使用爬蟲不可避免的就會遇到網站的各種封ip操作,因此就需要我們找尋代理,通過代理進行操作,屏蔽自己真實ip。
無私分享全套Python爬蟲乾貨,如果你也想學習Python,@ 私信小編獲取import requests
import pymongo
from lxml.html import etree
class SelfIpProxy():
def __init__(self): # 設置區域
self.depth = 1
self.timeout = 10
self.collection = pymongo.MongoClient()['Proxies']['free2']
self.url = {'http':"http://19ncc.medmeeting.org/cn",'https':"https://www.baidu.com"}
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://www.xicidaili.com/nn/2'}
def get_ip(self): # 從網站獲取ip
urls = [
'https://www.xicidaili.com/nn/{}'.format(i) for i in range(1, self.depth + 1)]
for url in urls:
html = requests.get(url, headers=self.headers, timeout=30)
html.encoding = 'utf-8'
e_html = etree.HTML(html.text)
ips = e_html.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
ports = e_html.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
modes = e_html.xpath('//table[@id="ip_list"]/tr/td[6]/text()')
for ip, port, mode in zip(ips, ports, modes):
item = dict()
item[mode.lower()] = '{}://{}:{}'.format(mode.lower(), ip, port)
yield item
def store_ip(self):
for i in self.get_ip():
self.collection.insert_one(i)
def check_ip(self):
count = 0
demo = self.collection.find({}, {'_id': 0}, no_cursor_timeout=True) # 為了防止
pymongo.error.CursionError,手動打開庫
for ip in demo:
count += 1
print('正在測試第{}個ip'.format(count))
for key, value in ip.items():
try:
html = requests.get(self.url[key],
headers=self.headers, proxies={key: value}, timeout=self.timeout)
html.encoding = 'utf-8'
html.raise_for_status()
print('************當前ip測試通過,當前ip為{}************'.format(value))
except BaseException:
print('當前ip測試不通過,當前ip為{}'.format(value))
self.collection.delete_one(ip)
demo.close() # 手動關閉庫
def anti_duplicate(self): # 去重
demo = self.collection.find({}, {'_id': 0}, no_cursor_timeout=True)
l = []
for i in demo:
if i not in l:
l.append(i)
demo.close()
self.collection.drop()
for i in l:
self.collection.insert_one(i)
if __name__ == '__main__':
# 設置內容在class內部__init__()方法內部
my_ip = SelfIpProxy()
my_ip.store_ip() # 獲取存儲ip到MongoDB中,已經成功, 很快,不需要多線程
my_ip.check_ip() # 檢查ip是否可用
# my_ip.anti_duplicate() # 去重
結果,絕大部分都是不可用的,少量能用上:
正在測試第318個ip
當前ip測試不通過,當前ip為
https://114.239.255.179:9999
正在測試第319個ip
當前ip測試不通過,當前ip為
https://222.189.246.79:9999
正在測試第320個ip
當前ip測試不通過,當前ip為
https://163.204.240.117:9999
正在測試第321個ip
當前ip測試不通過,當前ip為http://120.83.99.253:9999
正在測試第322個ip
當前ip測試通過,當前ip為http://59.57.148.10:9999
正在測試第323個ip
當前ip測試不通過,當前ip為http://182.35.81.209:9999
正在測試第324個ip
當前ip測試不通過,當前ip為http://112.87.69.236:9999
正在測試第325個ip
當前ip測試不通過,當前ip為http://120.83.108.41:9999
改成多進程,本來想將多進程的函數也寫進類裡面,但是不知道怎麼回事main函數調用就沒反應了,無奈只能在class之外重寫了一個check_ip函數,全代碼如下:
import requests
import pymongo
from lxml.html import etree
from multiprocessing import Pool
class SelfIpProxy():
def __init__(self): # 設置區域
self.depth = 10
self.collection = pymongo.MongoClient()['Proxies']['free2']
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Referer': 'https://www.xicidaili.com/nn/2'}
def get_ip(self): # 從網站獲取ip
urls = [
'https://www.xicidaili.com/nn/{}'.format(i) for i in range(1, self.depth + 1)]
for url in urls:
html = requests.get(url, headers=self.headers, timeout=30)
html.encoding = 'utf-8'
e_html = etree.HTML(html.text)
ips = e_html.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
ports = e_html.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
modes = e_html.xpath('//table[@id="ip_list"]/tr/td[6]/text()')
for ip, port, mode in zip(ips, ports, modes):
item = dict()
item[mode.lower()] = '{}://{}:{}'.format(mode.lower(), ip, port)
yield item
def store_ip(self):
for i in self.get_ip():
self.collection.insert_one(i)
def anti_duplicate(self): # 去重
demo = self.collection.find({}, {'_id': 0}, no_cursor_timeout=True)
l = []
for i in demo:
if i not in l:
l.append(i)
demo.close()
self.collection.drop()
for i in l:
self.collection.insert_one(i)
def check_ip(proxy):
url = {'http': "http://www.baidu.com", 'https': "https://www.baidu.com"}
for key, value in proxy.items():
try:
html = requests.get(url[key], proxies={key: value}, timeout=10)
html.encoding = 'utf-8'
html.raise_for_status()
print('***************************當前ip測試通過,當前ip為{}***************************\n'.format(value))
pymongo.MongoClient()['Proxies']['checked'].insert_one(proxy)
except:
print('當前ip測試失敗,當前ip為{}'.format(value))
if __name__ == '__main__':
# 設置內容在class內部__init__()方法內部
my_ip = SelfIpProxy()
my_ip.store_ip() # 獲取存儲ip到MongoDB中,已經成功, 很快,不需要多線程
proxies = [] # 將庫裡的ip轉成列表收集,以便多進程處理
demo = my_ip.collection.find({}, {'_id': 0}, no_cursor_timeout=True) # 手動打開庫,是因為庫長度較長,防止時間過長,引起指針報錯。
for i in demo:
proxies.append(i)
my_ip.collection.drop()
demo.close # 手動關閉庫
pool = Pool(8) # 開始多進程處理模式
for i in range(len(proxies)):
pool.apply_async(check_ip, args=(proxies[i], ))
pool.close()
pool.join()
# my_ip.anti_duplicate() # 去重
多進程的效果是原來大約4個小時才能跑完1000個驗證,現在大約半小時就能搞定,最後出來一共42個。
為了幫助大家更輕鬆的學好Python,我給大家分享一套Python學習資料,希望對正在學習的你有所幫助!
獲取方式:關注並私信小編 “ 學習 ”,即可免費獲取!