乾貨分享,程序員帶你構建ip池,爬取大數據不封ip無懼反爬蟲。

使用爬蟲不可避免的就會遇到網站的各種封ip操作,因此就需要我們找尋代理,通過代理進行操作,屏蔽自己真實ip。

無私分享全套Python爬蟲乾貨,如果你也想學習Python,@ 私信小編獲取

import requests

import pymongo

from lxml.html import etree

class SelfIpProxy():

def __init__(self): # 設置區域

self.depth = 1

self.timeout = 10

self.collection = pymongo.MongoClient()['Proxies']['free2']

self.url = {'http':"http://19ncc.medmeeting.org/cn",'https':"https://www.baidu.com"}

self.headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

'Referer': 'https://www.xicidaili.com/nn/2'}

def get_ip(self): # 從網站獲取ip

urls = [

'https://www.xicidaili.com/nn/{}'.format(i) for i in range(1, self.depth + 1)]

for url in urls:

html = requests.get(url, headers=self.headers, timeout=30)

html.encoding = 'utf-8'

e_html = etree.HTML(html.text)

ips = e_html.xpath('//table[@id="ip_list"]/tr/td[2]/text()')

ports = e_html.xpath('//table[@id="ip_list"]/tr/td[3]/text()')

modes = e_html.xpath('//table[@id="ip_list"]/tr/td[6]/text()')

for ip, port, mode in zip(ips, ports, modes):

item = dict()

item[mode.lower()] = '{}://{}:{}'.format(mode.lower(), ip, port)

yield item

def store_ip(self):

for i in self.get_ip():

self.collection.insert_one(i)

def check_ip(self):

count = 0

demo = self.collection.find({}, {'_id': 0}, no_cursor_timeout=True) # 為了防止
pymongo.error.CursionError,手動打開庫

for ip in demo:

count += 1

print('正在測試第{}個ip'.format(count))

for key, value in ip.items():

try:

html = requests.get(self.url[key],

headers=self.headers, proxies={key: value}, timeout=self.timeout)

html.encoding = 'utf-8'

html.raise_for_status()

print('************當前ip測試通過,當前ip為{}************'.format(value))

except BaseException:

print('當前ip測試不通過,當前ip為{}'.format(value))

self.collection.delete_one(ip)

demo.close() # 手動關閉庫

def anti_duplicate(self): # 去重

demo = self.collection.find({}, {'_id': 0}, no_cursor_timeout=True)

l = []

for i in demo:

if i not in l:

l.append(i)

demo.close()

self.collection.drop()

for i in l:

self.collection.insert_one(i)

if __name__ == '__main__':

# 設置內容在class內部__init__()方法內部

my_ip = SelfIpProxy()

my_ip.store_ip() # 獲取存儲ip到MongoDB中,已經成功, 很快,不需要多線程

my_ip.check_ip() # 檢查ip是否可用

# my_ip.anti_duplicate() # 去重

結果,絕大部分都是不可用的,少量能用上:

正在測試第318個ip

當前ip測試不通過,當前ip為
https://114.239.255.179:9999

正在測試第319個ip

當前ip測試不通過,當前ip為
https://222.189.246.79:9999

正在測試第320個ip

當前ip測試不通過,當前ip為
https://163.204.240.117:9999

正在測試第321個ip

當前ip測試不通過,當前ip為http://120.83.99.253:9999

正在測試第322個ip

當前ip測試通過,當前ip為http://59.57.148.10:9999

正在測試第323個ip

當前ip測試不通過,當前ip為http://182.35.81.209:9999

正在測試第324個ip

當前ip測試不通過,當前ip為http://112.87.69.236:9999

正在測試第325個ip

當前ip測試不通過,當前ip為http://120.83.108.41:9999

改成多進程,本來想將多進程的函數也寫進類裡面,但是不知道怎麼回事main函數調用就沒反應了,無奈只能在class之外重寫了一個check_ip函數,全代碼如下:

import requests

import pymongo

from lxml.html import etree

from multiprocessing import Pool

class SelfIpProxy():

def __init__(self): # 設置區域

self.depth = 10

self.collection = pymongo.MongoClient()['Proxies']['free2']

self.headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

'Referer': 'https://www.xicidaili.com/nn/2'}

def get_ip(self): # 從網站獲取ip

urls = [

'https://www.xicidaili.com/nn/{}'.format(i) for i in range(1, self.depth + 1)]

for url in urls:

html = requests.get(url, headers=self.headers, timeout=30)

html.encoding = 'utf-8'

e_html = etree.HTML(html.text)

ips = e_html.xpath('//table[@id="ip_list"]/tr/td[2]/text()')

ports = e_html.xpath('//table[@id="ip_list"]/tr/td[3]/text()')

modes = e_html.xpath('//table[@id="ip_list"]/tr/td[6]/text()')

for ip, port, mode in zip(ips, ports, modes):

item = dict()

item[mode.lower()] = '{}://{}:{}'.format(mode.lower(), ip, port)

yield item

def store_ip(self):

for i in self.get_ip():

self.collection.insert_one(i)

def anti_duplicate(self): # 去重

demo = self.collection.find({}, {'_id': 0}, no_cursor_timeout=True)

l = []

for i in demo:

if i not in l:

l.append(i)

demo.close()

self.collection.drop()

for i in l:

self.collection.insert_one(i)

def check_ip(proxy):

url = {'http': "http://www.baidu.com", 'https': "https://www.baidu.com"}

for key, value in proxy.items():

try:

html = requests.get(url[key], proxies={key: value}, timeout=10)

html.encoding = 'utf-8'

html.raise_for_status()

print('***************************當前ip測試通過,當前ip為{}***************************\n'.format(value))

pymongo.MongoClient()['Proxies']['checked'].insert_one(proxy)

except:

print('當前ip測試失敗,當前ip為{}'.format(value))

if __name__ == '__main__':

# 設置內容在class內部__init__()方法內部

my_ip = SelfIpProxy()

my_ip.store_ip() # 獲取存儲ip到MongoDB中,已經成功, 很快,不需要多線程

proxies = [] # 將庫裡的ip轉成列表收集,以便多進程處理

demo = my_ip.collection.find({}, {'_id': 0}, no_cursor_timeout=True) # 手動打開庫,是因為庫長度較長,防止時間過長,引起指針報錯。

for i in demo:

proxies.append(i)

my_ip.collection.drop()

demo.close # 手動關閉庫

pool = Pool(8) # 開始多進程處理模式

for i in range(len(proxies)):

pool.apply_async(check_ip, args=(proxies[i], ))

pool.close()

pool.join()

# my_ip.anti_duplicate() # 去重

多進程的效果是原來大約4個小時才能跑完1000個驗證,現在大約半小時就能搞定,最後出來一共42個。

乾貨分享,程序員帶你構建ip池,爬取大數據不封ip無懼反爬蟲。

為了幫助大家更輕鬆的學好Python,我給大家分享一套Python學習資料,希望對正在學習的你有所幫助!

獲取方式:關注並私信小編 “ 學習 ”,即可免費獲取!

乾貨分享,程序員帶你構建ip池,爬取大數據不封ip無懼反爬蟲。



分享到:


相關文章: