爬蟲代理池,百萬數據輕鬆抓取。

1.今天我們來講下一個非常有用的東西,代理ip池,結果就是一個任務每隔一定時間去到目標ip代理提供網站去爬取可用數據存到mysql數據庫,並且檢測數據庫已有數據是否可用,不可用就刪除。

無私分享全套Python爬蟲乾貨,如果你也想學習Python,@ 私信小編獲取

2. 編寫 提取代理ip到數據庫 的爬蟲

2.1準備mysql表

<code>

CREATE

TABLE

`t_ips`

(

`id`

int

(

10

)

NOT

NULL

AUTO_INCREMENT

COMMENT

'主鍵'

,

`ip`

varchar

(

15

)

COLLATE

utf8_unicode_ci

DEFAULT

NULL

COMMENT

'ip'

,

`port`

int

(

10

)

NOT

NULL

COMMENT

'port'

,

`type`

int

(

10

)

NOT

NULL

DEFAULT

'0'

COMMENT

'0:http 1:https'

, PRIMARY

KEY

(

`id`

) )

ENGINE

=

InnoDB

AUTO_INCREMENT=

421

DEFAULT

CHARSET

=utf8

COLLATE

=utf8_unicode_ci

COMMENT

=

'ip表'

;/<code>

2.2創建爬蟲工程,編寫items.py(對應數據庫的字段)

<code>

import

scrapy

class

IpsItem(scrapy.Item):

ip

=

scrapy.Field()

port

=

scrapy.Field()

httpType

=

scrapy.Field()

/<code>

2.3編寫settings.py

<code> 
 
MAX_PAGE = 2  
 
TYPE = 0  
URL = 'http://www.bugng.com/gnpt?page='  
TIMER_STOP_TIME = 20  
 
BOT_NAME = 'ips'
SPIDER_MODULES = ['ips.spiders']
NEWSPIDER_MODULE = 'ips.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
ITEM_PIPELINES = {

'ips.pipelines.IpsPipeline': 300,

} RETRY_ENABLED = False ROBOTSTXT_OBEY = False DOWNLOAD_TIMEOUT = 2 COOKIES_ENABLED = False DOWNLOAD_DELAY=2/<code>

2.4編寫spider

<code>這裡用到了bs4,需要自行安裝
 

import

scrapy

import

logging

from

bs4

import

BeautifulSoup

from

ips.items

import

IpsItem

from

ips.settings

import

*

class

XicispiderSpider

(scrapy.Spider)

:

name =

'xiciSpider'

allowed_domains = [

'xicidaili.com'

] start_urls = [

'http://xicidaili.com/'

]

def

start_requests

(self)

:

req = []

for

i

in

range(

1

,MAX_PAGE): req.append(scrapy.Request(URL + str(i

-1

)))

return

req

def

parse

(self, response)

:

print(

'@@@@@@@@@ 開始解析 '

+response.url)

try

: soup = BeautifulSoup(str(response.body, encoding =

"utf-8"

),

'html.parser'

) trs = soup.find(

'table'

,{

'class'

:

'table'

}).find_all(

'tr'

)

for

tr

in

trs[

1

:]: tds = tr.find_all(

'td'

) cur =

0

item = IpsItem() item[

'httpType'

] = TYPE

for

td

in

tds:

if

cur ==

0

: item[

'ip'

] = td.text

if

cur ==

1

: item[

'port'

] = td.text cur = cur +

1

yield

item

except

Exception

as

e: logging.log(logging.WARN,

'@@@@@@@@@ start parser '

+ str(e))/<code>

2.5編寫pipline

這裡需要安裝 : pip install mysqlclient

這裡插入數據庫之前做兩個校驗:

1.數據是否存在

2.數據是否可用

<code> 

import

MySQLdb

import

MySQLdb.cursors

from

twisted.enterprise

import

adbapi

import

logging

import

requests

class

IpsPipeline

(object)

:

def

__init__

(self)

:

dbargs = dict( host=

'你的數據庫ip'

, db=

'數據庫名稱'

, user=

'root'

, passwd=

'數據庫密碼'

, charset=

'utf8'

, cursorclass=MySQLdb.cursors.DictCursor, use_unicode=

True

, ) self.dbpool = adbapi.ConnectionPool(

'MySQLdb'

, **dbargs)

def

process_item

(self, item, spider)

:

res = self.dbpool.runInteraction(self.insert_into_table, item)

return

item

def

insert_into_table

(self, conn, item)

:

ip = item[

'ip'

] port = item[

'port'

]

if

self.exsist(item,conn):

return

if

self.proxyIpCheck(item[

'ip'

],item[

'port'

])

is

False

: print(

"此代理ip不可用,proxy:"

,item[

'ip'

],

':'

,str(item[

'port'

]))

return

sql =

'insert into t_ips (ip,port,type) VALUES ('

sql = sql +

'"'

+ item[

'ip'

] +

'",'

sql = sql + str(item[

'port'

]) +

','

sql = sql + str(item[

'httpType'

]) +

','

sql = sql[

0

:

-1

] sql = sql +

')'

try

: conn.execute(sql) print(sql)

except

Exception

as

e: logging.log(logging.WARNING,

"sqlsqlsqlsqlsqlsqlsql error>> "

+ sql)

def

exsist

(self,item,conn)

:

sql =

'select * from t_ips where ip="'

+ item[

'ip'

] +

'" and port='

+ str(item[

'port'

]) +

''

try

: conn.execute(sql) results = conn.fetchall()

if

len(results) >

0

:

return

True

except

:

return

False

return

False

def

proxyIpCheck

(self,ip, port)

:

server = ip +

":"

+ str(port) proxies = {

'http'

:

'http://'

+ server,

'https'

:

'https://'

+ server}

try

: r = requests.get(

'https://www.baidu.com/'

, proxies=proxies, timeout=

1

)

if

(r.status_code ==

200

):

return

True

else

:

return

False

except

:

return

False

/<code>

2.6 測試爬蟲 scrapy crwal 爬蟲名

3. 到此我們的 提取代理ip到數據庫的 爬蟲就寫好了,接下來就是我們的任務定時器的編寫

<code> 

import

os

import

pymysql

import

threading

from

settings

import

*

def

run

()

:

clearIpPool() timer = threading.Timer(TIMER_STOP_TIME, run) timer.start() print(

"ip池定時器開始,間隔時間:"

,str(TIMER_STOP_TIME),

's'

) timer = threading.Timer(TIMER_STOP_TIME,run) timer.start()

def

clearIpPool

()

:

print(

"定時器執行,清掃ip數據庫池"

) os.system(

'scrapy crawl xiciSpider --nolog'

) removeUnSafeProxyFromDB() print(

"定時器執行完畢"

)

def

removeUnSafeProxyFromDB

()

:

db = pymysql.connect(

"39.108.112.254"

,

"root"

,

"abc123|||456"

,

"xici"

) cursor = db.cursor() sql =

"SELECT * FROM t_ips"

try

: cursor.execute(sql) results = cursor.fetchall()

for

row

in

results: id = row[

0

] ip = row[

1

] port = row[

2

]

if

proxyIpCheck(ip, str(port))

is

False

: print(

"此代理ip不可用,proxy:"

,ip,

':'

, str(port)) sql =

"DELETE FROM t_ips WHERE id = "

+str(id) cursor.execute(sql) print(sql) db.commit()

return

except

: print(

"Error: unable to fetch data"

) db.close()

def

proxyIpCheck

(ip, port)

:

server = ip +

":"

+ str(port) proxies = {

'http'

:

'http://'

+ server,

'https'

:

'https://'

+ server}

try

: r = requests.get(

'https://www.baidu.com/'

, proxies=proxies, timeout=

1

)

if

(r.status_code ==

200

):

return

True

else

:

return

False

except

:

return

False

/<code>

為了幫助大家更輕鬆的學好Python,我給大家分享一套Python學習資料,希望對正在學習的你有所幫助!

獲取方式:關注並私信小編 “ 學習 ”,即可免費獲取!

爬蟲代理池,百萬數據輕鬆抓取。


分享到:


相關文章: