爬蟲練手-豆瓣top250(go版以及python版)

  最近學習go,就找了一個例子練習【go語言爬蟲】go語言爬取豆瓣電影top250,思路大概就是獲取網頁,然後根據頁面元素,用正則表達式匹配電影名稱、評分、評論人數。原文有個地方需要修改下pattern4 := `(.*?)

爬蟲練手-豆瓣top250(go版以及python版)

這個例子可以由修改下變成併發的形式,提高性能(參考golang 併發 chan)

<code>

var

sem

chan

int

=

make

(

chan

int

,

10

);

for

i :=

0

; i

10

; i++ {

go

func

(i

int

)

{ header :=

map

[

string

]

string

{

"Host"

:

"movie.douban.com"

,

"Connection"

:

"keep-alive"

,

"Cache-Control"

:

"max-age=0"

,

"Upgrade-Insecure-Requests"

:

"1"

,

"User-Agent"

:

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"

,

"Accept"

:

"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"

,

"Referer"

:

"https://movie.douban.com/top250"

, } fmt.Println(

"正在抓取第"

+ strconv.Itoa(i) +

"頁......"

) url :=

"https://movie.douban.com/top250?start="

+ strconv.Itoa(i*

25

) +

"&filter="

spider := &Spider{url, header} html := spider.get_html_header() pattern2 :=

`

(.*?)評價

`

rp2 := regexp.MustCompile(pattern2) find_txt2 := rp2.FindAllStringSubmatch(html,

-1

) pattern3 :=

`property="v:average">(.*?)

` rp3 := regexp.MustCompile(pattern3) find_txt3 := rp3.FindAllStringSubmatch(html,

-1

) pattern4 := (.*?) rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,

-1

)

for

i :=

0

; i

len

(find_txt2); i++ { fmt.Printf(

"%s %s %s\n"

, find_txt4[i][

1

], find_txt3[i][

1

], find_txt2[i][

1

], ) f.WriteString(find_txt4[i][

1

] +

"\t"

+ find_txt3[i][

1

] +

"\t"

+ find_txt2[i][

1

] +

"\t"

+

"\r\n"

) } sem 0

rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,i :=; i

for

i :=

0

; i

10

; i++ { close(sem) /<code>
爬蟲練手-豆瓣top250(go版以及python版)

  到這裡go爬蟲部分已經介紹完畢,百無聊賴之際又寫了一個python版,python很簡潔

<code> 

import

re

import

urllib2

import

datetime

def

getDouban(i):

print

"爬取第" + str(i)+"頁"

html

=

"https://movie.douban.com/top250?start=" + str(i) + "&filter="

try

:

page

=

urllib2.urlopen(html, timeout=3)

result

=

page.read()

score

=

re.findall('property="v:average">(.*?)

',result)

person

=

re.findall('

(.*?)評價

',result)

name

= (.*?)

j

=

j+1

except

:

print

i

starttime

=

datetime.datetime.now()

params

=

[]

for

i in range(25):

getDouban(i)

endtime

=

datetime.datetime.now()

print

"爬蟲歷時"+str((endtime-starttime).seconds)+"s完成"

re.findall('

/<code>
爬蟲練手-豆瓣top250(go版以及python版)


分享到:


相關文章: