爬虫练手-豆瓣top250(go版以及python版)

  最近学习go,就找了一个例子练习【go语言爬虫】go语言爬取豆瓣电影top250,思路大概就是获取网页,然后根据页面元素,用正则表达式匹配电影名称、评分、评论人数。原文有个地方需要修改下pattern4 := `(.*?)

爬虫练手-豆瓣top250(go版以及python版)

这个例子可以由修改下变成并发的形式,提高性能(参考golang 并发 chan)

<code>

var

sem

chan

int

=

make

(

chan

int

,

10

);

for

i :=

0

; i

10

; i++ {

go

func

(i

int

)

{ header :=

map

[

string

]

string

{

"Host"

:

"movie.douban.com"

,

"Connection"

:

"keep-alive"

,

"Cache-Control"

:

"max-age=0"

,

"Upgrade-Insecure-Requests"

:

"1"

,

"User-Agent"

:

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"

,

"Accept"

:

"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"

,

"Referer"

:

"https://movie.douban.com/top250"

, } fmt.Println(

"正在抓取第"

+ strconv.Itoa(i) +

"页......"

) url :=

"https://movie.douban.com/top250?start="

+ strconv.Itoa(i*

25

) +

"&filter="

spider := &Spider{url, header} html := spider.get_html_header() pattern2 :=

`

(.*?)评价

`

rp2 := regexp.MustCompile(pattern2) find_txt2 := rp2.FindAllStringSubmatch(html,

-1

) pattern3 :=

`property="v:average">(.*?)

` rp3 := regexp.MustCompile(pattern3) find_txt3 := rp3.FindAllStringSubmatch(html,

-1

) pattern4 := (.*?) rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,

-1

)

for

i :=

0

; i

len

(find_txt2); i++ { fmt.Printf(

"%s %s %s\n"

, find_txt4[i][

1

], find_txt3[i][

1

], find_txt2[i][

1

], ) f.WriteString(find_txt4[i][

1

] +

"\t"

+ find_txt3[i][

1

] +

"\t"

+ find_txt2[i][

1

] +

"\t"

+

"\r\n"

) } sem 0

rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,i :=; i

for

i :=

0

; i

10

; i++ { close(sem) /<code>
爬虫练手-豆瓣top250(go版以及python版)

  到这里go爬虫部分已经介绍完毕,百无聊赖之际又写了一个python版,python很简洁

<code> 

import

re

import

urllib2

import

datetime

def

getDouban(i):

print

"爬取第" + str(i)+"页"

html

=

"https://movie.douban.com/top250?start=" + str(i) + "&filter="

try

:

page

=

urllib2.urlopen(html, timeout=3)

result

=

page.read()

score

=

re.findall('property="v:average">(.*?)

',result)

person

=

re.findall('

(.*?)评价

',result)

name

= (.*?)

j

=

j+1

except

:

print

i

starttime

=

datetime.datetime.now()

params

=

[]

for

i in range(25):

getDouban(i)

endtime

=

datetime.datetime.now()

print

"爬虫历时"+str((endtime-starttime).seconds)+"s完成"

re.findall('

/<code>
爬虫练手-豆瓣top250(go版以及python版)


分享到:


相關文章: