最近学习go,就找了一个例子练习【go语言爬虫】go语言爬取豆瓣电影top250,思路大概就是获取网页,然后根据页面元素,用正则表达式匹配电影名称、评分、评论人数。原文有个地方需要修改下pattern4 := `
这个例子可以由修改下变成并发的形式,提高性能(参考golang 并发 chan)
<code>var
semchan
int
=make
(chan
int
,10
);for
i :=0
; i10
; i++ {go
func
(i
int
) { header :=map
[string
]string
{"Host"
:"movie.douban.com"
,"Connection"
:"keep-alive"
,"Cache-Control"
:"max-age=0"
,"Upgrade-Insecure-Requests"
:"1"
,"User-Agent"
:"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"
,"Accept"
:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
,"Referer"
:"https://movie.douban.com/top250"
, } fmt.Println("正在抓取第"
+ strconv.Itoa(i) +"页......"
) url :="https://movie.douban.com/top250?start="
+ strconv.Itoa(i*25
) +"&filter="
spider := &Spider{url, header} html := spider.get_html_header() pattern2 :=`
(.*?)评价
` rp2 := regexp.MustCompile(pattern2) find_txt2 := rp2.FindAllStringSubmatch(html,-1
) pattern3 :=`property="v:average">(.*?)
` rp3 := regexp.MustCompile(pattern3) find_txt3 := rp3.FindAllStringSubmatch(html,-1
) pattern4 := rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,-1
)for
i :=0
; ilen
(find_txt2); i++ { fmt.Printf("%s %s %s\n"
, find_txt4[i][1
], find_txt3[i][1
], find_txt2[i][1
], ) f.WriteString(find_txt4[i][1
] +"\t"
+ find_txt3[i][1
] +"\t"
+ find_txt2[i][1
] +"\t"
+"\r\n"
) } sem 0rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,i :=; i
for
i :=0
; i10
; i++ { close(sem) /<code>
到这里go爬虫部分已经介绍完毕,百无聊赖之际又写了一个python版,python很简洁
<code>import
re
import
urllib2
import
datetime
def
getDouban(i):
"爬取第" + str(i)+"页"
html
="https://movie.douban.com/top250?start=" + str(i) + "&filter="
try
:page
=urllib2.urlopen(html, timeout=3)
result
=page.read()
score
=re.findall('property="v:average">(.*?)
',result)person
=re.findall('
(.*?)评价
',result)name
=j
=j+1
except
:i
starttime
=datetime.datetime.now()
params
=[]
for
i in range(25):
getDouban(i)
endtime
=datetime.datetime.now()
"爬虫历时"+str((endtime-starttime).seconds)+"s完成"
re.findall('
/<code>
關鍵字: regexp FindAllStringSubmatch 练手