最近學習go,就找了一個例子練習【go語言爬蟲】go語言爬取豆瓣電影top250,思路大概就是獲取網頁,然後根據頁面元素,用正則表達式匹配電影名稱、評分、評論人數。原文有個地方需要修改下pattern4 := `
這個例子可以由修改下變成併發的形式,提高性能(參考golang 併發 chan)
<code>var
semchan
int
=make
(chan
int
,10
);for
i :=0
; i10
; i++ {go
func
(i
int
) { header :=map
[string
]string
{"Host"
:"movie.douban.com"
,"Connection"
:"keep-alive"
,"Cache-Control"
:"max-age=0"
,"Upgrade-Insecure-Requests"
:"1"
,"User-Agent"
:"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"
,"Accept"
:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
,"Referer"
:"https://movie.douban.com/top250"
, } fmt.Println("正在抓取第"
+ strconv.Itoa(i) +"頁......"
) url :="https://movie.douban.com/top250?start="
+ strconv.Itoa(i*25
) +"&filter="
spider := &Spider{url, header} html := spider.get_html_header() pattern2 :=`
(.*?)評價
` rp2 := regexp.MustCompile(pattern2) find_txt2 := rp2.FindAllStringSubmatch(html,-1
) pattern3 :=`property="v:average">(.*?)
` rp3 := regexp.MustCompile(pattern3) find_txt3 := rp3.FindAllStringSubmatch(html,-1
) pattern4 := rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,-1
)for
i :=0
; ilen
(find_txt2); i++ { fmt.Printf("%s %s %s\n"
, find_txt4[i][1
], find_txt3[i][1
], find_txt2[i][1
], ) f.WriteString(find_txt4[i][1
] +"\t"
+ find_txt3[i][1
] +"\t"
+ find_txt2[i][1
] +"\t"
+"\r\n"
) } sem 0rp4 := regexp.MustCompile(pattern4) find_txt4 := rp4.FindAllStringSubmatch(html,i :=; i
for
i :=0
; i10
; i++ { close(sem) /<code>
到這裡go爬蟲部分已經介紹完畢,百無聊賴之際又寫了一個python版,python很簡潔
<code>import
re
import
urllib2
import
datetime
def
getDouban(i):
"爬取第" + str(i)+"頁"
html
="https://movie.douban.com/top250?start=" + str(i) + "&filter="
try
:page
=urllib2.urlopen(html, timeout=3)
result
=page.read()
score
=re.findall('property="v:average">(.*?)
',result)person
=re.findall('
(.*?)評價
',result)name
=j
=j+1
except
:i
starttime
=datetime.datetime.now()
params
=[]
for
i in range(25):
getDouban(i)
endtime
=datetime.datetime.now()
"爬蟲歷時"+str((endtime-starttime).seconds)+"s完成"
re.findall('
/<code>