5
6 import urllib2
7 import lxml
8 import time
9 import progressbar
10 from bs4 import BeautifulSoup
11
12 def askURL(url):
13 request = urllib2.Request(url)
14 try:
15 response = urllib2.urlopen(request)
16 except urllib2.HTTPError as e:
17 if hasattr(e, "code"):
18 print(e.code)
19 if hasattr(e, "reason"):
20 print(e.reason)
21 finally:
22 html = response.read()
23 return html
24
25 def getData(content_val):
26 list_result =[]
27 soup = BeautifulSoup(content_val, 'lxml')
28
29 dict_val = soup.find_all(class_='review-list chart', recursive=True)
30 # 得到電影評論標題
31 for eleOne in dict_val:
32 comment_titleList = eleOne.find_all(class_='title-link')
33 datalist_comment_title = list([comment_title for comment_title in comment_titleList])
34 # 得到電影標題
35 for eleTwo in dict_val:
36 titleList = eleTwo.find_all('a', 'subject-title')
37 datalist_title = list([movie_title for movie_title in titleList])
38 # 得到電影鏈接URL
39 for eleThree in dict_val:
40 urlList = eleThree.find_all('a', class_='subject-title')
41 datalist_url = list([movie_url.attrs['href'] for movie_url in urlList])
42 # 得到電影等級
43 for eleFour in dict_val:
44 rankList = eleFour.find_all('span', 'main-title-rating')
45 datalist_rank = list([movie_rank.attrs['title'] for movie_rank in rankList])
46 # 得到電影評論鏈接URL
47 for eleFive in dict_val:
48 comment_urlList = eleFive.find_all('a', class_='title-link')
49 datalist_commentURL = list([movie_commentURL.attrs['href'] for movie_commentURL in comment_urlList])
50 # 得到電影簡短評論
51 for eleSix in dict_val:
52 shortCommentList = eleSix.find_all('div', 'short-content')
53 datalist_shortComment = list([movie_shortComment for movie_shortComment in shortCommentList])
54 for flag in range(0, 10):
55 #print(type(datalist_shortComment)) # list
56 #print(type(datalist_shortComment[flag])) # Tag
57 list_result.append(datalist_shortComment[flag].contents[0].strip().encode('utf-8'))
58 """由於從 https://movie.douban.com/review/best/ 獲取數據時,會遇到“如果評論劇透則不會顯示”從而使得獲取到的值為空的情況,因此,
59 將其空值手動進行賦值。圖示說明:http://img.vim-cn.com/56/b8a707fb0fcc9b4bc08c738d169f57409a36ce.png
60 """
61 for final_flag in range(0, 10):
62 if((list_result[final_flag]) == ""):
63 list_result[final_flag] = "這篇影評可能有劇透, 沒關係之後可以顯示全文 :-)"
64 # 得到電影的海報圖片
65 for eleSeven in dict_val:
66 imageList = eleSeven.find_all('a', class_='subject-img')
67 datalist_image = list([movie_image for movie_image in imageList])
68
69 for flag in range(0, 10):
70 result = [datalist_comment_title[flag].get_text().encode('utf-8'), '\t', datalist_title[flag].get_text().encode('utf-8'), '\t', \
71 datalist_url[flag], '\t', datalist_rank[flag].encode('utf-8'), '\t', list_result[flag], '\t', datalist_commentURL[flag], '\t', \
72 datalist_image[flag].img["src"]]
73
74 SQL_dataText = './dataText.txt'
75 try:
76 files = open(SQL_dataText, 'a+')
77 except BaseException as error:
78 print(error.message)
79 else:
80 files.write(''.join(result) + '\n')
81 files.close()
82 if(flag == 10):
83 print '[ 當期文件路徑: ]', files.name
84 print
85
86 def main():
87 baseURL = 'https://movie.douban.com/review/best/?start='
88 for i in range(0, 5):
89 reqURL = baseURL + str(i * 20)
90 print r'正在抓取第' + str(i+1) + '個頁面,請稍等……'
91 print_bar()
92 result = askURL(reqURL)
93 getData(result)
94
95 def print_bar():
96 bar = progressbar.ProgressBar();
97 for i in bar(range(100)):
98 time.sleep(0.05)
99
100 if __name__ == '__main__':
101 main()
閱讀更多 Python樂園 的文章