根據 DouBan 影評網站提供的視頻內容信息向用戶進行個性化推薦。

5

6 import urllib2

7 import lxml

8 import time

9 import progressbar

10 from bs4 import BeautifulSoup

11

12 def askURL(url):

13 request = urllib2.Request(url)

14 try:

15 response = urllib2.urlopen(request)

16 except urllib2.HTTPError as e:

17 if hasattr(e, "code"):

18 print(e.code)

19 if hasattr(e, "reason"):

20 print(e.reason)

21 finally:

22 html = response.read()

23 return html

24

25 def getData(content_val):

26 list_result =[]

27 soup = BeautifulSoup(content_val, 'lxml')

28

29 dict_val = soup.find_all(class_='review-list chart', recursive=True)

30 # 得到電影評論標題

31 for eleOne in dict_val:

32 comment_titleList = eleOne.find_all(class_='title-link')

33 datalist_comment_title = list([comment_title for comment_title in comment_titleList])

34 # 得到電影標題

35 for eleTwo in dict_val:

36 titleList = eleTwo.find_all('a', 'subject-title')

37 datalist_title = list([movie_title for movie_title in titleList])

38 # 得到電影鏈接URL

39 for eleThree in dict_val:

40 urlList = eleThree.find_all('a', class_='subject-title')

41 datalist_url = list([movie_url.attrs['href'] for movie_url in urlList])

42 # 得到電影等級

43 for eleFour in dict_val:

44 rankList = eleFour.find_all('span', 'main-title-rating')

45 datalist_rank = list([movie_rank.attrs['title'] for movie_rank in rankList])

46 # 得到電影評論鏈接URL

47 for eleFive in dict_val:

48 comment_urlList = eleFive.find_all('a', class_='title-link')

49 datalist_commentURL = list([movie_commentURL.attrs['href'] for movie_commentURL in comment_urlList])

50 # 得到電影簡短評論

51 for eleSix in dict_val:

52 shortCommentList = eleSix.find_all('div', 'short-content')

53 datalist_shortComment = list([movie_shortComment for movie_shortComment in shortCommentList])

54 for flag in range(0, 10):

55 #print(type(datalist_shortComment)) # list

56 #print(type(datalist_shortComment[flag])) # Tag

57 list_result.append(datalist_shortComment[flag].contents[0].strip().encode('utf-8'))

58 """由於從 https://movie.douban.com/review/best/ 獲取數據時,會遇到“如果評論劇透則不會顯示”從而使得獲取到的值為空的情況,因此,

59 將其空值手動進行賦值。圖示說明:http://img.vim-cn.com/56/b8a707fb0fcc9b4bc08c738d169f57409a36ce.png

60 """

61 for final_flag in range(0, 10):

62 if((list_result[final_flag]) == ""):

63 list_result[final_flag] = "這篇影評可能有劇透, 沒關係之後可以顯示全文 :-)"

64 # 得到電影的海報圖片

65 for eleSeven in dict_val:

66 imageList = eleSeven.find_all('a', class_='subject-img')

67 datalist_image = list([movie_image for movie_image in imageList])

68

69 for flag in range(0, 10):

70 result = [datalist_comment_title[flag].get_text().encode('utf-8'), '\t', datalist_title[flag].get_text().encode('utf-8'), '\t', \

71 datalist_url[flag], '\t', datalist_rank[flag].encode('utf-8'), '\t', list_result[flag], '\t', datalist_commentURL[flag], '\t', \

72 datalist_image[flag].img["src"]]

73

74 SQL_dataText = './dataText.txt'

75 try:

76 files = open(SQL_dataText, 'a+')

77 except BaseException as error:

78 print(error.message)

79 else:

80 files.write(''.join(result) + '\n')

81 files.close()

82 if(flag == 10):

83 print '[ 當期文件路徑: ]', files.name

84 print

85

86 def main():

87 baseURL = 'https://movie.douban.com/review/best/?start='

88 for i in range(0, 5):

89 reqURL = baseURL + str(i * 20)

90 print r'正在抓取第' + str(i+1) + '個頁面,請稍等……'

91 print_bar()

92 result = askURL(reqURL)

93 getData(result)

94

95 def print_bar():

96 bar = progressbar.ProgressBar();

97 for i in bar(range(100)):

98 time.sleep(0.05)

99

100 if __name__ == '__main__':

101 main()



分享到:


相關文章: