几十个爬虫大集合,人人贷,拍拍贷, 气象数据,猫眼电影(完整项目

www.umetrip.com --- 航旅纵横航班信息



代码展示:

from selenium import webdriver

2 import openpyxl

3 import jieba

4 from bs4 import BeautifulSoup

5 import time

6 import jieba

7 import os

8 import re

9 import chardet

10 from PyQt5 import QtCore, QtGui, QtWidgets

11

12 jieba.set_dictionary("dict/dict.txt")

13 jieba.initialize()

14

15 class Ui_MainWindow(object):

16 def setupUi(self, MainWindow):

17 MainWindow.setObjectName("MainWindow")

18 MainWindow.resize(725, 590)

19 self.centralWidget = QtWidgets.QWidget(MainWindow)

20 self.centralWidget.setObjectName("centralWidget")

21 self.listWidget = QtWidgets.QListWidget(self.centralWidget)

22 self.listWidget.setGeometry(QtCore.QRect(10, 40, 411, 471))

23 self.listWidget.setObjectName("listWidget")

24 self.pushButton = QtWidgets.QPushButton(self.centralWidget)

25 self.pushButton.setGeometry(QtCore.QRect(479, 50, 171, 61))

26 self.pushButton.setObjectName("pushButton")

27 self.label = QtWidgets.QLabel(self.centralWidget)

28 self.label.setGeometry(QtCore.QRect(10, 10, 51, 31))

29 self.label.setObjectName("label")

30 self.pushButton_2 = QtWidgets.QPushButton(self.centralWidget)

31 self.pushButton_2.setGeometry(QtCore.QRect(430, 480, 80, 25))

32 self.pushButton_2.setObjectName("pushButton_2")

33 self.pushButton_3 = QtWidgets.QPushButton(self.centralWidget)

34 self.pushButton_3.setGeometry(QtCore.QRect(480, 140, 171, 61))

35 self.pushButton_3.setObjectName("pushButton_3")

36 self.pushButton_4 = QtWidgets.QPushButton(self.centralWidget)

37 self.pushButton_4.setGeometry(QtCore.QRect(480, 220, 171, 61))

38 self.pushButton_4.setObjectName("pushButton_4")

39 MainWindow.setCentralWidget(self.centralWidget)

40 self.menuBar = QtWidgets.QMenuBar(MainWindow)

41 self.menuBar.setGeometry(QtCore.QRect(0, 0, 725, 22))

42 self.menuBar.setObjectName("menuBar")

43 self.menu = QtWidgets.QMenu(self.menuBar)

44 self.menu.setObjectName("menu")

45 MainWindow.setMenuBar(self.menuBar)

46 self.mainToolBar = QtWidgets.QToolBar(MainWindow)

47 self.mainToolBar.setObjectName("mainToolBar")

48 MainWindow.addToolBar(QtCore.Qt.TopToolBarArea, self.mainToolBar)

49 self.statusBar = QtWidgets.QStatusBar(MainWindow)

50 self.statusBar.setObjectName("statusBar")

51 MainWindow.setStatusBar(self.statusBar)

52 self.action = QtWidgets.QAction(MainWindow)

53 self.action.setObjectName("action")

54 self.menu.addAction(self.action)

55 self.menuBar.addAction(self.menu.menuAction())

56

57 self.retranslateUi(MainWindow)

58 QtCore.QMetaObject.connectSlotsByName(MainWindow)

59

60 def retranslateUi(self, MainWindow):

61 _translate = QtCore.QCoreApplication.translate

62 MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))

63 self.pushButton.setText(_translate("MainWindow", "采集"))

64 self.label.setText(_translate("MainWindow", "标题:"))

65 self.pushButton_2.setText(_translate("MainWindow", "清空"))

66 self.pushButton_3.setText(_translate("MainWindow", "导出词频"))

67 self.pushButton_4.setText(_translate("MainWindow", "启动浏览器"))

68 self.menu.setTitle(_translate("MainWindow", "菜单"))

69 self.action.setText(_translate("MainWindow", "退出"))

70

71 def cutword(data):

72 result=jieba.cut(data,cut_all=False)

73 return result

74

75 def wordfrequency(text):

76 sub_re='[a-zA-Z]+|[\s+\.\!\/_,$%^*\(\d+"\']+|[+—;—!:\(\):《》,。?、~@#¥%……&*()%~\[\]\|\?\·【】“”;-]+'

77 text=re.sub(sub_re,' ',text)

78 result={}

79 words=[word for word in cutword(text)]

80 for word in words:

81 if word=='' or word==' ':

82 continue

83 try:

84 result[word]+=1

85 except:

86 result[word]=1

87 return result

88

89 class Sycm(QtWidgets.QMainWindow,Ui_MainWindow):

90 def __init__(self):

91 super(Sycm,self).__init__()

92 self.setupUi(self)

93 self.titles=[]

94 self.basic_init()

95

96 def basic_init(self):

97 self.action.triggered.connect(self.close)

98 self.pushButton.clicked.connect(self.crawl)

99 self.pushButton_3.clicked.connect(self.save_to_excel)

100 self.pushButton_2.clicked.connect(self.clear_list)

101 self.pushButton_4.clicked.connect(self.getBrowser)

102

103 def message(self,text=''):

104 box=QtWidgets.QMessageBox.question(self,"提示",text,QtWidgets.QMessageBox.Ok)

105 if box==QtWidgets.QMessageBox.Ok:

106 return True

107 else:

108 return False

109

110 def getBrowser(self):

111 self.browser=webdriver.Firefox()

112 self.browser.get('https://sycm.taobao.com/custom/login.htm')

113 self.browser.implicitly_wait(10)

114 self.message(text='请在浏览器中登录')

115

116 def save_to_excel(self):

117 timenow=time.strftime("%Y%m%d_%H%M%S",time.localtime())

118 text=''

119 for title in self.titles:

120 text+=title

121 words=wordfrequency(text)

122 result=sorted(words.items(),key=lambda x:x[1],reverse=True)

123 excel=openpyxl.Workbook(write_only=True)

124 sheet=excel.create_sheet()

125 for item in result:

126 sheet.append(item)

127 try:

128 os.mkdir('result')

129 except:

130 pass

131 excel.save('result/%s.xlsx'%timenow)

132

133 def list_show(self):

134 self.listWidget.clear()

135 for title in self.titles:

136 self.listWidget.addItem(title)

137

138 def clear_list(self):

139 self.titles.clear()

140 self.list_show()

141

142 def crawl(self):

143 current_url=self.browser.current_url

144 if 'rankTabIndex' in current_url:

145 current_url=current_url.replace('&rankTabIndex=0','').replace('&rankTabIndex=1','')

146 result=[]

147 for num in range(2):

148 items=[]

149 self.browser.get(current_url+'&rankTabIndex='+str(num))

150 time.sleep(2)

151 try:

152 items+=self.parser(self.browser.page_source)

153 except:

154 self.message(text='解析错误')

155 return

156 page=2

157 while True:

158 url=current_url+'&rankTabIndex='+str(num)+'&page='+str(page)

159 self.browser.get(url)

160 time.sleep(3)

161 try:

162 items+=self.parser(self.browser.page_source)

163 except:

164 self.message(text='解析错误')

165 return

166 page+=1

167 if page==8:

168 break

169 result+=items[:100]

170 self.titles+=result

171 self.list_show()

172

173 def parser(self,html):

174 table=BeautifulSoup(html,'lxml').find('table',{'class':['table-ng','table-ng-basic']}).find_all('tr')

175 result=[]

176 for tr in table:

177 try:

178 result.append(tr.find('img').get('alt'))

179 except:

180 continue

181 return result

182

183

184 if __name__=='__main__':

185 import sys

186 app=QtWidgets.QApplication(sys.argv)

187 management=Sycm()

188 management.show()

189 sys.exit(app.exec_())


import requests

2 import json

3 import time

4 import os

5 import chardet

6

7 headers = {

8 ':authority':'suggest.taobao.com',

9 'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',

10 'Accept':"*/*",

11 'Accept-Language': 'en-US,en;q=0.5',

12 'Accept-Encoding': 'gzip, deflate',

13 'Connection': 'keep-alive'}

14

15

16 def suggest(keyword):

17 html=requests.get('https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null'.format(keyword),headers=headers).text

18 data=json.loads(html)['result']

19 result=[]

20 for item in data:

21 result.append(item[0].replace('','').replace('',''))

22 return result

23

24 def get_chardet(filename):

25 data=open(filename,'rb').read()

26 coding=chardet.detect(data)

27 return coding['encoding']

28

29 def loadkeywords():

30 keywords={}

31 for filename in os.listdir('keywords'):

32 if '.txt' not in filename:

33 continue

34 encoding=get_chardet('keywords/'+filename)

35 if encoding=='GB2312':

36 encoding='GBK'

37 keywords[filename]=[]

38 for line in open('keywords/'+filename,'r',encoding=encoding):

39 word=line.replace('\r','').replace('\n','')

40 keywords[filename].append(word)

41 return keywords

42

43 def save_to_txt(filename,deep,words):

44 f=open('result/'+filename.replace('.txt','_%s.txt'%deep),'w',encoding='utf-8')

45 writed=[]

46 for word in words:

47 if word in writed:

48 continue

49 writed.append(word)

50 f.write(word+'\r\n')

51 f.close()

52

53 def main():

54 keywords=loadkeywords()

55 while True:

56 try:

57 deep=input("输入采集深度:")

58 deep=int(deep)

59 break

60 except:

61 pass

62 for filename in keywords:

63 result=[]

64 for word in keywords[filename]:

65 words=[word]

66 count=0

67 for num in range(deep):

68 suggest_words=[]

69 for need_word in words:

70 try:

71 suggest_words+=suggest(need_word)

72 except:

73 continue

74 suggest_words=list(set(suggest_words))

75 words=suggest_words

76 count+=len(suggest_words)

77 result+=suggest_words

78 print(word,'deep',num+1)

79 print(word,'get',count,'ok')

80 save_to_txt(filename,deep,result)

81

82 main()


项目地址:

https://github.com/Nyloner/Nyspider.git


分享到:


相關文章: