www.umetrip.com --- 航旅纵横航班信息
代码展示:
from selenium import webdriver
2 import openpyxl
3 import jieba
4 from bs4 import BeautifulSoup
5 import time
6 import jieba
7 import os
8 import re
9 import chardet
10 from PyQt5 import QtCore, QtGui, QtWidgets
11
12 jieba.set_dictionary("dict/dict.txt")
13 jieba.initialize()
14
15 class Ui_MainWindow(object):
16 def setupUi(self, MainWindow):
17 MainWindow.setObjectName("MainWindow")
18 MainWindow.resize(725, 590)
19 self.centralWidget = QtWidgets.QWidget(MainWindow)
20 self.centralWidget.setObjectName("centralWidget")
21 self.listWidget = QtWidgets.QListWidget(self.centralWidget)
22 self.listWidget.setGeometry(QtCore.QRect(10, 40, 411, 471))
23 self.listWidget.setObjectName("listWidget")
24 self.pushButton = QtWidgets.QPushButton(self.centralWidget)
25 self.pushButton.setGeometry(QtCore.QRect(479, 50, 171, 61))
26 self.pushButton.setObjectName("pushButton")
27 self.label = QtWidgets.QLabel(self.centralWidget)
28 self.label.setGeometry(QtCore.QRect(10, 10, 51, 31))
29 self.label.setObjectName("label")
30 self.pushButton_2 = QtWidgets.QPushButton(self.centralWidget)
31 self.pushButton_2.setGeometry(QtCore.QRect(430, 480, 80, 25))
32 self.pushButton_2.setObjectName("pushButton_2")
33 self.pushButton_3 = QtWidgets.QPushButton(self.centralWidget)
34 self.pushButton_3.setGeometry(QtCore.QRect(480, 140, 171, 61))
35 self.pushButton_3.setObjectName("pushButton_3")
36 self.pushButton_4 = QtWidgets.QPushButton(self.centralWidget)
37 self.pushButton_4.setGeometry(QtCore.QRect(480, 220, 171, 61))
38 self.pushButton_4.setObjectName("pushButton_4")
39 MainWindow.setCentralWidget(self.centralWidget)
40 self.menuBar = QtWidgets.QMenuBar(MainWindow)
41 self.menuBar.setGeometry(QtCore.QRect(0, 0, 725, 22))
42 self.menuBar.setObjectName("menuBar")
43 self.menu = QtWidgets.QMenu(self.menuBar)
44 self.menu.setObjectName("menu")
45 MainWindow.setMenuBar(self.menuBar)
46 self.mainToolBar = QtWidgets.QToolBar(MainWindow)
47 self.mainToolBar.setObjectName("mainToolBar")
48 MainWindow.addToolBar(QtCore.Qt.TopToolBarArea, self.mainToolBar)
49 self.statusBar = QtWidgets.QStatusBar(MainWindow)
50 self.statusBar.setObjectName("statusBar")
51 MainWindow.setStatusBar(self.statusBar)
52 self.action = QtWidgets.QAction(MainWindow)
53 self.action.setObjectName("action")
54 self.menu.addAction(self.action)
55 self.menuBar.addAction(self.menu.menuAction())
56
57 self.retranslateUi(MainWindow)
58 QtCore.QMetaObject.connectSlotsByName(MainWindow)
59
60 def retranslateUi(self, MainWindow):
61 _translate = QtCore.QCoreApplication.translate
62 MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
63 self.pushButton.setText(_translate("MainWindow", "采集"))
64 self.label.setText(_translate("MainWindow", "标题:"))
65 self.pushButton_2.setText(_translate("MainWindow", "清空"))
66 self.pushButton_3.setText(_translate("MainWindow", "导出词频"))
67 self.pushButton_4.setText(_translate("MainWindow", "启动浏览器"))
68 self.menu.setTitle(_translate("MainWindow", "菜单"))
69 self.action.setText(_translate("MainWindow", "退出"))
70
71 def cutword(data):
72 result=jieba.cut(data,cut_all=False)
73 return result
74
75 def wordfrequency(text):
76 sub_re='[a-zA-Z]+|[\s+\.\!\/_,$%^*\(\d+"\']+|[+—;—!:\(\):《》,。?、~@#¥%……&*()%~\[\]\|\?\·【】“”;-]+'
77 text=re.sub(sub_re,' ',text)
78 result={}
79 words=[word for word in cutword(text)]
80 for word in words:
81 if word=='' or word==' ':
82 continue
83 try:
84 result[word]+=1
85 except:
86 result[word]=1
87 return result
88
89 class Sycm(QtWidgets.QMainWindow,Ui_MainWindow):
90 def __init__(self):
91 super(Sycm,self).__init__()
92 self.setupUi(self)
93 self.titles=[]
94 self.basic_init()
95
96 def basic_init(self):
97 self.action.triggered.connect(self.close)
98 self.pushButton.clicked.connect(self.crawl)
99 self.pushButton_3.clicked.connect(self.save_to_excel)
100 self.pushButton_2.clicked.connect(self.clear_list)
101 self.pushButton_4.clicked.connect(self.getBrowser)
102
103 def message(self,text=''):
104 box=QtWidgets.QMessageBox.question(self,"提示",text,QtWidgets.QMessageBox.Ok)
105 if box==QtWidgets.QMessageBox.Ok:
106 return True
107 else:
108 return False
109
110 def getBrowser(self):
111 self.browser=webdriver.Firefox()
112 self.browser.get('https://sycm.taobao.com/custom/login.htm')
113 self.browser.implicitly_wait(10)
114 self.message(text='请在浏览器中登录')
115
116 def save_to_excel(self):
117 timenow=time.strftime("%Y%m%d_%H%M%S",time.localtime())
118 text=''
119 for title in self.titles:
120 text+=title
121 words=wordfrequency(text)
122 result=sorted(words.items(),key=lambda x:x[1],reverse=True)
123 excel=openpyxl.Workbook(write_only=True)
124 sheet=excel.create_sheet()
125 for item in result:
126 sheet.append(item)
127 try:
128 os.mkdir('result')
129 except:
130 pass
131 excel.save('result/%s.xlsx'%timenow)
132
133 def list_show(self):
134 self.listWidget.clear()
135 for title in self.titles:
136 self.listWidget.addItem(title)
137
138 def clear_list(self):
139 self.titles.clear()
140 self.list_show()
141
142 def crawl(self):
143 current_url=self.browser.current_url
144 if 'rankTabIndex' in current_url:
145 current_url=current_url.replace('&rankTabIndex=0','').replace('&rankTabIndex=1','')
146 result=[]
147 for num in range(2):
148 items=[]
149 self.browser.get(current_url+'&rankTabIndex='+str(num))
150 time.sleep(2)
151 try:
152 items+=self.parser(self.browser.page_source)
153 except:
154 self.message(text='解析错误')
155 return
156 page=2
157 while True:
158 url=current_url+'&rankTabIndex='+str(num)+'&page='+str(page)
159 self.browser.get(url)
160 time.sleep(3)
161 try:
162 items+=self.parser(self.browser.page_source)
163 except:
164 self.message(text='解析错误')
165 return
166 page+=1
167 if page==8:
168 break
169 result+=items[:100]
170 self.titles+=result
171 self.list_show()
172
173 def parser(self,html):
174 table=BeautifulSoup(html,'lxml').find('table',{'class':['table-ng','table-ng-basic']}).find_all('tr')
175 result=[]
176 for tr in table:
177 try:
178 result.append(tr.find('img').get('alt'))
179 except:
180 continue
181 return result
182
183
184 if __name__=='__main__':
185 import sys
186 app=QtWidgets.QApplication(sys.argv)
187 management=Sycm()
188 management.show()
189 sys.exit(app.exec_())
import requests
2 import json
3 import time
4 import os
5 import chardet
6
7 headers = {
8 ':authority':'suggest.taobao.com',
9 'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36',
10 'Accept':"*/*",
11 'Accept-Language': 'en-US,en;q=0.5',
12 'Accept-Encoding': 'gzip, deflate',
13 'Connection': 'keep-alive'}
14
15
16 def suggest(keyword):
17 html=requests.get('https://suggest.taobao.com/sug?q={}&code=utf-8&area=c2c&nick=&sid=null'.format(keyword),headers=headers).text
18 data=json.loads(html)['result']
19 result=[]
20 for item in data:
21 result.append(item[0].replace('','').replace('',''))
22 return result
23
24 def get_chardet(filename):
25 data=open(filename,'rb').read()
26 coding=chardet.detect(data)
27 return coding['encoding']
28
29 def loadkeywords():
30 keywords={}
31 for filename in os.listdir('keywords'):
32 if '.txt' not in filename:
33 continue
34 encoding=get_chardet('keywords/'+filename)
35 if encoding=='GB2312':
36 encoding='GBK'
37 keywords[filename]=[]
38 for line in open('keywords/'+filename,'r',encoding=encoding):
39 word=line.replace('\r','').replace('\n','')
40 keywords[filename].append(word)
41 return keywords
42
43 def save_to_txt(filename,deep,words):
44 f=open('result/'+filename.replace('.txt','_%s.txt'%deep),'w',encoding='utf-8')
45 writed=[]
46 for word in words:
47 if word in writed:
48 continue
49 writed.append(word)
50 f.write(word+'\r\n')
51 f.close()
52
53 def main():
54 keywords=loadkeywords()
55 while True:
56 try:
57 deep=input("输入采集深度:")
58 deep=int(deep)
59 break
60 except:
61 pass
62 for filename in keywords:
63 result=[]
64 for word in keywords[filename]:
65 words=[word]
66 count=0
67 for num in range(deep):
68 suggest_words=[]
69 for need_word in words:
70 try:
71 suggest_words+=suggest(need_word)
72 except:
73 continue
74 suggest_words=list(set(suggest_words))
75 words=suggest_words
76 count+=len(suggest_words)
77 result+=suggest_words
78 print(word,'deep',num+1)
79 print(word,'get',count,'ok')
80 save_to_txt(filename,deep,result)
81
82 main()
项目地址:
https://github.com/Nyloner/Nyspider.git
閱讀更多 Python樂園 的文章