python爬蟲
爬蟲的概念
爬蟲是模擬瀏覽器發送請求,獲取響應爬蟲的流程
發送請求獲取響應提取數據保存請求頭
通過請求頭模擬模擬服務器
Host:主機和端口號Connection:鏈接類型Upgrade-Insecure-Requests:升級為HTTPS請求Accept:傳輸文件類型Referer:頁面跳轉處Accept-Encoding:文件編解碼格式Cookie:cookiex-requested-with:Ajax異步請求響應狀態碼
200:成功302:臨時轉移至新的url307:臨時轉移至新的url404:not found500:服務器內部錯誤爬蟲的類
通用爬蟲:通常指搜索引擎的爬蟲聚焦爬蟲:針對特定網站的爬蟲爬蟲的工作流程
搜索引擎流程
抓去網頁數據存儲預處理提供檢索服務、網站排名聚焦爬蟲流程
url list響應內容 |提取數據---->提取url入庫發送簡單的請求
安裝requests庫pip3 install requests
發起一個簡單的request請求
import requests
response = requests.get("http://www.baidu.com")
response.encoding = "utf-8"
text = response.text #返回內容
status = response.status_code
print(text)
print(status)
發起一個帶headers的請求
import requests
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
response = requests.get("http://www.baidu.com",headers = header)
response.encoding = "utf-8"
text = response.text #返回內容
status = response.status_code
print(text)
print(status )
print(response.request.headers)
發起帶參數的請求
import requests
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
par = {"wd":"張三"}
response = requests.get("http://www.baidu.com/s",headers = header,params=par)
response.encoding = "utf-8"
text = response.text #返回內容
status = response.status_code
print(text)
print(status )
print(response.request.headers)
print(response.request.url)
發送post請求
發送post請求import requests
def get_content_length(data):
length = len(data.keys()) * 2 - 1
total = ''.join(list(data.keys()) + list(data.values()))
length += len(total)
return length
url = "http://localhost/"
par = {"txt_userName":"admin",
"txt_userPwd":"123",
"but_sigin":"登錄",
"__EVENTTARGET":"",
"_EVENTARGUMENT":"",
"__VIEWSTATE":"jvSnUXEL/VE5n7y6wjx8T+if6kxOtL5RZwOROEHiWmIseLfbsua+mkFpteAxZMTrtRVgaO7cQYj90Ziw1hvSv7KeCChJga9R4DYPeP77Ypw=",
"__VIEWSTATEGENERATOR":"9005994241",
"__EVENTVALIDATION":"s/6wUJU3A8q2IrZV4ockZ4bKJm1jj4l1IEJm/C1OQSyauSTIxHbtAXVI9DP8ARz9X0iFrjeted/manpeRySaa7fU+T1ssbkfEfNB0MkmEE347WV9/jow73gaNCnKWVg2REhDPfYJ/LR+oLQrqBqDawKEly5WTksOlKgVmxF7+Gc="}
print(get_content_length(par))
header = {"Referer":"http://localhost/",
"Origin":"http://localhost",
"Content-Type":"application/x-www-form-urlencoded",
"Connection":"keep-alive",
"Content-Length":str(get_content_length(par)),
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
response = requests.post(url,data = par,headers = header)
text = response.text
status = response.status_code
print(text)
print(status)
print(response.request.headers)
print(response.headers)
```
代理
import requests
header = {"Connection":"keep-alive",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
proxies = {"http":"118.163.120.181:52458"}#設置代理ip
response = requests.get("https://www.baidu.com",headers = header,proxies=proxies)
response.encoding="utf-8"
print(response.text)
保持session
import requests
def get_content_length(data):
length = len(data.keys()) * 2 - 1
total = ''.join(list(data.keys()) + list(data.values()))
length += len(total)
return length
url = "http://localhost/"
par = {"txt_userName":"admin",
"txt_userPwd":"123",
"but_sigin":"登錄",
"__EVENTTARGET":"",
"_EVENTARGUMENT":"",
"__VIEWSTATE":"jvSnUXEL/VE5n7y6wjx8T+if6kxOtL5RZwOROEHiWmIseLfbsua+mkFpteAxZMTrtRVgaO7cQYj90Ziw1hvSv7KeCChJga9R4DYPeP77Ypw=",
"__VIEWSTATEGENERATOR":"9005994241",
"__EVENTVALIDATION":"s/6wUJU3A8q2IrZV4ockZ4bKJm1jj4l1IEJm/C1OQSyauSTIxHbtAXVI9DP8ARz9X0iFrjeted/manpeRySaa7fU+T1ssbkfEfNB0MkmEE347WV9/jow73gaNCnKWVg2REhDPfYJ/LR+oLQrqBqDawKEly5WTksOlKgVmxF7+Gc="}
print(get_content_length(par))
header = {"Referer":"http://localhost/",
"Origin":"http://localhost",
"Content-Type":"application/x-www-form-urlencoded",
"Connection":"keep-alive",
"Content-Length":str(get_content_length(par)),
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
session = requests.session();
session.post(url,data = par,headers = header)
header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
response = session.get("http://localhost/ChooseFunc.aspx",headers = header)
print(response.text)
使用cookie登錄
import requests
def get_content_length(data):
length = len(data.keys()) * 2 - 1
total = ''.join(list(data.keys()) + list(data.values()))
length += len(total)
return length
url = "http://localhost/"
par = {"txt_userName":"admin",
"txt_userPwd":"123",
"but_sigin":"登錄",
"__EVENTTARGET":"",
"_EVENTARGUMENT":"",
"__VIEWSTATE":"jvSnUXEL/VE5n7y6wjx8T+if6kxOtL5RZwOROEHiWmIseLfbsua+mkFpteAxZMTrtRVgaO7cQYj90Ziw1hvSv7KeCChJga9R4DYPeP77Ypw=",
"__VIEWSTATEGENERATOR":"9005994241",
"__EVENTVALIDATION":"s/6wUJU3A8q2IrZV4ockZ4bKJm1jj4l1IEJm/C1OQSyauSTIxHbtAXVI9DP8ARz9X0iFrjeted/manpeRySaa7fU+T1ssbkfEfNB0MkmEE347WV9/jow73gaNCnKWVg2REhDPfYJ/LR+oLQrqBqDawKEly5WTksOlKgVmxF7+Gc="}
print(get_content_length(par))
header = {"Referer":"http://localhost/",
"Origin":"http://localhost",
"Content-Type":"application/x-www-form-urlencoded",
"Connection":"keep-alive",
"Content-Length":str(get_content_length(par)),
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER"}
Cookie = "BAIDUID=FE80207B022A6E1E4D7BB1367A17F521:FG=1; BIDUPSID=FE80207B022A6E1E4D7BB1367A17F521; PSTM=1530773661; MCITY=-227%3A; BDUSS=U94WDlNWE9UNXdPQ2hLZUFJb3IzT1d6MVBra1J1MXBNTGxleDg5cElDLWx6dkpiQVFBQUFBJCQAAAAAAAAAAAEAAABjihIwxKu05unkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAKVBy1ulQctbW; shifen[72403812623_78176]=1540113943; BCLID=14310731544559705912; BDSFRCVID=XaPsJeC629x9PMR7z-PHuUya6eMiOacTH6aomc6Zd6LmrCfkf5B3EG0PDf8g0KAbHA5togKKWeOTHxRP; H_BDCLCKID_SF=tRKDoC02tCI3fP36q4nSb-De2fob-C62aKDshnjx-hcqEIL4eJAB0MuwjpCLQUvtQ65K_J5z2l7HHUbSj4QohtJBWx8qWPrH0KcW0D5Pth5nhMJeb67JDMPF-47CtR3y523i-b5vQpnWVxtu-n5jHjQXDGKH3J; H_PS_645EC=008cULc%2Bp%2Fdi4lUhaDQ3S%2Fa27VwOr8z2caFujxq6eaPb1yA6qq8yWL76nYy%2F0mIg4K%2FPyQ; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; BD_CK_SAM=1; PSINO=6; BDRCVFR[4Zjqyl1bxbt]=aeXf-1x8UdYcs; BD_HOME=1; H_PS_PSSID=1448_21079_27244; BD_UPN=16314753"
#分割cookie字符串組合成字典
cookies = {item.split("=")[0]:item.split("=")[1] for item in Cookie.split("; ")}
print(cookies)
response = requests.post(url,data = par,headers = header,cookies = cookies)
text = response.text
status = response.status_code
求SSL 證書
requests.get(login_url,verify=False)