爬蟲項目04-電影蜜蜂

#核心代碼
# -*- coding: utf-8 -*-
import scrapy
from scrapy.cmdline import execute
from scrapy.http import Request
from urllib.parse import urljoin
class MifengSpider(scrapy.Spider):
name = 'mifeng'
allowed_domains = ['dybee.tv']
start_urls = ['https://www.dybee.tv/movie//']
def parse(self, response):
page_urls = response.xpath('//ul[@id="index_ajax_list"]//a[@class="user_list_kz"]')
if page_urls is not None:
for page_url in page_urls:
url = page_url.xpath('..//@href').extract()[0]
yield Request(url=url,callback=self.get_content)
#獲取下一頁的處理數據
next_url=None
try:
next_url = response.xpath('//a[@class="next page-numbers"]//@href').extract()[0]
except:
print("全部獲取完成")
if next_url is not None:
yield Request(url=next_url,callback=self.parse)
def get_content(self,response):
print(response.status)
title = response.xpath('//h1//text()').extract()[0]#電影名稱
#電影連接獲取
link=''
movie_urls = response.xpath('//div[@id="normalDown"]//table//tr//a')
for movie_url in movie_urls:
link_title = movie_url.xpath('..//text()').extract()[0]
link_addr = movie_url.xpath('..//@href').extract()[0]
link=link+link_title+"===>"+link_addr+"\n"
f = open("movie01.txt","a",encoding="utf-8")
f.write(title+"\n"+link)
f.flush()
f.close()
if __name__ == '__main__':
execute(["scrapy", "crawl", "mifeng"])
#僅供交流參考
爬蟲項目04-電影蜜蜂


分享到:


相關文章: