Python初學小爬蟲之Scrapy爬取美女壁紙圖片(附全部源碼)

項目地址:

https://github.com/JunWangCode/picScrapy.git


此爬蟲可以爬取http://www.jj20.com所有分類圖片,僅作實驗和參考,請勿用於其他用途,在Ubuntu系統Python3環境下完美運行

  1. 一些設置

# 爬取深度DEPTH_LIMIT = 5# 圖片存放位置IMAGES_STORE = '/home/jwang/Videos/Pic'# 圖片最小寬度IMAGES_MIN_WIDTH = 500# 圖片最小高度IMAGES_MIN_HEIGHT = 500

還有一些選項需要注意:

# 下載延遲,別把別人人站點拖垮了,慢點DOWNLOAD_DELAY = 0.2# 爬蟲併發數,默認是 16CONCURRENT_REQUESTS = 20
  1. 啟動爬蟲

python3 -m scrapy crawl pic


核心代碼:

# -*- coding: utf-8 -*-

2

3 # Define here the models for your scraped items

4 #

5 # See documentation in:

6 # http://doc.scrapy.org/en/latest/topics/items.html

7

8 import scrapy

9

10

11 class PicscrapyItem(scrapy.Item):

12 image_urls = scrapy.Field()

13 images = scrapy.Field()

14 title = scrapy.Field()

15 category_name = scrapy.Field()

16

17

18 # 商品數據

19 class AfscrapyItem(scrapy.Item):

20 goods_id = scrapy.Field()

21 shop_name = scrapy.Field()

22 category_name = scrapy.Field()

23 title = scrapy.Field()

24 sales_num = scrapy.Field()

25 unit = scrapy.Field()

26 price = scrapy.Field()

27 location = scrapy.Field()



# -*- coding: utf-8 -*-

2

3 # Define your item pipelines here

4 #

5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting

6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

7 # -*- coding: utf-8 -*-

8 from urllib.parse import urlparse

9 import pymysql

10 import time

11 from scrapy.pipelines.images import ImagesPipeline

12 from scrapy import Request

13

14

15 def db_handler():

16 conn = pymysql.connect(

17 host='192.168.0.111',

18 user='root',

19 passwd='',

20 charset='utf8',

21 db='scrapy_data',

22 use_unicode=True

23 )

24 conn.autocommit(True)

25 return conn

26

27

28 class PicscrapyPipeline(ImagesPipeline):

29 def get_media_requests(self, item, info):

30 # 通過meta屬性傳遞title

31 return [Request(x, meta={'title': item['title'], 'cat': item['category_name']}) for x in

32 item.get(self.images_urls_field, [])]

33

34 # 重寫函數,修改了下載圖片名稱的生成規則

35 def file_path(self, request, response=None, info=None):

36 if not isinstance(request, Request):

37 url = request

38 else:

39 url = request.url

40 url = urlparse(url)

41 img_name = url.path.split('/')[5].split('.')[0]

42 return request.meta['cat'] + '/' + request.meta['title'] + '/%s.jpg' % img_name

43

44

45 class WebcrawlerScrapyPipeline(object):

46 def __init__(self):

47 self.db_object = db_handler()

48 self.cursor = db_handler().cursor()

49

50 def process_item(self, item, spider):

51 if item['category_name'] == "全部":

52 return

53 try:

54 sql = "insert into " + spider.name + "(goods_id, shop_name, " \

55 "category_name, title, sales_num, unit, price, location, created_at)" \

56 "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"

57 params = (

58 item['goods_id'], item['shop_name'], item['category_name'], item['title'],

59 item['sales_num'], item['unit'], item['price'], item['location'],

60 time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

61 )

62 self.cursor.execute(sql, params)

63

64 except RuntimeError as e:

65 self.db_object.rollback()

66 print(e)

67

68 return item



分享到:


相關文章: