項目地址:
https://github.com/JunWangCode/picScrapy.git
此爬蟲可以爬取http://www.jj20.com所有分類圖片,僅作實驗和參考,請勿用於其他用途,在Ubuntu系統Python3環境下完美運行
一些設置
# 爬取深度DEPTH_LIMIT = 5# 圖片存放位置IMAGES_STORE = '/home/jwang/Videos/Pic'# 圖片最小寬度IMAGES_MIN_WIDTH = 500# 圖片最小高度IMAGES_MIN_HEIGHT = 500
還有一些選項需要注意:
# 下載延遲,別把別人人站點拖垮了,慢點DOWNLOAD_DELAY = 0.2# 爬蟲併發數,默認是 16CONCURRENT_REQUESTS = 20
啟動爬蟲
python3 -m scrapy crawl pic
核心代碼:
# -*- coding: utf-8 -*-
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # http://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class PicscrapyItem(scrapy.Item):
12 image_urls = scrapy.Field()
13 images = scrapy.Field()
14 title = scrapy.Field()
15 category_name = scrapy.Field()
16
17
18 # 商品數據
19 class AfscrapyItem(scrapy.Item):
20 goods_id = scrapy.Field()
21 shop_name = scrapy.Field()
22 category_name = scrapy.Field()
23 title = scrapy.Field()
24 sales_num = scrapy.Field()
25 unit = scrapy.Field()
26 price = scrapy.Field()
27 location = scrapy.Field()
# -*- coding: utf-8 -*-
2
3 # Define your item pipelines here
4 #
5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 # -*- coding: utf-8 -*-
8 from urllib.parse import urlparse
9 import pymysql
10 import time
11 from scrapy.pipelines.images import ImagesPipeline
12 from scrapy import Request
13
14
15 def db_handler():
16 conn = pymysql.connect(
17 host='192.168.0.111',
18 user='root',
19 passwd='',
20 charset='utf8',
21 db='scrapy_data',
22 use_unicode=True
23 )
24 conn.autocommit(True)
25 return conn
26
27
28 class PicscrapyPipeline(ImagesPipeline):
29 def get_media_requests(self, item, info):
30 # 通過meta屬性傳遞title
31 return [Request(x, meta={'title': item['title'], 'cat': item['category_name']}) for x in
32 item.get(self.images_urls_field, [])]
33
34 # 重寫函數,修改了下載圖片名稱的生成規則
35 def file_path(self, request, response=None, info=None):
36 if not isinstance(request, Request):
37 url = request
38 else:
39 url = request.url
40 url = urlparse(url)
41 img_name = url.path.split('/')[5].split('.')[0]
42 return request.meta['cat'] + '/' + request.meta['title'] + '/%s.jpg' % img_name
43
44
45 class WebcrawlerScrapyPipeline(object):
46 def __init__(self):
47 self.db_object = db_handler()
48 self.cursor = db_handler().cursor()
49
50 def process_item(self, item, spider):
51 if item['category_name'] == "全部":
52 return
53 try:
54 sql = "insert into " + spider.name + "(goods_id, shop_name, " \
55 "category_name, title, sales_num, unit, price, location, created_at)" \
56 "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
57 params = (
58 item['goods_id'], item['shop_name'], item['category_name'], item['title'],
59 item['sales_num'], item['unit'], item['price'], item['location'],
60 time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
61 )
62 self.cursor.execute(sql, params)
63
64 except RuntimeError as e:
65 self.db_object.rollback()
66 print(e)
67
68 return item