项目地址:
https://github.com/JunWangCode/picScrapy.git
此爬虫可以爬取http://www.jj20.com所有分类图片,仅作实验和参考,请勿用于其他用途,在Ubuntu系统Python3环境下完美运行
一些设置
# 爬取深度DEPTH_LIMIT = 5# 图片存放位置IMAGES_STORE = '/home/jwang/Videos/Pic'# 图片最小宽度IMAGES_MIN_WIDTH = 500# 图片最小高度IMAGES_MIN_HEIGHT = 500
还有一些选项需要注意:
# 下载延迟,别把别人人站点拖垮了,慢点DOWNLOAD_DELAY = 0.2# 爬虫并发数,默认是 16CONCURRENT_REQUESTS = 20
启动爬虫
python3 -m scrapy crawl pic
核心代码:
# -*- coding: utf-8 -*-
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # http://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class PicscrapyItem(scrapy.Item):
12 image_urls = scrapy.Field()
13 images = scrapy.Field()
14 title = scrapy.Field()
15 category_name = scrapy.Field()
16
17
18 # 商品数据
19 class AfscrapyItem(scrapy.Item):
20 goods_id = scrapy.Field()
21 shop_name = scrapy.Field()
22 category_name = scrapy.Field()
23 title = scrapy.Field()
24 sales_num = scrapy.Field()
25 unit = scrapy.Field()
26 price = scrapy.Field()
27 location = scrapy.Field()
# -*- coding: utf-8 -*-
2
3 # Define your item pipelines here
4 #
5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 # -*- coding: utf-8 -*-
8 from urllib.parse import urlparse
9 import pymysql
10 import time
11 from scrapy.pipelines.images import ImagesPipeline
12 from scrapy import Request
13
14
15 def db_handler():
16 conn = pymysql.connect(
17 host='192.168.0.111',
18 user='root',
19 passwd='',
20 charset='utf8',
21 db='scrapy_data',
22 use_unicode=True
23 )
24 conn.autocommit(True)
25 return conn
26
27
28 class PicscrapyPipeline(ImagesPipeline):
29 def get_media_requests(self, item, info):
30 # 通过meta属性传递title
31 return [Request(x, meta={'title': item['title'], 'cat': item['category_name']}) for x in
32 item.get(self.images_urls_field, [])]
33
34 # 重写函数,修改了下载图片名称的生成规则
35 def file_path(self, request, response=None, info=None):
36 if not isinstance(request, Request):
37 url = request
38 else:
39 url = request.url
40 url = urlparse(url)
41 img_name = url.path.split('/')[5].split('.')[0]
42 return request.meta['cat'] + '/' + request.meta['title'] + '/%s.jpg' % img_name
43
44
45 class WebcrawlerScrapyPipeline(object):
46 def __init__(self):
47 self.db_object = db_handler()
48 self.cursor = db_handler().cursor()
49
50 def process_item(self, item, spider):
51 if item['category_name'] == "全部":
52 return
53 try:
54 sql = "insert into " + spider.name + "(goods_id, shop_name, " \
55 "category_name, title, sales_num, unit, price, location, created_at)" \
56 "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
57 params = (
58 item['goods_id'], item['shop_name'], item['category_name'], item['title'],
59 item['sales_num'], item['unit'], item['price'], item['location'],
60 time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
61 )
62 self.cursor.execute(sql, params)
63
64 except RuntimeError as e:
65 self.db_object.rollback()
66 print(e)
67
68 return item
閱讀更多 Python樂園 的文章