Python初学小爬虫之Scrapy爬取美女壁纸图片(附全部源码)

项目地址:

https://github.com/JunWangCode/picScrapy.git


此爬虫可以爬取http://www.jj20.com所有分类图片,仅作实验和参考,请勿用于其他用途,在Ubuntu系统Python3环境下完美运行

  1. 一些设置

# 爬取深度DEPTH_LIMIT = 5# 图片存放位置IMAGES_STORE = '/home/jwang/Videos/Pic'# 图片最小宽度IMAGES_MIN_WIDTH = 500# 图片最小高度IMAGES_MIN_HEIGHT = 500

还有一些选项需要注意:

# 下载延迟,别把别人人站点拖垮了,慢点DOWNLOAD_DELAY = 0.2# 爬虫并发数,默认是 16CONCURRENT_REQUESTS = 20
  1. 启动爬虫

python3 -m scrapy crawl pic


核心代码:

# -*- coding: utf-8 -*-

2

3 # Define here the models for your scraped items

4 #

5 # See documentation in:

6 # http://doc.scrapy.org/en/latest/topics/items.html

7

8 import scrapy

9

10

11 class PicscrapyItem(scrapy.Item):

12 image_urls = scrapy.Field()

13 images = scrapy.Field()

14 title = scrapy.Field()

15 category_name = scrapy.Field()

16

17

18 # 商品数据

19 class AfscrapyItem(scrapy.Item):

20 goods_id = scrapy.Field()

21 shop_name = scrapy.Field()

22 category_name = scrapy.Field()

23 title = scrapy.Field()

24 sales_num = scrapy.Field()

25 unit = scrapy.Field()

26 price = scrapy.Field()

27 location = scrapy.Field()



# -*- coding: utf-8 -*-

2

3 # Define your item pipelines here

4 #

5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting

6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

7 # -*- coding: utf-8 -*-

8 from urllib.parse import urlparse

9 import pymysql

10 import time

11 from scrapy.pipelines.images import ImagesPipeline

12 from scrapy import Request

13

14

15 def db_handler():

16 conn = pymysql.connect(

17 host='192.168.0.111',

18 user='root',

19 passwd='',

20 charset='utf8',

21 db='scrapy_data',

22 use_unicode=True

23 )

24 conn.autocommit(True)

25 return conn

26

27

28 class PicscrapyPipeline(ImagesPipeline):

29 def get_media_requests(self, item, info):

30 # 通过meta属性传递title

31 return [Request(x, meta={'title': item['title'], 'cat': item['category_name']}) for x in

32 item.get(self.images_urls_field, [])]

33

34 # 重写函数,修改了下载图片名称的生成规则

35 def file_path(self, request, response=None, info=None):

36 if not isinstance(request, Request):

37 url = request

38 else:

39 url = request.url

40 url = urlparse(url)

41 img_name = url.path.split('/')[5].split('.')[0]

42 return request.meta['cat'] + '/' + request.meta['title'] + '/%s.jpg' % img_name

43

44

45 class WebcrawlerScrapyPipeline(object):

46 def __init__(self):

47 self.db_object = db_handler()

48 self.cursor = db_handler().cursor()

49

50 def process_item(self, item, spider):

51 if item['category_name'] == "全部":

52 return

53 try:

54 sql = "insert into " + spider.name + "(goods_id, shop_name, " \

55 "category_name, title, sales_num, unit, price, location, created_at)" \

56 "values (%s, %s, %s, %s, %s, %s, %s, %s, %s)"

57 params = (

58 item['goods_id'], item['shop_name'], item['category_name'], item['title'],

59 item['sales_num'], item['unit'], item['price'], item['location'],

60 time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

61 )

62 self.cursor.execute(sql, params)

63

64 except RuntimeError as e:

65 self.db_object.rollback()

66 print(e)

67

68 return item



分享到:


相關文章: