python爬虫总结,,安装Scrapy(有


安装Scrapy(有很多依赖库要装,略麻烦)

参考: https://www.cnblogs.com/liuliliuli2017/p/6746440.html

Scrapy中文文档: http://scrapy-chs.readthedocs.io/zh_CN/0.24/index.html

查看scrapy基本信息和功能

scrapy

测试爬虫性能

scrapy bench 

爬取网页信息(以百度首页为例)

scrapy fetch "http://www.baidu.com"

shell环境,可以在cmd进行操作(以百度为例)

scrapy shell "http://www.baidu.com"print response.body # 打印响应主体

创建项目(以ITcast为例)

scrapy startproject ITcast

settings.py屏蔽ROBOTSTXT_OBEY(不遵守机器人协议)

生成爬虫文件

# scrapy genspider example example_urlscrapy genspider itcast "http://www.itcast.cn"

items字段(items.py)

import scrapyclass ItcastItem(scrapy.Item):    # define the fields for your item here like:        #老师姓名    name = scrapy.Field()    #老师职称    title = scrapy.Field()    #老师信息    info = scrapy.Field()

编写爬虫文件(itcast.py)

# -*- coding: utf-8 -*-import scrapyfrom ITcast.items import ItcastItemclass ItcastSpider(scrapy.Spider):    #爬虫名(必选)    name = ‘itcast‘    allowed_domains = [‘http://www.itcast.cn‘]    start_urls = [‘http://www.itcast.cn/channel/teacher.shtml‘]    def parse(self, response):         node_list = response.xpath("//div[@class=‘li_txt‘]")        #存储所有的item字段        items = []        for node in node_list:            item = ItcastItem()            name = node.xpath("./h3/text()").extract()            title = node.xpath("./h4/text()").extract()            info = node.xpath("./p/text()").extract()                        item[‘name‘] = name[0]            item[‘title‘] = title[0]            item[‘info‘] = info[0]                        items.append(item)        return items        #pass

检查爬虫是否无误

scrapy check itcast

运行爬虫

scrapy crawl itcast

查看爬虫

scrapy list

编写多个管道,则需要在settints文件中的ITEM_PIPELINES添加

例: 腾讯招聘(多页抓取)

items.py

技术分享图片
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass TencentItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    #职位名    positionName = scrapy.Field()    #职位详情    #positionLink = scrapy.Field()    #职位类型    #positionType = scrapy.Field()    #人数    #peopleNumber = scrapy.Field()    #工作地点    #workLocation = scrapy.Field()    #发布时间    #publishTime = scrapy.Field()    #pass
View Code

pipelines.py

技术分享图片
# -*- coding: utf-8 -*-# Define your item pipelines here## Don‘t forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport jsonclass TencentPipeline(object):    def __init__(self):        self.f = open("tencent.json", "w")    def process_item(self, item, spider):        content = json.dumps(dict(item), ensure_ascii=False) +"\n"        #self.f.write(item[‘positionName‘] + "\n")        self.f.write(content)        return item    def close_spider(self, spider):        self.f.close()
View Code

settings.py开启管道

技术分享图片
ITEM_PIPELINES = {    ‘Tencent.pipelines.TencentPipeline‘: 300,}
View Code

tencent.py

技术分享图片
# -*- coding: utf-8 -*-import scrapyfrom Tencent.items import TencentItemclass TencentSpider(scrapy.Spider):    name = ‘tencent‘    allowed_domains = [‘tencent.com‘]    base_url = "http://hr.tencent.com/position.php?&start="    offset = 0        start_urls = [base_url + str(offset)]    def parse(self, response):        node_list = response.xpath("//tr[@class=‘even‘] | //tr[@class=‘odd‘]")        for node in node_list:            item = TencentItem()            item[‘positionName‘] = node.xpath("./td[1]/a/text()").extract()[0]            #item[‘positionLink‘] = node.xpath("./td[1]/a/@href").extract()[0].encode("utf-8")            #item[‘positionType‘] = node.xpath("./td[2]/text()").extract()[0].encode("utf-8")            #item[‘peopleNumber‘] = node.xpath("./td[3]/text()").extract()[0].encode("utf-8")            #item[‘workLocation‘] = node.xpath("./td[4]/text()").extract()[0].encode("utf-8")            #item[‘publishTime‘] = node.xpath("./td[5]/text()").extract()[0].encode("utf-8")            yield item        # if self.offset < 2620 :            # self.offset += 10            # url = self.base_url + str(self.offset)            # yield scrapy.Request(url, callback = self.parse)        next_page = response.xpath("//*[@id=‘next‘]/@href").extract()[0]        if not next_page.startswith("java") :            yield scrapy.Request("http://hr.tencent.com/" + next_page, callback = self.parse)                    #pass
View Code

例: 斗鱼主播图片爬取(图片爬取)

items.py

技术分享图片
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyclass DouyuItem(scrapy.Item):    # define the fields for your item here like:    # name = scrapy.Field()    nickname = scrapy.Field()    imagelink = scrapy.Field()    
View Code

pipelines.py

技术分享图片
# -*- coding: utf-8 -*-# Define your item pipelines here## Don‘t forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport osfrom Douyu.settings import IMAGES_STORE as image_storefrom scrapy.pipelines.images import ImagesPipelineimport scrapyclass DouyuPipeline(ImagesPipeline):    def get_media_requests(self, item, info):        image_link = item[‘imagelink‘]        yield scrapy.Request(image_link)        def item_completed(self, results, item, info):        #print(results)        image_path = [x[‘path‘] for ok,x in results if ok]        os.rename(image_store + image_path[0], image_store + item[‘nickname‘] + ".jpg")
View Code

settings.py配置IMAGE_STORE和USER_AGENT并开启管道(同腾讯招聘)

技术分享图片
IMAGES_STORE = "E:/PythonScrapy/Douyu/Douyu/Images/"# Crawl responsibly by identifying yourself (and your website) on the user-agentUSER_AGENT = ‘Mozilla/5.0 (Linux; U; Android 4.4.2; zh-CN; HUAWEI MT7-TL00 Build/HuaweiMT7-TL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/40.0.2214.89 UCBrowser/11.3.8.909 Mobile Safari/537.36‘
View Code

douyu.py

技术分享图片
# -*- coding: utf-8 -*-import scrapyimport jsonfrom Douyu.items import DouyuItemclass DouyuSpider(scrapy.Spider):    name = ‘douyu‘    allowed_domains = [‘douyucdn.cn‘]    baseURL = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="    offset = 0    start_urls = [baseURL + str(offset)]    def parse(self, response):                data_list = json.loads(response.body.decode(‘gbk‘))[‘data‘]        if len(data_list) == 0:            return        #print(data_list)        for data in data_list:            item = DouyuItem()            item[‘nickname‘] = data[‘nickname‘]            item[‘imagelink‘] = data[‘vertical_src‘]            yield item                    #self.offset += 20        #yield scrapy.Request(self.baseURL + str(self.offset), callback = self.parse)
View Code

python爬虫总结

评论关闭