python scrapy 网络采集使用代理的方法,pythonscrapy,1.在Scrapy工程下


Importing base64 library because we'll need it ONLY in case if the proxy we are going to use requires authentication

import base64

Start your middleware class

class ProxyMiddleware(object): # overwrite process request def process_request(self, request, spider): # Set the location of the proxy request.meta['proxy'] = "http://YOUR_PROXY_IP:PORT"

    # Use the following lines if your proxy requires authentication    proxy_user_pass = "USERNAME:PASSWORD"    # setup basic authentication for the proxy    encoded_user_pass = base64.encodestring(proxy_user_pass)    request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass


2.在项目配置文件里(./project_name/添加pythonDOWNLOADER_MIDDLEWARES = { 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110, 'project_name.middlewares.ProxyMiddleware': 100,}

只要两步,现在请求就是通过代理的了。测试一下^_^pythonfrom scrapy.spider import BaseSpiderfrom scrapy.contrib.spiders import CrawlSpider, Rulefrom scrapy.http import Request

class TestSpider(CrawlSpider): name = "test" domain_name = "" # The following url is subject to change, you can get the last updated one from here : # start_urls = [""]

def parse(self, response):    open('test.html', 'wb').write(response.body)

