requests爬取代理网站ip验证,登陆ip网站查询ip不变的原因解决,requestsip,改写了网上一个爬取ip代




改写了网上一个爬取ip代理并验证的代码,验证不是去登陆度娘,而是换了ip查询网站,原以为会用爬取的代理IP,实际上ip并没有变化,烦请大牛看下什么原因,附上运行结果:

#

{'https': u'183.221.50.139:8123'} 您的IP:[218.88.XX.XX] 来自:四川省成都市 电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7
{'https': u'116.236.216.116:8080'} 您的IP:[218.88.XX.XX] 来自:四川省成都市 电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7
{'https': u'183.221.160.44:8123'} 您的IP:[218.88.XX.XX] 来自:四川省成都市 电信操作系统:Unknown浏览器:Unknown 0.0python-requests/2.4.0 CPython/2.7.3 Windows/7

#

代码如下

import requestsfrom lxml import etreefrom bs4 import BeautifulSoup as bsimport Queueimport threadingimport timeimport datetimeimport sysreload(sys)sys.setdefaultencoding('utf-8')# write proxydef writeproxy(porxyinfo):    writefile = file('porxyinfo.txt','a+')    writefile.write(porxyinfo)    writefile.write('\n')      writefile.close()# return page codedef GetPageText(url):    r = requests.get(url)    return r.text# return post urllistdef GetPostUrl(source):    posturllist = []    iplist = bs(source).find("table",{"id":"ip_list"}).findAll("tr")[1:]    for item in iplist:        getinfo = item.findAll("td")        ip      = getinfo[1].get_text(strip='\r\n')        port    = getinfo[2].get_text(strip='\r\n')        address = getinfo[3].get_text(strip='\r\n')        type    = getinfo[5].get_text(strip='\r\n')        posturllist.append(type.lower()+'#'+ip+':'+port)    return posturllistdef Checkproxy(porxyinfo):    proxies = {}    if porxyinfo.split('#')[0] == 'http':        proxies['http'] = porxyinfo.split('#')[1]    else:        proxies['https'] = porxyinfo.split('#')[1]    r = requests.get("http://ip.chinaz.com/", proxies=proxies,timeout=3)    if r:        print proxies, bs(requests.get('http://ip.chinaz.com/').content).find("span",{"class":"info3"}).get_text(strip='\r\n')#         writeproxy(porxyinfo)    else:        print 'No'def getproxyid():    start = time.time()    queue = Queue.Queue()    class ThreadUrl(threading.Thread):        """Threaded Url Grab"""        def __init__(self, queue):            threading.Thread.__init__(self)            self.queue = queue            global mutex        def run(self):            while True:                porxyinfo = self.queue.get()                try:                    mutex.acquire(5)                    try:                        Checkproxy(porxyinfo)                    except:                        time.sleep(0.15)                        mutex.release()                        self.queue.task_done()                        continue                    time.sleep(0.15)                    mutex.release()                    self.queue.task_done()                except Exception,e:                    time.sleep(0.15)                    self.queue.task_done()           pagenum =5    targets  = ['http://www.xici.net.co/nn/%d'%page for page in range(1,pagenum+1)]    targets += ['http://www.xici.net.co/wn/%d'%page for page in range(1,pagenum+1)]         for proxyurl in targets:        try:            PageText = GetPageText(proxyurl)        except Exception,e:            print e            break        PostUrlList = GetPostUrl(PageText)        mutex = threading.Lock()        for i in range(5):            t = ThreadUrl(queue)            t.setDaemon(True)            try:                t.start()            except:                pass        for host in PostUrlList:            queue.put(host)        queue.join()    print "Elapsed Time: %s" % (time.time() - start)if __name__ == '__main__':    getproxyid()

编橙之家文章,

评论关闭