多线程抓取getproxy网站代理信息并验证,getproxy代理信息,#!/usr/bin/e
文章由Byrx.net分享于2019-03-23 07:03:06
多线程抓取getproxy网站代理信息并验证,getproxy代理信息,#!/usr/bin/e
#!/usr/bin/env python#coding:utf-8import requestsfrom lxml import etreeimport Queueimport threadingimport timeimport datetimeimport sysreload(sys)sys.setdefaultencoding('utf-8')# write proxydef writeproxy(porxyinfo): writefile = file('porxyinfo.txt','a+') writefile.write(porxyinfo) writefile.write('\\n') writefile.close()# return page codedef GetPageText(url): #print url #headers = {'User-Adent':'Mozilla/5.0 (compatible; baidu/2.1; +http://www.baidu.com)','Referer':'http://www.tianya.cn/','Accept':'*/*'} #print request.headers #r = requests.get(url,headers=headers) r = requests.get(url) #r.encoding = 'utf-8' return r.text# return post urllistdef GetPostUrl(source): posturllist = [] x = etree.HTML(source) #print source #for count in range(1,2): #lxmlpath = '//td[@style="text-align:left;"]/text()' #print lxmlpath for hang in range(2,32): proxyinfo = x.xpath('string(//tr[%s])' % hang) #print proxyinfo.split('US')[0].strip() if "https" in proxyinfo: posturllist.append('https'+'#'+proxyinfo.split('ID')[0].strip()) else: posturllist.append('http'+'#'+proxyinfo.split('ID')[0].strip()) #print proxyiphttp #print posturllist return posturllistdef Checkproxy(porxyinfo): proxies = {} #print porxyinfo.split('#') if porxyinfo.split('#')[0] == 'http': proxies['http'] = porxyinfo.split('#')[1] else: proxies['https'] = porxyinfo.split('#')[1] #print proxies r = requests.get("http://www.baidu.com", proxies=proxies,timeout=5) #print r.request.headers #print proxies if '030173' in r.text: writeproxy(porxyinfo) else: print 'No'def getproxyid(): start = time.time() queue = Queue.Queue() class ThreadUrl(threading.Thread): """Threaded Url Grab""" def __init__(self, queue): threading.Thread.__init__(self) self.queue = queue global mutex def run(self): while True: #grabs host from queue porxyinfo = self.queue.get() #print porxyinfo try: mutex.acquire(5) try: Checkproxy(porxyinfo) except: #print "Error1 : ",porxyinfo #queue.put(PostUrl) time.sleep(0.15) mutex.release() self.queue.task_done() continue #print "OK" time.sleep(0.15) mutex.release() self.queue.task_done() except Exception,e: #print 'Error : ' , e," : porxyinfo : ",porxyinfo time.sleep(0.15) self.queue.task_done() #break for pageid in range(1,5): proxyurl = 'http://www.getproxy.jp/cn/indonesia/%s' % pageid # get page source try: PageText = GetPageText(proxyurl) except Exception,e: #print "EROR2:" + proxyurl print e break PostUrlList = GetPostUrl(PageText) mutex = threading.Lock() for i in range(5): t = ThreadUrl(queue) t.setDaemon(True) try: t.start() except: pass #populate queue with data for host in PostUrlList: queue.put(host) #wait on the queue until everything has been processed queue.join() print "Elapsed Time: %s" % (time.time() - start) #print PostUrlListif __name__ == '__main__': getproxyid()#该片段来自于http://byrx.net
评论关闭