多线程抓取getproxy网站代理信息并验证,getproxy代理信息,#!/usr/bin/e


#!/usr/bin/env python#coding:utf-8import requestsfrom lxml import etreeimport Queueimport threadingimport timeimport datetimeimport sysreload(sys)sys.setdefaultencoding('utf-8')# write proxydef writeproxy(porxyinfo):    writefile = file('porxyinfo.txt','a+')    writefile.write(porxyinfo)    writefile.write('\\n')      writefile.close()# return page codedef GetPageText(url):    #print url    #headers = {'User-Adent':'Mozilla/5.0 (compatible; baidu/2.1; +http://www.baidu.com)','Referer':'http://www.tianya.cn/','Accept':'*/*'}    #print request.headers    #r = requests.get(url,headers=headers)    r = requests.get(url)    #r.encoding = 'utf-8'    return r.text# return post urllistdef GetPostUrl(source):    posturllist = []    x = etree.HTML(source)    #print source    #for count in range(1,2):    #lxmlpath = '//td[@style="text-align:left;"]/text()'    #print lxmlpath    for hang in range(2,32):        proxyinfo = x.xpath('string(//tr[%s])' % hang)        #print proxyinfo.split('US')[0].strip()        if "https" in proxyinfo:            posturllist.append('https'+'#'+proxyinfo.split('ID')[0].strip())        else:            posturllist.append('http'+'#'+proxyinfo.split('ID')[0].strip())        #print proxyiphttp    #print posturllist    return posturllistdef Checkproxy(porxyinfo):    proxies = {}    #print porxyinfo.split('#')    if porxyinfo.split('#')[0] == 'http':        proxies['http'] = porxyinfo.split('#')[1]    else:        proxies['https'] = porxyinfo.split('#')[1]    #print proxies    r = requests.get("http://www.baidu.com", proxies=proxies,timeout=5)    #print r.request.headers    #print proxies     if '030173' in r.text:        writeproxy(porxyinfo)    else:        print 'No'def getproxyid():    start = time.time()    queue = Queue.Queue()    class ThreadUrl(threading.Thread):        """Threaded Url Grab"""        def __init__(self, queue):            threading.Thread.__init__(self)            self.queue = queue            global mutex        def run(self):            while True:                #grabs host from queue                porxyinfo = self.queue.get()                #print porxyinfo                try:                    mutex.acquire(5)                    try:                        Checkproxy(porxyinfo)                    except:                        #print "Error1 : ",porxyinfo                        #queue.put(PostUrl)                        time.sleep(0.15)                        mutex.release()                        self.queue.task_done()                        continue                    #print "OK"                    time.sleep(0.15)                    mutex.release()                    self.queue.task_done()                except Exception,e:                    #print 'Error : ' , e," : porxyinfo : ",porxyinfo                    time.sleep(0.15)                    self.queue.task_done()                    #break    for pageid in range(1,5):        proxyurl = 'http://www.getproxy.jp/cn/indonesia/%s' % pageid        # get page source        try:            PageText = GetPageText(proxyurl)        except Exception,e:            #print "EROR2:" + proxyurl            print e            break        PostUrlList = GetPostUrl(PageText)        mutex = threading.Lock()        for i in range(5):            t = ThreadUrl(queue)            t.setDaemon(True)            try:                t.start()            except:                pass        #populate queue with data        for host in PostUrlList:            queue.put(host)        #wait on the queue until everything has been processed        queue.join()    print "Elapsed Time: %s" % (time.time() - start)        #print PostUrlListif __name__ == '__main__':    getproxyid()#该片段来自于http://byrx.net

评论关闭