Python爬虫多线程假死现象要什么方法解决?,python爬虫,抓baidu数据,但跑不
Python爬虫多线程假死现象要什么方法解决?,python爬虫,抓baidu数据,但跑不
抓baidu数据,但跑不了多久就假死,无反应也不报错,初学python,搞了两礼拜没搞明白,望各位前辈指点下
#coding:utf-8'''百度排名查询,代理多线程版本'''import StringIO,pycurl,time,random,re,os,csvfrom threading import Thread,Lockfrom Queue import Queuefrom bs4 import BeautifulSoup as bscsvfile = open('serp_html.csv','wb') #存放关键词和搜索结果页源码的文件bdjd_dict = {}#bdjd_list = ["www.baidu.com","180.97.33.107","115.239.210.27","180.97.33.108","180.97.33.107","180.97.33.107","180.97.33.108","220.181.111.188","220.181.111.188","180.97.33.107","180.97.33.107","115.239.211.112","180.97.33.108","180.97.33.108","180.97.33.108","180.97.33.108","180.97.33.108","115.239.211.112","180.97.33.108","115.239.211.112","115.239.210.27","180.97.33.108","115.239.211.112","115.239.210.27","180.97.33.108","115.239.210.27","61.135.169.125","115.239.211.112","115.239.210.27","180.97.33.107","180.97.33.107","180.97.33.108","115.239.210.27","180.97.33.107","61.135.169.121","115.239.210.27","61.135.169.121","61.135.169.125","115.239.211.112","115.239.210.27","61.135.169.125","112.80.248.73","61.135.169.121","112.80.248.74","112.80.248.73","61.135.169.125","180.97.33.108","115.239.210.27","61.135.169.125","61.135.169.125","112.80.248.74","112.80.248.74","61.135.169.121","115.239.210.27","61.135.169.125","111.13.100.92","111.13.100.92","111.13.100.91","111.13.100.91","115.239.211.112","111.13.100.92","111.13.100.91","111.13.100.92","115.239.211.112","115.239.210.27","115.239.211.112","115.239.210.27","115.239.210.27","115.239.210.27","115.239.210.27"]bdjd_list = ["www.baidu.com"]#提取百度地域节点def getBDJD(bdjd_str): bdjd_list = bdjd_str.split(',') bdjd = random.choice(bdjd_list) return bdjddef baidu_url(word): #百度搜索url return 'http://www.baidu.com/s?wd=%s' % (bdjd,word)daili_list = [] #存储代理ip#读取代理文件,随机提取1个代理def ip(): for x in open('hege_daili.txt'): x = x.strip() daili_list.append(x) newip = random.choice(daili_list) return newip#如果代理不可用,则从代理文件中删除,此函数在baidu_cout中应用def daili_delete(ip): dailifile = open('daili_beifen.txt','w') for line in open('hege_daili.txt'): line = line.strip() if ip not in line: dailifile.write(line+"\n") os.system("mv daili_beifen.txt hege_daili.txt")def baidu_url(word): #百度搜索url return 'http://www.baidu.com/s?wd=%s' % worddef getUA(): uaList = [ 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)', 'Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1', 'Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)', 'Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)', 'Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36' ] ua = random.choice(uaList) return uadef baidu_cont(url,headers,ip): #百度搜索结果页源码 while 1: try: c = pycurl.Curl() c.setopt(pycurl.MAXREDIRS,5) c.setopt(pycurl.REFERER, url) c.setopt(pycurl.FOLLOWLOCATION, True) c.setopt(pycurl.CONNECTTIMEOUT, 60) c.setopt(pycurl.TIMEOUT,120) c.setopt(pycurl.ENCODING,'gzip,deflate') c.setopt(c.PROXY,ip) c.fp = StringIO.StringIO() c.setopt(pycurl.URL, url) c.setopt(pycurl.HTTPHEADER,headers) c.setopt(c.WRITEFUNCTION, c.fp.write) c.perform() #code = c.getinfo(c.HTTP_CODE) 返回状态码 html = c.fp.getvalue() if '="http://verify.baidu.com' in html: time.sleep(1200) print '重启' continue return html except Exception, what: information = '错误信息:%s' % what return str(information) continuedef search(req,line): text = re.search(req,line) if text: data = text.group(1) else: data = 'no' return dataurl_list = []for word in open('word'): word = word.strip() url_list.append(word)class Fetcher: def __init__(self,threads): self.lock = Lock() #线程锁 self.q_req = Queue() #任务队列 self.q_ans = Queue() #完成队列 self.threads = threads for i in range(threads): t = Thread(target=self.threadget) #括号中的是每次线程要执行的任务 t.setDaemon(True) #设置子线程是否随主线程一起结束,必须在start() #之前调用。默认为False t.start() #启动线程 self.running = 0 #设置运行中的线程个数 def __del__(self): #解构时需等待两个队列完成 time.sleep(0.5) self.q_req.join() #Queue等待队列为空后再执行其他操作 self.q_ans.join() #返回还在运行线程的个数,为0时表示全部运行完毕 def taskleft(self): return self.q_req.qsize()+self.q_ans.qsize()+self.running def push(self,req): self.q_req.put(req) def pop(self): return self.q_ans.get() #线程执行的任务,根据req来区分 def threadget(self): while True: line = self.q_req.get() word = line.strip() ''' Lock.lock()操作,使用with可以不用显示调用acquire和release, 这里锁住线程,使得self.running加1表示运行中的线程加1, 如此做防止其他线程修改该值,造成混乱。 with下的语句结束后自动解锁。 ''' with self.lock: self.running += 1 bdjd_str = ','.join(bdjd_list) newip = ip() bdjd = getBDJD(bdjd_str) url = baidu_url(word) headers = [ "Accept:*/*", "Accept-Encoding:gzip, deflate, sdch", "Accept-Language:zh-CN,zh;q=0.8,en;q=0.6", "Connection:keep-alive", #"Cookie:BIDUPSID=4DEE9B78AA3A97A51CFC916C43C30EC6; BAIDUID=380B58B009DF49761A4D9C30E19B34D0:FG=1; B64_BOT=1; PSTM=1432281826; BDRCVFR[G4oNZs7I7B3]=duGuzxHDPeTmy-lpA78QhPEUf; BDUSS=X5PYndScmJYc3lPSm1yQy15Vlg2YW45cWl3ZVFEalFjckFSNk16NHFIZTZkWVpWQVFBQUFBJCQAAAAAAAAAAAEAAAAJkstJv7TXvMTj1NnM-AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALroXlW66F5Vc; SFSSID=ap4nu9shvsoc52f5nvnq6nev42; uc_login_unique=95684e36f6d0a03b984187546ec66f06; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a01823607022; BDRCVFR[ltbVPlNi2ac]=mk3SLVN4HKm; BD_HOME=1; BD_UPN=32; sug=3; sugstore=1; ORIGIN=0; bdime=0; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; H_PS_645EC=5cdfPyG9PHeV%2BYlX03NQsmg4%2BZ0wlUr4XqxnORGloYCIlW8tFgZVbm9AliNr1w2Ls75g; BD_CK_SAM=1; BDSVRTM=106; H_PS_PSSID=13783_1454_13519_13075_12824_12867_14167_10562_12722_14155_14172_13202_14329_11951_13936_13741_14369_14182_8498_14195; WWW_ST=1432372813391", "Host:www.baidu.com", "RA-Sid:7739A016-20140918-030243-3adabf-48f828", "RA-Ver:2.10.3", "Referer:http://www.baidu.com/", "X-Requested-With:XMLHttpRequest", "User-Agent:%s" % getUA() ] html = baidu_cont(url, headers, newip) soup = bs(html) b_tags = soup.find_all('div', {'class': 'result c-container '}) for line in b_tags: newline = str(line) number = search(r'id="(\d+)"',newline) urldiv = search(r'<span class="g">(.*?)</span>',newline) #获取源码中domain所在的<span> data = [] data.append(word) data.append(newline) writer = csv.writer(csvfile,dialect='excel') writer.writerow(data) if len(b_tags) == 0: print html else: print '》》当前IP:%s,已抓取:%s,返回%s条结果' % (newip,word,len(b_tags)) #self.q_ans.put((req,ans)) # 将完成的任务压入完成队列,在主程序中返回 self.q_ans.put(word) with self.lock: self.running -= 1 self.q_req.task_done() # 在完成一项工作之后,Queue.task_done() # 函数向任务已经完成的队列发送一个信号 time.sleep(0.1) # don't spamif __name__ == "__main__": #links = [ 'http://www.verycd.com/topics/%d/'%i for i in range(5420,5450) ] f = Fetcher(threads=10) #设置线程数为10 for url in url_list: f.push(url) #所有url推入下载队列 while f.taskleft(): #若还有未完成的的线程 f.pop() #从下载完成的队列中取出结果'''# 如果百度节点超时次数》10,则从百度节点列表中删除# if '错误信息' in html:# print html# if 'Connection refused' in html:# #判断访问超时的节点存入字典,若该节点已超过10次链接超时,则从节点列表中删除# if bdjd_dict.has_key(bdjd):# bdjd_dict[bdjd] += 1# print '节点:%s,已%s次超时' % (bdjd,bdjd_dict[bdjd])# if int(bdjd_dict[bdjd]) >= 10:# bdjd_list.remove(bdjd)# print "节点:%s 已删除" % bdjd# else:# bdjd_dict[bdjd] = 1# continue'''
如果是爬虫的话,这个一般都是由于网络原因造成的卡住,可以做两层控制:
尝试下轻量级的协程。使用gevent: http://www.gevent.org/
Python2.x的Queue有bug。
Python你用多线程,也是醉了。
改成多进程!
kill掉吧,一般都使用多线程不容易死,查找下具体什么原因导致的吧
编橙之家文章,
相关内容
- redis官网5种python客户端连接性能如何比较,redispython,我
- 求Python ldap修改组织的名称方法附源码,pythonldap,如何使
- Python 管理员权限打开PyQt QLabel外部连结用什么方法,
- 有遇到过Gevent开3000并发时出现栈溢出问题的吗,geven
- Python flask路由设置问题,pythonflask,难道说每个页面都需
- Python爬取网站登录时遇到问题,python网站登录,爬取领英
- Python语言实现微博关键词转发操作思路是什么,python思
- Django中admin是否支持自定义表单,djangoadmin,现在admin可以
- Python scrapy运行出错报VerificationError错误,,在启动scrap
- 请问Python tornado异步tcp服务器一些问题,pythontornado,大家
评论关闭