多线程采集图片,多线程采集,#! /usr/bin/
文章由Byrx.net分享于2019-03-23 09:03:05
多线程采集图片,多线程采集,#! /usr/bin/
#! /usr/bin/env python# -*- coding: utf-8 -*-import os ,sys ,urllib2,socketimport reimport timefrom threading import Threadfrom Queue import QueueDOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download') #保存地址socket.setdefaulttimeout(30)THREAD_COUNT = 5 #线程数量def md5sum(s): try: import hashlib m = hashlib.md5() m.update(s) return m.hexdigest() except: import md5 m = md5.new() m.update(s) return m.hexdigest()class spiderList(Thread): def __init__(self ,queue): Thread.__init__(self) self.queue = queue def run(self): pages = [] #这个网站列表的页数从第1页到第117页 for i in range(1,117): pages.append('http://xxx.com/?page=%s' % i) self.queue.put(pages) self.queue.task_done()class spiderDetail(Thread): def __init__(self,queue): Thread.__init__(self) self.queue = queue self.header = { 'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2' } def run(self): urls = self.queue.get() self.page=1 for url in urls: rq = urllib2.urlopen(urllib2.Request(url = url ,headers = self.header)) result = re.findall('_src="([\\w\\W]+?)"', rq.read()) if result != '': for src in result: bigImage = self.__getBigImage(src) if bigImage!='': img = urllib2.urlopen(bigImage).read() fileName = self.__getFileName(bigImage) file(fileName,'wb').write(img) self.page+=1 self.queue.task_done() def __getDir(self): import datetime now = datetime.datetime.now() dateDir = now.strftime('%Y-%m-%d') saveDir = os.path.join(DOWNLOAD_BASEDIR, dateDir) pageDir = 'page_%d' % self.page saveDir = os.path.join(saveDir, pageDir) if os.path.isdir(saveDir) == False: os.makedirs(saveDir) return saveDir def __getBigImage(self ,url): if(url==''): return False args = re.split("\\-([0-9a-zA-z]+)\\.", url) return args[0]+'.'+args[2] def __getFileName(self,url): baseName = os.path.basename(bigImage) args = os.path.splitext(baseName) fileName = md5sum(args[0])+args[1] return os.path.join(self.__getDir(), fileName)if __name__ == '__main__': queue = Queue() for i in range(THREAD_COUNT): lt = spiderList(queue) lt.setDaemon(True) lt.start() dt = spiderDetail(queue) dt.setDaemon(True) dt.start() while 1: pass#该片段来自于http://byrx.net
评论关闭