多线程采集图片,多线程采集,#! /usr/bin/


#! /usr/bin/env python# -*- coding: utf-8 -*-import os ,sys ,urllib2,socketimport reimport timefrom threading import Threadfrom Queue import QueueDOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download') #保存地址socket.setdefaulttimeout(30)THREAD_COUNT = 5 #线程数量def md5sum(s):    try:        import hashlib        m = hashlib.md5()        m.update(s)        return m.hexdigest()    except:        import md5        m = md5.new()        m.update(s)        return m.hexdigest()class spiderList(Thread):    def __init__(self ,queue):        Thread.__init__(self)        self.queue = queue    def run(self):        pages = []        #这个网站列表的页数从第1页到第117页        for i in range(1,117):            pages.append('http://xxx.com/?page=%s' % i)        self.queue.put(pages)        self.queue.task_done()class spiderDetail(Thread):    def __init__(self,queue):        Thread.__init__(self)        self.queue = queue        self.header = {            'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2'        }    def run(self):        urls = self.queue.get()        self.page=1        for url in urls:            rq = urllib2.urlopen(urllib2.Request(url = url ,headers = self.header))            result = re.findall('_src="([\\w\\W]+?)"', rq.read())            if result != '':                for src in result:                    bigImage = self.__getBigImage(src)                    if bigImage!='':                        img = urllib2.urlopen(bigImage).read()                        fileName = self.__getFileName(bigImage)                        file(fileName,'wb').write(img)            self.page+=1        self.queue.task_done()    def __getDir(self):        import datetime        now = datetime.datetime.now()        dateDir = now.strftime('%Y-%m-%d')        saveDir = os.path.join(DOWNLOAD_BASEDIR, dateDir)        pageDir = 'page_%d' % self.page        saveDir = os.path.join(saveDir, pageDir)        if os.path.isdir(saveDir) == False:            os.makedirs(saveDir)        return saveDir    def __getBigImage(self ,url):        if(url==''):            return False        args = re.split("\\-([0-9a-zA-z]+)\\.", url)        return args[0]+'.'+args[2]    def __getFileName(self,url):        baseName = os.path.basename(bigImage)        args = os.path.splitext(baseName)        fileName = md5sum(args[0])+args[1]        return os.path.join(self.__getDir(), fileName)if __name__ == '__main__':    queue = Queue()    for i in range(THREAD_COUNT):        lt = spiderList(queue)        lt.setDaemon(True)        lt.start()        dt = spiderDetail(queue)        dt.setDaemon(True)        dt.start()    while 1:        pass#该片段来自于http://byrx.net

评论关闭