多线程采集图片,多线程采集,do.py#! /usr
多线程采集图片,多线程采集,do.py#! /usr
do.py
#! /usr/bin/env python# -*- coding: utf-8 -*-import os ,sys ,urllib2,socketimport reimport timefrom threading import Threadfrom Queue import QueueDOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download') #保存地址socket.setdefaulttimeout(30)THREAD_COUNT = 5 #线程数量def md5sum(s): try: import hashlib m = hashlib.md5() m.update(s) return m.hexdigest() except: import md5 m = md5.new() m.update(s) return m.hexdigest()class spiderList(Thread): def __init__(self ,queue): Thread.__init__(self) self.queue = queue def run(self): pages = [] #这个网站列表的页数从第1页到第117页 for i in range(1,117): pages.append('http://xxx.com/?page=%s' % i) self.queue.put(pages) self.queue.task_done()class spiderDetail(Thread): def __init__(self,queue): Thread.__init__(self) self.queue = queue self.header = { 'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2' } def run(self): urls = self.queue.get() self.page=1 for url in urls: rq = urllib2.urlopen(urllib2.Request(url = url ,headers = self.header)) result = re.findall('_src="([\w\W]+?)"', rq.read()) if result != '': for src in result: bigImage = self.__getBigImage(src) if bigImage!='': img = urllib2.urlopen(bigImage).read() fileName = self.__getFileName(bigImage) file(fileName,'wb').write(img) self.page+=1 self.queue.task_done() def __getDir(self): import datetime now = datetime.datetime.now() dateDir = now.strftime('%Y-%m-%d') saveDir = os.path.join(DOWNLOAD_BASEDIR, dateDir) pageDir = 'page_%d' % self.page saveDir = os.path.join(saveDir, pageDir) if os.path.isdir(saveDir) == False: os.makedirs(saveDir) return saveDir def __getBigImage(self ,url): if(url==''): return False args = re.split("\-([0-9a-zA-z]+)\.", url) return args[0]+'.'+args[2] def __getFileName(self,url): baseName = os.path.basename(bigImage) args = os.path.splitext(baseName) fileName = md5sum(args[0])+args[1] return os.path.join(self.__getDir(), fileName)if __name__ == '__main__': queue = Queue() for i in range(THREAD_COUNT): lt = spiderList(queue) lt.setDaemon(True) lt.start() dt = spiderDetail(queue) dt.setDaemon(True) dt.start() while 1: pass
相关内容
- 很好玩的一个面试题,很好玩一个面试题,[Python]代码
- 基于Tornado And MySQL的RSS移动服务端.zip,tornadorss,[Python
- 初识聚类算法: DBSACN,初识聚类算法dbsacn,[Python]代码#
- 初识聚类算法: 凝聚层次聚类,初识聚类,[Python]代码#
- 初识聚类算法: 基本K均值,初识聚类k均值,[Python]代码
- Rock-paper-scissors-lizard-Spock Game,,[Python]代码#
- 元芳,你怎么看,元芳,你,O(∩_∩)OSTR =
- wxPython实现sqlite3数据库的gui界面,wxpythonsqlite3,sql.py#!
- 简单的生成html,简单生成html,[Python]代码cl
- Levenshtein字符串相似度,Levenshtein字符串,Levenshtein距
评论关闭