采集文中的图片,采集文中图片,Python语言: 采集
文章由Byrx.net分享于2019-03-23 07:03:10
采集文中的图片,采集文中图片,Python语言: 采集
Python语言: 采集文中的图片import os,time,sys,re,threadingimport urllibDOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download')DOWNLOAD_BASEURL = './download/'os.mkdir(DOWNLOAD_BASEDIR)def md5sum(s): try: import hashlib m = hashlib.md5() m.update(s) return m.hexdigest() except: import md5 m = md5.new() m.update(s) return m.hexdigest()class Download(threading.Thread): def __init__(self, url): threading.Thread.__init__(self) self.url = url def run(self):## print "downloading %s " % self.url f = urllib.urlopen(self.url) content_type,extention = f.headers.get('content-type','image/jpeg').split('/') if extention in ('jpeg','html'): extention = 'jpg' basename = "%s.%s" %( md5sum(self.url) , extention) self.filename = os.path.join(DOWNLOAD_BASEDIR, basename) self.local_url = DOWNLOAD_BASEURL + basename file(self.filename, 'wb').write(f.read())content = file(os.path.join(os.path.dirname(__file__), 'content.html')).read()pt=re.compile(r"""src=['"]?(<a href="http://.">http://.*?)[ '"]""")urls = []for url in pt.findall(content): urls.append(url)print time.ctime()thread_pools = []for url in urls: current = Download(url) thread_pools.append(current) current.start()result_text = content for result in thread_pools: print "%s threads running" % threading.activeCount() result.join(5) if not result.isAlive():## print "url %s saved to %s" % (result.url, result.filename) result_text = result_text.replace(result.url, result.local_url)file(os.path.join(os.path.dirname(__file__), 'result.html'), 'wb').write(result_text)print "%s threads running" % threading.activeCount()if threading.activeCount(): print "Can not stop"print time.ctime()#该片段来自于http://byrx.net
评论关闭