采集文中的图片,采集文中图片,Python语言: 采集


Python语言: 采集文中的图片import os,time,sys,re,threadingimport urllibDOWNLOAD_BASEDIR = os.path.join(os.path.dirname(__file__), 'download')DOWNLOAD_BASEURL = './download/'os.mkdir(DOWNLOAD_BASEDIR)def md5sum(s):    try:        import hashlib        m = hashlib.md5()        m.update(s)        return m.hexdigest()    except:        import md5        m = md5.new()        m.update(s)        return m.hexdigest()class Download(threading.Thread):    def __init__(self, url):        threading.Thread.__init__(self)        self.url = url    def run(self):##        print "downloading %s " % self.url        f = urllib.urlopen(self.url)        content_type,extention = f.headers.get('content-type','image/jpeg').split('/')        if extention in ('jpeg','html'):            extention = 'jpg'        basename = "%s.%s" %( md5sum(self.url) , extention)        self.filename = os.path.join(DOWNLOAD_BASEDIR, basename)        self.local_url = DOWNLOAD_BASEURL + basename        file(self.filename, 'wb').write(f.read())content = file(os.path.join(os.path.dirname(__file__), 'content.html')).read()pt=re.compile(r"""src=['"]?(<a href="http://.">http://.*?)[ '"]""")urls = []for url in pt.findall(content):    urls.append(url)print time.ctime()thread_pools = []for url in urls:    current = Download(url)    thread_pools.append(current)    current.start()result_text = content    for result in thread_pools:    print "%s threads running" % threading.activeCount()     result.join(5)    if not result.isAlive():##        print "url %s saved to %s" % (result.url, result.filename)        result_text = result_text.replace(result.url, result.local_url)file(os.path.join(os.path.dirname(__file__), 'result.html'), 'wb').write(result_text)print "%s threads running" % threading.activeCount()if threading.activeCount():    print "Can not stop"print time.ctime()#该片段来自于http://byrx.net

评论关闭