分享用多线程下载百度大图图片源码,多线程大图,分享用多线程下载百度大图
分享用多线程下载百度大图图片源码,多线程大图,分享用多线程下载百度大图
分享用多线程下载百度大图图片源码网上看到的,代码很多,慢慢看吧。
添加两个异常处理,使得程序在url读取时不因为异常而退出;
修改一处条件语句错误;
修改Baidu模块nextPage函数,修改App模块nextPage函数参数;
修改几处错误,改为多线程下载;
添加代理功能;
使用Queue代替threadDown子类;
独立Config配置;
添加煎蛋模块,未实现功能;
添加代理密码验证,未测试;
App.py-----------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-from Baidu import getImageUrlList, search, nextPage, searchResultfrom Downloader import downloadFromQueuefrom FileHelper import getFilenameFromURL, addExtension, makedirfrom Queue import Queuefrom thread import start_new_threadfrom Config import Configfrom NetworkPrepare import prepareimport os, sysdef baseURL(): if Config.site == 'baidu': return search(Config.keyword, Config.addtional) if Config.site == 'jandan': return 'http://jandan.net/ooxx'def main(): # 开始准备 prepare() while_n = 0 # 循环计数器 imglist = [] makedir(Config.directory) print 'Generate search url' URL = baseURL() # 下载 ############# # 获取搜索结果数量并与_count比较取其较小值 count = min(searchResult(URL), Config.count) # 没有搜索结果时退出 if not count: print "No search result at current condition." sys.exit(1) # 获得指定数量的url, 存放于list print 'Fetching page', while len(imglist) < count: print while_n, while_n += 1 tmplist = getImageUrlList(URL) imglist = imglist + tmplist URL = nextPage(URL, len(tmplist)) print '' # 换行 count = len(imglist) print "There're %d files to download" % count # 将已有文件从imglist中去除 imglist = [url for url in imglist if not getFilenameFromURL(url) in os.listdir(Config.directory)] print "There's %d files already downloaded." % (count - len(imglist)) # 下载该list print 'Fetching list of %d files' % len(imglist) queue = Queue() for url in imglist: queue.put(url) failure = [] for i in range(Config.thre3b8ad_count): start_new_thread(downloadFromQueue, ( queue, failure, Config.directory, Config.timeout)) queue.join() print "%d failed to fetch." % len(failure)def clean(): # 清理 # 1.添加后缀 print 'Adding extension ...' for fname in os.listdir(Config.directory): addExtension(Config.directory + os.sep + fname, '.jpg') print 'done.' # 2.保存cookie Config.cj.save()if __name__ == "__main__": main() clean()
Baidu.py---------------------------------------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-from Downloader import getStreamfrom MyParser import MyParserfrom String import longestString, cutTo, cutBegin, getCodingContentfrom urllib import urlencodeimport jsonimport redef getImageUrlFromScript(script): pattern = re.compile(r'(?<="objURL":").*?(?=")') groups = pattern.findall(script) new_group = [amatch.strip() for amatch in groups] # 更Pythonic的方式 return new_groupdef getImageUrlList(url): imglist = [] for i in _getJsonList(url): imglist.append(i['objURL'].strip()) return imglistdef _getJsonList(url): stream = getStream(url) data = getCodingContent(stream) pattern = re.compile(r'(?<=var imgdata =).*?(?=;v)') block = pattern.findall(data)[0] jsonlist = json.loads(block) return jsonlist['data'][:-1]def nextPage(url, pn): url_pn = cutBegin(url, '&pn=') if not url_pn: url_pn = 0 url_pn = int(url_pn) + pn return cutTo(url, '&pn') + '&pn=' + str(url_pn)def search(keyword, addtionParams={}): """Generate a search url by the given keyword. params keyword: utf8 string""" url = 'http://image.baidu.com/i?' parser = MyParser() params = _getParams('http://image.baidu.com', parser) params.update(addtionParams) params.update({'word':keyword.decode('utf8').encode('gbk')}) return url + urlencode(params)def searchResult(url): parser = MyParser() parser.feed(getCodingContent(getStream(url))) block = longestString(parser.scriptList) parser.close() pattern = re.compile('(?<="listNum":)\d*(?=,)') count = pattern.findall(block) if count: count = int(count[0]) return count return 0def _getParams(url, parser): """Get a dict contained the url params""" stream = getStream(url) data = getCodingContent(stream) parser.feed(data) return parser.formParamsdef _appendParams(adict): """Generate a url with params in adict.""" p = [key + '=' + adict[key] for key in adict] return '&'.join(p)
Config.py---------------------------------------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-from cookielib import LWPCookieJarclass Config: keyword = '美女' # 要搜索的关键字 注意不要改变文件编码 addtional = {'width':'1920', 'height':'1200'} # 宽度和高度 可以为空 {} directory = r'image' # 存放的位置 count = 30 # 要下载的数量,自动进到20的倍数 thread_count = 15 # 线程数 timeout = 20 # 下载超时限制 使用超时20 10好像小了点 # 代理设置 proxy = 'http://localhost:7001' use_proxy = False proxy_user = 'user_name' proxy_pass = 'password' proxy_auth = False cookies = 'cookies.txt' use_cookies = True cj = LWPCookieJar(cookies) site = 'baidu' #site='jandan'
Downloader.py------------------------------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-from FileHelper import getFilenameFromURL, writeBinFileimport urllib2def getStream(url, timeout=10): # 返回一个url流或者False request = urllib2.Request(url) request.add_header('User-Agent', UserAgent.Mozilla) try: stream = urllib2.urlopen(request, timeout=timeout) except (Exception, SystemExit): # catch SystemExit to keep running print "URL open error. Probably timed out." return False return streamdef downloadFromQueue(queue, failure, directory='.', timeout=10): """Get files from 3c48a list of urls. return : list, contained the failure fetch""" while not queue.empty(): url = queue.get() stream = getStream(url, timeout=timeout) file_name = getFilenameFromURL(url) if stream and writeBinFile(stream, file_name, directory): queue.task_done() print "Fetching", url, 'done.' continue failure.append(url) queue.task_done() return failureclass UserAgent: Mozilla = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14'
FileHelper.py------------------------------------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-import re, osdef getFilenameFromURL(url): # 在 Downloader 中使用 pos = url.rfind('/') shorted = url[pos + 1:] pattern = re.compile(r'\w*[\.\w]*') f_name = pattern.findall(shorted)[0] return f_namedef addExtension(fname, ext): # 在 App 中使用,添加扩展名 # 没有后缀才添加 if '.' not in fname: rename(fname, ext)def rename(old, ext): # ext='.jpg' if os.path.isfile(old + ext): ext = '2' + ext rename(old, ext) return None print 'rename', old, old + ext os.rename(old, old + ext)def makedir(directory): if not os.path.isdir(directory): os.mkdir(directory) # 不捕获_directory是文件时的异常,让程序自己退出def writeBinFile(stream, file_name, directory='.', mode='wb'): """Read from the given url and write to file_name.""" file_name = directory + os.sep + file_name if os.path.isfile(file_name): print 'File %s exist.' % file_name return False CHUNCK_SIZE = 1024 with open(file_name, mode) as fp: while True: try: chunck = stream.read(CHUNCK_SIZE) except (Exception, SystemExit): print 'Fetching error. Probably timed out.' fp.close() os.remove(file_name) return False if not chunck:break fp.write(chunck) return True
MyParser.py---------------------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-import HTMLParserclass MyParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.toggle_script_parse = False self.toggle_form_parse = False self.scriptList = [] self.formParams = {} self.result = 0 def handle_starttag(self, tag, attrs): HTMLParser.HTMLParser.handle_starttag(self, tag, attrs) attrs = dict(attrs) if tag == 'script': self.toggle_script_parse = True # parse start parse form to get attrs in input tag if tag == 'form' and attrs.has_key('name') and attrs['name'] == 'f1': self.toggle_form_parse = True if tag == 'input' and self.toggle_form_parse: if attrs.has_key('type') and attrs['type'] == 'hidden': key = attrs['name'];value = attrs['value'] self.formParams[key] = value def handle_endtag(self, tag): HTMLParser.HTMLParser.handle_endtag(self, tag) if tag == 'form' and self.toggle_form_parse: self.toggle_form_parse = False def handle_data(self, data): HTMLParser.HTMLParser.handle_data(self, data) if self.toggle_script_parse: self.scriptList.append(data) self.toggle_script_parse = False def reset(self): HTMLParser.HTMLParser.reset(self) self.toggle_script_parse = False self.toggle_form_parse = False self.scriptList = [] self.formParams = {} self.result = 0
NetworkPrepare.py-----------------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-import urllib2from Config import Configdef proxy_handler(proxy, use_proxy, proxy_auth=False, puser='', ppass=''): if use_proxy: return urllib2.ProxyHandler({"http" : proxy}) return urllib2.ProxyHandler({})def cookie_handler(cj): try: cj.revert(cj) except Exception: pass cj.clear_expired_cookies() return urllib2.HTTPCookieProcessor(cj)def prepare(): ch = cookie_handler(Config.cj) ph = proxy_handler(Config.proxy, Config.use_proxy) if Config.proxy_auth: pm = urllib2.HTTPPasswordMgrWithDefaultRealm() pm.add_password(None, Config.proxy, Config.proxy_user, Config.proxy_pass) urllib2.install_opener(urllib2.build_opener(ch, ph, urllib2.ProxyBasicAuthHandler(pm))) return urllib2.install_opener(urllib2.build_opener(ch, ph))
String.py---------------------------------------------
#!/usr/bin/env python2# -*- coding:utf-8 -*-def determinCoding(content, header): """Determin a coding of a given url content and it's header. params headers : HTMLHeader instance""" content_type = header['Content-Type'] tag = 'charset=' if content_type: if tag in content_type: pos = content_type.index(tag) pos += 8 return content_type[pos:] content = content.lower() if tag in content: startpos = content.index(tag) endpos = content[startpos:].index('"') return content[startpos:endpos][startpos + 8:]def getCodingContent(stream): # 获取stream的编码 """Return a string in which is the content of given url. return - content : unicode string""" content = stream.read() coding = determinCoding(content, stream.headers) stream.close() return content.decode(coding)def longestString(alist): """Return the longest string of a list of strings.""" a_new_list = [len(a_str) for a_str in alist] pos = a_new_list.index(max(a_new_list)) return alist[pos]def cutTo(str_1, str_2): """Cut str_1 to the position just befor str_2.""" # 不包含 str_2 if not str_2 in str_1 : return str_1 pos = str_1.index(str_2) return str_1[0:pos]def cutBegin(str_1, str_2): # 在MyParser中使用 if not str_2 in str_1: return None pos = str_1.index(str_2) + len(str_2) return str_1[pos:]
编橙之家文章,
相关内容
- 使用Python完成访问同时下载网页内容的方法,python网页
- 分享用Python3抓中文网页的操作步骤,python3中文网页,我
- 练手的Python Tcp协议小示例,pythontcp,这是我练手写的一
- 分享冒泡排序与选择排序源码示例,冒泡排序源码示例
- wxPython系统剪切板使用方法,wxpython使用方法,想知道w
- python登陆163获取邮件和通讯录列表方法,python163,此贴中
- python实现逆波兰计算简单方法,python实现波兰,这是一段
- 判断并修改图片分辨率的python方法,,平时工作中常会遇
- python应用脚本之解析淘宝助理数据包csv文件,pythoncsv
- Python之SMTP学习笔记方法详解,pythonsmtp学习笔记,这是关
评论关闭