分享用多线程下载百度大图图片源码,多线程大图,分享用多线程下载百度大图


分享用多线程下载百度大图图片源码网上看到的,代码很多,慢慢看吧。
添加两个异常处理,使得程序在url读取时不因为异常而退出;
修改一处条件语句错误;
修改Baidu模块nextPage函数,修改App模块nextPage函数参数;
修改几处错误,改为多线程下载;
添加代理功能;
使用Queue代替threadDown子类;
独立Config配置;
添加煎蛋模块,未实现功能;
添加代理密码验证,未测试;

App.py-----------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-from Baidu import getImageUrlList, search, nextPage, searchResultfrom Downloader import downloadFromQueuefrom FileHelper import getFilenameFromURL, addExtension, makedirfrom Queue import Queuefrom thread import start_new_threadfrom  Config import Configfrom NetworkPrepare import prepareimport os, sysdef baseURL():  if Config.site == 'baidu':    return search(Config.keyword, Config.addtional)  if Config.site == 'jandan':    return 'http://jandan.net/ooxx'def main():  # 开始准备  prepare()  while_n = 0 # 循环计数器  imglist = []  makedir(Config.directory)  print 'Generate search url'  URL = baseURL()  # 下载 #############  # 获取搜索结果数量并与_count比较取其较小值  count = min(searchResult(URL), Config.count)  # 没有搜索结果时退出  if not count:    print "No search result at current condition."    sys.exit(1)  # 获得指定数量的url, 存放于list    print 'Fetching page',  while len(imglist) < count:    print while_n,    while_n += 1    tmplist = getImageUrlList(URL)    imglist = imglist + tmplist    URL = nextPage(URL, len(tmplist))  print '' # 换行  count = len(imglist)  print "There're %d files to download" % count  # 将已有文件从imglist中去除  imglist = [url for url in imglist             if not getFilenameFromURL(url) in os.listdir(Config.directory)]  print "There's %d files already downloaded." % (count - len(imglist))  # 下载该list   print 'Fetching list of %d files' % len(imglist)  queue = Queue()  for url in imglist:    queue.put(url)  failure = []  for i in range(Config.thre3b8ad_count):    start_new_thread(downloadFromQueue, (                                         queue, failure, Config.directory, Config.timeout))  queue.join()  print "%d failed to fetch." % len(failure)def clean():  # 清理  # 1.添加后缀  print 'Adding extension ...'  for fname in os.listdir(Config.directory):    addExtension(Config.directory + os.sep + fname, '.jpg')  print 'done.'  # 2.保存cookie  Config.cj.save()if __name__ == "__main__":  main()  clean()

Baidu.py---------------------------------------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-from Downloader import getStreamfrom MyParser import MyParserfrom String import longestString, cutTo, cutBegin, getCodingContentfrom urllib import urlencodeimport jsonimport redef getImageUrlFromScript(script):  pattern = re.compile(r'(?<="objURL":").*?(?=")')  groups = pattern.findall(script)  new_group = [amatch.strip() for amatch in groups] # 更Pythonic的方式  return new_groupdef getImageUrlList(url):  imglist = []  for i in _getJsonList(url):    imglist.append(i['objURL'].strip())  return imglistdef _getJsonList(url):  stream = getStream(url)  data = getCodingContent(stream)  pattern = re.compile(r'(?<=var imgdata =).*?(?=;v)')  block = pattern.findall(data)[0]  jsonlist = json.loads(block)  return jsonlist['data'][:-1]def nextPage(url, pn):  url_pn = cutBegin(url, '&pn=')  if not url_pn:    url_pn = 0  url_pn = int(url_pn) + pn  return cutTo(url, '&pn') + '&pn=' + str(url_pn)def search(keyword, addtionParams={}):  """Generate a search url by the given keyword.  params keyword: utf8 string"""  url = 'http://image.baidu.com/i?'  parser = MyParser()  params = _getParams('http://image.baidu.com', parser)  params.update(addtionParams)  params.update({'word':keyword.decode('utf8').encode('gbk')})  return url + urlencode(params)def searchResult(url):  parser = MyParser()  parser.feed(getCodingContent(getStream(url)))  block = longestString(parser.scriptList)  parser.close()  pattern = re.compile('(?<="listNum":)\d*(?=,)')  count = pattern.findall(block)  if count:    count = int(count[0])    return count  return 0def _getParams(url, parser):  """Get a dict contained the url params"""  stream = getStream(url)  data = getCodingContent(stream)  parser.feed(data)  return parser.formParamsdef _appendParams(adict):  """Generate a url with params in adict."""  p = [key + '=' + adict[key] for key in adict]  return '&'.join(p)

Config.py---------------------------------------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-from cookielib import LWPCookieJarclass Config:  keyword = '美女' # 要搜索的关键字 注意不要改变文件编码  addtional = {'width':'1920', 'height':'1200'} # 宽度和高度 可以为空 {}  directory = r'image'  # 存放的位置  count = 30     # 要下载的数量,自动进到20的倍数  thread_count = 15 # 线程数  timeout = 20 # 下载超时限制 使用超时20 10好像小了点  # 代理设置  proxy = 'http://localhost:7001'  use_proxy = False  proxy_user = 'user_name'  proxy_pass = 'password'  proxy_auth = False  cookies = 'cookies.txt'  use_cookies = True  cj = LWPCookieJar(cookies)  site = 'baidu'  #site='jandan'

Downloader.py------------------------------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-from FileHelper import getFilenameFromURL, writeBinFileimport urllib2def getStream(url, timeout=10):  # 返回一个url流或者False  request = urllib2.Request(url)  request.add_header('User-Agent', UserAgent.Mozilla)  try:    stream = urllib2.urlopen(request, timeout=timeout)  except (Exception, SystemExit): # catch SystemExit to keep running    print "URL open error. Probably timed out."    return False  return streamdef downloadFromQueue(queue, failure, directory='.', timeout=10):  """Get files from 3c48a list of urls.  return : list, contained the failure fetch"""  while not queue.empty():    url = queue.get()    stream = getStream(url, timeout=timeout)    file_name = getFilenameFromURL(url)    if stream and writeBinFile(stream, file_name, directory):      queue.task_done()      print "Fetching", url, 'done.'      continue    failure.append(url)    queue.task_done()  return failureclass UserAgent:  Mozilla = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14'

FileHelper.py------------------------------------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-import re, osdef getFilenameFromURL(url):  # 在 Downloader 中使用  pos = url.rfind('/')  shorted = url[pos + 1:]  pattern = re.compile(r'\w*[\.\w]*')  f_name = pattern.findall(shorted)[0]  return f_namedef addExtension(fname, ext):  # 在 App 中使用,添加扩展名  # 没有后缀才添加  if '.' not in fname:    rename(fname, ext)def rename(old, ext):  # ext='.jpg'  if os.path.isfile(old + ext):    ext = '2' + ext    rename(old, ext)    return None  print 'rename', old, old + ext  os.rename(old, old + ext)def makedir(directory):  if not os.path.isdir(directory):    os.mkdir(directory) # 不捕获_directory是文件时的异常,让程序自己退出def writeBinFile(stream, file_name, directory='.', mode='wb'):  """Read from the given url and write to file_name."""  file_name = directory + os.sep + file_name  if os.path.isfile(file_name):    print 'File %s exist.' % file_name    return False  CHUNCK_SIZE = 1024  with open(file_name, mode) as fp:    while True:      try:        chunck = stream.read(CHUNCK_SIZE)      except (Exception, SystemExit):        print 'Fetching error. Probably timed out.'        fp.close()        os.remove(file_name)        return False      if not chunck:break      fp.write(chunck)  return True

MyParser.py---------------------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-import HTMLParserclass MyParser(HTMLParser.HTMLParser):  def __init__(self):    HTMLParser.HTMLParser.__init__(self)    self.toggle_script_parse = False    self.toggle_form_parse = False    self.scriptList = []    self.formParams = {}    self.result = 0  def handle_starttag(self, tag, attrs):    HTMLParser.HTMLParser.handle_starttag(self, tag, attrs)    attrs = dict(attrs)    if tag == 'script':      self.toggle_script_parse = True    # parse start parse form to get attrs in input tag    if tag == 'form' and attrs.has_key('name') and attrs['name'] == 'f1':      self.toggle_form_parse = True    if tag == 'input' and self.toggle_form_parse:      if attrs.has_key('type') and attrs['type'] == 'hidden':        key = attrs['name'];value = attrs['value']        self.formParams[key] = value  def handle_endtag(self, tag):    HTMLParser.HTMLParser.handle_endtag(self, tag)    if tag == 'form' and self.toggle_form_parse:      self.toggle_form_parse = False  def handle_data(self, data):    HTMLParser.HTMLParser.handle_data(self, data)    if self.toggle_script_parse:      self.scriptList.append(data)      self.toggle_script_parse = False  def reset(self):    HTMLParser.HTMLParser.reset(self)    self.toggle_script_parse = False    self.toggle_form_parse = False    self.scriptList = []    self.formParams = {}    self.result = 0

NetworkPrepare.py-----------------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-import urllib2from Config import Configdef proxy_handler(proxy, use_proxy, proxy_auth=False, puser='', ppass=''):  if use_proxy:    return urllib2.ProxyHandler({"http" : proxy})  return urllib2.ProxyHandler({})def cookie_handler(cj):  try:    cj.revert(cj)  except Exception:    pass  cj.clear_expired_cookies()  return urllib2.HTTPCookieProcessor(cj)def prepare():  ch = cookie_handler(Config.cj)  ph = proxy_handler(Config.proxy, Config.use_proxy)  if Config.proxy_auth:    pm = urllib2.HTTPPasswordMgrWithDefaultRealm()    pm.add_password(None, Config.proxy, Config.proxy_user, Config.proxy_pass)    urllib2.install_opener(urllib2.build_opener(ch, ph, urllib2.ProxyBasicAuthHandler(pm)))    return  urllib2.install_opener(urllib2.build_opener(ch, ph))

String.py---------------------------------------------

#!/usr/bin/env python2# -*- coding:utf-8 -*-def determinCoding(content, header):  """Determin a coding of a given url content and it's header.  params headers : HTMLHeader instance"""  content_type = header['Content-Type']  tag = 'charset='  if content_type:    if tag in content_type:      pos = content_type.index(tag)      pos += 8      return content_type[pos:]  content = content.lower()  if tag in content:    startpos = content.index(tag)    endpos = content[startpos:].index('"')    return content[startpos:endpos][startpos + 8:]def getCodingContent(stream):  # 获取stream的编码  """Return a string in which is the content of given url.  return - content : unicode string"""  content = stream.read()  coding = determinCoding(content, stream.headers)  stream.close()  return content.decode(coding)def longestString(alist):  """Return the longest string of a list of strings."""  a_new_list = [len(a_str) for a_str in alist]  pos = a_new_list.index(max(a_new_list))  return alist[pos]def cutTo(str_1, str_2):  """Cut str_1 to the position just befor str_2."""  # 不包含 str_2  if not str_2 in str_1 :    return str_1  pos = str_1.index(str_2)  return str_1[0:pos]def cutBegin(str_1, str_2):  # 在MyParser中使用  if not str_2 in str_1:    return None  pos = str_1.index(str_2) + len(str_2)  return str_1[pos:]

编橙之家文章,

评论关闭