抓取dribbble图片,抓取dribbble,#!/usr/bin/e


#!/usr/bin/env python2# -*- coding: utf-8 -*-"""# @brief: 抓取dribbble.com里的图片,包括附件。#         主要用于平时在设计方面的材料收集。"""from BeautifulSoup import BeautifulSoupfrom colorama import init, Fore #控制台颜色from urllib2 import urlopenfrom urllib import urlretrievefrom progressbar import *import codecsimport timeimport jsonimport osimport re# 通过使用autoreset参数可以让变色效果只对当前输出起作用,输出完成后颜色恢复默认设置init(autoreset=True)class Dribbble:  HOMEPAGE = 'http://dribbble.com/shots/everyone'  TOP = 0  #0表示不限制   allPageList = []  pageList = []  hasUpdate = False  def __init__(self, top=0):    self.TOP = top    now = time.strftime("%Y%m%d", time.localtime())    # 获取本地缓存    _pageList = self.cacheShots()    if _pageList and len(_pageList) and (_pageList['update'] == str(now)):      self.allPageList = _pageList['url']    self.getAllShotsUrl()    # 如果数据有更新,则保存数据    if self.hasUpdate:      self.allPageList.extend(self.pageList)      self.cacheShots({        'url': self.allPageList,        'update': now      })    for item in self.pageList:      if item:        self.downShot(item)  def getAllShotsUrl(self):    pageUrl = self.HOMEPAGE + '?page='    pageIndex = 1    isEnd = False    # python貌似没有三目    if len(self.pageList):      lastUpdateShotUrl = self.pageList[len(self.pageList) - 1]    else:      lastUpdateShotUrl = ''    while not isEnd:      # 获取当前页面所有作品的url      shotsUrl = self.getShotUrl(pageUrl + str(pageIndex))      if (self.TOP and pageIndex <= self.TOP) or not self.TOP:        if len(shotsUrl):          # 缓存得到更新的url          tmp = []          for url in shotsUrl:            # 如果当前url和缓存中最后的url一样,则跳出。            if url == lastUpdateShotUrl:              isEnd = True              break            else:              tmp.insert(0, 'http://dribbble.com' + url)              self.hasUpdate = True          tmp.extend(self.pageList)          self.pageList = tmp          #页数加1          pageIndex += 1        else:          isEnd = True      else:        isEnd = True  def getShotUrl(self, url):    page = urlopen(url)    soup = BeautifulSoup(page.read())    ol = soup.find('ol',{'class': 'dribbbles group'})    # 如果没有内容,直接返回空数组    if not ol:      return []    links = ol.findAll('a',{'class': 'dribbble-link'})    # 返回的结果    result = []    for i in links:      try:        href = i['href']        if href:          result.append(href)      except:        pass    return result  def downShot(self, url):    #创建目录    dirname = 'dribbble'    try:      os.mkdir(dirname)    except Exception, e:      pass    page = urlopen(url)    soup = BeautifulSoup(page.read())    #正常展示的作品    #例如 shotDefaultUrl = http://dribbble.s3.amazonaws.com/users/34934/screenshots/1268076/gym_preview_1x.jpg    shotDefaultUrl = 'http:' + soup.find('div', {'class': 'single-img'}).find('img')['src']    shot2XOriginUrl = re.sub(r'_1x', '', shotDefaultUrl)    preUrl = re.match(r'.*\\/', shotDefaultUrl).group()    #附件大图,这正是我要下载东西    shotAttachments = soup.find('div', {'class': 'attachments'})    shotAttachmentsList = []    hasAttachmentShot = False    if shotAttachments:      shotAttachmentsList = shotAttachments.findAll('a')    widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker('>-=')),' ', ETA(), ' ', FileTransferSpeed()]    pbar = ProgressBar(widgets=widgets)    global hasReset    hasReset = True    def dlProgress(blockCount, blockSize, totalSize):      pbar.maxval = totalSize      global hasReset      if hasReset:        pbar.start()        hasReset = False      pbar.update(int(min(blockCount * blockSize, totalSize)))    def retrieveImg(url, path, filename=0):      if filename:        print '\\n' + Fore.CYAN + filename      urlretrieve(url, path, dlProgress)      pbar.finish()      hasReset = True    for item in shotAttachmentsList:      if item:        fileIds = re.findall(r'\\d+$', item['href'])[0]        filename = item.string        filepath = dirname + '/' + filename        #下载附件        attachmentUrl = preUrl + 'attachments/' + fileIds + '/' +  filename        if os.path.isfile(filepath):          filepath = dirname + '/' + re.sub(r'\\.', '0.', filename)        retrieveImg(attachmentUrl, filepath, filename)        hasAttachmentShot = True    if not hasAttachmentShot:      filename = re.findall(r'[^/\\\\\\\\]+$', shot2XOriginUrl)[0]      filepath = dirname + '/' + filename      if os.path.isfile(filepath):        filepath = dirname + '/' + re.sub(r'\\.', '0.', filename)      retrieveImg(shot2XOriginUrl, filepath, filename)  def cacheShots(self, data=0):    if data:      f = codecs.open('dribbble.json', 'w')      f.write(json.dumps(data, indent=2, ensure_ascii=False))      f.close()    else:      try:        f = codecs.open('dribbble.json', 'r')        data = json.loads(f.read())      except:        return False    return dataif __name__ == '__main__':  startTime = time.time()  print Fore.CYAN + 'Downloading...', '\\n'  Dribbble()  print '\\n' + Fore.GREEN + 'Download OK!'  endTime = time.time()  # 秒数精确到小数点后两位  print Fore.YELLOW + '共耗时:'.decode('utf-8'), Fore.YELLOW + '%.2f' %(endTime - startTime), Fore.YELLOW + '秒'.decode('utf-8')#该片段来自于http://byrx.net

评论关闭