抓取dribbble图片,抓取dribbble,#!/usr/bin/e
#!/usr/bin/env python2# -*- coding: utf-8 -*-"""# @brief: 抓取dribbble.com里的图片,包括附件。# 主要用于平时在设计方面的材料收集。"""from BeautifulSoup import BeautifulSoupfrom colorama import init, Fore #控制台颜色from urllib2 import urlopenfrom urllib import urlretrievefrom progressbar import *import codecsimport timeimport jsonimport osimport re# 通过使用autoreset参数可以让变色效果只对当前输出起作用,输出完成后颜色恢复默认设置init(autoreset=True)class Dribbble: HOMEPAGE = 'http://dribbble.com/shots/everyone' TOP = 0 #0表示不限制 allPageList = [] pageList = [] hasUpdate = False def __init__(self, top=0): self.TOP = top now = time.strftime("%Y%m%d", time.localtime()) # 获取本地缓存 _pageList = self.cacheShots() if _pageList and len(_pageList) and (_pageList['update'] == str(now)): self.allPageList = _pageList['url'] self.getAllShotsUrl() # 如果数据有更新,则保存数据 if self.hasUpdate: self.allPageList.extend(self.pageList) self.cacheShots({ 'url': self.allPageList, 'update': now }) for item in self.pageList: if item: self.downShot(item) def getAllShotsUrl(self): pageUrl = self.HOMEPAGE + '?page=' pageIndex = 1 isEnd = False # python貌似没有三目 if len(self.pageList): lastUpdateShotUrl = self.pageList[len(self.pageList) - 1] else: lastUpdateShotUrl = '' while not isEnd: # 获取当前页面所有作品的url shotsUrl = self.getShotUrl(pageUrl + str(pageIndex)) if (self.TOP and pageIndex <= self.TOP) or not self.TOP: if len(shotsUrl): # 缓存得到更新的url tmp = [] for url in shotsUrl: # 如果当前url和缓存中最后的url一样,则跳出。 if url == lastUpdateShotUrl: isEnd = True break else: tmp.insert(0, 'http://dribbble.com' + url) self.hasUpdate = True tmp.extend(self.pageList) self.pageList = tmp #页数加1 pageIndex += 1 else: isEnd = True else: isEnd = True def getShotUrl(self, url): page = urlopen(url) soup = BeautifulSoup(page.read()) ol = soup.find('ol',{'class': 'dribbbles group'}) # 如果没有内容,直接返回空数组 if not ol: return [] links = ol.findAll('a',{'class': 'dribbble-link'}) # 返回的结果 result = [] for i in links: try: href = i['href'] if href: result.append(href) except: pass return result def downShot(self, url): #创建目录 dirname = 'dribbble' try: os.mkdir(dirname) except Exception, e: pass page = urlopen(url) soup = BeautifulSoup(page.read()) #正常展示的作品 #例如 shotDefaultUrl = http://dribbble.s3.amazonaws.com/users/34934/screenshots/1268076/gym_preview_1x.jpg shotDefaultUrl = 'http:' + soup.find('div', {'class': 'single-img'}).find('img')['src'] shot2XOriginUrl = re.sub(r'_1x', '', shotDefaultUrl) preUrl = re.match(r'.*\\/', shotDefaultUrl).group() #附件大图,这正是我要下载东西 shotAttachments = soup.find('div', {'class': 'attachments'}) shotAttachmentsList = [] hasAttachmentShot = False if shotAttachments: shotAttachmentsList = shotAttachments.findAll('a') widgets = ['Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker('>-=')),' ', ETA(), ' ', FileTransferSpeed()] pbar = ProgressBar(widgets=widgets) global hasReset hasReset = True def dlProgress(blockCount, blockSize, totalSize): pbar.maxval = totalSize global hasReset if hasReset: pbar.start() hasReset = False pbar.update(int(min(blockCount * blockSize, totalSize))) def retrieveImg(url, path, filename=0): if filename: print '\\n' + Fore.CYAN + filename urlretrieve(url, path, dlProgress) pbar.finish() hasReset = True for item in shotAttachmentsList: if item: fileIds = re.findall(r'\\d+$', item['href'])[0] filename = item.string filepath = dirname + '/' + filename #下载附件 attachmentUrl = preUrl + 'attachments/' + fileIds + '/' + filename if os.path.isfile(filepath): filepath = dirname + '/' + re.sub(r'\\.', '0.', filename) retrieveImg(attachmentUrl, filepath, filename) hasAttachmentShot = True if not hasAttachmentShot: filename = re.findall(r'[^/\\\\\\\\]+$', shot2XOriginUrl)[0] filepath = dirname + '/' + filename if os.path.isfile(filepath): filepath = dirname + '/' + re.sub(r'\\.', '0.', filename) retrieveImg(shot2XOriginUrl, filepath, filename) def cacheShots(self, data=0): if data: f = codecs.open('dribbble.json', 'w') f.write(json.dumps(data, indent=2, ensure_ascii=False)) f.close() else: try: f = codecs.open('dribbble.json', 'r') data = json.loads(f.read()) except: return False return dataif __name__ == '__main__': startTime = time.time() print Fore.CYAN + 'Downloading...', '\\n' Dribbble() print '\\n' + Fore.GREEN + 'Download OK!' endTime = time.time() # 秒数精确到小数点后两位 print Fore.YELLOW + '共耗时:'.decode('utf-8'), Fore.YELLOW + '%.2f' %(endTime - startTime), Fore.YELLOW + '秒'.decode('utf-8')#该片段来自于http://byrx.net
评论关闭