批量下载指定歌手的所有专辑(已解除验证码限制),歌手验证码,一直想把喜欢的歌手的专辑


一直想把喜欢的歌手的专辑全都归类并下载下来,由于那专辑数量实在太多了,再加上最近开始学习python,就想着何不用python写个脚本把下载过程自动化呢?所以就花了点时间写了这么个东西,分享给有需要的人。:)

写这个东西,一开始并没有想到抓取过于频繁、时间过长会出现验证码,由于验证码的问题试了几种方式都无法得到很好的解决,于是加上了生成下载清单这一步,加这一步的时候,一开始是把最终下载地址存储起来,结果发现,下载地址居然会过期,没办法最后只有将下载页面地址存储下来,使用下载命令的时候,再去下载页面获取最终下载地址。

这段脚本使用了两个开源的模块,gevent和BeautifulSoup。

updated-----------------------------------------------------------------------

已解除验证码限制,若出现验证码,则会从验证码页面中提取出所需cookie并重新发起请求。

#coding=utf-8import urllib,urllib2,re,os,json,gevent,tracebackfrom BeautifulSoup import BeautifulSoupfrom gevent import monkeymonkey.patch_all()rootUrl='http://music.baidu.com'artistId=2825 #想批量下载并归类你喜欢的歌手的所有专辑?那就把这里替换成该歌手在百度音乐的Id吧,例如:http://music.baidu.com/artist/2825pagesize=10savePath='G:\\\\crawl\\\\david bowie\\\\' #改成你想存储的文件夹listDir='_____downlist\\\\'handleCount=0BAIDUVERIFY=''def crawlList():    artistUrl=rootUrl+'/artist/'+str(artistId)    homeHtml=request(artistUrl)    soup=BeautifulSoup(homeHtml)    try:        pagecount=len(soup.findAll("div",{"class":"page-inner"})[1].findAll(text=re.compile(r'\\d+')))    except:        print traceback.print_exc()        print homeHtml        return    jobs=[]    listPath=savePath+listDir    if not os.path.exists(listPath):        os.mkdir(listPath)    for i in range(pagecount):        jobs.append(gevent.spawn(crawlPage,i))    gevent.joinall(jobs)def request(url):    global BAIDUVERIFY    req=urllib2.Request(url)    if BAIDUVERIFY!='':        req.add_header('Cookie','BAIDUVERIFY='+BAIDUVERIFY+';')    resp=urllib2.urlopen(req)    html= resp.read()    verify=getBaiduVerify(html)    if verify!='':         print u'成功提取验证码并重新发起请求'        BAIDUVERIFY=verify         return request(url)    return htmldef getBaiduVerify(html):    vcode=re.search(r'name=\\"vcode\\" value=\\"(.*?)\\"' , html, re.I)    id=re.search(r'name=\\"id\\" value=\\"(.*?)\\"' , html, re.I)    di=re.search(r'name=\\"di\\" value=\\"(.*?)\\"' , html, re.I)    if vcode and id and di:        return vcode.group(1)+':'+id.group(1)+':'+di.group(1)    return ''def crawlPage(page):    start=page*pagesize    albumListUrl='http://music.baidu.com/data/user/getalbums?start=%d&ting_uid=%d&order=time' % (start,artistId)    print albumListUrl    albumListHtml=json.loads(request(albumListUrl))["data"]["html"]    albumListSoup=BeautifulSoup(albumListHtml)    covers=albumListSoup.findAll('a',{'class':'cover'})    pagePath=savePath+listDir+str(page)+'\\\\'    if not os.path.exists(pagePath):        os.mkdir(pagePath)    for cover in covers:        try:            crawlAlbum(pagePath,rootUrl+cover['href'],cover['title'])        except:            print traceback.print_exc()def crawlAlbum(pagePath,albumUrl,title):    print albumUrl,title    albumHtml=request(albumUrl)    albumSoup=BeautifulSoup(albumHtml)    musicWraps=albumSoup.findAll('span',{'class':'song-title '})    title=re.subn(r'\\\\|\\/|:|\\*|\\?|\\"|\\<|\\>|\\|','',title)[0]    path=savePath+title+'\\\\'    albumListPath=pagePath+title+'.txt'    albumFile=open(albumListPath,'w')    for wrap in musicWraps:        link=wrap.find('a')        try:            musicPage=rootUrl+link['href']            albumFile.write('%s\\t%s\\t%s\\n' % (musicPage,link['title'],path)) #真实下载地址会过期,这里保存下载页面        except:            print traceback.print_exc()    albumFile.close()def crawlDownloadUrl(musicPage):    downPage=musicPage+'/download'    downHtml=request(downPage)    downUrl=re.search('http://[^ ]*xcode.[a-z0-9]*' , downHtml, re.M).group()    return downUrldef downList():    listPath=savePath+listDir    jobs=[]    for pageDir in os.listdir(listPath):        jobs.append(gevent.spawn(downPage,listPath+pageDir))    gevent.joinall(jobs)def downPage(pagePath):    for filename in os.listdir(pagePath):        filePath=pagePath+'\\\\'+filename        albumFile=open(filePath,'r')        try:            for args in albumFile.readlines():                arrArgs=args.split('\\t')                downMusic(arrArgs[0],arrArgs[1],arrArgs[2].replace('\\n',''))        except:            print traceback.print_exc()        finally:            albumFile.close()def downMusic(musicPage,title,path):    global handleCount    if not os.path.exists(path):        os.mkdir(path)    handleCount+=1    print handleCount,musicPage,title,path    filename=path+re.subn(r'\\\\|\\/|:|\\*|\\?|\\"|\\<|\\>|\\|','',title)[0]+'.mp3'    if os.path.isfile(filename):        return    downUrl=crawlDownloadUrl(musicPage)    try:        urllib.urlretrieve(downUrl,filename)    except:        print traceback.print_exc()        os.remove(filename)if __name__=='__main__':    print u'命令:\\n\\tlist\\t生成下载清单\\n\\tdown\\t开始下载\\n\\texit\\t退出'    cmd=raw_input('>>>')    while cmd!='exit':        if cmd=='list':            crawlList()            print u'已生成下载清单'        elif cmd=='down':            downList()            print u'下载完成'        else:            print 'unknow cmd'        cmd=raw_input('>>>')#该片段来自于http://byrx.net

评论关闭