使用urllib模拟登陆豆瓣电台,抓取红心歌曲表单,增加封面提取,urllib豆瓣,本人新手,刚学urlli


本人新手,刚学urllib,拿豆瓣登陆练手,参考了社区里几个前辈的代码,这里表示感谢。使用urllib模拟登陆豆瓣电台,抓取红心歌曲表单,保存到本地。登录中用到的验证码图片,保存在当前目录下。

环境是python3 貌似python3 可以参考的实例很少

import sys, time, os, re, urllib, jsonfrom http import cookiejarfrom urllib import request, parseparams = {"source":"radio",}params['alias']=input('请输入邮箱:')params['form_password']=input('请输入密码:')listfile = open('list.txt','w',encoding='utf-8')cookie = cookiejar.CookieJar()opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))Mainurl = 'http://douban.fm/mine'suburl = 'http://douban.fm/common_login?redir=/mine'req = urllib.request.urlopen(suburl)html = req.read().decode('utf8')#get captcha_idcaptchaurl = 'http://douban.fm/j/new_captcha'data = urllib.parse.urlencode({'ck':'null'}).encode('unicode_escape')response=opener.open(captchaurl, data)params['captcha_id']=response.read().decode('utf8').split('\\"')[1]#get captcha_solutionrequrl = 'http://www.douban.com/misc/captcha?size=m&id=%s' % params['captcha_id']#方法一:'''response=opener.open(requrl)imgdata = response.read()imgfile = open('v.jpg', "wb")imgfile.write(imgdata)imgfile.close()'''#方法二:urllib.request.urlretrieve(requrl,'v.jpg')vcode=input('请输入图片上的验证码:')params['captcha_solution']=vcode#loginloginurl= 'http://douban.fm/j/login'data = urllib.parse.urlencode(params).encode('unicode_escape')response = opener.open(loginurl,data)ans = json.loads(response.read().decode('utf8'))if 'err_no' in ans:        print('登录错误:%s' % ans['err_msg'])        exit(0)else:        print ('登陆成功')        print('累计收听:%s 首' % ans['user_info']['play_record']['played'])        print('加红心:%s 首' % ans['user_info']['play_record']['liked'])        print('不再收听:%s 首' % ans['user_info']['play_record']['banned'])#抓取红心歌曲表单url = Mainurl #第一页歌曲列表urlwhile(True):        html = opener.open(url).read().decode('utf8')        songlist = re.findall(                '''  <div class="props">                        <p class="song_title">(.+?)</p>                        <p class="performer">(.+?)</p>                        <p class="source">                            <a                            href="(.+?)" target="_blank"                            >(.+?)</a>                        </p>                    </div>''',html)        for song in songlist:                imgname = song[0]+'-'+song[1]+'-'+song[3]                imgname=imgname.replace('/','&')                print(imgname)                listfile.write(imgname+'\\n')                 ##################################################                ###############     批量下载封面     ##################                ##################################################                         imghtml = urllib.request.urlopen(song[2]).read().decode('utf8')                imgurl = re.search('''<a class="nbg" href="(.+?)"           title="点击看大图">''',imghtml).group(1)                  urllib.request.urlretrieve(imgurl,imgname+'.jpg')                ##################################################        ans=re.findall('''<span class="next">            <link rel="next" href="(.+?)"/>            <a href=".*?" >后页></a>        </span>''',html)        #如果是最后一页        if len(ans) == 0:                print ('over')                listfile.close()                break        #如果还有下一页        else:                url = 'http://douban.fm/' + ans[0]#该片段来自于http://byrx.net

评论关闭