pythonp爬虫 爬取百度音乐,,#coding=ut
pythonp爬虫 爬取百度音乐,,#coding=ut
#coding=utf-8import requestsimport reimport timefrom bs4 import BeautifulSoupdef spider(): for i in range(100,151): start = i * 20 url = ‘http://music.baidu.com/tag/%E6%96%B0%E6%AD%8C?size=20&start=‘+ str(start)+‘&third_type=0‘ print url headers = { "Host":"music.baidu.com", "Connection":"keep-alive", "Cache-Control":"max-age=0", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Referer":url, "Accept-Encoding":"gzip, deflate", "Accept-Language":"zh-CN,zh;q=0.9", # "Cookie":"checkStatus=true; BIDUPSID=F76081B6DCEF178EB115E76CFFABDFFF; PSTM=1490192233; __cfduid=dc0607f001fdddad698f98a17b619d9461517674324; BAIDUID=FCBB590CDE88FE3F4965949AD0A91252:FG=1; MCITY=-%3A; BDUSS=FXUDdYdmVacmV3cC1nNXhnM2RlRi1UWEw3dTFuUzdjSHFvTXZaTlpmdGktUnRiQVFBQUFBJCQAAAAAAAAAAAEAAACeLk0x0O20usHWMTY4AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAGJs9FpibPRaQl; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=13290_1434_21114_20883_20929; PSINO=2; BCLID=13234662273182259149; BDSFRCVID=LeIsJeC6246SbPQAU-w6KwKAG0BRyj7TH6-JNTcy6f-W_zkxmhlfEG0PqU8g0Ku-jgO9ogKK0mOTHvjP; H_BDCLCKID_SF=tJkt_K-aJKvjD4-k247Hhn8thmT22-usBITAQhcH0KLKMKQb-l3GLqQD5Nji-MnC3bRGbtT_JMb1M66_XlOj2jKEqJJdhtnOaCbQ0q5TtUJaSDnTDMRhqtIsXNryKMnitIj9-pnK2ft0HPonHjKhejv-3f; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BAIDU_DUP_lcr=https://www.duba.com/?f=qd_sch; userid=827141790; app_vip=show; Hm_lvt_d0ad46e4afeacf34cd12de4c9b553aa6=1526222318; u_id=; u_t=; UM_distinctid=16359f080b3a3-0802715d516d47-454c092b-ff000-16359f080b450a; CNZZDATA1262632547=1637929121-1526217035-http%253A%252F%252Fmusic.baidu.com%252F%7C1526217035; u_lo=0; checkStatus=true; tracesrc=-1%7C%7C-1; Hm_lpvt_d0ad46e4afeacf34cd12de4c9b553aa6=1526222739", } result = requests.session().get(url=url,headers=headers) if result.status_code ==200: result_html = result.content # print result_html soup = BeautifulSoup(result_html,‘html.parser‘) result_divs = soup.find_all(‘div‘,attrs={"class":"song-item clearfix "}) print len(result_divs) for result_div in result_divs: result_replace = str(result_div).replace(‘\r\n\t‘,‘<br/>‘).replace(‘\n\t‘,‘<br/>‘).replace(‘\n‘,‘<br/>‘) print result_replace index_num = re.findall(‘<span class="index-num index-hook" style="width: 25px;">(.*?)</span><span class="song-info‘,result_replace)[0] song_url_name = re.findall(‘href="(.*?)" target="_blank" title="(.*?)</a><div class="extra-info">‘,result_replace)[0] song_url = song_url_name[0] song_name = song_url_name[1] if ‘<span class="appendix">‘ in result_replace: try: appendix = re.findall(‘<div class="extra-info"><span class="appendix">(.*?)</span></div>‘,str(result_replace))[0] except: appendix = re.findall(‘<span class="appendix">(.*?)</span>‘, str(result_replace))[0] else: appendix = "" author_list = re.findall(‘<span class="author_list" title="(.*?)">‘,result_replace)[0] if ‘<a hidefocus="true" href=‘ in result_replace: author_url = re.findall(‘<a hidefocus="true" href="(.*?)" target="_blank">‘,result_replace)[0] author_url = "http://music.baidu.com/" + author_url else: author_url = "" song_url = "http://music.baidu.com/" + song_url # print author_url # print song_url print author_list # print appendix # print index_num # print song_url print song_name print "="* 88 # time.sleep(2)spider()
pythonp爬虫 爬取百度音乐
相关内容
- 女神说不能每张照片P的一样,所以朋友圈开三天可见,
- python中requests库get方法带参数请求,,起因是想爬五等分
- python基础知识 05 python语言中的大整数,,第五课 python
- Python 字典 len()方法,,描述Python 字
- 在Windows上安装pytorch,,电脑环境为 :win
- Python(64)_写函数,判断用户传入的值(字符串,列表,
- python学习-10 运算符1,,1.加+,减-,乘*
- [原创]win7/64位系统+python3.7.2下安装wordcloud库失败之解决
- python 将IP地址转换成打包后的32位格式,,python 2.7
- python json格式转xml格式,,import xml
评论关闭