urllib:简单的贴吧页面爬取代码,,from urlli
urllib:简单的贴吧页面爬取代码,,from urlli
from urllib import requestimport timeimport urllibimport urllib.parse#根据url发送请求,获取服务器响应文件def loadPage(url,filename): print(‘正在下载‘ + filename) headers = { ‘User - Agent‘: ‘Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 65.0.3325.181 Safari / 537.36‘ } req = urllib.request.Request(url,headers=headers) return urllib.request.urlopen(req).read()#将html内容写入本地def writePage(html,filename): print(‘正在保存‘ + filename) with open(filename,‘wb‘) as f: f.write(html) print(‘-------------------------------‘)# http://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5 第一页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 按照规律这个和上面的第一页url是一样的# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 第二页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 第三页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 第四页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=200 第五页#处理每个页面的urldef tiebaSpider(url,beginPage,endPage): for page in range(beginPage,endPage+1): pn = (page - 1) * 50 filename=‘d:/yemian/第‘+str(page) + ‘页.html‘ fullurl = url + ‘&pn-‘ + str(pn) html = loadPage(fullurl,filename) writePage(html,filename)if __name__ == ‘__main__‘: kw = input(‘请输入需要爬取页面的贴吧名:‘) beginPage = int(input(‘请输入起始页:‘)) endPage = int(input(‘请输入结束页:‘)) url = ‘http://tieba.baidu.com/f?‘ key = urllib.parse.urlencode({‘kw‘:kw}) fullurl = url + key tiebaSpider(fullurl,beginPage,endPage)print(‘谢谢使用‘)time.sleep(10)
urllib:简单的贴吧页面爬取代码
评论关闭