urllib:简单的贴吧页面爬取代码,,from urlli


from urllib import requestimport timeimport urllibimport urllib.parse#根据url发送请求,获取服务器响应文件def loadPage(url,filename):    print(‘正在下载‘ + filename)    headers = {        ‘User - Agent‘: ‘Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 65.0.3325.181 Safari / 537.36‘    }    req = urllib.request.Request(url,headers=headers)    return urllib.request.urlopen(req).read()#将html内容写入本地def writePage(html,filename):    print(‘正在保存‘ + filename)    with open(filename,‘wb‘) as f:        f.write(html)    print(‘-------------------------------‘)# http://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5    第一页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0      按照规律这个和上面的第一页url是一样的# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50     第二页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100   第三页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150    第四页# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=200    第五页#处理每个页面的urldef tiebaSpider(url,beginPage,endPage):    for page in range(beginPage,endPage+1):        pn = (page - 1) * 50        filename=‘d:/yemian/第‘+str(page) + ‘页.html‘        fullurl = url + ‘&pn-‘ + str(pn)        html = loadPage(fullurl,filename)        writePage(html,filename)if __name__ == ‘__main__‘:    kw = input(‘请输入需要爬取页面的贴吧名:‘)    beginPage = int(input(‘请输入起始页:‘))    endPage = int(input(‘请输入结束页:‘))    url = ‘http://tieba.baidu.com/f?‘    key = urllib.parse.urlencode({‘kw‘:kw})    fullurl = url + key    tiebaSpider(fullurl,beginPage,endPage)print(‘谢谢使用‘)time.sleep(10)

  

urllib:简单的贴吧页面爬取代码

评论关闭