翻页抓取,,#!/usr/bin/e


#!/usr/bin/env python#coding:utf-8import urllib2import reimport osdef getHtml(url):   #获取html源码    headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}    req=urllib2.Request(url,headers=headers)    response=urllib2.urlopen(req)    html=response.read().decode('utf-8')    return htmldef urlPages(page):     #翻页    url = 'https://but/company/lists/page/' + str(page)    #print url    return urldef findList(html):     #正则匹配列表    myitems=re.findall('www.*?</td>',html,re.S)    return myitemsfor page in range(1, 78+1):    #抓取的页数    #print page    html = getHtml(urlPages(page))    items = findList(html)    for item in items:        s = item        s=item.replace('</td>','')        #print s        file_object = open('but.txt', 'a')        file_object.writelines(s+'\n')        file_object.close()print (u'\n\n本王的网站下载完毕啦!')

评论关闭