爬取动态网页数据,动态网页数据,#coding:utf-


#coding:utf-8import urllib2import refrom bs4 import BeautifulSoupclass fanli():    def __init__(self):        self.usr_agent='Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36'        self.headers={'usr_agent',self.usr_agent}        self.p=1        self.pageIndex=1    def getHtml(self,p,pageIndex):        try:            url='http://zhide.fanli.com/index/ajaxGetItem?cat_id=0&tag=&page='+str(pageIndex)+'-'+str(p)+'&area=0&tag_id=0&shop_id=0'            request=urllib2.Request(url)            page=urllib2.urlopen(request)            html=page.read()            return html        except urllib2.URLError,e:            if hasattr(e,'reason'):                print u'连接失败',e.reason                return None    def getItems(self):        with open('fanli.txt','a') as f:            f.write('商品名称'+'|'+'分类'+'|'+'推荐人'+'|'+'好评数'+'|'+'差评数'+'\n')        for pageIndex in range(1,51):            for p in range(1,11):                html=self.getHtml(pageIndex,p)#IndexError:list index out of range在第11页出现的问题                html=self.getHtml(pageIndex,p)                rep=re.compile(' J_tklink_tmall')                data=rep.sub('',html)                soup=BeautifulSoup(data)                name_list=soup.find_all('a',class_='J-item-track nodelog')#商品名称                fenlei_list=soup.find_all('a',class_='nine')#分类                usr_list=soup.find_all('div', class_='item-user')#推荐人                yes_list=soup.find_all('a',class_='l item-vote-yes J-item-vote-yes')#好评                no_list=soup.find_all('a',class_='l item-vote-no J-item-vote-no')#差评                f=open('fanli.txt','a')                for i in range(0,5):                    f.write(name_list[i].get_text(strip=True).encode("utf-8")+'|'\                        +fenlei_list[i].get_text(strip=True).encode("utf-8")+'|'\                        +usr_list[i].get_text(strip=True).encode("utf-8")+'|'\                        +yes_list[i].get_text(strip=True).encode("utf-8")+'|'\                        +no_list[i].get_text(strip=True).encode("utf-8")+'|'+'\n')                f.close()spider=fanli()spider.getItems()

评论关闭