使用Python抓取58同城(南京站)的演出票的信息,python南京,#!/usr/bin/e


#!/usr/bin/env python#-*-coding: utf-8 -*-import reimport urllib2from bs4 import BeautifulSoup as bsimport csvimport osimport sysreload(sys)sys.setdefaultencoding('utf-8')def GetAllLink():    num = int(raw_input("爬取多少页:>"))    if not os.path.exists('./data/'):        os.mkdir('./data/')    for i in range(num):        if i+1 == 1:            url = 'http://nj.58.com/piao/'            GetPage(url, i)        else:            url = 'http://nj.58.com/piao/pn%s/' %(i+1)            GetPage(url, i)def GetPage(url, num):    Url = url    user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:32.0) Gecko/20100101 Firefox/32.0'    headers = { 'User-Agent' : user_agent }    req = urllib2.Request(Url, headers = headers)    page = urllib2.urlopen(req).read().decode('utf-8')    soup = bs(page)    table = soup.table    tag = table.find_all('tr')    # 提取出所需的那段    soup2 = bs(str(tag))    title = soup2.find_all('a','t')         #标题与url     price = soup2.find_all('b', 'pri')      #价格    fixedprice = soup2.find_all('del')      #原价    date = soup2.find_all('span','pr25')    #时间     atitle = []    ahref = []    aprice = []    afixedprice = []    adate = []    for i in title:        #print i.get_text(), i.get('href')        atitle.append(i.get_text())        ahref.append(i.get('href'))    for i in price:        #print i.get_text()        aprice.append(i.get_text())    for i in fixedprice:        #print j.get_text()        afixedprice.append(i.get_text())    for i in date:        #print i.get_text()        adate.append(i.get_text())    csvfile = file('./data/ticket_%s.csv'%num, 'w')    writer = csv.writer(csvfile)    writer.writerow(['标题','url','售价','原价','演出时间'])    '''    每个字段必有title,但是不一定有时间date    如果没有date日期,我们就设为'---'    '''    if len(atitle) > len(adate):        for i in range(len(atitle) - len(adate)):            adate.append('---')    for i in range(len(atitle)):            message = atitle[i]+'|'+ahref[i]+'|'+aprice[i]+ '|'+afixedprice[i]+'|'+ adate[i]            writer.writerow([i for i in str(message).split('|')])    print "[Result]:> 页面 %s 信息保存完毕!"%(num+1)    csvfile.close()if __name__ == '__main__':    GetAllLink()

评论关闭