抓取三大电商今日特价,抓取三大电商,抓京东, 新蛋,易迅的今


抓京东, 新蛋,易迅的今日特价,省得每次打开三个网页了,嘿嘿

Beautiful Soup, 好用是好用,就是要搞清楚Tag之类的,有时候容易混淆

# -*- coding: gb2312 -*-import urllib2import reimport jsonfrom bs4 import BeautifulSoupimport sqlite3from time import gmtime, strftime, localtimedef getPrice(link):       f3 = urllib2.urlopen(link)    html3 = f3.read()    try:        html3.decode('gb2312')    except:        pass    soup_hot = BeautifulSoup(html3, from_encoding='gb2312')    tag_hot = soup_hot("p", "promoText")    for tag in tag_hot:        detail_link = tag.a['href']        print detail_link        f4 = urllib2.urlopen(detail_link)        html4 = f4.read()##        try:##            html4.decode('gb2312')##        except:##            pass        soup_detail = BeautifulSoup(html4, from_encoding='gb2312')        for tag in soup_detail("p","promoText"):            print tag.text        for tag2 in soup_detail("h1"):            print tag2.text        f5 = urllib2.urlopen(detail_link)        pattern_price = re.compile("pvalues(.*)")        for line in f5.readlines():            m = re.search('pvalues:(.*)', line)            if m is not None:                return m.group(1)            else:                continueif __name__ == '__main__':##    proxyhandler = urllib2.ProxyHandler({'http':'http://proxy.xxxx.com'})##    opener = urllib2.build_opener(proxyhandler)##    urllib2.install_opener(opener)    conn = sqlite3.connect("items.db")    cur = conn.cursor()    current_time_x = localtime()    current_date = strftime("%Y/%m/%d", current_time_x)    current_time = strftime("%H:%M", current_time_x)    print 'jd:', '*'*60    request = urllib2.Request('http://www.jd.com')    request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)')    f = urllib2.urlopen(request)    html = f.read()    try:        html.decode('gb18030')    except:        pass    soup = BeautifulSoup(html,from_encoding='gb18030')    myContentList = soup("li", onclick=re.compile('fengkuang2012'))    print myContentList    print "\\n"*5    for tag in myContentList:        print tag        print "\\n"*2        print '物品:',tag.contents[3].contents[0].text        print '价格:',tag.contents[5].contents[1].text        print '链接:',tag.contents[1].a['href']        print '图片:',tag.contents[1].img['src']        print '*'*80    print "\\n\\n", 'newegg_cn:',  '*' * 60    f2 = urllib2.urlopen('http://zhadan.newegg.com.cn')    html2 = f2.read()    try:        html2.decode('gb2312')    except:        pass    soup_newegg = BeautifulSoup(html2, from_encoding='gb2312')    tag_bomb = soup_newegg("div", 'inner')    for tag in tag_bomb:        lis = tag.find_all('li')        for li in lis:            if li['class'][-1] != u'locked':                print getPrice(li.a['href'])                print "Img Link: ", li.img['src']            else:                print "Locked item: ", li.a['href']                print "Img Link: ", li.img['src']    print "\\n\\n", '51buy:', '>'*60    f_51buy = urllib2.urlopen('http://www.51buy.com')    html_51buy = f_51buy.read()    soup_51buy = BeautifulSoup(html_51buy, from_encoding='utf8')    tag_quickbuy = soup_51buy("div", 'bd_inner')    for tag in tag_quickbuy:        mylist = tag.contents        print mylist        print "\\n"*5# Today's hot list:    print "Today's hot list, come on baby:\\n\\n"    lis = mylist[1].find_all('li')    for li in lis:        print li.a['href']        print li.a['title']        print li.a.text        print li.find("div", 'price').text        print li.img['_src']        print '\\n'    print '\\n'*5    print "Tomorrow hot list to be expected:\\n\\n"# Not yet started -- For tomorrow's hot list, no price information    lis_tomorrow = mylist[3].find_all('li')    for li_t in lis_tomorrow:        print li_t.a['href']        print li_t.a['title']        print li_t.a.text        print li_t.find("div", 'wait').text        print li_t.img['_src']        print '\\n'#该片段来自于http://byrx.net

评论关闭