抓取三大电商今日特价,抓取三大电商,抓京东, 新蛋,易迅的今
抓取三大电商今日特价,抓取三大电商,抓京东, 新蛋,易迅的今
抓京东, 新蛋,易迅的今日特价,省得每次打开三个网页了,嘿嘿
Beautiful Soup, 好用是好用,就是要搞清楚Tag之类的,有时候容易混淆
# -*- coding: gb2312 -*-import urllib2import reimport jsonfrom bs4 import BeautifulSoupimport sqlite3from time import gmtime, strftime, localtimedef getPrice(link): f3 = urllib2.urlopen(link) html3 = f3.read() try: html3.decode('gb2312') except: pass soup_hot = BeautifulSoup(html3, from_encoding='gb2312') tag_hot = soup_hot("p", "promoText") for tag in tag_hot: detail_link = tag.a['href'] print detail_link f4 = urllib2.urlopen(detail_link) html4 = f4.read()## try:## html4.decode('gb2312')## except:## pass soup_detail = BeautifulSoup(html4, from_encoding='gb2312') for tag in soup_detail("p","promoText"): print tag.text for tag2 in soup_detail("h1"): print tag2.text f5 = urllib2.urlopen(detail_link) pattern_price = re.compile("pvalues(.*)") for line in f5.readlines(): m = re.search('pvalues:(.*)', line) if m is not None: return m.group(1) else: continueif __name__ == '__main__':## proxyhandler = urllib2.ProxyHandler({'http':'http://proxy.xxxx.com'})## opener = urllib2.build_opener(proxyhandler)## urllib2.install_opener(opener) conn = sqlite3.connect("items.db") cur = conn.cursor() current_time_x = localtime() current_date = strftime("%Y/%m/%d", current_time_x) current_time = strftime("%H:%M", current_time_x) print 'jd:', '*'*60 request = urllib2.Request('http://www.jd.com') request.add_header('User-agent', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)') f = urllib2.urlopen(request) html = f.read() try: html.decode('gb18030') except: pass soup = BeautifulSoup(html,from_encoding='gb18030') myContentList = soup("li", onclick=re.compile('fengkuang2012')) print myContentList print "\\n"*5 for tag in myContentList: print tag print "\\n"*2 print '物品:',tag.contents[3].contents[0].text print '价格:',tag.contents[5].contents[1].text print '链接:',tag.contents[1].a['href'] print '图片:',tag.contents[1].img['src'] print '*'*80 print "\\n\\n", 'newegg_cn:', '*' * 60 f2 = urllib2.urlopen('http://zhadan.newegg.com.cn') html2 = f2.read() try: html2.decode('gb2312') except: pass soup_newegg = BeautifulSoup(html2, from_encoding='gb2312') tag_bomb = soup_newegg("div", 'inner') for tag in tag_bomb: lis = tag.find_all('li') for li in lis: if li['class'][-1] != u'locked': print getPrice(li.a['href']) print "Img Link: ", li.img['src'] else: print "Locked item: ", li.a['href'] print "Img Link: ", li.img['src'] print "\\n\\n", '51buy:', '>'*60 f_51buy = urllib2.urlopen('http://www.51buy.com') html_51buy = f_51buy.read() soup_51buy = BeautifulSoup(html_51buy, from_encoding='utf8') tag_quickbuy = soup_51buy("div", 'bd_inner') for tag in tag_quickbuy: mylist = tag.contents print mylist print "\\n"*5# Today's hot list: print "Today's hot list, come on baby:\\n\\n" lis = mylist[1].find_all('li') for li in lis: print li.a['href'] print li.a['title'] print li.a.text print li.find("div", 'price').text print li.img['_src'] print '\\n' print '\\n'*5 print "Tomorrow hot list to be expected:\\n\\n"# Not yet started -- For tomorrow's hot list, no price information lis_tomorrow = mylist[3].find_all('li') for li_t in lis_tomorrow: print li_t.a['href'] print li_t.a['title'] print li_t.a.text print li_t.find("div", 'wait').text print li_t.img['_src'] print '\\n'#该片段来自于http://byrx.net
相关内容
- 如何把图片存入数据库,图片存入数据库,MYSQL 是支持把
- Python堆排序(最大堆),python堆排序最大堆,# -*- codin
- Python递归遍历文件夹,寻找包含某个字符串的文本文件
- Python实现读取文件,python实现读取,def get_byte
- Python3 的 tkinter 简单实例,python3tkinter,创建一个小窗口
- 从Bing中国上下载背景图片,bing中国背景图片,改用url
- saltstack reg模块修复,saltstackreg模块,原来的modules/r
- 局域网聊天---当前有图形和基本的架构 分享有时间的可
- 下载进度条显示,进度条显示,#!/usr/bin/p
- 掷骰子游戏,,游戏规则:玩家投掷两个骰
评论关闭