Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地,pythonxml


本文实例为大家分享了Android九宫格图片展示的具体代码,供大家参考,具体内容如下

#!/user/bin/python 
# -*- coding: gbk -*- 
#Spider.py 
 
import urllib2 
import httplib 
import StringIO 
import gzip 
import re 
import chardet 
import sys 
import os 
import datetime 
from xml.dom.minidom import Document 
from BeautifulSoup import BeautifulSoup 
 
## 这段代码是用于解决控制台打印汉字报错的问题 
reload(sys) 
sys.setdefaultencoding("utf8") 
##################################################### 
 
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 
DEBUG = 1 
NO_DEBUG = 0 
httplib.HTTPConnection.debuglevel = DEBUG 
## 是否显示爬取网页源代码开关 
showSrcCode = False 
## 压缩方式 
ZIP_TYPE = "gzip" 
 
fileName = "auctions" 
location = "d://spiderData/" 
 
## header 
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 
##################################################### 
 
 
#############class SpiderConfig ##################### 
class SpiderConfig: 
  """ 
    configuration for spider name and url 
  """ 
  def __init__(self, name, url): 
    self.name = name 
    self.url = url 
##################################################### 
 
##############class SpiderAuctionDomain############## 
class SpiderAuctionDomain: 
  """ 
    Store information with auctions spidered by python 
  """ 
  title = "" 
  url = "" 
  img = "" 
  price = "" 
 
  def __init__(self): 
    pass 
 
##################################################### 
 
########class SpiderDefaultErrorHandler############## 
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 
  def http_error_default(self, req, fp, code, msg, hdrs): 
    """ 
      default error process handler for spider 
    """ 
    result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 
    result.status = code 
    result.url = req.get_full_url() 
 
    print "<", result.url, "Exception code :", result.status, ">" 
 
    return result 
##################################################### 
 
#############class SpiderHandler##################### 
class SpiderHandler: 
  """ 
    spider handler 
  """ 
 
  def spider(self, spiderConfig): 
    try: 
      request = urllib2.Request(spiderConfig.url) 
 
      ## configure request hreader 
      for key,val in headerConfig.items(): 
        request.add_header(key, val) 
 
      ## build opener 
      opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 
 
      ## open request 
      openRequest = opener.open(request) 
 
      ## read data 
      spiderData = openRequest.read() 
 
      ## close 
      opener.close() 
 
      if 0 == len(spiderData): 
        return 
 
      if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 
        spiderData = SpiderHandler.gzipData(self, spiderData) 
 
      if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 
        print spiderData 
 
      # parse html 
      SpiderHandler.parse(self, spiderData) 
 
    except Exception,x: 
      print "spider process Exception:", x 
 
 
 
  def parse(self, spiderData): 
    """ 
      parse html content 
    """ 
 
    if httplib.HTTPConnection.debuglevel == DEBUG: 
      charsetAnalyze = chardet.detect(spiderData) 
      print "analyze spider data encode :",charsetAnalyze["encoding"] 
 
    print "执行解析", fileName 
 
    soup = BeautifulSoup(spiderData) 
    encode = soup.originalEncoding 
 
    encoding = lambda x : x.encode(encode) 
 
    if httplib.HTTPConnection.debuglevel == DEBUG: 
      print "识别到编码:", encode 
      title = soup.head.title.string 
      print encoding(title) 
 
    spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 
    auctions = ["%s" % s for s in spiderContents] 
 
    if auctions is None: 
      return 
 
    auctionList = [] 
 
    for auc in auctions: 
      auctionDomain = SpiderAuctionDomain() 
      # parse auction link 
      links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 
      if links is not None : 
        auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 
 
      #parse auction title 
      titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc) 
      if titles is not None: 
        auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 
 
      #parse auction price 
      price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 
      if price is not None: 
        auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 
 
      #parse image url 
      imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 
      if imgs is not None: 
        auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 
 
      auctionList.append(auctionDomain) 
 
    print "成功解析商品信息:" 
    for a in auctionList: 
      print "--->",a.title 
 
    # sort auction list 
    auctionList = SpiderHandler.sortAuctionList(self, auctionList) 
 
    # save in file 
    SpiderHandler.save(self, auctionList) 
 
    print "解析完成" 
 
    pass 
 
  def sortAuctionList(self, auctionList): 
    """ 
      冒泡排序,按照价格排序 
    """ 
    length = len(auctionList) 
    if length < 2: 
      return auctionList 
    else: 
      for i in range(length-1): 
        for j in range(length - i -1): 
          if float(auctionList[j].price) > float(auctionList[j+1].price): 
            auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 
    return auctionList 
    pass 
 
  def save(self, auctionList): 
    if auctionList is not None: 
      doc = Document() 
 
      auctions = doc.createElement("auctions") 
      doc.appendChild(auctions) 
 
      for auc in auctionList: 
        auction = doc.createElement("auction") 
        auctions.appendChild(auction) 
 
        SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 
        SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 
        SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 
        SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 
 
      if False == os.path.exists(location): 
        os.mkdir(location) 
 
      file = open(location+fileName+".xml", 'w') 
      file.write(doc.toprettyxml()) 
      file.close() 
 
      if httplib.HTTPConnection.debuglevel == DEBUG: 
        print doc.toprettyxml() 
 
  def generateXML(self, doc, f, name, txt): 
    c = doc.createElement(name) 
    f.appendChild(c) 
    c.appendChild(doc.createTextNode(txt)) 
 
  def gzipData(self, spiderData): 
    """ 
      get data from gzip 
    """ 
    if 0 == len(spiderData): 
      return spiderData 
    spiderDataStream = StringIO.StringIO(spiderData) 
    spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 
    return spiderData 
##################################################### 
 
if __name__ == "__main__": 
  nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒") 
 
  needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 
           "hangzhou":"http://ju.taobao.com/hangzhou", 
           "shanghai":"http://ju.taobao.com/shanghai", 
           "beijing":"http://ju.taobao.com/beijing", 
           "chengdu":"http://ju.taobao.com/chengdu"} 
 
  configList = [] 
  for k,v in needSpiderUrl.items(): 
    spiderConfig = SpiderConfig(k, v) 
    configList.append(spiderConfig) 
 
  spiderHandler = SpiderHandler() 
 
  print "爬虫执行开始时间:",nowtime() 
  for spiderConfig in configList: 
    fileName = spiderConfig.name 
    spiderHandler.spider(spiderConfig) 
 
  print "爬虫执行完毕时间:",nowtime() 

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持帮客之家。

评论关闭