简单的rss阅读工具,简单rss阅读工具,#!usr/bin/en


#!usr/bin/env python# -*- coding:UTF-8 -*-import refrom lxml import etreefrom bs4 import BeautifulSoup as spimport requestsimport urllib2import StringIOimport sysreload(sys)sys.setdefaultencoding("utf-8")headers={'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}def urlread(url):    try:        req=requests.get(url,headers=headers)        req.encoding="utf-8"        return req.text.encode("utf-8")    except:         req=urllib2.Request(url,headers=headers)        response=urllib2.urlopen(req)        return response.read().encode("utf-8") class Item:    def __init__(self,title,link,date,description):        self.title=title.strip()        self.link=link.strip()        self.pubDate=date.strip()        self.decription=self.filter(description).strip()    def filter(self,description):        description=re.sub("<.*?>",'',description)        description=re.sub("\r",'',description)        description=re.sub("\n",'',description)        description=re.sub("&amp;nbsp;"," ",description)        if len(description)>240:            description=description[:240]+'...'        return description            def __str__(self):        return "%s\n%s\n%s\n<%s>\n" % (                self.title,                self.link,                self.decription,                self.pubDate                )     __repr__=__str__class BSParser(object):    #url=''    def __init__(self,url):        xml=urlread(url)        self.reset(xml)    def reset(self,xml=None):        if xml==None:            self.soup=sp("<xml> </xml>")        else:            self.soup=sp(xml,"xml")    def callback(self,method,obj,tags):        rst=None        attr=method.lower()        for tag in tags:            try:                rst=getattr(obj,attr)(tag)            except:                continue            if rst:                break        return rst    def getfields(self,tags=["item",'entry']):        return self.callback(method="FIND_ALL",                            obj=self.soup,                            tags=tags)    def gettitle(self,obj,tags=["title"]):        return self.callback("FIND",obj,tags).text    def getlink(self,obj,tags=["link"]):              rst=self.callback("FIND",obj,tags).text        if not rst:                       rst=self.callback("FIND",obj,tags).get("href")        return rst    def getdate(self,obj,tags=["pubDate","published"]):        return self.callback("FIND",obj,tags).text    def getdescription(self,obj,tags=["description","content"]):                return self.callback("FIND",obj,tags).text    def run(self):        for item in self.getfields():            title=self.gettitle(item)            link=self.getlink(item)            date=self.getdate(item)            description=self.getdescription(item)            newsitem=Item(title,link,date,description)            yield newsitemdef test():    parser=Parser()    for item in parser.run():        print itemif __name__=="__main__":    test()

评论关闭