python使用SGMLParser提取文本正文内容,pythonsgmlparser,如下代码:from sg
python使用SGMLParser提取文本正文内容,pythonsgmlparser,如下代码:from sg
如下代码:
from sgmllib import SGMLParserfrom urllib import urlopenfrom urlparse import urljoinclass AllTextParser(SGMLParser): def __init__(self, url): SGMLParser.__init__(self) self.url = url self.is_js = False self.is_st = False self.div = [] self.charset = 'utf-8' self.all_div = [] www = urlopen(url) self.feed(www.read()) www.close() def start_script(self, a): self.is_js = True def end_script(self): self.is_js = False def start_style(self, a): self.is_st = True def end_style(self): self.is_st = False def start_p(self, a): if self.div: self.div[-1][0] += '<p>' def end_p(self): if self.div: self.div[-1][0] += '</p>' def start_a(self, a): if self.div: d = dict(a) try:self.div[-1][0] += '<a target="_blank" href="%s">' % urljoin(self.url, d['href']) except:pass def end_a(self): if self.div: self.div[-1][0] += '</a>' def start_img(self, a): if self.div: d = dict(a) try:self.div[-1][0] += '<img src="%s" border="0" alt="%s" />' % (urljoin(self.url, d['src']), d.get('alt', '')) except:pass def start_meta(self, a): d = dict(a) if d.get('http-equiv') == 'Content-Type': try: self.charset = d.get('content').split('=')[1].lower() if self.charset[:2] == 'gb': self.charset = 'gb18030' self.type = d.get('content').split(';')[0] except:pass '''elif d.has_key('name'): try:setattr(self, d.get('name'), d.get('content')) except:pass''' def start_br(self, a): if self.div: self.div[-1][0] += '<br />' def start_div(self, a): self.div.append(['', 0]) def end_div(self): try: self.all_div.append(self.div[-1]) del(self.div[-1]) except: pass def handle_data(self, text): if self.is_js or self.is_st: pass elif self.div: self.div[-1][0] += text self.div[-1][1] += len(text) def get_result(self): m = 0 c = 0 mc = 0 for x in self.all_div: l = x[1] if l > m: mc = c m = l c += 1 if self.charset not in ['utf-8', 'utf8']: return self.all_div[mc][0].decode(self.charset).encode('utf8') return self.all_div[mc][0]if __name__ == '__main__': webSite = AllTextParser('http://byrx.net/code-snippet/1625/Hibernate-how-achieve-pagination-search') print webSite.get_result()
相关内容
- Python thread socket server,pythonsocket,从网上参考了一些代码
- python list使用技巧总结,pythonlist使用技巧,判断一个 l
- python dict使用技巧,pythondict,在 Dictionary
- Python统计列表中元素出现的次数,python统计列表元素
- Python yield使用示例,pythonyield示例,下面2段代码通过Py
- python使用eval,exec将字符串(string)转为字典(dict),pythone
- Python POP3 收取邮件,pythonpop3,python 为我们提供
- python爬虫抓网页的总结,python爬虫抓,学用python也有3个
- python核心模块之pickle和cPickle的使用示例,picklecpickle,
- Python使用ConfigParser读写ini配置文件,,ini文件格式概述
评论关闭