获取网页的正文,获取网页正文,有什么办法可以直接获取一
获取网页的正文,获取网页正文,有什么办法可以直接获取一
有什么办法可以直接获取一个网页的正文?想想就觉得挺难,里面有这么多HTML元素,还真不知道应该提取哪里。
这里有一个原理的讲解:http://code.google.com/p/cx-extractor/
这里是针对它的python实现。
# -*- coding=utf-8 -*-import sys, os, codecs, rereload(sys) sys.setdefaultencoding('utf-8')import cProfileimport urllib2re_title = re.compile(r'<title>(.*?)</title>', re.I|re.U|re.S)re_body = re.compile(r'<body[^>]*>.*</body>', re.I|re.U|re.S)re_doc_type = re.compile(r'<!DOCTYPE.*?>', re.I|re.U|re.S)re_comment = re.compile(r'<!--.*?-->', re.I|re.U|re.S)re_js = re.compile(r'<script.[^>]*>.*?</script>', re.I|re.U|re.S)re_css = re.compile(r'<style[^>]*>.*?</style>', re.I|re.U|re.S)re_special = re.compile(r'&.{2,8};|&#.{2,8};', re.I|re.U|re.S)re_other = re.compile(r'<[^>]*>', re.I|re.U|re.S)BLOCK_HEIGHT = 3THRESHOLD = 90class TextExtract(object): def __init__(self, new_html, join=True): self.html = new_html self.join = join self.text_start = 0 self.text_end = 0 self.text_body = '' self.block_len = [] self.title = '' self.content = '' self.extract() def extract(self): self.extract_title() self.extract_body() self.remove_tags() self.extract_text() def extract_title(self): m = re_title.search(self.html) if m: self.title = m.group(1) def extract_body(self): m = re_body.search(self.html) if m: self.text_body = m.group() def remove_tags(self): self.text_body = re_doc_type.sub('', self.text_body) self.text_body = re_comment.sub('', self.text_body) self.text_body = re_js.sub('', self.text_body) self.text_body = re_css.sub('', self.text_body) self.text_body = re_special.sub('', self.text_body) self.text_body = re_other.sub('', self.text_body) def extract_text(self): lines = self.text_body.split('\\n') line_len = len(lines) for i in xrange(0,line_len,1): lines[i] = re.sub(r'\\s+', ' ', lines[i]).strip() for i in xrange(1,line_len-1,1): if len(lines[i]) > 0 and len(lines[i]) < 30 and 0 == len(lines[i-1]) and 0 == len(lines[i+1]): lines[i] = '' for i in xrange(0, len(lines)-BLOCK_HEIGHT, 1): line_len = 0 for j in xrange(0, BLOCK_HEIGHT, 1): line_len += len(lines[i+j]) self.block_len.append(line_len) self.text_start = self.find_text_start(0) self.text_end = 0 if(0 == self.text_start): self.content = 'nothing can find' else: if self.join: line_lens = len(lines) while self.text_end < line_lens: self.text_end = self.find_text_end(self.text_start) self.content += self.get_text(lines) self.text_start = self.find_text_start(self.text_end) if 0 == self.text_start: break self.text_end = self.text_start else: self.text_end = self.find_text_end(self.text_start) self.content += self.get_text(lines) def find_text_start(self, index): blk_len = len(self.block_len) for i in xrange(index, blk_len-1, 1): if self.block_len[i] > THRESHOLD and self.block_len[i+1] > 0: return i return 0 def find_text_end(self, index): blk_len = len(self.block_len) for i in xrange(index, blk_len-1, 1): if 0== self.block_len[i] and 0== self.block_len[i+1]: return i return blk_len-1 def get_text(self, lines): str = '' for i in xrange(self.text_start, self.text_end, 1): str += lines[i]+'\\n' return str#with codecs.open('/home/yz/download/zzz.html', 'r', 'utf-8') as file:# html = file.read()# text_extract = TextExtract(html)# print text_extract.content#text_extract = TextExtract('<html><title>asdfasf</title><body>\\nasdfasfd</body></html>')#print text_extract.contenttry: url = 'http://www.v-find.com' proxied_request = urllib2.urlopen(url) status_code = proxied_request.code mimetype = proxied_request.headers.typeheader or mimetypes.guess_type(url) content = proxied_request.read() #encoding = proxied_request.headers['content-type'].split('charset=')[-1] #ucontent = unicode(content, encoding) text_extract = TextExtract(content) print text_extract.contentexcept urllib2.HTTPError as e: print e#该片段来自于http://byrx.net
相关内容
- Python 的 Twisted 和 ZeroMQ 集成的示例,twistedzeromq,import
- 像fileinput一样使用,处理tarfile文件组,fileinputtarfile,py
- 一只从百度开始不断搜索的小爬虫,百度搜索爬虫,这是
- 不带重复的全排列,不带重复排列,from sys imp
- 简单的猜数字代码,简单猜数字代码,# -*- coding
- 在多玩图片上下载妹子图,玩图片妹子图,# -*- coding
- 带有重复的全排列,带有重复排列,输入开始排列数字和
- 商品条码校验,,这两天刚学着python
- python tic-tac 改进输出版本,pythontic-tac,def print_bo
- 产生a-z的字符串,产生a-z字符串,# method 1pr
评论关闭