python 将html转换为markdown,pythonmarkdown,markdown格式要比
python 将html转换为markdown,pythonmarkdown,markdown格式要比
markdown格式要比html格式清爽的多,所以在某些场合需要将html转换为markdown,下面是一段转换的代码片段,来自git,有修改:
#!/usr/bin/env python#encoding=utf8from HTMLParser import HTMLParserimport reimport osimport sysclass Html2MarkdownParser(HTMLParser): def __init__(self): self._markdown = '' self._tag_stack = [] self._tag_attr_data = {} self._handled_tag_body_data = '' self._convertible_tags = ['a', 'b', 'blockquote', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre', 'strong', 'ul'] # FIXME: special characters HTMLParser.__init__(self) def _append_to_markdown(self, new_markdown): if len(self._markdown) > 1: if re.match('\s', self._markdown[-1:]): self._markdown += new_markdown else: self._markdown += ' ' + new_markdown else: self._markdown += new_markdown # <a /> def handle_start_a(self, attrs): self._tag_attr_data = dict(attrs) def handle_end_a(self): a_tag = '' a_tag += '[' + self._handled_tag_body_data + ']' a_tag += '(' + self._tag_attr_data.get('href') title = self._tag_attr_data.get('title') if title: a_tag += ' "' + title + '") ' else: a_tag += ') ' self._append_to_markdown(a_tag) # <b /> def handle_end_b(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('*' + self._handled_tag_body_data + '*') # <blockquote /> def handle_end_blockquote(self): blockquote_body = self._handled_tag_body_data.split(os.linesep) for blockquote_line in blockquote_body: blockquote_line = blockquote_line.strip() self._append_to_markdown('> ' + blockquote_line + os.linesep) # <em /> def handle_end_em(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('*' + self._handled_tag_body_data + '*') # <h1 /> def handle_end_h1(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('# ' + self._handled_tag_body_data + ' #' + os.linesep) # <h2 /> def handle_end_h2(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('## ' + self._handled_tag_body_data + ' ##' + os.linesep) # <h3 /> def handle_end_h3(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('### ' + self._handled_tag_body_data + ' ###' + os.linesep) # <h4 /> def handle_end_h4(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('#### ' + self._handled_tag_body_data + ' ####' + os.linesep) # <h5 /> def handle_end_h5(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('##### ' + self._handled_tag_body_data + ' #####' + os.linesep) # <h6 /> def handle_end_h6(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('###### ' + self._handled_tag_body_data + ' ######' + os.linesep) # <hr /> def handle_start_hr(self, attrs): self._append_to_markdown('* * *' + os.linesep) # <li /> def handle_end_li(self): if len(self._tag_stack): if self._tag_stack[-1] == 'ol': self._append_to_markdown('1. ' + self._handled_tag_body_data + os.linesep) elif self._tag_stack[-1] == 'ul': self._append_to_markdown('* ' + self._handled_tag_body_data + os.linesep) # <p /> def handle_start_p(self, attrs): if len(self._markdown) > 1: if self._markdown[-2:] == '%s%s' % (os.linesep, os.linesep): pass elif self._markdown[-1:] == os.linesep: self._markdown += os.linesep else: self._markdown += os.linesep + os.linesep def handle_end_p(self): self._markdown += '%s%s' % (os.linesep, os.linesep) # <pre /> def handle_end_pre(self): code_lines = self._handled_tag_body_data.split('\n') for code_line in code_lines: self._append_to_markdown(' ' + code_line + os.linesep) # <strong /> def handle_end_strong(self): self._handled_tag_body_data = self._handled_tag_body_data.replace(os.linesep, ' ') self._append_to_markdown('**' + self._handled_tag_body_data + '**') ## ### def handle_starttag(self, tag, attrs): self._tag_stack.append(tag) try: eval('self.handle_start_' + tag + '(attrs)') except AttributeError, e: pass def handle_endtag(self, tag): self._tag_stack.pop() try: eval('self.handle_end_' + tag + '()') # Collapse three successive CRs into two before moving on while len(self._markdown) > 2 and\ self._markdown[-3:] == '%s%s%s' % (os.linesep, os.linesep, os.linesep): self._markdown = self._markdown[:-3] + '%s%s' % (os.linesep, os.linesep) except AttributeError, e: pass self._tag_attr_data = {} self._handled_tag_body_data = '' def handle_data(self, data): data = os.linesep.join(data.strip().split(os.linesep)) if len(self._tag_stack) and self._tag_stack[-1] not in ['p']: self._handled_tag_body_data += data else: self._append_to_markdown(data) def get_markdown(self): return self._markdown.rstrip() + '\n'def main(): p = Html2MarkdownParser() buf = ''' here place your html code''' p.feed(buf) p.close() print p.get_markdown()if __name__ == "__main__": sys.exit(main())
注意这个只可以做有限的转换,table,div等标签是转换不了的。
相关内容
- python使用twisted库实现udp广播,pythontwisted,python使用twi
- python复制某文件夹下指定扩展名的文件,python扩展名
- python 对字符串数组按照相似性排序,python相似性,# [S
- 用python来备份目录,python备份目录,#!/usr/bin/p
- 基于用户推荐算法的k-nearest neighbor算法,k-nearestneighb
- python中thread的setDaemon、join的用法,pythonsetdaemon,#! /usr
- python将经纬度转换为kml文件,,读取文件,将其中的经纬
- python 获得日期是星期几,python星期几,如下代码:from
- python 获得一个月有多少天,python获得,在python的date
- Python 模板引擎性能对比,python性能对比,对比目标,j
评论关闭