python_读取 doc,docx,pdf,,#!/usr/bin


#!/usr/bin/env python# -*- coding: utf-8 -*-import docxfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfpage import PDFPagefrom cStringIO import StringIOfrom win32com import clientimport sysreload(sys)sys.setdefaultencoding(‘gb2312‘)def readDocx(docxPath):    fullText = []    doc = docx.Document(docxPath)    paras = doc.paragraphs    for p in paras:        fullText.append(p.text.strip())    return ‘\n‘.join(fullText)def readPdf(pdfPath):    rsrcmgr = PDFResourceManager()    retstr = StringIO()    codec = ‘utf-8‘    laparams = LAParams()    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    fp = file(pdfPath, ‘rb‘)    interpreter = PDFPageInterpreter(rsrcmgr, device)    password = ""    maxpages = 0    caching = True    pagenos=set()    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):        interpreter.process_page(page)    fp.close()    device.close()    str = retstr.getvalue()    retstr.close()    return strdef readDoc(docPath):    fullText = []    word = client.Dispatch(‘Word.Application‘)        # 打开一个已存在的文件    doc = word.Documents.Open(docPath)    #print doc.Content    #print text    doc.SaveAs(‘c:/temp.txt‘, 2)    # 关闭    doc.Close()    word.Quit()    f=open(r‘c:/temp.txt‘,‘r‘)      for line in f.readlines():         #f len(line)!=line.count(‘\n‘):        fullText.append(line.decode(‘gbk‘).strip())    f.close()    return ‘\n‘.join(fullText)if __name__ == ‘__main__‘:    #docxValue=readDocx(‘d:/1.docx‘)    #print docxValue    #pdfValue = readPdf(‘d:/3.pdf‘)    #print pdfValue    docValue = readDoc(‘d:/2.doc‘)    print docValue

python_读取 doc,docx,pdf

评论关闭