python_读取 doc,docx,pdf,,#!/usr/bin
python_读取 doc,docx,pdf,,#!/usr/bin
#!/usr/bin/env python# -*- coding: utf-8 -*-import docxfrom pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreterfrom pdfminer.converter import TextConverterfrom pdfminer.layout import LAParamsfrom pdfminer.pdfpage import PDFPagefrom cStringIO import StringIOfrom win32com import clientimport sysreload(sys)sys.setdefaultencoding(‘gb2312‘)def readDocx(docxPath): fullText = [] doc = docx.Document(docxPath) paras = doc.paragraphs for p in paras: fullText.append(p.text.strip()) return ‘\n‘.join(fullText)def readPdf(pdfPath): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = ‘utf-8‘ laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(pdfPath, ‘rb‘) interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return strdef readDoc(docPath): fullText = [] word = client.Dispatch(‘Word.Application‘) # 打开一个已存在的文件 doc = word.Documents.Open(docPath) #print doc.Content #print text doc.SaveAs(‘c:/temp.txt‘, 2) # 关闭 doc.Close() word.Quit() f=open(r‘c:/temp.txt‘,‘r‘) for line in f.readlines(): #f len(line)!=line.count(‘\n‘): fullText.append(line.decode(‘gbk‘).strip()) f.close() return ‘\n‘.join(fullText)if __name__ == ‘__main__‘: #docxValue=readDocx(‘d:/1.docx‘) #print docxValue #pdfValue = readPdf(‘d:/3.pdf‘) #print pdfValue docValue = readDoc(‘d:/2.doc‘) print docValue
python_读取 doc,docx,pdf
评论关闭