python2.7将doc文件转换成文本文件(windows),python2.7doc,依赖选项WindowsP


依赖选项

WindowsPython 2.7pywin32, http://sourceforge.net/projects/pywin32/Word
# a script that converts word file to txt files# requires word application on Windows machine# requirement:#    1. Windows platform#    2. python 2.7#    3. pywin32, download from http://sourceforge.net/projects/pywin32/#    4. word application installed on running machinefrom win32com.client import constants, Dispatchimport pythoncomimport globimport osfrom zipfile import ZipFile# convert the word file to a text file.# @arg wordapp: The word IDispatch object# @arg wordfile: The word file name# @returns: The txt file namedef convert_to_text(wordapp, wordfile):    name, ext = os.path.splitext(wordfile)    if ext != '.doc' and ext != '.docx':        return None    txtfile = name + '.txt'    print txtfile    wordapp.Documents.Open(os.path.abspath(wordfile))    wdFormatTextLineBreaks = 3    wordapp.ActiveDocument.SaveAs(os.path.abspath(txtfile),                                   FileFormat=wdFormatTextLineBreaks)    wordapp.ActiveDocument.Close()    return txtfile# a generator that iterates all doc files in the current work dirdef next_doc():    for d in glob.glob('*.doc'):        yield d    for d in glob.glob('*.docx'):        yield d# convert all doc/docx files and zip all output txt files as the zipfilenamedef convert_and_zip(zipfilename):    word = Dispatch("Word.Application")    with ZipFile(zipfilename, 'w') as fzip:        for doc in next_doc():            print 'converting ', doc, '...'            txtfile = convert_to_text(word, doc)            if txtfile:                fzip.write(txtfile)    word.Quit()

评论关闭