python使用PyPDF解析PDF文件,pythonpypdf解析pdf,pyPDF模块提供了方便


pyPDF模块提供了方便的解析pdf文件的接口。下面是一个简单的程序实现从pdf文件中抽取某些页的内容。

#! /usr/bin/env python###################################################################################  Copyright 2012 Jeet Sukumaran.####  This program is free software; you can redistribute it and/or modify##  it under the terms of the GNU General Public License as published by##  the Free Software Foundation; either version 3 of the License, or##  (at your option) any later version.####  This program is distributed in the hope that it will be useful,##  but WITHOUT ANY WARRANTY; without even the implied warranty of##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the##  GNU General Public License for more details.####  You should have received a copy of the GNU General Public License along##  with this program. If not, see <http://www.gnu.org/licenses/>.#################################################################################"""Extract specified pages from source PDF."""import sysimport osimport argparseimport pyPdf__prog__ = os.path.basename(__file__)__version__ = "1.0.0"__description__ = __doc____author__ = 'Jeet Sukumaran'__copyright__ = 'Copyright (C) 2012 Jeet Sukumaran.'def main():    """    Main CLI handler.    """    parser = argparse.ArgumentParser(description=__description__)    parser.add_argument("--version", action="version", version="%(prog)s " + __version__)    parser.add_argument("src_pdf",            metavar="SOURCE-PDF",            type=argparse.FileType('rb'),            help="path to input pdf file")    parser.add_argument("first_page",            metavar="FIRST-PAGE",            type=int,            help="number of first page (1-based index: first page is '1')")    parser.add_argument("last_page",            metavar="LAST-PAGE",            type=str,            help="number of last page; if preceded by '+' (e.g., '+30'), specifies number of pages following first page to extract")    parser.add_argument("-o", "--output-filepath",            type=str,            default=None,            help="path to output file (if not given, will write to standard output)")    args = parser.parse_args()    first_page = args.first_page - 1    if args.last_page.startswith("+"):        last_page = args.last_page[1:].replace(" ", "")        if not last_page:            sys.exit("Need to specify number of pages")        last_page = first_page + int(last_page)    else:        last_page = int(args.last_page) - 1    pdf_in = pyPdf.PdfFileReader(args.src_pdf)    pdf_out = pyPdf.PdfFileWriter()    for pg_num in range(first_page, last_page + 1):        pdf_out.addPage(pdf_in.getPage(pg_num))    if args.output_filepath:        out_stream = open(os.path.expandvars(os.path.expanduser(args.output_filepath)), "wb")    else:        out_stream = sys.stdout    pdf_out.write(out_stream)    out_stream.close()if __name__ == '__main__':    main()

评论关闭