开发者

How to extract PDF fields from a filled out form in Python?

I'm trying to use Python to processes some PDF forms that were filled out and signed using Adobe Acrobat Reader.

I've tried:

  • The pdfminer demo: it didn't dump any of the filled out data.
  • pyPdf: it maxed a core for 2 minutes when I tried to load the file with PdfFileReader(f) and I just gave up and killed it.
  • Jython and PDFBox: got that working great but the startup time is excessive, I'll just write an external utility in straight Java if that's my only option.

I can keep hunting for libraries and trying them but I'm hoping someone already has an efficient solution for this.


Update: Based on Steven's answer I looked into pdfminer and it did the trick nicely.

from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PD开发者_C百科FDocument
from pdfminer.pdftypes import resolve1, PDFObjRef

def load_form(filename):
    """Load pdf form contents into a nested list of name/value tuples"""
    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        return [load_fields(resolve1(f)) for f in
                   resolve1(doc.catalog['AcroForm'])['Fields']]

def load_fields(field):
    """Recursively load form fields"""
    form = field.get('Kids', None)
    if form:
        return [load_fields(resolve1(f)) for f in form]
    else:
        # Some field types, like signatures, need extra resolving
        return (field.get('T').decode('utf-16'), resolve1(field.get('V')))

def parse_cli():
    """Load command line arguments"""
    parser = ArgumentParser(description='Dump the form contents of a PDF.')
    parser.add_argument('file', metavar='pdf_form',
                    help='PDF Form to dump the contents of')
    parser.add_argument('-o', '--out', help='Write output to file',
                      default=None, metavar='FILE')
    parser.add_argument('-p', '--pickle', action='store_true', default=False,
                      help='Format output for python consumption')
    return parser.parse_args()

def main():
    args = parse_cli()
    form = load_form(args.file)
    if args.out:
        with open(args.out, 'w') as outfile:
            if args.pickle:
                pickle.dump(form, outfile)
            else:
                pp = pprint.PrettyPrinter(indent=2)
                file.write(pp.pformat(form))
    else:
        if args.pickle:
            print(pickle.dumps(form))
        else:
            pp = pprint.PrettyPrinter(indent=2)
            pp.pprint(form)

if __name__ == '__main__':
    main()


You should be able to do it with pdfminer, but it will require some delving into the internals of pdfminer and some knowledge about the pdf format (wrt forms of course, but also about pdf's internal structures like "dictionaries" and "indirect objects").

This example might help you on your way (I think it will work only on simple cases, with no nested fields etc...)

import sys
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1

filename = sys.argv[1]
fp = open(filename, 'rb')

parser = PDFParser(fp)
doc = PDFDocument(parser)
fields = resolve1(doc.catalog['AcroForm'])['Fields']
for i in fields:
    field = resolve1(i)
    name, value = field.get('T'), field.get('V')
    print '{0}: {1}'.format(name, value)

EDIT: forgot to mention: if you need to provide a password, pass it to doc.initialize()


Python 3.6+:

pip install PyPDF2

# -*- coding: utf-8 -*-

from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader


def _getFields(obj, tree=None, retval=None, fileobj=None):
    """
    Extracts field data if this PDF contains interactive form fields.
    The *tree* and *retval* parameters are for recursive use.

    :param fileobj: A file object (usually a text file) to write
        a report to on all interactive form fields found.
    :return: A dictionary where each key is a field name, and each
        value is a :class:`Field<PyPDF2.generic.Field>` object. By
        default, the mapping name is used for keys.
    :rtype: dict, or ``None`` if form data could not be located.
    """
    fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
                       '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
    if retval is None:
        retval = OrderedDict()
        catalog = obj.trailer["/Root"]
        # get the AcroForm tree
        if "/AcroForm" in catalog:
            tree = catalog["/AcroForm"]
        else:
            return None
    if tree is None:
        return retval

    obj._checkKids(tree, retval, fileobj)
    for attr in fieldAttributes:
        if attr in tree:
            # Tree is a field
            obj._buildField(tree, retval, fileobj, fieldAttributes)
            break

    if "/Fields" in tree:
        fields = tree["/Fields"]
        for f in fields:
            field = f.getObject()
            obj._buildField(field, retval, fileobj, fieldAttributes)

    return retval


def get_form_fields(infile):
    infile = PdfFileReader(open(infile, 'rb'))
    fields = _getFields(infile)
    return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())



if __name__ == '__main__':
    from pprint import pprint

    pdf_file_name = 'FormExample.pdf'

    pprint(get_form_fields(pdf_file_name))


The Python PyPDF2 package (successor to pyPdf) is very convenient:

import PyPDF2
f = PyPDF2.PdfFileReader('form.pdf')
ff = f.getFields()

Then ff is a dict that contains all the relevant form information.


Quick and dirty 2-minute job; just use PDFminer to convert PDF to xml and then grab all of the fields.

from xml.etree import ElementTree
from pprint import pprint
import os

def main():
    print "Calling PDFDUMP.py"
    os.system("dumppdf.py -a FILE.pdf > out.xml")

    # Preprocess the file to eliminate bad XML.
    print "Screening the file"
    o = open("output.xml","w") #open for append
    for line in open("out.xml"):
       line = line.replace("&#", "Invalid_XML") #some bad data in xml for formatting info.
       o.write(line) 
    o.close()

    print "Opening XML output"
    tree = ElementTree.parse('output.xml')
    lastnode = ""
    lastnode2 = ""
    list = {}
    entry = {}

    for node in tree.iter(): # Run through the tree..        
        # Check if New node
        if node.tag == "key" and node.text == "T":
            lastnode = node.tag + node.text
        elif lastnode == "keyT":
            for child in node.iter():
                entry["ID"] = child.text
            lastnode = ""

        if node.tag == "key" and node.text == "V":
            lastnode2 = node.tag + node.text
        elif lastnode2 == "keyV":
            for child in node.iter():
                if child.tag == "string":
                    if entry.has_key("ID"):
                        entry["Value"] = child.text
                        list[entry["ID"]] = entry["Value"]
                        entry = {}
            lastnode2 = ""

    pprint(list)

if __name__ == '__main__':
  main()

It isn't pretty, just a simple proof of concept. I need to implement it for a system I'm working on so I will be cleaning it up, but I thought I would post it in case anyone finds it useful.


Update for latest version of pdf miner (change import and parser/doc setup in first function)

from argparse import ArgumentParser
import pickle
import pprint
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1
from pdfminer.pdftypes import PDFObjRef

def load_form(filename):
    """Load pdf form contents into a nested list of name/value tuples"""
    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        parser.set_document(doc)
        #doc.set_parser(parser)
        doc.initialize()
        return [load_fields(resolve1(f)) for f in
            resolve1(doc.catalog['AcroForm'])['Fields']]

def load_fields(field):
    """Recursively load form fields"""
    form = field.get('Kids', None)
    if form:
        return [load_fields(resolve1(f)) for f in form]
    else:
        # Some field types, like signatures, need extra resolving
        return (field.get('T').decode('utf-8'), resolve1(field.get('V')))

def parse_cli():
    """Load command line arguments"""
    parser = ArgumentParser(description='Dump the form contents of a PDF.')
    parser.add_argument('file', metavar='pdf_form',
        help='PDF Form to dump the contents of')
    parser.add_argument('-o', '--out', help='Write output to file',
        default=None, metavar='FILE')
    parser.add_argument('-p', '--pickle', action='store_true', default=False,
        help='Format output for python consumption')
    return parser.parse_args()

def main():
    args = parse_cli()
    form = load_form(args.file)
    if args.out:
        with open(args.out, 'w') as outfile:
            if args.pickle:
                pickle.dump(form, outfile)
            else:
                pp = pprint.PrettyPrinter(indent=2)
                file.write(pp.pformat(form))
    else:
        if args.pickle:
            print pickle.dumps(form)
        else:
            pp = pprint.PrettyPrinter(indent=2)
            pp.pprint(form)

if __name__ == '__main__':
    main()


I created a library to do this: pip install fillpdf

from fillpdf import fillpdfs
fillpdfs.get_form_fields("ex.pdf")

Credit to dvska's answer, for basis of library code.


There is a typo on these lines:

file.write(pp.pformat(form))

Should be:

outfile.write(pp.pformat(form))
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜