Skip to content

Commit

Permalink
Command line argument parsing changed
Browse files Browse the repository at this point in the history
  • Loading branch information
Christoph Holtermann committed Oct 24, 2013
1 parent 3b4e69a commit 784346b
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 29 deletions.
111 changes: 82 additions & 29 deletions HocrConverter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,38 @@
"""HocrConverter
Usage:
HocrConverter.py [-tIbm] <inputHocrFile> <outputPdfFile>
HocrConverter.py [-tIbnm] <inputHocrFile> <outputPdfFile> <inputImageFile> ...
HocrConverter.py (-h | --help)
Options:
-h --help Show this screen.
-t Make ocr-text visible
-I include images
-b draw bounding boxes around ocr-text
-n don't read images supplied in hocr-file
-m do multiple pages in hocr and output pdf
"""

from reportlab.pdfgen.canvas import Canvas
from reportlab.lib.units import inch
from xml.etree.ElementTree import ElementTree
import Image, re, sys
import Image
import re
import sys
try:
from docopt import docopt
except ImportError:
exit('This program requires that `docopt` command line parsing library'
' is installed: \n pip install docopt\n'
'https://github.com/docopt/docopt')
try:
from schema import Schema, And, Or, Use, SchemaError, Optional
except ImportError:
exit('This program requires that `schema` data-validation library'
' is installed: \n pip install schema\n'
'https://github.com/halst/schema')

class HocrConverter():
"""
Expand Down Expand Up @@ -117,7 +148,7 @@ def _setup_image(self, imageFileName):

return (im, width, height)

def to_pdf(self, imageFileName, outFileName, fontname="Courier", fontsize=8, withVisibleOCRText=False, withVisibleImage=True, withVisibleBoundingBoxes=False, takePictureFromHocr=True, multiplePages=False):
def to_pdf(self, imageFileName, outFileName, fontname="Courier", fontsize=8, withVisibleOCRText=False, withVisibleImage=True, withVisibleBoundingBoxes=False, noPictureFromHocr=False, multiplePages=False):
"""
Creates a PDF file with an image superimposed on top of the text.
Expand Down Expand Up @@ -167,7 +198,7 @@ def to_pdf(self, imageFileName, outFileName, fontname="Courier", fontsize=8, wit
print "Parse Results:",parse_result

if parse_result.has_key("file"):
if takePictureFromHocr:
if not noPictureFromHocr:
imageFileName_ocr_page = parse_result["file"]
print "ocr_page file", imageFileName_ocr_page

Expand Down Expand Up @@ -279,38 +310,60 @@ def to_text(self, outFileName):
f.write(self.__str__())
f.close()

def setGlobal( varName ):
def setValue( value ):
print varName, "=", value
globals()[varName] = value;
return True
return setValue

def appendGlobal( varName ):
def appendValue( value ):
globals()[varName].append(value)
print varName,"=",globals()[varName]
return True
return appendValue

if __name__ == "__main__":

# Taking care of command line Arguments
if len(sys.argv) < 4:
print 'Usage: python HocrConverter.py [-t] [-I] [-b] [-f] [-m] inputHocrFile inputImageFile outputPdfFile'
sys.exit(1)
# Variables to control program function
withVisibleOCRText = False;
withVisibleImage = True;
withVisibleBoundingBoxes = False;
takePictureFromHocr = True
noPictureFromHocr = False
multiplePages = False

# Only single Arguments possible, not combinations like -tIbf
while sys.argv[1][0] == "-":
arg = sys.argv.pop(1)
if arg == "-t":
withVisibleOCRText = True;
elif arg == "-I":
withVisibleImage = False;
elif arg == "-b":
withVisibleBoundingBoxes = True;
elif arg == "-n":
takePictureFromHocr = False
elif arg == "-m":
multiplePages = True
inputImageFileNames = []
inputImageFileName = None
inputHocrFileName = None

if takePictureFromHocr:
inputImageFileName = None
outputPdfFileName = sys.argv[2]
# Taking care of command line arguments
arguments = docopt(__doc__)
print(arguments)

# Validation of arguments and setting of global variables
schema = Schema({
'<inputHocrFile>': And( setGlobal( "inputHocrFileName" ), Use(open, error="Can't open <inputHocrFile>") ) ,
'--help': bool,
'-I': setGlobal( "withVisibleImage" ),
'-b': setGlobal( "withVisibleBoundingBoxes" ),
'-m': setGlobal( "multiplePages" ),
'-n': setGlobal( "noPictureFromHocr" ),
'-t': setGlobal( "withVisibleOCRText" ),
'<inputImageFile>': [ And( appendGlobal( "inputImageFileNames" ), Use(open, error="Can't open <inputImageFile>") ) ],
'<outputPdfFile>': setGlobal( "outputPdfFileName" ) })
try:
args = schema.validate(arguments)
except SchemaError as e:
print "Error:"
print " ",e
print "Error Details:"
print " ", e.autos
exit(1)

if inputImageFileNames:
inputImageFileName = inputImageFileNames[0]
else:
inputImageFileName = sys.argv[2]
outputPdfFileName = sys.argv[3]
inputImageFileName = None

hocr = HocrConverter(sys.argv[1])
hocr.to_pdf( inputImageFileName, outputPdfFileName, withVisibleOCRText=withVisibleOCRText, withVisibleImage=withVisibleImage, withVisibleBoundingBoxes=withVisibleBoundingBoxes, takePictureFromHocr=takePictureFromHocr, multiplePages=multiplePages )
hocr = HocrConverter( inputHocrFileName )
hocr.to_pdf( inputImageFileName, outputPdfFileName, withVisibleOCRText=withVisibleOCRText, withVisibleImage=withVisibleImage, withVisibleBoundingBoxes=withVisibleBoundingBoxes, noPictureFromHocr=noPictureFromHocr, multiplePages=multiplePages )
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ Included some aspects from the fork of https://github.com/zw/HocrConverter:

He seems to use tesseract hocr files. I haven't tested that. I didn't include the "word"-object interpretation.

For command line parsing and validation I use some external libraries:
- docopt
- schema

Like this the script is rather something to understand the concept.

Expand Down

0 comments on commit 784346b

Please sign in to comment.