ocropus · JKamlah · Jul 26, 2019 · Jul 26, 2019 · Jul 26, 2019 · Jul 26, 2019
diff --git a/hocr-simplify b/hocr-simplify
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+# change level of typesetting and/or remove properties
+# to create a simplified hocr-version
+
+from __future__ import print_function
+import argparse
+import re
+import sys
+import os
+
+from lxml import etree, html
+
+parser = argparse.ArgumentParser(
+    description=('change level of typesetting and/or'
+                 'remove properties to create'
+                 'a simplified hocr-version'))
+properties = ['baseline', 'bbox', 'cflow', 'cuts', 'hardbreak', 'image',
+              'imagemd5', 'lpageno', 'ppageno', 'nlp', 'order', 'poly',
+              'scan_res', 'textangle', 'x_booxes', 'x_font', 'x_fsize',
+              'x_confs', 'x_scanner', 'x_source', 'x_wconf']
+
+parser.add_argument('file', nargs='?', default=sys.stdin)
+parser.add_argument('-t', '--typesetting', type=str,
+                    choices=['glyph', 'word', 'line', 'par', 'carea', 'page'],
+                    help='Maximum level of typesetting')
+parser.add_argument('-r', '--remove-properties', nargs='+',
+                    help='List of properties: {}'.format(','.join(properties)))
+parser.add_argument('fileout', nargs='?',
+                    help="Outputpath, default: print to terminal")
+parser.add_argument('-v', '--verbose',
+                    action='store_true', help='Verbose, default: %(default)s')
+
+args = parser.parse_args()
+
+doc = html.parse(args.file)
+# change level of typesetting
+if args.typesetting:
+    # set maximum level of typesetting
+    if args.typesetting in ["word"]:
+        args.typesetting = "ocrx_" + args.typesetting
+    else:
+        args.typesetting = "ocr_" + args.typesetting
+
+    # apply new level of typesetting
+    for node in doc.xpath("//*[@class='{}']".format(args.typesetting)):
+        if args.verbose:
+            print(re.sub(r'\s+', '\x20', node.text_content()).strip())
+        node.text = node.text_content().strip()
+        for child in list(node):
+            node.remove(child)
+
+# remove properties
+if args.remove_properties:
+    for node in doc.xpath("//*[@title]"):
+        title = node.get("title")
+        for prop in title.split(";"):
+            (key, args) = prop.strip().split(None, 1)
+            if key in args.remove_properties:
+                if args.verbose:
+                    print("Replaced :{}".format(title))
+                title = title.replace(prop + ";", "").strip()
+
+# if no outputpath is given, print to terminal
+if args.fileout is None:
+    print(etree.tostring(doc, pretty_print=True).decode('UTF-8'))
+else:
+    # create output path if needed
+    if not os.path.isdir(os.path.dirname(args.fileout)):
+        os.makedirs(os.path.dirname(args.fileout))
+
+    # write new hocr-files
+    with open(args.fileout, "w") as f:
+        f.writelines(etree.tostring(doc, pretty_print=True).decode('UTF-8'))
diff --git a/test/hocr-simplify/hocr-simplify.tsht b/test/hocr-simplify/hocr-simplify.tsht
@@ -0,0 +1,11 @@
+#!/usr/bin/env tsht
+TESTDATA="../testdata"
+SIMPLEFILE="./tess.simple.hocr"
+
+plan 5
+
+after () {
+    rm -f "$SIMPLEFILE"
+}
+hocr-simplify "$TESTDATA/tess.hocr" -t page > "$SIMPLEFILE" || fail 'hocr-simplify'
+equals 3870 $(ls -l "$SIMPLEFILE" | cut -d " " -f5 ) 'filesize == 3870'
diff --git a/test/smoke.tsht b/test/smoke.tsht
@@ -1,6 +1,6 @@
 #!/usr/bin/env tsht
 
-for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split;do
+for f in check combine eval eval-geom eval-lines extract-g1000 extract-images lines merge-dc pdf split simplify;do
     exec_ok "hocr-$f" "--help"
     exec_ok "hocr-$f" "-h"
 done