virantha · dpnova · Sep 20, 2017
diff --git a/pypdfocr/pypdfocr.py b/pypdfocr/pypdfocr.py
@@ -21,27 +21,27 @@
 import itertools
 from functools import wraps
 
-from version import __version__
+from .version import __version__
 from PIL import Image
 import yaml
 
 import multiprocessing
 # Replace the Popen routine to allow win32 pyinstaller to build
 from multiprocessing import forking
-from pypdfocr_multiprocessing import _Popen
+from .pypdfocr_multiprocessing import _Popen
 forking.Popen = _Popen
 
-from pypdfocr_pdf import PyPdf
-from pypdfocr_tesseract import PyTesseract
-from pypdfocr_gs import PyGs
-from pypdfocr_watcher import PyPdfWatcher
-from pypdfocr_pdffiler import PyPdfFiler
-from pypdfocr_filer_dirs import PyFilerDirs
-from pypdfocr_filer_evernote import PyFilerEvernote
-from pypdfocr_preprocess import PyPreprocess
+from .pypdfocr_pdf import PyPdf
+from .pypdfocr_tesseract import PyTesseract
+from .pypdfocr_gs import PyGs
+from .pypdfocr_watcher import PyPdfWatcher
+from .pypdfocr_pdffiler import PyPdfFiler
+from .pypdfocr_filer_dirs import PyFilerDirs
+from .pypdfocr_filer_evernote import PyFilerEvernote
+from .pypdfocr_preprocess import PyPreprocess
 
 def error(text):
-    print("ERROR: %s" % text)
+    print(("ERROR: %s" % text))
     sys.exit(-1)
 
 # decorator to retry multiple times
@@ -299,16 +299,16 @@ def _setup_filing(self):
         keyword_count = 0
         folder_count = 0
         if 'folders' in self.config:
-            for folder, keywords in self.config['folders'].items():
+            for folder, keywords in list(self.config['folders'].items()):
                 folder_count +=1
                 keyword_count += len(keywords)
                 # Make sure keywords are lower-cased before adding
                 keywords = [str(x).lower() for x in keywords]
                 self.filer.add_folder_target(folder, keywords)
 
         print ("Filing of PDFs is enabled")
-        print (" - %d target filing folders" % (folder_count))
-        print (" - %d keywords" % (keyword_count))
+        print((" - %d target filing folders" % (folder_count)))
+        print((" - %d keywords" % (keyword_count)))
 
 
     def _setup_external_tools(self):
@@ -337,7 +337,7 @@ def run_conversion(self, pdf_filename):
             :returns: OCR'ed PDF
             :rtype: filename string
         """
-        print ("Starting conversion of %s" % pdf_filename)
+        print(("Starting conversion of %s" % pdf_filename))
         try:
             # Make the images for Tesseract
             img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
@@ -367,11 +367,11 @@ def run_conversion(self, pdf_filename):
             time.sleep(1)
             if not self.debug:
                 # Need to clean up the original image files before preprocessing
-                if locals().has_key("fns"): # Have to check if this was set before exception raised
+                if "fns" in locals(): # Have to check if this was set before exception raised
                     logging.info("Cleaning up %s" % fns)
                     self._clean_up_files(fns)
 
-                if locals().has_key("preprocess_imagefilenames"):  # Have to check if this was set before exception raised
+                if "preprocess_imagefilenames" in locals():  # Have to check if this was set before exception raised
                     logging.info("Cleaning up %s" % preprocess_imagefilenames)
                     self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs
                     for ext in [".hocr", ".html", ".txt"]:
@@ -384,7 +384,7 @@ def run_conversion(self, pdf_filename):
                     #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames])
 
 
-        print ("Completed conversion successfully to %s" % ocr_pdf_filename)
+        print(("Completed conversion successfully to %s" % ocr_pdf_filename))
         return ocr_pdf_filename
 
     def file_converted_file(self, ocr_pdffilename, original_pdffilename):
@@ -399,11 +399,11 @@ def file_converted_file(self, ocr_pdffilename, original_pdffilename):
             "rtype: string
         """
         filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename)  
-        print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path)))
+        print(("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path))))
 
         tgt_path = self.pdf_filer.file_original(original_pdffilename)
         if tgt_path != original_pdffilename:
-            print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path)))
+            print(("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path))))
         return os.path.dirname(filed_path)
 
 
@@ -467,7 +467,7 @@ def go(self, argv):
                 except KeyboardInterrupt:
                     break
                 except Exception as e:
-                    print traceback.print_exc(e)
+                    print(traceback.print_exc(e))
                     py_watcher.stop()
 
         else:

diff --git a/pypdfocr/pypdfocr_filer.py b/pypdfocr/pypdfocr_filer.py
@@ -14,12 +14,11 @@
 import abc
 import os, logging
 
-class PyFiler(object):
+class PyFiler(object, metaclass=abc.ABCMeta):
     """ Abstract base class for defining filing objects, whether you want to 
     save to a file-system/directory structure or to something like Evernote
 
     """
-    __metaclass__ = abc.ABCMeta
 
     @abc.abstractmethod
     def move_to_matching_folder(self, filename):

diff --git a/pypdfocr/pypdfocr_filer_dirs.py b/pypdfocr/pypdfocr_filer_dirs.py
@@ -16,7 +16,7 @@
 import os
 import shutil
 
-from pypdfocr_filer import PyFiler
+from .pypdfocr_filer import PyFiler
 
 """
     Implementation of a filer class 

diff --git a/pypdfocr/pypdfocr_filer_evernote.py b/pypdfocr/pypdfocr_filer_evernote.py
@@ -19,7 +19,7 @@
 import time
 import sys
 
-from pypdfocr_filer import PyFiler
+from .pypdfocr_filer import PyFiler
 
 import functools
 
@@ -87,7 +87,7 @@ def get_target_folder(self):
         return self._target_folder
     def set_target_folder (self, target_folder):
         """ Override this to make sure we only have the basename"""
-        print("Setting target_folder %s" % target_folder)
+        print(("Setting target_folder %s" % target_folder))
         if target_folder:
             self._target_folder = os.path.basename(target_folder)
         else:
@@ -134,14 +134,14 @@ def _connect_to_evernote(self, dictUserInfo):
             user = self.user_store.getUser()
         except EDAMUserException as e:
             err = e.errorCode
-            print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter))
+            print(("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter)))
         except EDAMSystemException as e:
             err = e.errorCode
-            print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message))
+            print(("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message)))
             sys.exit(-1)
 
         if user:
-            print("Authenticated to evernote as user %s" % user.username)
+            print(("Authenticated to evernote as user %s" % user.username))
         return True
 
     def add_folder_target(self, folder, keywords):
@@ -274,9 +274,9 @@ def move_to_matching_folder(self, filename, foldername):
             logging.info("[MATCH] %s --> %s" % (filename, foldername))
 
         # Check if the evernote notebook exists
-        print ("Checking for notebook named %s" % foldername)
+        print(("Checking for notebook named %s" % foldername))
         notebook = self._check_and_make_notebook(foldername)
-        print("Uploading %s to %s" % (filename, foldername))
+        print(("Uploading %s to %s" % (filename, foldername)))
 
         note = self._create_evernote_note(notebook, filename)
 

diff --git a/pypdfocr/pypdfocr_gs.py b/pypdfocr/pypdfocr_gs.py
@@ -26,7 +26,7 @@
 import glob
 
 def error(text):
-    print("ERROR: %s" % text)
+    print(("ERROR: %s" % text))
     exit(-1)
 
 class PyGs(object):
@@ -112,7 +112,7 @@ def _find_windows_gs(self):
             error(self.msgs['GS_MISSING_BINARY'])
 
     def _warn(self, msg):
-        print("WARNING: %s" % msg)
+        print(("WARNING: %s" % msg))
 
     def _get_dpi(self, pdf_filename):
         if not os.path.exists(pdf_filename):
@@ -157,7 +157,7 @@ def _get_dpi(self, pdf_filename):
             if abs(xdpi-ydpi) > xdpi*.05:  # Make sure the two dpi's are within 5%
                 self._warn("X-dpi is %d, Y-dpi is %d, defaulting to %d" % (xdpi, ydpi, self.output_dpi))
             else:
-                print("Using %d DPI" % self.output_dpi)
+                print(("Using %d DPI" % self.output_dpi))
 
 
         except Exception as e:
@@ -174,7 +174,7 @@ def _run_gs(self, options, output_filename, pdf_filename):
             out = subprocess.check_output(cmd, shell=True)
 
         except subprocess.CalledProcessError as e:
-            print e.output
+            print(e.output)
             if "undefined in .getdeviceparams" in e.output:
                 error(self.msgs['GS_OUTDATED'])
             else:

diff --git a/pypdfocr/pypdfocr_pdf.py b/pypdfocr/pypdfocr_pdf.py
@@ -31,7 +31,7 @@
 import tempfile
 import glob
 
-import cStringIO
+import io
 import base64
 import zlib
 import math
@@ -52,7 +52,7 @@
 from reportlab.lib.enums import TA_LEFT
 from reportlab.platypus.paragraph import Paragraph
 
-from pypdfocr_util import Retry
+from .pypdfocr_util import Retry
 from functools import partial
 
 class RotatedPara(Paragraph):

diff --git a/pypdfocr/pypdfocr_pdffiler.py b/pypdfocr/pypdfocr_pdffiler.py
@@ -1,80 +1,80 @@
-
-# Copyright 2013 Virantha Ekanayake All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-    Provides capability to search PDFs and file to a specific folder based
-    on keywords
-"""
-
-from sets import Set    
-import sys, os
-import re
-import logging
-import shutil
-
-from PyPDF2 import PdfFileReader
-from pypdfocr_filer import PyFiler
-from pypdfocr_filer_dirs import PyFilerDirs
-
-class PyPdfFiler(object):
-    def __init__(self, filer):
-
-        assert isinstance(filer, PyFiler)
-        self.filer = filer  # Must be a subclass of PyFiler
-
-        # Whether to fall back on filename for matching keywords against
-        # if there is no match in the text
-        self.file_using_filename = False 
-
-    def iter_pdf_page_text(self, filename):
-        self.filename = filename
-        reader = PdfFileReader(filename)
-        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
-        for pgnum in range(reader.getNumPages()):
-            text = reader.getPage(pgnum).extractText()
-            text = text.encode('ascii', 'ignore')
-            text = text.replace('\n', ' ')
-            yield text
-
-    def _get_matching_folder(self, pdfText):
-        searchText = pdfText.lower()
-        for folder,strings in self.filer.folder_targets.items():
-            for s in strings:
-                logging.debug("Checking string %s" % s)
-                if s in searchText:
-                    logging.info("Matched keyword '%s'" % s)
-                    return folder
-        # No match found, so return 
-        return None
-
-    def file_original (self, original_filename):
-        return self.filer.file_original(original_filename)
-
-    def move_to_matching_folder(self, filename):
-        for page_text in self.iter_pdf_page_text(filename):
-            tgt_folder = self._get_matching_folder(page_text)
-            if tgt_folder: break  # Stop searching through pdf pages as soon as we find a match
-
-        if not tgt_folder and self.file_using_filename:
-            tgt_folder = self._get_matching_folder(filename)
-
-        tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
-        return tgt_file
-        
-if __name__ == '__main__':
-    p = PyPdfFiler(PyFilerDirs())
-    for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
-        print (page_text)
-
+
+# Copyright 2013 Virantha Ekanayake All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+    Provides capability to search PDFs and file to a specific folder based
+    on keywords
+"""
+
+from sets import Set    
+import sys, os
+import re
+import logging
+import shutil
+
+from PyPDF2 import PdfFileReader
+from .pypdfocr_filer import PyFiler
+from .pypdfocr_filer_dirs import PyFilerDirs
+
+class PyPdfFiler(object):
+    def __init__(self, filer):
+
+        assert isinstance(filer, PyFiler)
+        self.filer = filer  # Must be a subclass of PyFiler
+
+        # Whether to fall back on filename for matching keywords against
+        # if there is no match in the text
+        self.file_using_filename = False 
+
+    def iter_pdf_page_text(self, filename):
+        self.filename = filename
+        reader = PdfFileReader(filename)
+        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
+        for pgnum in range(reader.getNumPages()):
+            text = reader.getPage(pgnum).extractText()
+            text = text.encode('ascii', 'ignore')
+            text = text.replace('\n', ' ')
+            yield text
+
+    def _get_matching_folder(self, pdfText):
+        searchText = pdfText.lower()
+        for folder,strings in list(self.filer.folder_targets.items()):
+            for s in strings:
+                logging.debug("Checking string %s" % s)
+                if s in searchText:
+                    logging.info("Matched keyword '%s'" % s)
+                    return folder
+        # No match found, so return 
+        return None
+
+    def file_original (self, original_filename):
+        return self.filer.file_original(original_filename)
+
+    def move_to_matching_folder(self, filename):
+        for page_text in self.iter_pdf_page_text(filename):
+            tgt_folder = self._get_matching_folder(page_text)
+            if tgt_folder: break  # Stop searching through pdf pages as soon as we find a match
+
+        if not tgt_folder and self.file_using_filename:
+            tgt_folder = self._get_matching_folder(filename)
+
+        tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
+        return tgt_file
+
+if __name__ == '__main__':
+    p = PyPdfFiler(PyFilerDirs())
+    for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
+        print (page_text)
+