Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Output of 2to3. #67

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 21 additions & 21 deletions pypdfocr/pypdfocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,27 @@
import itertools
from functools import wraps

from version import __version__
from .version import __version__
from PIL import Image
import yaml

import multiprocessing
# Replace the Popen routine to allow win32 pyinstaller to build
from multiprocessing import forking
from pypdfocr_multiprocessing import _Popen
from .pypdfocr_multiprocessing import _Popen
forking.Popen = _Popen

from pypdfocr_pdf import PyPdf
from pypdfocr_tesseract import PyTesseract
from pypdfocr_gs import PyGs
from pypdfocr_watcher import PyPdfWatcher
from pypdfocr_pdffiler import PyPdfFiler
from pypdfocr_filer_dirs import PyFilerDirs
from pypdfocr_filer_evernote import PyFilerEvernote
from pypdfocr_preprocess import PyPreprocess
from .pypdfocr_pdf import PyPdf
from .pypdfocr_tesseract import PyTesseract
from .pypdfocr_gs import PyGs
from .pypdfocr_watcher import PyPdfWatcher
from .pypdfocr_pdffiler import PyPdfFiler
from .pypdfocr_filer_dirs import PyFilerDirs
from .pypdfocr_filer_evernote import PyFilerEvernote
from .pypdfocr_preprocess import PyPreprocess

def error(text):
print("ERROR: %s" % text)
print(("ERROR: %s" % text))
sys.exit(-1)

# decorator to retry multiple times
Expand Down Expand Up @@ -299,16 +299,16 @@ def _setup_filing(self):
keyword_count = 0
folder_count = 0
if 'folders' in self.config:
for folder, keywords in self.config['folders'].items():
for folder, keywords in list(self.config['folders'].items()):
folder_count +=1
keyword_count += len(keywords)
# Make sure keywords are lower-cased before adding
keywords = [str(x).lower() for x in keywords]
self.filer.add_folder_target(folder, keywords)

print ("Filing of PDFs is enabled")
print (" - %d target filing folders" % (folder_count))
print (" - %d keywords" % (keyword_count))
print((" - %d target filing folders" % (folder_count)))
print((" - %d keywords" % (keyword_count)))


def _setup_external_tools(self):
Expand Down Expand Up @@ -337,7 +337,7 @@ def run_conversion(self, pdf_filename):
:returns: OCR'ed PDF
:rtype: filename string
"""
print ("Starting conversion of %s" % pdf_filename)
print(("Starting conversion of %s" % pdf_filename))
try:
# Make the images for Tesseract
img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename)
Expand Down Expand Up @@ -367,11 +367,11 @@ def run_conversion(self, pdf_filename):
time.sleep(1)
if not self.debug:
# Need to clean up the original image files before preprocessing
if locals().has_key("fns"): # Have to check if this was set before exception raised
if "fns" in locals(): # Have to check if this was set before exception raised
logging.info("Cleaning up %s" % fns)
self._clean_up_files(fns)

if locals().has_key("preprocess_imagefilenames"): # Have to check if this was set before exception raised
if "preprocess_imagefilenames" in locals(): # Have to check if this was set before exception raised
logging.info("Cleaning up %s" % preprocess_imagefilenames)
self._clean_up_files(preprocess_imagefilenames) # splat the hocr_filenames as it is a list of pairs
for ext in [".hocr", ".html", ".txt"]:
Expand All @@ -384,7 +384,7 @@ def run_conversion(self, pdf_filename):
#self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames])


print ("Completed conversion successfully to %s" % ocr_pdf_filename)
print(("Completed conversion successfully to %s" % ocr_pdf_filename))
return ocr_pdf_filename

def file_converted_file(self, ocr_pdffilename, original_pdffilename):
Expand All @@ -399,11 +399,11 @@ def file_converted_file(self, ocr_pdffilename, original_pdffilename):
"rtype: string
"""
filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename)
print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path)))
print(("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path))))

tgt_path = self.pdf_filer.file_original(original_pdffilename)
if tgt_path != original_pdffilename:
print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path)))
print(("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path))))
return os.path.dirname(filed_path)


Expand Down Expand Up @@ -467,7 +467,7 @@ def go(self, argv):
except KeyboardInterrupt:
break
except Exception as e:
print traceback.print_exc(e)
print(traceback.print_exc(e))
py_watcher.stop()

else:
Expand Down
3 changes: 1 addition & 2 deletions pypdfocr/pypdfocr_filer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@
import abc
import os, logging

class PyFiler(object):
class PyFiler(object, metaclass=abc.ABCMeta):
""" Abstract base class for defining filing objects, whether you want to
save to a file-system/directory structure or to something like Evernote

"""
__metaclass__ = abc.ABCMeta

@abc.abstractmethod
def move_to_matching_folder(self, filename):
Expand Down
2 changes: 1 addition & 1 deletion pypdfocr/pypdfocr_filer_dirs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
import shutil

from pypdfocr_filer import PyFiler
from .pypdfocr_filer import PyFiler

"""
Implementation of a filer class
Expand Down
14 changes: 7 additions & 7 deletions pypdfocr/pypdfocr_filer_evernote.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import time
import sys

from pypdfocr_filer import PyFiler
from .pypdfocr_filer import PyFiler

import functools

Expand Down Expand Up @@ -87,7 +87,7 @@ def get_target_folder(self):
return self._target_folder
def set_target_folder (self, target_folder):
""" Override this to make sure we only have the basename"""
print("Setting target_folder %s" % target_folder)
print(("Setting target_folder %s" % target_folder))
if target_folder:
self._target_folder = os.path.basename(target_folder)
else:
Expand Down Expand Up @@ -134,14 +134,14 @@ def _connect_to_evernote(self, dictUserInfo):
user = self.user_store.getUser()
except EDAMUserException as e:
err = e.errorCode
print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter))
print(("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.parameter)))
except EDAMSystemException as e:
err = e.errorCode
print("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message))
print(("Error attempting to authenticate to Evernote: %s - %s" % (EDAMErrorCode._VALUES_TO_NAMES[err], e.message)))
sys.exit(-1)

if user:
print("Authenticated to evernote as user %s" % user.username)
print(("Authenticated to evernote as user %s" % user.username))
return True

def add_folder_target(self, folder, keywords):
Expand Down Expand Up @@ -274,9 +274,9 @@ def move_to_matching_folder(self, filename, foldername):
logging.info("[MATCH] %s --> %s" % (filename, foldername))

# Check if the evernote notebook exists
print ("Checking for notebook named %s" % foldername)
print(("Checking for notebook named %s" % foldername))
notebook = self._check_and_make_notebook(foldername)
print("Uploading %s to %s" % (filename, foldername))
print(("Uploading %s to %s" % (filename, foldername)))

note = self._create_evernote_note(notebook, filename)

Expand Down
8 changes: 4 additions & 4 deletions pypdfocr/pypdfocr_gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import glob

def error(text):
print("ERROR: %s" % text)
print(("ERROR: %s" % text))
exit(-1)

class PyGs(object):
Expand Down Expand Up @@ -112,7 +112,7 @@ def _find_windows_gs(self):
error(self.msgs['GS_MISSING_BINARY'])

def _warn(self, msg):
print("WARNING: %s" % msg)
print(("WARNING: %s" % msg))

def _get_dpi(self, pdf_filename):
if not os.path.exists(pdf_filename):
Expand Down Expand Up @@ -157,7 +157,7 @@ def _get_dpi(self, pdf_filename):
if abs(xdpi-ydpi) > xdpi*.05: # Make sure the two dpi's are within 5%
self._warn("X-dpi is %d, Y-dpi is %d, defaulting to %d" % (xdpi, ydpi, self.output_dpi))
else:
print("Using %d DPI" % self.output_dpi)
print(("Using %d DPI" % self.output_dpi))


except Exception as e:
Expand All @@ -174,7 +174,7 @@ def _run_gs(self, options, output_filename, pdf_filename):
out = subprocess.check_output(cmd, shell=True)

except subprocess.CalledProcessError as e:
print e.output
print(e.output)
if "undefined in .getdeviceparams" in e.output:
error(self.msgs['GS_OUTDATED'])
else:
Expand Down
4 changes: 2 additions & 2 deletions pypdfocr/pypdfocr_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import tempfile
import glob

import cStringIO
import io
import base64
import zlib
import math
Expand All @@ -52,7 +52,7 @@
from reportlab.lib.enums import TA_LEFT
from reportlab.platypus.paragraph import Paragraph

from pypdfocr_util import Retry
from .pypdfocr_util import Retry
from functools import partial

class RotatedPara(Paragraph):
Expand Down
160 changes: 80 additions & 80 deletions pypdfocr/pypdfocr_pdffiler.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,80 @@
# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Provides capability to search PDFs and file to a specific folder based
on keywords
"""
from sets import Set
import sys, os
import re
import logging
import shutil
from PyPDF2 import PdfFileReader
from pypdfocr_filer import PyFiler
from pypdfocr_filer_dirs import PyFilerDirs
class PyPdfFiler(object):
def __init__(self, filer):
assert isinstance(filer, PyFiler)
self.filer = filer # Must be a subclass of PyFiler
# Whether to fall back on filename for matching keywords against
# if there is no match in the text
self.file_using_filename = False
def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text
def _get_matching_folder(self, pdfText):
searchText = pdfText.lower()
for folder,strings in self.filer.folder_targets.items():
for s in strings:
logging.debug("Checking string %s" % s)
if s in searchText:
logging.info("Matched keyword '%s'" % s)
return folder
# No match found, so return
return None
def file_original (self, original_filename):
return self.filer.file_original(original_filename)
def move_to_matching_folder(self, filename):
for page_text in self.iter_pdf_page_text(filename):
tgt_folder = self._get_matching_folder(page_text)
if tgt_folder: break # Stop searching through pdf pages as soon as we find a match
if not tgt_folder and self.file_using_filename:
tgt_folder = self._get_matching_folder(filename)
tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
return tgt_file
if __name__ == '__main__':
p = PyPdfFiler(PyFilerDirs())
for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
print (page_text)

# Copyright 2013 Virantha Ekanayake All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Provides capability to search PDFs and file to a specific folder based
on keywords
"""

from sets import Set
import sys, os
import re
import logging
import shutil

from PyPDF2 import PdfFileReader
from .pypdfocr_filer import PyFiler
from .pypdfocr_filer_dirs import PyFilerDirs

class PyPdfFiler(object):
def __init__(self, filer):

assert isinstance(filer, PyFiler)
self.filer = filer # Must be a subclass of PyFiler

# Whether to fall back on filename for matching keywords against
# if there is no match in the text
self.file_using_filename = False

def iter_pdf_page_text(self, filename):
self.filename = filename
reader = PdfFileReader(filename)
logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
for pgnum in range(reader.getNumPages()):
text = reader.getPage(pgnum).extractText()
text = text.encode('ascii', 'ignore')
text = text.replace('\n', ' ')
yield text

def _get_matching_folder(self, pdfText):
searchText = pdfText.lower()
for folder,strings in list(self.filer.folder_targets.items()):
for s in strings:
logging.debug("Checking string %s" % s)
if s in searchText:
logging.info("Matched keyword '%s'" % s)
return folder
# No match found, so return
return None

def file_original (self, original_filename):
return self.filer.file_original(original_filename)

def move_to_matching_folder(self, filename):
for page_text in self.iter_pdf_page_text(filename):
tgt_folder = self._get_matching_folder(page_text)
if tgt_folder: break # Stop searching through pdf pages as soon as we find a match

if not tgt_folder and self.file_using_filename:
tgt_folder = self._get_matching_folder(filename)

tgt_file = self.filer.move_to_matching_folder(filename, tgt_folder)
return tgt_file

if __name__ == '__main__':
p = PyPdfFiler(PyFilerDirs())
for page_text in p.iter_pdf_page_text("scan_ocr.pdf"):
print (page_text)

Loading