Skip to content

Commit 63e3b6e

Browse files
bugfix relating to pdfminer when pdftotext not available, and some related tidying
1 parent e9ad64e commit 63e3b6e

File tree

3 files changed

+40
-149
lines changed

3 files changed

+40
-149
lines changed

cardinal_pythonlib/extract_text.py

Lines changed: 34 additions & 148 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,14 @@
2929
3030
.. code-block:: bash
3131
32-
sudo apt-get install antiword
33-
pip install docx pdfminer
32+
sudo apt-get install antiword # required for DOC
33+
sudo apt-get install pdftotext # optional, but best way for PDF
34+
sudo apt-get install strings # strings/strings2 needed as generic fallback
35+
sudo apt-get install strings2 # as above
36+
sudo apt-get install unrtf # required for RTF
37+
38+
pip install chardet # improves character type detection
39+
pip install pdfminer.six # optional, backup optional for PDF
3440
3541
- Author: Rudolf Cardinal (rudolf@pobox.com)
3642
- Created: Feb 2015
@@ -71,7 +77,7 @@
7177
# =============================================================================
7278

7379
import argparse
74-
from io import StringIO # Python 3
80+
from io import StringIO
7581
import io
7682
import logging
7783
import os
@@ -88,7 +94,6 @@
8894
Iterator,
8995
List,
9096
Optional,
91-
Union,
9297
)
9398
from xml.etree import ElementTree as ElementTree
9499

@@ -101,8 +106,6 @@
101106
import prettytable
102107
from semantic_version import Version
103108

104-
# import texttable # ... can't deal with Unicode properly
105-
106109
from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
107110

108111
try:
@@ -112,47 +115,14 @@
112115
chardet = None
113116
UniversalDetector = None
114117

115-
try:
116-
# noinspection PyPackageRequirements
117-
import docx # pip install python-docx (NOT docx) - BUT python-docx requires lxml which has C dependencies # noqa: E501
118-
119-
# noinspection PyPackageRequirements
120-
import docx.document
121-
122-
# noinspection PyPackageRequirements
123-
import docx.oxml.table
124-
125-
# noinspection PyPackageRequirements
126-
import docx.oxml.text.paragraph
127-
128-
# noinspection PyPackageRequirements
129-
import docx.table
130-
131-
# noinspection PyPackageRequirements
132-
import docx.text.paragraph
133-
134-
DOCX_DOCUMENT_TYPE = "docx.document.Document"
135-
DOCX_TABLE_TYPE = Union["docx.table.Table", "CustomDocxTable"]
136-
DOCX_CONTAINER_TYPE = Union[DOCX_DOCUMENT_TYPE, "docx.table._Cell"]
137-
DOCX_BLOCK_ITEM_TYPE = Union[
138-
"docx.text.paragraph.Paragraph", "docx.table.Table"
139-
]
140-
except ImportError:
141-
docx = None
142-
DOCX_DOCUMENT_TYPE = None
143-
DOCX_TABLE_TYPE = "CustomDocxTable"
144-
DOCX_CONTAINER_TYPE = None
145-
DOCX_BLOCK_ITEM_TYPE = None
146-
147-
try:
148-
import docx2txt # pip install docx2txt
149-
except ImportError:
150-
docx2txt = None
151-
152118
try:
153119
# noinspection PyPackageRequirements
154120
import pdfminer # pip install pdfminer
155121

122+
assert (
123+
int(pdfminer.__version__) > 20191010
124+
), "pdfminer installed but too old" # version string is e.g. '20191125'
125+
156126
# noinspection PyPackageRequirements
157127
import pdfminer.pdfinterp
158128

@@ -167,20 +137,9 @@
167137
except ImportError:
168138
pdfminer = None
169139

170-
try:
171-
# noinspection PyPackageRequirements
172-
import pyth # pip install pyth (PYTHON 2 ONLY; https://pypi.python.org/pypi/pyth/0.5.4) # noqa: E501
173-
174-
# noinspection PyPackageRequirements
175-
import pyth.plugins.rtf15.reader
176-
177-
# noinspection PyPackageRequirements
178-
import pyth.plugins.plaintext.writer
179-
except ImportError:
180-
pyth = None
181-
182140
log = get_brace_style_log_with_null_handler(__name__)
183141

142+
184143
# =============================================================================
185144
# Constants
186145
# =============================================================================
@@ -192,6 +151,7 @@
192151
SYS_ENCODING = sys.getdefaultencoding()
193152
ENCODING = "utf-8"
194153

154+
195155
# =============================================================================
196156
# External tool map
197157
# =============================================================================
@@ -542,6 +502,7 @@ def rstrip_all_lines(text: str) -> str:
542502
# PDF
543503
# =============================================================================
544504

505+
545506
# noinspection PyUnresolvedReferences,PyUnusedLocal
546507
def convert_pdf_to_txt(
547508
filename: str = None,
@@ -561,11 +522,11 @@ def convert_pdf_to_txt(
561522
elif pdfminer: # Memory-hogging method
562523
with get_filelikeobject(filename, blob) as fp:
563524
rsrcmgr = pdfminer.pdfinterp.PDFResourceManager()
564-
retstr = StringIO()
525+
str_io = StringIO()
565526
codec = ENCODING
566527
laparams = pdfminer.layout.LAParams()
567528
device = pdfminer.converter.TextConverter(
568-
rsrcmgr, retstr, codec=codec, laparams=laparams
529+
rsrcmgr, str_io, codec=codec, laparams=laparams
569530
)
570531
interpreter = pdfminer.pdfinterp.PDFPageInterpreter(
571532
rsrcmgr, device
@@ -583,7 +544,7 @@ def convert_pdf_to_txt(
583544
check_extractable=True,
584545
):
585546
interpreter.process_page(page)
586-
text = retstr.getvalue().decode(ENCODING)
547+
text = str_io.getvalue()
587548
return text
588549
else:
589550
raise AssertionError("No PDF-reading tool available")
@@ -949,7 +910,7 @@ def wordwrap(text: str, width: int) -> str:
949910

950911

951912
def docx_process_table(
952-
table: DOCX_TABLE_TYPE, config: TextProcessingConfig
913+
table: CustomDocxTable, config: TextProcessingConfig
953914
) -> str:
954915
"""
955916
Converts a DOCX table to text.
@@ -1061,69 +1022,9 @@ def get_cell_text(cell_) -> str:
10611022

10621023

10631024
# -----------------------------------------------------------------------------
1064-
# With the docx library
1025+
# DOCX
10651026
# -----------------------------------------------------------------------------
10661027

1067-
_ = '''
1068-
# noinspection PyProtectedMember,PyUnresolvedReferences
1069-
def docx_docx_iter_block_items(parent: DOCX_CONTAINER_TYPE) \
1070-
-> Iterator[DOCX_BLOCK_ITEM_TYPE]:
1071-
"""
1072-
Iterate through items of a DOCX file.
1073-
1074-
See https://github.com/python-openxml/python-docx/issues/40.
1075-
1076-
Yield each paragraph and table child within ``parent``, in document order.
1077-
Each returned value is an instance of either :class:`Table` or
1078-
:class:`Paragraph`. ``parent`` would most commonly be a reference to a main
1079-
:class:`Document` object, but also works for a :class:`_Cell` object, which
1080-
itself can contain paragraphs and tables.
1081-
1082-
NOTE: uses internals of the ``python-docx`` (``docx``) library; subject to
1083-
change; this version works with ``docx==0.8.5``.
1084-
"""
1085-
if isinstance(parent, docx.document.Document):
1086-
parent_elm = parent.element.body
1087-
elif isinstance(parent, docx.table._Cell):
1088-
parent_elm = parent._tc
1089-
else:
1090-
raise ValueError("something's not right")
1091-
1092-
for child in parent_elm.iterchildren():
1093-
if isinstance(child, docx.oxml.text.paragraph.CT_P):
1094-
yield docx.text.paragraph.Paragraph(child, parent)
1095-
elif isinstance(child, docx.oxml.table.CT_Tbl):
1096-
yield docx.table.Table(child, parent)
1097-
1098-
1099-
# noinspection PyUnresolvedReferences
1100-
def docx_docx_gen_text(doc: DOCX_DOCUMENT_TYPE,
1101-
config: TextProcessingConfig) -> Iterator[str]:
1102-
"""
1103-
Iterate through a DOCX file and yield text.
1104-
1105-
Args:
1106-
doc: DOCX document to process
1107-
config: :class:`TextProcessingConfig` control object
1108-
1109-
Yields:
1110-
pieces of text (paragraphs)
1111-
1112-
"""
1113-
if in_order:
1114-
for thing in docx_docx_iter_block_items(doc):
1115-
if isinstance(thing, docx.text.paragraph.Paragraph):
1116-
yield docx_process_simple_text(thing.text, config.width)
1117-
elif isinstance(thing, docx.table.Table):
1118-
yield docx_process_table(thing, config)
1119-
else:
1120-
for paragraph in doc.paragraphs:
1121-
yield docx_process_simple_text(paragraph.text, config.width)
1122-
for table in doc.tables:
1123-
yield docx_process_table(table, config)
1124-
'''
1125-
1126-
11271028
# noinspection PyUnusedLocal
11281029
def convert_docx_to_text(
11291030
filename: str = None,
@@ -1211,25 +1112,12 @@ def convert_docx_to_text(
12111112
text += docx_text_from_xml(xml, config)
12121113
return text
12131114

1214-
# elif docx:
1215-
# with get_filelikeobject(filename, blob) as fp:
1216-
# # noinspection PyUnresolvedReferences
1217-
# document = docx.Document(fp)
1218-
# return '\n\n'.join(
1219-
# docx_docx_gen_text(document, config))
1220-
# elif docx2txt:
1221-
# if filename:
1222-
# return docx2txt.process(filename)
1223-
# else:
1224-
# raise NotImplementedError("docx2txt BLOB handling not written")
1225-
# else:
1226-
# raise AssertionError("No DOCX-reading tool available")
1227-
12281115

12291116
# =============================================================================
12301117
# ODT
12311118
# =============================================================================
12321119

1120+
12331121
# noinspection PyUnusedLocal
12341122
def convert_odt_to_text(
12351123
filename: str = None,
@@ -1259,6 +1147,7 @@ def convert_odt_to_text(
12591147
# HTML
12601148
# =============================================================================
12611149

1150+
12621151
# noinspection PyUnusedLocal
12631152
def convert_html_to_text(
12641153
filename: str = None,
@@ -1277,6 +1166,7 @@ def convert_html_to_text(
12771166
# XML
12781167
# =============================================================================
12791168

1169+
12801170
# noinspection PyUnusedLocal
12811171
def convert_xml_to_text(
12821172
filename: str = None,
@@ -1295,6 +1185,7 @@ def convert_xml_to_text(
12951185
# RTF
12961186
# =============================================================================
12971187

1188+
12981189
# noinspection PyUnresolvedReferences,PyUnusedLocal
12991190
def convert_rtf_to_text(
13001191
filename: str = None,
@@ -1314,13 +1205,6 @@ def convert_rtf_to_text(
13141205
return get_cmd_output(*args)
13151206
else:
13161207
return get_cmd_output_from_stdin(blob, *args)
1317-
elif pyth: # Very memory-consuming:
1318-
# https://github.com/brendonh/pyth/blob/master/pyth/plugins/rtf15/reader.py # noqa: E501
1319-
with get_filelikeobject(filename, blob) as fp:
1320-
doc = pyth.plugins.rtf15.reader.Rtf15Reader.read(fp)
1321-
return pyth.plugins.plaintext.writer.PlaintextWriter.write(
1322-
doc
1323-
).getvalue()
13241208
else:
13251209
raise AssertionError("No RTF-reading tool available")
13261210

@@ -1332,11 +1216,6 @@ def availability_rtf() -> bool:
13321216
unrtf = tools["unrtf"]
13331217
if unrtf:
13341218
return True
1335-
elif pyth:
1336-
log.warning(
1337-
"RTF conversion: unrtf missing; " "using pyth (less efficient)"
1338-
)
1339-
return True
13401219
else:
13411220
return False
13421221

@@ -1345,6 +1224,7 @@ def availability_rtf() -> bool:
13451224
# DOC
13461225
# =============================================================================
13471226

1227+
13481228
# noinspection PyUnusedLocal
13491229
def convert_doc_to_text(
13501230
filename: str = None,
@@ -1378,6 +1258,7 @@ def availability_doc() -> bool:
13781258
# Anything
13791259
# =============================================================================
13801260

1261+
13811262
# noinspection PyUnusedLocal
13821263
def convert_anything_to_text(
13831264
filename: str = None,
@@ -1523,7 +1404,7 @@ def is_text_extractor_available(extension: str) -> bool:
15231404
if info is None:
15241405
return False
15251406
availability = info[AVAILABILITY]
1526-
if type(availability) == bool:
1407+
if type(availability) is bool:
15271408
return availability
15281409
elif callable(availability):
15291410
return availability()
@@ -1551,7 +1432,6 @@ def main() -> None:
15511432
"""
15521433
Command-line processor. See ``--help`` for details.
15531434
"""
1554-
logging.basicConfig(level=logging.DEBUG)
15551435
parser = argparse.ArgumentParser(
15561436
formatter_class=argparse.ArgumentDefaultsHelpFormatter
15571437
)
@@ -1582,7 +1462,13 @@ def main() -> None:
15821462
default=DEFAULT_MIN_COL_WIDTH,
15831463
help="Minimum column width for tables",
15841464
)
1465+
parser.add_argument(
1466+
"--verbose",
1467+
action="store_true",
1468+
help="Be verbose",
1469+
)
15851470
args = parser.parse_args()
1471+
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
15861472
if args.availability:
15871473
for ext in args.availability:
15881474
if ext.lower() == "none":

cardinal_pythonlib/version_string.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,5 @@
3131
3232
"""
3333

34-
VERSION_STRING = "2.0.1"
34+
VERSION_STRING = "2.0.2"
3535
# Use semantic versioning: https://semver.org/

docs/source/changelog.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -859,3 +859,8 @@ Quick links:
859859
:func:`cardinal_pythonlib.sqlalchemy.alembic_func.get_current_revision` where
860860
since SQLAlchemy 2.0, the database connection was persisting, resulting in a
861861
metadata lock.
862+
863+
- Bugfix to :func:`cardinal_pythonlib.extract_text.convert_pdf_to_txt` where
864+
``pdftotext`` was unavailable. Also remove antique ``pyth`` support. And
865+
shift from unmaintained ``pdfminer`` to maintained ``pdfminer.six``. Also
866+
removed unused code around importing ``docx`` and ``docx2txt``.

0 commit comments

Comments
 (0)