Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 34 additions & 148 deletions cardinal_pythonlib/extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,14 @@

.. code-block:: bash

sudo apt-get install antiword
pip install docx pdfminer
sudo apt-get install antiword # required for DOC
sudo apt-get install pdftotext # optional, but best way for PDF
sudo apt-get install strings # strings/strings2 needed as generic fallback
sudo apt-get install strings2 # as above
sudo apt-get install unrtf # required for RTF

pip install chardet # improves character type detection
pip install pdfminer.six # optional, backup optional for PDF

- Author: Rudolf Cardinal (rudolf@pobox.com)
- Created: Feb 2015
Expand Down Expand Up @@ -71,7 +77,7 @@
# =============================================================================

import argparse
from io import StringIO # Python 3
from io import StringIO
import io
import logging
import os
Expand All @@ -88,7 +94,6 @@
Iterator,
List,
Optional,
Union,
)
from xml.etree import ElementTree as ElementTree

Expand All @@ -101,8 +106,6 @@
import prettytable
from semantic_version import Version

# import texttable # ... can't deal with Unicode properly

from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler

try:
Expand All @@ -112,47 +115,14 @@
chardet = None
UniversalDetector = None

try:
# noinspection PyPackageRequirements
import docx # pip install python-docx (NOT docx) - BUT python-docx requires lxml which has C dependencies # noqa: E501

# noinspection PyPackageRequirements
import docx.document

# noinspection PyPackageRequirements
import docx.oxml.table

# noinspection PyPackageRequirements
import docx.oxml.text.paragraph

# noinspection PyPackageRequirements
import docx.table

# noinspection PyPackageRequirements
import docx.text.paragraph

DOCX_DOCUMENT_TYPE = "docx.document.Document"
DOCX_TABLE_TYPE = Union["docx.table.Table", "CustomDocxTable"]
DOCX_CONTAINER_TYPE = Union[DOCX_DOCUMENT_TYPE, "docx.table._Cell"]
DOCX_BLOCK_ITEM_TYPE = Union[
"docx.text.paragraph.Paragraph", "docx.table.Table"
]
except ImportError:
docx = None
DOCX_DOCUMENT_TYPE = None
DOCX_TABLE_TYPE = "CustomDocxTable"
DOCX_CONTAINER_TYPE = None
DOCX_BLOCK_ITEM_TYPE = None

try:
import docx2txt # pip install docx2txt
except ImportError:
docx2txt = None

try:
# noinspection PyPackageRequirements
import pdfminer # pip install pdfminer

assert (
int(pdfminer.__version__) > 20191010
), "pdfminer installed but too old" # version string is e.g. '20191125'

# noinspection PyPackageRequirements
import pdfminer.pdfinterp

Expand All @@ -167,20 +137,9 @@
except ImportError:
pdfminer = None

try:
# noinspection PyPackageRequirements
import pyth # pip install pyth (PYTHON 2 ONLY; https://pypi.python.org/pypi/pyth/0.5.4) # noqa: E501

# noinspection PyPackageRequirements
import pyth.plugins.rtf15.reader

# noinspection PyPackageRequirements
import pyth.plugins.plaintext.writer
except ImportError:
pyth = None

log = get_brace_style_log_with_null_handler(__name__)


# =============================================================================
# Constants
# =============================================================================
Expand All @@ -192,6 +151,7 @@
SYS_ENCODING = sys.getdefaultencoding()
ENCODING = "utf-8"


# =============================================================================
# External tool map
# =============================================================================
Expand Down Expand Up @@ -542,6 +502,7 @@ def rstrip_all_lines(text: str) -> str:
# PDF
# =============================================================================


# noinspection PyUnresolvedReferences,PyUnusedLocal
def convert_pdf_to_txt(
filename: str = None,
Expand All @@ -561,11 +522,11 @@ def convert_pdf_to_txt(
elif pdfminer: # Memory-hogging method
with get_filelikeobject(filename, blob) as fp:
rsrcmgr = pdfminer.pdfinterp.PDFResourceManager()
retstr = StringIO()
str_io = StringIO()
codec = ENCODING
laparams = pdfminer.layout.LAParams()
device = pdfminer.converter.TextConverter(
rsrcmgr, retstr, codec=codec, laparams=laparams
rsrcmgr, str_io, codec=codec, laparams=laparams
)
interpreter = pdfminer.pdfinterp.PDFPageInterpreter(
rsrcmgr, device
Expand All @@ -583,7 +544,7 @@ def convert_pdf_to_txt(
check_extractable=True,
):
interpreter.process_page(page)
text = retstr.getvalue().decode(ENCODING)
text = str_io.getvalue()
return text
else:
raise AssertionError("No PDF-reading tool available")
Expand Down Expand Up @@ -949,7 +910,7 @@ def wordwrap(text: str, width: int) -> str:


def docx_process_table(
table: DOCX_TABLE_TYPE, config: TextProcessingConfig
table: CustomDocxTable, config: TextProcessingConfig
) -> str:
"""
Converts a DOCX table to text.
Expand Down Expand Up @@ -1061,69 +1022,9 @@ def get_cell_text(cell_) -> str:


# -----------------------------------------------------------------------------
# With the docx library
# DOCX
# -----------------------------------------------------------------------------

_ = '''
# noinspection PyProtectedMember,PyUnresolvedReferences
def docx_docx_iter_block_items(parent: DOCX_CONTAINER_TYPE) \
-> Iterator[DOCX_BLOCK_ITEM_TYPE]:
"""
Iterate through items of a DOCX file.

See https://github.com/python-openxml/python-docx/issues/40.

Yield each paragraph and table child within ``parent``, in document order.
Each returned value is an instance of either :class:`Table` or
:class:`Paragraph`. ``parent`` would most commonly be a reference to a main
:class:`Document` object, but also works for a :class:`_Cell` object, which
itself can contain paragraphs and tables.

NOTE: uses internals of the ``python-docx`` (``docx``) library; subject to
change; this version works with ``docx==0.8.5``.
"""
if isinstance(parent, docx.document.Document):
parent_elm = parent.element.body
elif isinstance(parent, docx.table._Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")

for child in parent_elm.iterchildren():
if isinstance(child, docx.oxml.text.paragraph.CT_P):
yield docx.text.paragraph.Paragraph(child, parent)
elif isinstance(child, docx.oxml.table.CT_Tbl):
yield docx.table.Table(child, parent)


# noinspection PyUnresolvedReferences
def docx_docx_gen_text(doc: DOCX_DOCUMENT_TYPE,
config: TextProcessingConfig) -> Iterator[str]:
"""
Iterate through a DOCX file and yield text.

Args:
doc: DOCX document to process
config: :class:`TextProcessingConfig` control object

Yields:
pieces of text (paragraphs)

"""
if in_order:
for thing in docx_docx_iter_block_items(doc):
if isinstance(thing, docx.text.paragraph.Paragraph):
yield docx_process_simple_text(thing.text, config.width)
elif isinstance(thing, docx.table.Table):
yield docx_process_table(thing, config)
else:
for paragraph in doc.paragraphs:
yield docx_process_simple_text(paragraph.text, config.width)
for table in doc.tables:
yield docx_process_table(table, config)
'''


# noinspection PyUnusedLocal
def convert_docx_to_text(
filename: str = None,
Expand Down Expand Up @@ -1211,25 +1112,12 @@ def convert_docx_to_text(
text += docx_text_from_xml(xml, config)
return text

# elif docx:
# with get_filelikeobject(filename, blob) as fp:
# # noinspection PyUnresolvedReferences
# document = docx.Document(fp)
# return '\n\n'.join(
# docx_docx_gen_text(document, config))
# elif docx2txt:
# if filename:
# return docx2txt.process(filename)
# else:
# raise NotImplementedError("docx2txt BLOB handling not written")
# else:
# raise AssertionError("No DOCX-reading tool available")


# =============================================================================
# ODT
# =============================================================================


# noinspection PyUnusedLocal
def convert_odt_to_text(
filename: str = None,
Expand Down Expand Up @@ -1259,6 +1147,7 @@ def convert_odt_to_text(
# HTML
# =============================================================================


# noinspection PyUnusedLocal
def convert_html_to_text(
filename: str = None,
Expand All @@ -1277,6 +1166,7 @@ def convert_html_to_text(
# XML
# =============================================================================


# noinspection PyUnusedLocal
def convert_xml_to_text(
filename: str = None,
Expand All @@ -1295,6 +1185,7 @@ def convert_xml_to_text(
# RTF
# =============================================================================


# noinspection PyUnresolvedReferences,PyUnusedLocal
def convert_rtf_to_text(
filename: str = None,
Expand All @@ -1314,13 +1205,6 @@ def convert_rtf_to_text(
return get_cmd_output(*args)
else:
return get_cmd_output_from_stdin(blob, *args)
elif pyth: # Very memory-consuming:
# https://github.com/brendonh/pyth/blob/master/pyth/plugins/rtf15/reader.py # noqa: E501
with get_filelikeobject(filename, blob) as fp:
doc = pyth.plugins.rtf15.reader.Rtf15Reader.read(fp)
return pyth.plugins.plaintext.writer.PlaintextWriter.write(
doc
).getvalue()
else:
raise AssertionError("No RTF-reading tool available")

Expand All @@ -1332,11 +1216,6 @@ def availability_rtf() -> bool:
unrtf = tools["unrtf"]
if unrtf:
return True
elif pyth:
log.warning(
"RTF conversion: unrtf missing; " "using pyth (less efficient)"
)
return True
else:
return False

Expand All @@ -1345,6 +1224,7 @@ def availability_rtf() -> bool:
# DOC
# =============================================================================


# noinspection PyUnusedLocal
def convert_doc_to_text(
filename: str = None,
Expand Down Expand Up @@ -1378,6 +1258,7 @@ def availability_doc() -> bool:
# Anything
# =============================================================================


# noinspection PyUnusedLocal
def convert_anything_to_text(
filename: str = None,
Expand Down Expand Up @@ -1523,7 +1404,7 @@ def is_text_extractor_available(extension: str) -> bool:
if info is None:
return False
availability = info[AVAILABILITY]
if type(availability) == bool:
if type(availability) is bool:
return availability
elif callable(availability):
return availability()
Expand Down Expand Up @@ -1551,7 +1432,6 @@ def main() -> None:
"""
Command-line processor. See ``--help`` for details.
"""
logging.basicConfig(level=logging.DEBUG)
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
Expand Down Expand Up @@ -1582,7 +1462,13 @@ def main() -> None:
default=DEFAULT_MIN_COL_WIDTH,
help="Minimum column width for tables",
)
parser.add_argument(
"--verbose",
action="store_true",
help="Be verbose",
)
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.availability:
for ext in args.availability:
if ext.lower() == "none":
Expand Down
2 changes: 1 addition & 1 deletion cardinal_pythonlib/version_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@

"""

VERSION_STRING = "2.0.1"
VERSION_STRING = "2.0.2"
# Use semantic versioning: https://semver.org/
5 changes: 5 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -859,3 +859,8 @@ Quick links:
:func:`cardinal_pythonlib.sqlalchemy.alembic_func.get_current_revision` where
since SQLAlchemy 2.0, the database connection was persisting, resulting in a
metadata lock.

- Bugfix to :func:`cardinal_pythonlib.extract_text.convert_pdf_to_txt` where
``pdftotext`` was unavailable. Also remove antique ``pyth`` support. And
shift from unmaintained ``pdfminer`` to maintained ``pdfminer.six``. Also
removed unused code around importing ``docx`` and ``docx2txt``.