2929
3030.. code-block:: bash
3131
32- sudo apt-get install antiword
33- pip install docx pdfminer
32+ sudo apt-get install antiword # required for DOC
33+ sudo apt-get install pdftotext # optional, but best way for PDF
34+ sudo apt-get install strings # strings/strings2 needed as generic fallback
35+ sudo apt-get install strings2 # as above
36+ sudo apt-get install unrtf # required for RTF
37+
38+ pip install chardet # improves character type detection
39+ pip install pdfminer.six # optional, backup optional for PDF
3440
3541- Author: Rudolf Cardinal (rudolf@pobox.com)
3642- Created: Feb 2015
7177# =============================================================================
7278
7379import argparse
74- from io import StringIO # Python 3
80+ from io import StringIO
7581import io
7682import logging
7783import os
8894 Iterator ,
8995 List ,
9096 Optional ,
91- Union ,
9297)
9398from xml .etree import ElementTree as ElementTree
9499
101106import prettytable
102107from semantic_version import Version
103108
104- # import texttable # ... can't deal with Unicode properly
105-
106109from cardinal_pythonlib .logs import get_brace_style_log_with_null_handler
107110
108111try :
112115 chardet = None
113116 UniversalDetector = None
114117
115- try :
116- # noinspection PyPackageRequirements
117- import docx # pip install python-docx (NOT docx) - BUT python-docx requires lxml which has C dependencies # noqa: E501
118-
119- # noinspection PyPackageRequirements
120- import docx .document
121-
122- # noinspection PyPackageRequirements
123- import docx .oxml .table
124-
125- # noinspection PyPackageRequirements
126- import docx .oxml .text .paragraph
127-
128- # noinspection PyPackageRequirements
129- import docx .table
130-
131- # noinspection PyPackageRequirements
132- import docx .text .paragraph
133-
134- DOCX_DOCUMENT_TYPE = "docx.document.Document"
135- DOCX_TABLE_TYPE = Union ["docx.table.Table" , "CustomDocxTable" ]
136- DOCX_CONTAINER_TYPE = Union [DOCX_DOCUMENT_TYPE , "docx.table._Cell" ]
137- DOCX_BLOCK_ITEM_TYPE = Union [
138- "docx.text.paragraph.Paragraph" , "docx.table.Table"
139- ]
140- except ImportError :
141- docx = None
142- DOCX_DOCUMENT_TYPE = None
143- DOCX_TABLE_TYPE = "CustomDocxTable"
144- DOCX_CONTAINER_TYPE = None
145- DOCX_BLOCK_ITEM_TYPE = None
146-
147- try :
148- import docx2txt # pip install docx2txt
149- except ImportError :
150- docx2txt = None
151-
152118try :
153119 # noinspection PyPackageRequirements
154120 import pdfminer # pip install pdfminer
155121
122+ assert (
123+ int (pdfminer .__version__ ) > 20191010
124+ ), "pdfminer installed but too old" # version string is e.g. '20191125'
125+
156126 # noinspection PyPackageRequirements
157127 import pdfminer .pdfinterp
158128
167137except ImportError :
168138 pdfminer = None
169139
170- try :
171- # noinspection PyPackageRequirements
172- import pyth # pip install pyth (PYTHON 2 ONLY; https://pypi.python.org/pypi/pyth/0.5.4) # noqa: E501
173-
174- # noinspection PyPackageRequirements
175- import pyth .plugins .rtf15 .reader
176-
177- # noinspection PyPackageRequirements
178- import pyth .plugins .plaintext .writer
179- except ImportError :
180- pyth = None
181-
182140log = get_brace_style_log_with_null_handler (__name__ )
183141
142+
184143# =============================================================================
185144# Constants
186145# =============================================================================
192151SYS_ENCODING = sys .getdefaultencoding ()
193152ENCODING = "utf-8"
194153
154+
195155# =============================================================================
196156# External tool map
197157# =============================================================================
@@ -542,6 +502,7 @@ def rstrip_all_lines(text: str) -> str:
542502# PDF
543503# =============================================================================
544504
505+
545506# noinspection PyUnresolvedReferences,PyUnusedLocal
546507def convert_pdf_to_txt (
547508 filename : str = None ,
@@ -561,11 +522,11 @@ def convert_pdf_to_txt(
561522 elif pdfminer : # Memory-hogging method
562523 with get_filelikeobject (filename , blob ) as fp :
563524 rsrcmgr = pdfminer .pdfinterp .PDFResourceManager ()
564- retstr = StringIO ()
525+ str_io = StringIO ()
565526 codec = ENCODING
566527 laparams = pdfminer .layout .LAParams ()
567528 device = pdfminer .converter .TextConverter (
568- rsrcmgr , retstr , codec = codec , laparams = laparams
529+ rsrcmgr , str_io , codec = codec , laparams = laparams
569530 )
570531 interpreter = pdfminer .pdfinterp .PDFPageInterpreter (
571532 rsrcmgr , device
@@ -583,7 +544,7 @@ def convert_pdf_to_txt(
583544 check_extractable = True ,
584545 ):
585546 interpreter .process_page (page )
586- text = retstr .getvalue (). decode ( ENCODING )
547+ text = str_io .getvalue ()
587548 return text
588549 else :
589550 raise AssertionError ("No PDF-reading tool available" )
@@ -949,7 +910,7 @@ def wordwrap(text: str, width: int) -> str:
949910
950911
951912def docx_process_table (
952- table : DOCX_TABLE_TYPE , config : TextProcessingConfig
913+ table : CustomDocxTable , config : TextProcessingConfig
953914) -> str :
954915 """
955916 Converts a DOCX table to text.
@@ -1061,69 +1022,9 @@ def get_cell_text(cell_) -> str:
10611022
10621023
10631024# -----------------------------------------------------------------------------
1064- # With the docx library
1025+ # DOCX
10651026# -----------------------------------------------------------------------------
10661027
1067- _ = '''
1068- # noinspection PyProtectedMember,PyUnresolvedReferences
1069- def docx_docx_iter_block_items(parent: DOCX_CONTAINER_TYPE) \
1070- -> Iterator[DOCX_BLOCK_ITEM_TYPE]:
1071- """
1072- Iterate through items of a DOCX file.
1073-
1074- See https://github.com/python-openxml/python-docx/issues/40.
1075-
1076- Yield each paragraph and table child within ``parent``, in document order.
1077- Each returned value is an instance of either :class:`Table` or
1078- :class:`Paragraph`. ``parent`` would most commonly be a reference to a main
1079- :class:`Document` object, but also works for a :class:`_Cell` object, which
1080- itself can contain paragraphs and tables.
1081-
1082- NOTE: uses internals of the ``python-docx`` (``docx``) library; subject to
1083- change; this version works with ``docx==0.8.5``.
1084- """
1085- if isinstance(parent, docx.document.Document):
1086- parent_elm = parent.element.body
1087- elif isinstance(parent, docx.table._Cell):
1088- parent_elm = parent._tc
1089- else:
1090- raise ValueError("something's not right")
1091-
1092- for child in parent_elm.iterchildren():
1093- if isinstance(child, docx.oxml.text.paragraph.CT_P):
1094- yield docx.text.paragraph.Paragraph(child, parent)
1095- elif isinstance(child, docx.oxml.table.CT_Tbl):
1096- yield docx.table.Table(child, parent)
1097-
1098-
1099- # noinspection PyUnresolvedReferences
1100- def docx_docx_gen_text(doc: DOCX_DOCUMENT_TYPE,
1101- config: TextProcessingConfig) -> Iterator[str]:
1102- """
1103- Iterate through a DOCX file and yield text.
1104-
1105- Args:
1106- doc: DOCX document to process
1107- config: :class:`TextProcessingConfig` control object
1108-
1109- Yields:
1110- pieces of text (paragraphs)
1111-
1112- """
1113- if in_order:
1114- for thing in docx_docx_iter_block_items(doc):
1115- if isinstance(thing, docx.text.paragraph.Paragraph):
1116- yield docx_process_simple_text(thing.text, config.width)
1117- elif isinstance(thing, docx.table.Table):
1118- yield docx_process_table(thing, config)
1119- else:
1120- for paragraph in doc.paragraphs:
1121- yield docx_process_simple_text(paragraph.text, config.width)
1122- for table in doc.tables:
1123- yield docx_process_table(table, config)
1124- '''
1125-
1126-
11271028# noinspection PyUnusedLocal
11281029def convert_docx_to_text (
11291030 filename : str = None ,
@@ -1211,25 +1112,12 @@ def convert_docx_to_text(
12111112 text += docx_text_from_xml (xml , config )
12121113 return text
12131114
1214- # elif docx:
1215- # with get_filelikeobject(filename, blob) as fp:
1216- # # noinspection PyUnresolvedReferences
1217- # document = docx.Document(fp)
1218- # return '\n\n'.join(
1219- # docx_docx_gen_text(document, config))
1220- # elif docx2txt:
1221- # if filename:
1222- # return docx2txt.process(filename)
1223- # else:
1224- # raise NotImplementedError("docx2txt BLOB handling not written")
1225- # else:
1226- # raise AssertionError("No DOCX-reading tool available")
1227-
12281115
12291116# =============================================================================
12301117# ODT
12311118# =============================================================================
12321119
1120+
12331121# noinspection PyUnusedLocal
12341122def convert_odt_to_text (
12351123 filename : str = None ,
@@ -1259,6 +1147,7 @@ def convert_odt_to_text(
12591147# HTML
12601148# =============================================================================
12611149
1150+
12621151# noinspection PyUnusedLocal
12631152def convert_html_to_text (
12641153 filename : str = None ,
@@ -1277,6 +1166,7 @@ def convert_html_to_text(
12771166# XML
12781167# =============================================================================
12791168
1169+
12801170# noinspection PyUnusedLocal
12811171def convert_xml_to_text (
12821172 filename : str = None ,
@@ -1295,6 +1185,7 @@ def convert_xml_to_text(
12951185# RTF
12961186# =============================================================================
12971187
1188+
12981189# noinspection PyUnresolvedReferences,PyUnusedLocal
12991190def convert_rtf_to_text (
13001191 filename : str = None ,
@@ -1314,13 +1205,6 @@ def convert_rtf_to_text(
13141205 return get_cmd_output (* args )
13151206 else :
13161207 return get_cmd_output_from_stdin (blob , * args )
1317- elif pyth : # Very memory-consuming:
1318- # https://github.com/brendonh/pyth/blob/master/pyth/plugins/rtf15/reader.py # noqa: E501
1319- with get_filelikeobject (filename , blob ) as fp :
1320- doc = pyth .plugins .rtf15 .reader .Rtf15Reader .read (fp )
1321- return pyth .plugins .plaintext .writer .PlaintextWriter .write (
1322- doc
1323- ).getvalue ()
13241208 else :
13251209 raise AssertionError ("No RTF-reading tool available" )
13261210
@@ -1332,11 +1216,6 @@ def availability_rtf() -> bool:
13321216 unrtf = tools ["unrtf" ]
13331217 if unrtf :
13341218 return True
1335- elif pyth :
1336- log .warning (
1337- "RTF conversion: unrtf missing; " "using pyth (less efficient)"
1338- )
1339- return True
13401219 else :
13411220 return False
13421221
@@ -1345,6 +1224,7 @@ def availability_rtf() -> bool:
13451224# DOC
13461225# =============================================================================
13471226
1227+
13481228# noinspection PyUnusedLocal
13491229def convert_doc_to_text (
13501230 filename : str = None ,
@@ -1378,6 +1258,7 @@ def availability_doc() -> bool:
13781258# Anything
13791259# =============================================================================
13801260
1261+
13811262# noinspection PyUnusedLocal
13821263def convert_anything_to_text (
13831264 filename : str = None ,
@@ -1523,7 +1404,7 @@ def is_text_extractor_available(extension: str) -> bool:
15231404 if info is None :
15241405 return False
15251406 availability = info [AVAILABILITY ]
1526- if type (availability ) == bool :
1407+ if type (availability ) is bool :
15271408 return availability
15281409 elif callable (availability ):
15291410 return availability ()
@@ -1551,7 +1432,6 @@ def main() -> None:
15511432 """
15521433 Command-line processor. See ``--help`` for details.
15531434 """
1554- logging .basicConfig (level = logging .DEBUG )
15551435 parser = argparse .ArgumentParser (
15561436 formatter_class = argparse .ArgumentDefaultsHelpFormatter
15571437 )
@@ -1582,7 +1462,13 @@ def main() -> None:
15821462 default = DEFAULT_MIN_COL_WIDTH ,
15831463 help = "Minimum column width for tables" ,
15841464 )
1465+ parser .add_argument (
1466+ "--verbose" ,
1467+ action = "store_true" ,
1468+ help = "Be verbose" ,
1469+ )
15851470 args = parser .parse_args ()
1471+ logging .basicConfig (level = logging .DEBUG if args .verbose else logging .INFO )
15861472 if args .availability :
15871473 for ext in args .availability :
15881474 if ext .lower () == "none" :
0 commit comments