Skip to content

Commit 271269f

Browse files
Improve grep_in_openxml
1 parent 2baec41 commit 271269f

File tree

3 files changed

+190
-74
lines changed

3 files changed

+190
-74
lines changed

cardinal_pythonlib/openxml/grep_in_openxml.py

Lines changed: 183 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -42,19 +42,19 @@
4242
import os
4343
import re
4444
from sys import argv, getdefaultencoding, stdin
45-
from typing import Pattern
45+
from typing import Pattern, Union
46+
from xml.etree import ElementTree
4647
from zipfile import BadZipFile, ZipFile
4748
import zlib
4849

4950
from rich_argparse import RawDescriptionRichHelpFormatter
5051

5152
from cardinal_pythonlib.logs import (
52-
BraceStyleAdapter,
5353
main_only_quicksetup_rootlogger,
5454
)
5555
from cardinal_pythonlib.fileops import gen_filenames
5656

57-
log = BraceStyleAdapter(logging.getLogger(__name__))
57+
log = logging.getLogger(__name__)
5858

5959

6060
def report_hit_filename(
@@ -87,7 +87,10 @@ def report_miss_filename(zipfilename: str) -> None:
8787

8888

8989
def report_line(
90-
zipfilename: str, contentsfilename: str, line: str, show_inner_file: bool
90+
zipfilename: str,
91+
contentsfilename: str,
92+
line: Union[bytes, str],
93+
show_inner_file: bool,
9194
) -> None:
9295
"""
9396
Prints a line from a file, with the ``.zip`` filename and optionally also
@@ -113,90 +116,176 @@ def parse_zip(
113116
files_with_matches: bool,
114117
files_without_match: bool,
115118
grep_inner_file_name: bool,
119+
grep_raw_text: bool,
116120
show_inner_file: bool,
117121
) -> None:
118122
"""
119123
Implement a "grep within an OpenXML file" for a single OpenXML file, which
120124
is by definition a ``.zip`` file.
121125
122126
Args:
123-
zipfilename: name of the OpenXML (zip) file
124-
regex: regular expression to match
125-
invert_match: find files that do NOT match, instead of ones that do?
126-
files_with_matches: show filenames of files with a match?
127-
files_without_match: show filenames of files with no match?
128-
grep_inner_file_name: search the names of "inner" files, rather than
129-
their contents?
130-
show_inner_file: show the names of the "inner" files, not just the
131-
"outer" (OpenXML) file?
127+
zipfilename:
128+
Name of the OpenXML (zip) file.
129+
regex:
130+
Regular expression to match.
131+
invert_match:
132+
Find files that do NOT match, instead of ones that do?
133+
files_with_matches:
134+
Show filenames of OpenXML (zip) files with a match?
135+
files_without_match:
136+
Show filenames of OpenXML (zip) files with no match?
137+
grep_inner_file_name:
138+
Search the names of "inner" files, rather than their contents?
139+
grep_raw_text:
140+
Search the raw text, not the XML node text contents.
141+
show_inner_file:
142+
Show the names of the "inner" files, not just the "outer" (OpenXML)
143+
file?
132144
"""
145+
# Check arguments
133146
assert not (files_without_match and files_with_matches)
134-
report_lines = (not files_without_match) and (not files_with_matches)
135-
report_hit_lines = report_lines and not invert_match
136-
report_miss_lines = report_lines and invert_match
137-
log.debug("Checking ZIP: " + zipfilename)
147+
assert not (grep_inner_file_name and grep_raw_text)
148+
149+
# Precalculate some reporting flags
150+
_report_lines = (not files_without_match) and (not files_with_matches)
151+
report_hit_lines = _report_lines and not invert_match
152+
report_miss_lines = _report_lines and invert_match
153+
154+
log.debug(f"Checking OpenXML ZIP: {zipfilename}")
138155
found_in_zip = False
156+
# ... Have we found something in this zip file? May be used for early
157+
# abort.
158+
159+
def _report(
160+
_found_in_zip: bool,
161+
_found_locally: bool,
162+
_contentsfilename: str,
163+
_to_report: Union[bytes, str],
164+
) -> bool:
165+
"""
166+
Reporting function. This gets called more often than you might think,
167+
including for lines that do not need reporting, but this is to simplify
168+
the handling of "invert_match" (which may require all non-match lines
169+
to be reported).
170+
171+
Arguments:
172+
_found_in_zip:
173+
Have we found a match in this ZIP file?
174+
_found_locally:
175+
Have we found a match in a current line?
176+
_contentsfilename:
177+
The name of the inner file we are currently searching.
178+
_to_report:
179+
The text (usually a line, possibly the inner filename) that
180+
should be reported, if we report something. It might be
181+
matching text, or non-matching text.
182+
183+
Returns:
184+
Ae we done for this ZIP file (should the outer function return)?
185+
"""
186+
if files_with_matches and _found_in_zip:
187+
report_hit_filename(
188+
zipfilename, _contentsfilename, show_inner_file
189+
)
190+
return True
191+
if (report_hit_lines and _found_locally) or (
192+
report_miss_lines and not _found_locally
193+
):
194+
report_line(
195+
zipfilename,
196+
_contentsfilename,
197+
_to_report,
198+
show_inner_file,
199+
)
200+
return False
201+
139202
try:
140203
with ZipFile(zipfilename, "r") as zf:
204+
# Iterate through inner files
141205
for contentsfilename in zf.namelist():
142-
log.debug("... checking file: " + contentsfilename)
206+
log.debug(f"... checking inner file: {contentsfilename}")
143207
if grep_inner_file_name:
208+
# ---------------------------------------------------------
209+
# Search the (inner) filename
210+
# ---------------------------------------------------------
211+
log.debug("... ... searching filename")
144212
found_in_filename = bool(regex.search(contentsfilename))
145213
found_in_zip = found_in_zip or found_in_filename
146-
if files_with_matches and found_in_zip:
147-
report_hit_filename(
148-
zipfilename, contentsfilename, show_inner_file
149-
)
214+
done = _report(
215+
_found_in_zip=found_in_zip,
216+
_found_locally=found_in_filename,
217+
_contentsfilename=contentsfilename,
218+
_to_report=contentsfilename,
219+
)
220+
if done:
150221
return
151-
if (report_hit_lines and found_in_filename) or (
152-
report_miss_lines and not found_in_filename
153-
):
154-
report_line(
155-
zipfilename,
156-
contentsfilename,
157-
contentsfilename,
158-
show_inner_file,
159-
)
160-
else:
222+
elif grep_raw_text:
223+
# ---------------------------------------------------------
224+
# Search textually, line by line
225+
# ---------------------------------------------------------
226+
# log.debug("... ... searching plain text")
161227
try:
162228
with zf.open(contentsfilename, "r") as file:
163229
try:
164230
for line in file.readlines():
165-
# log.debug("line: {!r}", line)
231+
# "line" is of type "bytes"
166232
found_in_line = bool(regex.search(line))
167233
found_in_zip = (
168234
found_in_zip or found_in_line
169235
)
170-
if files_with_matches and found_in_zip:
171-
report_hit_filename(
172-
zipfilename,
173-
contentsfilename,
174-
show_inner_file,
175-
)
236+
done = _report(
237+
_found_in_zip=found_in_zip,
238+
_found_locally=found_in_line,
239+
_contentsfilename=contentsfilename,
240+
_to_report=line,
241+
)
242+
if done:
176243
return
177-
if (
178-
report_hit_lines and found_in_line
179-
) or (
180-
report_miss_lines and not found_in_line
181-
):
182-
report_line(
183-
zipfilename,
184-
contentsfilename,
185-
line,
186-
show_inner_file,
187-
)
188244
except EOFError:
189245
pass
190246
except RuntimeError as e:
191247
log.warning(
192-
"RuntimeError whilst processing {} [{}]: probably "
193-
"encrypted contents; error was {!r}",
194-
zipfilename,
195-
contentsfilename,
196-
e,
248+
f"RuntimeError whilst processing {zipfilename} "
249+
f"[{contentsfilename}]: probably encrypted "
250+
f"contents; error was {e!r}"
251+
)
252+
else:
253+
# ---------------------------------------------------------
254+
# Search the text contents of XML
255+
# ---------------------------------------------------------
256+
# log.debug("... ... searching XML contents")
257+
try:
258+
with zf.open(contentsfilename, "r") as file:
259+
data_str = file.read()
260+
try:
261+
tree = ElementTree.fromstring(data_str)
262+
except ElementTree.ParseError:
263+
log.debug(
264+
f"... ... skipping (not XML): "
265+
f"{contentsfilename}"
266+
)
267+
for elem in tree.iter():
268+
line = elem.text
269+
if not line:
270+
continue
271+
found_in_line = bool(regex.search(line))
272+
found_in_zip = found_in_zip or found_in_line
273+
done = _report(
274+
_found_in_zip=found_in_zip,
275+
_found_locally=found_in_line,
276+
_contentsfilename=contentsfilename,
277+
_to_report=line,
278+
)
279+
if done:
280+
return
281+
except RuntimeError as e:
282+
log.warning(
283+
f"RuntimeError whilst processing {zipfilename} "
284+
f"[{contentsfilename}]: probably encrypted "
285+
f"contents; error was {e!r}"
197286
)
198287
except (zlib.error, BadZipFile) as e:
199-
log.debug("Invalid zip: {}; error was {!r}", zipfilename, e)
288+
log.debug(f"Invalid zip: {zipfilename}; error was {e!r}")
200289
if files_without_match and not found_in_zip:
201290
report_miss_filename(zipfilename)
202291

@@ -274,6 +363,11 @@ def main() -> None:
274363
action="store_true",
275364
help="Search the NAMES of the inner files, not their contents.",
276365
)
366+
parser.add_argument(
367+
"--grep_raw_text",
368+
action="store_true",
369+
help="Search the raw text, not the XML node text contents.",
370+
)
277371
parser.add_argument(
278372
"--show_inner_file",
279373
action="store_true",
@@ -303,45 +397,61 @@ def main() -> None:
303397
"command line, but not both"
304398
)
305399

400+
if args.grep_raw_text and args.grep_inner_file_name:
401+
raise ValueError(
402+
"Can't specify both --grep_raw_text and --grep_inner_file_name"
403+
)
404+
306405
# Compile regular expression
307-
if args.grep_inner_file_name:
308-
final_pattern = args.pattern
309-
else:
406+
if args.grep_raw_text:
407+
# Create a regex for type: bytes
310408
encoding = getdefaultencoding()
311409
final_pattern = args.pattern.encode(encoding)
410+
else:
411+
# Create a regex for type: str
412+
final_pattern = args.pattern
312413
flags = re.IGNORECASE if args.ignore_case else 0
313414
log.debug(
314-
"Using regular expression {!r} with flags {!r}", final_pattern, flags
415+
f"Using regular expression {final_pattern!r} with flags {flags!r}"
315416
)
316417
regex = re.compile(final_pattern, flags)
317418

318-
# Set up pool for parallel processing
319-
pool = multiprocessing.Pool(processes=args.nprocesses)
320-
321419
# Iterate through files
420+
# - Common arguments
322421
parse_kwargs = dict(
323422
regex=regex,
324423
invert_match=args.invert_match,
325424
files_with_matches=args.files_with_matches,
326425
files_without_match=args.files_without_match,
327426
grep_inner_file_name=args.grep_inner_file_name,
427+
grep_raw_text=args.grep_raw_text,
328428
show_inner_file=args.show_inner_file,
329429
)
430+
# - Filenames, as iterator
330431
if args.filenames_from_stdin:
331-
for line in stdin.readlines():
332-
zipfilename = line.strip()
333-
parallel_kwargs = {"zipfilename": zipfilename}
334-
parallel_kwargs.update(**parse_kwargs)
335-
pool.apply_async(parse_zip, [], parallel_kwargs)
432+
zipfilename_it = (line.strip() for line in stdin.readlines())
336433
else:
337-
for zipfilename in gen_filenames(
434+
zipfilename_it = gen_filenames(
338435
starting_filenames=args.filename, recursive=args.recursive
339-
):
340-
parallel_kwargs = {"zipfilename": zipfilename}
341-
parallel_kwargs.update(**parse_kwargs)
342-
pool.apply_async(parse_zip, [], parallel_kwargs)
436+
)
437+
# - Combined arguments, as iterator
438+
arg_it = (
439+
dict(zipfilename=zipfilename, **parse_kwargs)
440+
for zipfilename in zipfilename_it
441+
)
442+
# - Set up pool for parallel processing
443+
pool = multiprocessing.Pool(processes=args.nprocesses)
444+
# - Launch in parallel
445+
jobs = [pool.apply_async(parse_zip, [], kwargs) for kwargs in arg_it]
446+
# - Stop entry to the pool (close) and wait for children (join).
447+
# https://stackoverflow.com/questions/38271547/
343448
pool.close()
344449
pool.join()
450+
# - Collect results, re-raising any exceptions.
451+
# (Otherwise they will be invisible.)
452+
# https://stackoverflow.com/questions/6728236/
453+
for j in jobs:
454+
j.get()
345455

346456

347457
if __name__ == "__main__":

cardinal_pythonlib/version_string.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,5 @@
3131
3232
"""
3333

34-
VERSION_STRING = "2.1.1"
34+
VERSION_STRING = "2.1.2"
3535
# Use semantic versioning: https://semver.org/

docs/source/changelog.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -908,3 +908,9 @@ Quick links:
908908
- Add support for Outlook ``.msg`` files with attachments processed by supported
909909
document converters (``.docx``, ``.pdf``, ``.odt`` etc.) to
910910
:func:`cardinal_pythonlib.extract_text.document_to_text`.
911+
912+
**2.1.2 (IN PROGRESS)**
913+
914+
- ``cardinalpythonlib_grep_in_openxml``: new facility to search XML node text
915+
(rather than raw file text), and this is now the default. Also, behind the
916+
scenes, exceptions in subprocesses are now reported.

0 commit comments

Comments
 (0)