4242import os
4343import re
4444from sys import argv , getdefaultencoding , stdin
45- from typing import Pattern
45+ from typing import Pattern , Union
46+ from xml .etree import ElementTree
4647from zipfile import BadZipFile , ZipFile
4748import zlib
4849
4950from rich_argparse import RawDescriptionRichHelpFormatter
5051
5152from cardinal_pythonlib .logs import (
52- BraceStyleAdapter ,
5353 main_only_quicksetup_rootlogger ,
5454)
5555from cardinal_pythonlib .fileops import gen_filenames
5656
57- log = BraceStyleAdapter ( logging .getLogger (__name__ ) )
57+ log = logging .getLogger (__name__ )
5858
5959
6060def report_hit_filename (
@@ -87,7 +87,10 @@ def report_miss_filename(zipfilename: str) -> None:
8787
8888
8989def report_line (
90- zipfilename : str , contentsfilename : str , line : str , show_inner_file : bool
90+ zipfilename : str ,
91+ contentsfilename : str ,
92+ line : Union [bytes , str ],
93+ show_inner_file : bool ,
9194) -> None :
9295 """
9396 Prints a line from a file, with the ``.zip`` filename and optionally also
@@ -113,90 +116,176 @@ def parse_zip(
113116 files_with_matches : bool ,
114117 files_without_match : bool ,
115118 grep_inner_file_name : bool ,
119+ grep_raw_text : bool ,
116120 show_inner_file : bool ,
117121) -> None :
118122 """
119123 Implement a "grep within an OpenXML file" for a single OpenXML file, which
120124 is by definition a ``.zip`` file.
121125
122126 Args:
123- zipfilename: name of the OpenXML (zip) file
124- regex: regular expression to match
125- invert_match: find files that do NOT match, instead of ones that do?
126- files_with_matches: show filenames of files with a match?
127- files_without_match: show filenames of files with no match?
128- grep_inner_file_name: search the names of "inner" files, rather than
129- their contents?
130- show_inner_file: show the names of the "inner" files, not just the
131- "outer" (OpenXML) file?
127+ zipfilename:
128+ Name of the OpenXML (zip) file.
129+ regex:
130+ Regular expression to match.
131+ invert_match:
132+ Find files that do NOT match, instead of ones that do?
133+ files_with_matches:
134+ Show filenames of OpenXML (zip) files with a match?
135+ files_without_match:
136+ Show filenames of OpenXML (zip) files with no match?
137+ grep_inner_file_name:
138+ Search the names of "inner" files, rather than their contents?
139+ grep_raw_text:
140+ Search the raw text, not the XML node text contents.
141+ show_inner_file:
142+ Show the names of the "inner" files, not just the "outer" (OpenXML)
143+ file?
132144 """
145+ # Check arguments
133146 assert not (files_without_match and files_with_matches )
134- report_lines = (not files_without_match ) and (not files_with_matches )
135- report_hit_lines = report_lines and not invert_match
136- report_miss_lines = report_lines and invert_match
137- log .debug ("Checking ZIP: " + zipfilename )
147+ assert not (grep_inner_file_name and grep_raw_text )
148+
149+ # Precalculate some reporting flags
150+ _report_lines = (not files_without_match ) and (not files_with_matches )
151+ report_hit_lines = _report_lines and not invert_match
152+ report_miss_lines = _report_lines and invert_match
153+
154+ log .debug (f"Checking OpenXML ZIP: { zipfilename } " )
138155 found_in_zip = False
156+ # ... Have we found something in this zip file? May be used for early
157+ # abort.
158+
159+ def _report (
160+ _found_in_zip : bool ,
161+ _found_locally : bool ,
162+ _contentsfilename : str ,
163+ _to_report : Union [bytes , str ],
164+ ) -> bool :
165+ """
166+ Reporting function. This gets called more often than you might think,
167+ including for lines that do not need reporting, but this is to simplify
168+ the handling of "invert_match" (which may require all non-match lines
169+ to be reported).
170+
171+ Arguments:
172+ _found_in_zip:
173+ Have we found a match in this ZIP file?
174+ _found_locally:
175+ Have we found a match in a current line?
176+ _contentsfilename:
177+ The name of the inner file we are currently searching.
178+ _to_report:
179+ The text (usually a line, possibly the inner filename) that
180+ should be reported, if we report something. It might be
181+ matching text, or non-matching text.
182+
183+ Returns:
184+ Ae we done for this ZIP file (should the outer function return)?
185+ """
186+ if files_with_matches and _found_in_zip :
187+ report_hit_filename (
188+ zipfilename , _contentsfilename , show_inner_file
189+ )
190+ return True
191+ if (report_hit_lines and _found_locally ) or (
192+ report_miss_lines and not _found_locally
193+ ):
194+ report_line (
195+ zipfilename ,
196+ _contentsfilename ,
197+ _to_report ,
198+ show_inner_file ,
199+ )
200+ return False
201+
139202 try :
140203 with ZipFile (zipfilename , "r" ) as zf :
204+ # Iterate through inner files
141205 for contentsfilename in zf .namelist ():
142- log .debug ("... checking file: " + contentsfilename )
206+ log .debug (f "... checking inner file: { contentsfilename } " )
143207 if grep_inner_file_name :
208+ # ---------------------------------------------------------
209+ # Search the (inner) filename
210+ # ---------------------------------------------------------
211+ log .debug ("... ... searching filename" )
144212 found_in_filename = bool (regex .search (contentsfilename ))
145213 found_in_zip = found_in_zip or found_in_filename
146- if files_with_matches and found_in_zip :
147- report_hit_filename (
148- zipfilename , contentsfilename , show_inner_file
149- )
214+ done = _report (
215+ _found_in_zip = found_in_zip ,
216+ _found_locally = found_in_filename ,
217+ _contentsfilename = contentsfilename ,
218+ _to_report = contentsfilename ,
219+ )
220+ if done :
150221 return
151- if (report_hit_lines and found_in_filename ) or (
152- report_miss_lines and not found_in_filename
153- ):
154- report_line (
155- zipfilename ,
156- contentsfilename ,
157- contentsfilename ,
158- show_inner_file ,
159- )
160- else :
222+ elif grep_raw_text :
223+ # ---------------------------------------------------------
224+ # Search textually, line by line
225+ # ---------------------------------------------------------
226+ # log.debug("... ... searching plain text")
161227 try :
162228 with zf .open (contentsfilename , "r" ) as file :
163229 try :
164230 for line in file .readlines ():
165- # log.debug( "line: {!r}", line)
231+ # "line" is of type "bytes"
166232 found_in_line = bool (regex .search (line ))
167233 found_in_zip = (
168234 found_in_zip or found_in_line
169235 )
170- if files_with_matches and found_in_zip :
171- report_hit_filename (
172- zipfilename ,
173- contentsfilename ,
174- show_inner_file ,
175- )
236+ done = _report (
237+ _found_in_zip = found_in_zip ,
238+ _found_locally = found_in_line ,
239+ _contentsfilename = contentsfilename ,
240+ _to_report = line ,
241+ )
242+ if done :
176243 return
177- if (
178- report_hit_lines and found_in_line
179- ) or (
180- report_miss_lines and not found_in_line
181- ):
182- report_line (
183- zipfilename ,
184- contentsfilename ,
185- line ,
186- show_inner_file ,
187- )
188244 except EOFError :
189245 pass
190246 except RuntimeError as e :
191247 log .warning (
192- "RuntimeError whilst processing {} [{}]: probably "
193- "encrypted contents; error was {!r}" ,
194- zipfilename ,
195- contentsfilename ,
196- e ,
248+ f"RuntimeError whilst processing { zipfilename } "
249+ f"[{ contentsfilename } ]: probably encrypted "
250+ f"contents; error was { e !r} "
251+ )
252+ else :
253+ # ---------------------------------------------------------
254+ # Search the text contents of XML
255+ # ---------------------------------------------------------
256+ # log.debug("... ... searching XML contents")
257+ try :
258+ with zf .open (contentsfilename , "r" ) as file :
259+ data_str = file .read ()
260+ try :
261+ tree = ElementTree .fromstring (data_str )
262+ except ElementTree .ParseError :
263+ log .debug (
264+ f"... ... skipping (not XML): "
265+ f"{ contentsfilename } "
266+ )
267+ for elem in tree .iter ():
268+ line = elem .text
269+ if not line :
270+ continue
271+ found_in_line = bool (regex .search (line ))
272+ found_in_zip = found_in_zip or found_in_line
273+ done = _report (
274+ _found_in_zip = found_in_zip ,
275+ _found_locally = found_in_line ,
276+ _contentsfilename = contentsfilename ,
277+ _to_report = line ,
278+ )
279+ if done :
280+ return
281+ except RuntimeError as e :
282+ log .warning (
283+ f"RuntimeError whilst processing { zipfilename } "
284+ f"[{ contentsfilename } ]: probably encrypted "
285+ f"contents; error was { e !r} "
197286 )
198287 except (zlib .error , BadZipFile ) as e :
199- log .debug ("Invalid zip: {}; error was {!r}" , zipfilename , e )
288+ log .debug (f "Invalid zip: { zipfilename } ; error was { e !r} " )
200289 if files_without_match and not found_in_zip :
201290 report_miss_filename (zipfilename )
202291
@@ -274,6 +363,11 @@ def main() -> None:
274363 action = "store_true" ,
275364 help = "Search the NAMES of the inner files, not their contents." ,
276365 )
366+ parser .add_argument (
367+ "--grep_raw_text" ,
368+ action = "store_true" ,
369+ help = "Search the raw text, not the XML node text contents." ,
370+ )
277371 parser .add_argument (
278372 "--show_inner_file" ,
279373 action = "store_true" ,
@@ -303,45 +397,61 @@ def main() -> None:
303397 "command line, but not both"
304398 )
305399
400+ if args .grep_raw_text and args .grep_inner_file_name :
401+ raise ValueError (
402+ "Can't specify both --grep_raw_text and --grep_inner_file_name"
403+ )
404+
306405 # Compile regular expression
307- if args .grep_inner_file_name :
308- final_pattern = args .pattern
309- else :
406+ if args .grep_raw_text :
407+ # Create a regex for type: bytes
310408 encoding = getdefaultencoding ()
311409 final_pattern = args .pattern .encode (encoding )
410+ else :
411+ # Create a regex for type: str
412+ final_pattern = args .pattern
312413 flags = re .IGNORECASE if args .ignore_case else 0
313414 log .debug (
314- "Using regular expression {!r} with flags {!r}" , final_pattern , flags
415+ f "Using regular expression { final_pattern !r} with flags { flags !r} "
315416 )
316417 regex = re .compile (final_pattern , flags )
317418
318- # Set up pool for parallel processing
319- pool = multiprocessing .Pool (processes = args .nprocesses )
320-
321419 # Iterate through files
420+ # - Common arguments
322421 parse_kwargs = dict (
323422 regex = regex ,
324423 invert_match = args .invert_match ,
325424 files_with_matches = args .files_with_matches ,
326425 files_without_match = args .files_without_match ,
327426 grep_inner_file_name = args .grep_inner_file_name ,
427+ grep_raw_text = args .grep_raw_text ,
328428 show_inner_file = args .show_inner_file ,
329429 )
430+ # - Filenames, as iterator
330431 if args .filenames_from_stdin :
331- for line in stdin .readlines ():
332- zipfilename = line .strip ()
333- parallel_kwargs = {"zipfilename" : zipfilename }
334- parallel_kwargs .update (** parse_kwargs )
335- pool .apply_async (parse_zip , [], parallel_kwargs )
432+ zipfilename_it = (line .strip () for line in stdin .readlines ())
336433 else :
337- for zipfilename in gen_filenames (
434+ zipfilename_it = gen_filenames (
338435 starting_filenames = args .filename , recursive = args .recursive
339- ):
340- parallel_kwargs = {"zipfilename" : zipfilename }
341- parallel_kwargs .update (** parse_kwargs )
342- pool .apply_async (parse_zip , [], parallel_kwargs )
436+ )
437+ # - Combined arguments, as iterator
438+ arg_it = (
439+ dict (zipfilename = zipfilename , ** parse_kwargs )
440+ for zipfilename in zipfilename_it
441+ )
442+ # - Set up pool for parallel processing
443+ pool = multiprocessing .Pool (processes = args .nprocesses )
444+ # - Launch in parallel
445+ jobs = [pool .apply_async (parse_zip , [], kwargs ) for kwargs in arg_it ]
446+ # - Stop entry to the pool (close) and wait for children (join).
447+ # https://stackoverflow.com/questions/38271547/
343448 pool .close ()
344449 pool .join ()
450+ # - Collect results, re-raising any exceptions.
451+ # (Otherwise they will be invisible.)
452+ # https://stackoverflow.com/questions/6728236/
453+ for j in jobs :
454+ j .get ()
345455
346456
347457if __name__ == "__main__" :
0 commit comments