From 89488779506a651c5f1da570a01e35e3f5ef58f4 Mon Sep 17 00:00:00 2001 From: Jan Aage Lavik Date: Tue, 1 Feb 2011 13:04:37 +0100 Subject: [PATCH] plotextractor: fix escaping of regular expression * Fixed an issue where label-names were not escaped when used in a regular expression. * Also includes some cosmetic changes to conform with coding standards. --- modules/miscutil/lib/plotextractor.py | 58 +++++++++---------- modules/miscutil/lib/plotextractor_config.py | 1 + .../miscutil/lib/plotextractor_converter.py | 6 +- modules/miscutil/lib/plotextractor_getter.py | 2 +- .../lib/plotextractor_output_utils.py | 14 ++--- modules/miscutil/lib/plotextractor_tests.py | 10 ++-- 6 files changed, 46 insertions(+), 45 deletions(-) diff --git a/modules/miscutil/lib/plotextractor.py b/modules/miscutil/lib/plotextractor.py index 610f3723d..fc1c56098 100644 --- a/modules/miscutil/lib/plotextractor.py +++ b/modules/miscutil/lib/plotextractor.py @@ -178,16 +178,16 @@ def main(): sys.exit(1) if squash: - squash_fd, squash_path = mkstemp(suffix = "_" + time.strftime("%Y%m%d%H%M%S") + ".xml", \ - prefix = "plotextractor_", dir = sdir) + squash_fd, squash_path = mkstemp(suffix="_" + time.strftime("%Y%m%d%H%M%S") + ".xml", \ + prefix="plotextractor_", dir=sdir) os.write(squash_fd, '\n\n') os.close(squash_fd) for tarball in tars_and_gzips: - process_single(tarball, sdir = sdir, xtract_text = xtract_text, \ - upload_plots = upload_plots, force = force, squash = squash_path, \ - yes_i_know = yes_i_know, refno_url = refno_url, \ - clean = clean) + process_single(tarball, sdir=sdir, xtract_text=xtract_text, \ + upload_plots=upload_plots, force=force, squash=squash_path, \ + yes_i_know=yes_i_know, refno_url=refno_url, \ + clean=clean) if squash: squash_fd = open(squash_path, "a") squash_fd.write("\n") @@ -196,10 +196,10 @@ def main(): if upload_plots: upload_to_site(squash_path, yes_i_know) -def process_single(tarball, sdir = CFG_TMPDIR, xtract_text = False, \ - upload_plots = False, force = False, squash = "", \ - yes_i_know = False, refno_url = "", \ - clean = False): +def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \ + upload_plots=False, force=False, squash="", \ + yes_i_know=False, refno_url="", \ + clean=False): """ Processes one tarball end-to-end. @@ -354,7 +354,7 @@ def get_reference_number(tarball, refno_url): else: arXiv_record = tarball - result = server.search(p = prefix + arXiv_record, of = 'id') + result = server.search(p=prefix + arXiv_record, of='id') if len(result) == 0: return None @@ -364,7 +364,7 @@ def get_reference_number(tarball, refno_url): arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball) if len(arXiv_record) > 1: arXiv_record = arXiv_record[0] - result = server.search(p = prefix + arXiv_record, of = 'id') + result = server.search(p=prefix + arXiv_record, of='id') if len(result) > 0: return str(result[0]) @@ -374,7 +374,7 @@ def get_reference_number(tarball, refno_url): tarball_mod) if len(arXiv_record) > 1: arXiv_record = arXiv_record[0] - result = server.search(p = prefix + arXiv_record, of = 'id') + result = server.search(p=prefix + arXiv_record, of='id') if len(result) > 0: return str(result[0]) @@ -408,13 +408,13 @@ def rotate_image(filename, line, sdir, image_list): degrees = str(0 - int(degrees)) dummy, dummy, cmd_err = run_process_with_timeout('mogrify -rotate %s %s' % \ - (degrees, file_loc), shell = True) + (degrees, file_loc), shell=True) if cmd_err != '': return True else: return True -def get_context(lines, backwards = False): +def get_context(lines, backwards=False): """ Given a relevant string from a TeX file, this function will extract text from it as far as it is deemed contextually relevant, either backwards or forwards @@ -500,7 +500,7 @@ def extract_context(tex_file, extracted_image_data): # Generate a list of index tuples for all matches indicies = [match.span() \ - for match in re.finditer(r"(\\(?:fig|ref)\{" + label + "\})", \ + for match in re.finditer(r"(\\(?:fig|ref)\{%s\})" % (re.escape(label),), \ lines)] for startindex, endindex in indicies: # Retrive all lines before label until beginning of file @@ -509,7 +509,7 @@ def extract_context(tex_file, extracted_image_data): text_before = lines[:startindex] else: text_before = lines[i:startindex] - context_before = get_context(text_before, backwards = True) + context_before = get_context(text_before, backwards=True) # Retrive all lines from label until end of file and get context i = endindex + CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT @@ -519,7 +519,7 @@ def extract_context(tex_file, extracted_image_data): new_image_data.append((image, caption, label, context_list)) return new_image_data -def extract_captions(tex_file, sdir, image_list, primary = True): +def extract_captions(tex_file, sdir, image_list, primary=True): """ Take the TeX file and the list of images in the tarball (which all, presumably, are used in the TeX file) and figure out which captions @@ -642,18 +642,18 @@ def extract_captions(tex_file, sdir, image_list, primary = True): ext = True else: ext = False - filenames = intelligently_find_filenames(line, ext = ext, - commas_okay = commas_okay) + filenames = intelligently_find_filenames(line, ext=ext, + commas_okay=commas_okay) # try to look ahead! sometimes there are better matches after if line_index < len(lines) - 1: filenames.extend(\ intelligently_find_filenames(lines[line_index + 1], - commas_okay = commas_okay)) + commas_okay=commas_okay)) if line_index < len(lines) - 2: filenames.extend(\ intelligently_find_filenames(lines[line_index + 2], - commas_okay = commas_okay)) + commas_okay=commas_okay)) for filename in filenames: filename = str(filename) @@ -674,14 +674,14 @@ def extract_captions(tex_file, sdir, image_list, primary = True): if index > -1: # which is the image associated to it? filenames = intelligently_find_filenames(line, - commas_okay = commas_okay) + commas_okay=commas_okay) # try the line after and the line before if line_index + 1 < len(lines): filenames.extend(intelligently_find_filenames(lines[line_index + 1], - commas_okay = commas_okay)) + commas_okay=commas_okay)) if line_index > 1: filenames.extend(intelligently_find_filenames(lines[line_index - 1], - commas_okay = commas_okay)) + commas_okay=commas_okay)) already_tried = [] for filename in filenames: @@ -721,8 +721,8 @@ def extract_captions(tex_file, sdir, image_list, primary = True): """ index = line.find(input_head) if index > -1: - new_tex_names = intelligently_find_filenames(line, TeX = True, \ - commas_okay = commas_okay) + new_tex_names = intelligently_find_filenames(line, TeX=True, \ + commas_okay=commas_okay) for new_tex_name in new_tex_names: if new_tex_name != 'ERROR': @@ -731,7 +731,7 @@ def extract_captions(tex_file, sdir, image_list, primary = True): extracted_image_data.extend(extract_captions(\ new_tex_file, sdir, \ image_list, - primary = False)) + primary=False)) """PICTURE""" @@ -1112,7 +1112,7 @@ def put_it_together(cur_image, caption, context, extracted_image_data, line_inde return (cur_image, caption, extracted_image_data) -def intelligently_find_filenames(line, TeX = False, ext = False, commas_okay = False): +def intelligently_find_filenames(line, TeX=False, ext=False, commas_okay=False): """ Find the filename in the line. We don't support all filenames! Just eps and ps for now. diff --git a/modules/miscutil/lib/plotextractor_config.py b/modules/miscutil/lib/plotextractor_config.py index 99efe0330..cf907bc0e 100644 --- a/modules/miscutil/lib/plotextractor_config.py +++ b/modules/miscutil/lib/plotextractor_config.py @@ -57,3 +57,4 @@ ## CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT -- when extracting context of plots from ## TeX sources, this is the limitation of sentences in each direction. Default 2. CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT = 2 + diff --git a/modules/miscutil/lib/plotextractor_converter.py b/modules/miscutil/lib/plotextractor_converter.py index d636c759a..aed192ca3 100644 --- a/modules/miscutil/lib/plotextractor_converter.py +++ b/modules/miscutil/lib/plotextractor_converter.py @@ -47,7 +47,7 @@ def untar(original_tarball, sdir): run_shell_command('rm %s', (tarball,)) return ([], [], None) dummy1, cmd_out, cmd_err = run_process_with_timeout('tar xvf %s -C %s' % - (tarball, sdir), shell = True) + (tarball, sdir), shell=True) if cmd_err != '': return ([], [], None) @@ -184,7 +184,7 @@ def convert_images(image_list): try: dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\ % (image_file, \ - converted_image_file), shell = True) + converted_image_file), shell=True) if cmd_err == '': ret_list.append(converted_image_file) else: @@ -206,7 +206,7 @@ def extract_text(tarball): try: os.stat(tarball + '.pdf') dummy1, dummy2, cmd_err = run_process_with_timeout('pdftotext %s %s' % \ - (tarball + '.pdf ', tarball + '.txt'), shell = True) + (tarball + '.pdf ', tarball + '.txt'), shell=True) if cmd_err != '': return - 1 write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf') diff --git a/modules/miscutil/lib/plotextractor_getter.py b/modules/miscutil/lib/plotextractor_getter.py index bad1e7a71..07faebc32 100644 --- a/modules/miscutil/lib/plotextractor_getter.py +++ b/modules/miscutil/lib/plotextractor_getter.py @@ -260,7 +260,7 @@ def tarballs_by_recids(recids, sdir): for recid in list_of_ids: rec = get_record(recid) - for afieldinstance in record_get_field_instances(rec, tag = '037'): + for afieldinstance in record_get_field_instances(rec, tag='037'): if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]: arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0] arXiv_ids.append(arXiv_id) diff --git a/modules/miscutil/lib/plotextractor_output_utils.py b/modules/miscutil/lib/plotextractor_output_utils.py index f94cb0c28..a6941dea9 100644 --- a/modules/miscutil/lib/plotextractor_output_utils.py +++ b/modules/miscutil/lib/plotextractor_output_utils.py @@ -161,7 +161,7 @@ def assemble_caption(begin_line, begin_index, end_line, end_index, lines): # clean out characters not allowed in MARCXML # not allowed: & < > try: - caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash = True) + caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True) except: # that damn encode thing threw an error on astro-ph/0601014 sys.stderr.write(caption) sys.stderr.write(' cannot be processed\n') @@ -332,7 +332,7 @@ def create_MARC(extracted_image_data, tarball, refno): marcxml.append('') return '\n'.join(marcxml) -def get_image_location(image, sdir, image_list, recurred = False): +def get_image_location(image, sdir, image_list, recurred=False): """ This function takes a raw image name and a directory and returns the location of the (possibly converted) image @@ -446,17 +446,17 @@ def get_image_location(image, sdir, image_list, recurred = False): # agh, this calls for drastic measures for piece in image.split(' '): - res = get_image_location(piece, sdir, image_list, recurred = True) + res = get_image_location(piece, sdir, image_list, recurred=True) if res != None: return res for piece in image.split(','): - res = get_image_location(piece, sdir, image_list, recurred = True) + res = get_image_location(piece, sdir, image_list, recurred=True) if res != None: return res for piece in image.split('='): - res = get_image_location(piece, sdir, image_list, recurred = True) + res = get_image_location(piece, sdir, image_list, recurred=True) if res != None: return res @@ -492,7 +492,7 @@ def get_converted_image_name(image): return os.path.join(img_dir, converted_image) -def get_tex_location(new_tex_name, current_tex_name, recurred = False): +def get_tex_location(new_tex_name, current_tex_name, recurred=False): """ Takes the name of a TeX file and attempts to match it to an actual file in the tarball. @@ -556,6 +556,6 @@ def get_tex_location(new_tex_name, current_tex_name, recurred = False): if tex_location == None and not recurred: return get_tex_location(new_tex_name + '.tex', current_tex_name, \ - recurred = True) + recurred=True) return tex_location diff --git a/modules/miscutil/lib/plotextractor_tests.py b/modules/miscutil/lib/plotextractor_tests.py index cdf5be188..4fbd127d1 100644 --- a/modules/miscutil/lib/plotextractor_tests.py +++ b/modules/miscutil/lib/plotextractor_tests.py @@ -227,28 +227,28 @@ def test_simple_test(self): """plotextractor - intelligently_find_filenames simple""" line = 'file.eps' - filenames = intelligently_find_filenames(line, ext = True) + filenames = intelligently_find_filenames(line, ext=True) self.assertTrue(filenames == ['file.eps'], 'didn\'t find correct filenames') def test_ext_test(self): """plotextractor - intelligently_find_filenames extension""" line = 'file.eps file2' - filenames = intelligently_find_filenames(line, ext = True) + filenames = intelligently_find_filenames(line, ext=True) self.assertTrue(filenames == ['file.eps'], 'didn\'t look for extension') def test_tex_test(self): """plotextractor - intelligently_find_filenames TeX extension""" line = 'file.eps file2.tex' - filenames = intelligently_find_filenames(line, TeX = True) + filenames = intelligently_find_filenames(line, TeX=True) self.assertTrue(filenames == ['file.eps', 'file2.tex'], 'not looking for TeX ext') def test_file_equals_test(self): """plotextractor - intelligently_find_filenames equals""" line = 'file=something.eps' - filenames = intelligently_find_filenames(line, ext = True) + filenames = intelligently_find_filenames(line, ext=True) self.assertTrue(filenames == ['something.eps', 'file=something.eps'], \ 'didn\'t catch file=') @@ -264,7 +264,7 @@ def test_lots_of_filenames(self): """plotextractor - intelligently_find_filenames lots of filenames""" line = '[file.pstex]figure=something.eps,haha,anotherthing.ps' - filenames = intelligently_find_filenames(line, ext = True) + filenames = intelligently_find_filenames(line, ext=True) self.assertTrue('file.pstex' in filenames, 'didn\'t look in brackets') self.assertTrue('something.eps' in filenames, 'didn\'t find figure=') self.assertTrue('anotherthing.ps' in filenames, 'didn\'t find filename')