plotextractor: fix escaping of regular expression

* Fixed an issue where label-names were not escaped when used in a regular expression. * Also includes some cosmetic changes to conform with coding standards.
adams164 · Feb 1, 2011 · 8948877 · 8948877
1 parent 5edccee
commit 8948877
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 45 deletions.
diff --git a/modules/miscutil/lib/plotextractor.py b/modules/miscutil/lib/plotextractor.py
@@ -178,16 +178,16 @@ def main():
         sys.exit(1)
 
     if squash:
-        squash_fd, squash_path = mkstemp(suffix = "_" + time.strftime("%Y%m%d%H%M%S") + ".xml", \
-                                  prefix = "plotextractor_", dir = sdir)
+        squash_fd, squash_path = mkstemp(suffix="_" + time.strftime("%Y%m%d%H%M%S") + ".xml", \
+                                  prefix="plotextractor_", dir=sdir)
         os.write(squash_fd, '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n')
         os.close(squash_fd)
 
     for tarball in tars_and_gzips:
-        process_single(tarball, sdir = sdir, xtract_text = xtract_text, \
-                       upload_plots = upload_plots, force = force, squash = squash_path, \
-                       yes_i_know = yes_i_know, refno_url = refno_url, \
-                       clean = clean)
+        process_single(tarball, sdir=sdir, xtract_text=xtract_text, \
+                       upload_plots=upload_plots, force=force, squash=squash_path, \
+                       yes_i_know=yes_i_know, refno_url=refno_url, \
+                       clean=clean)
     if squash:
         squash_fd = open(squash_path, "a")
         squash_fd.write("</collection>\n")
@@ -196,10 +196,10 @@ def main():
         if upload_plots:
             upload_to_site(squash_path, yes_i_know)
 
-def process_single(tarball, sdir = CFG_TMPDIR, xtract_text = False, \
-                   upload_plots = False, force = False, squash = "", \
-                   yes_i_know = False, refno_url = "", \
-                   clean = False):
+def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \
+                   upload_plots=False, force=False, squash="", \
+                   yes_i_know=False, refno_url="", \
+                   clean=False):
     """
     Processes one tarball end-to-end.
 
@@ -354,7 +354,7 @@ def get_reference_number(tarball, refno_url):
             else:
                 arXiv_record = tarball
 
-            result = server.search(p = prefix + arXiv_record, of = 'id')
+            result = server.search(p=prefix + arXiv_record, of='id')
 
             if len(result) == 0:
                 return None
@@ -364,7 +364,7 @@ def get_reference_number(tarball, refno_url):
         arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball)
         if len(arXiv_record) > 1:
             arXiv_record = arXiv_record[0]
-            result = server.search(p = prefix + arXiv_record, of = 'id')
+            result = server.search(p=prefix + arXiv_record, of='id')
 
             if len(result) > 0:
                 return str(result[0])
@@ -374,7 +374,7 @@ def get_reference_number(tarball, refno_url):
                                   tarball_mod)
         if len(arXiv_record) > 1:
             arXiv_record = arXiv_record[0]
-            result = server.search(p = prefix + arXiv_record, of = 'id')
+            result = server.search(p=prefix + arXiv_record, of='id')
 
             if len(result) > 0:
                 return str(result[0])
@@ -408,13 +408,13 @@ def rotate_image(filename, line, sdir, image_list):
     degrees = str(0 - int(degrees))
 
     dummy, dummy, cmd_err = run_process_with_timeout('mogrify -rotate %s %s' % \
-                                                     (degrees, file_loc), shell = True)
+                                                     (degrees, file_loc), shell=True)
     if cmd_err != '':
         return True
     else:
         return True
 
-def get_context(lines, backwards = False):
+def get_context(lines, backwards=False):
     """
     Given a relevant string from a TeX file, this function will extract text
     from it as far as it is deemed contextually relevant, either backwards or forwards
@@ -500,7 +500,7 @@ def extract_context(tex_file, extracted_image_data):
 
         # Generate a list of index tuples for all matches
         indicies = [match.span() \
-                    for match in re.finditer(r"(\\(?:fig|ref)\{" + label + "\})", \
+                    for match in re.finditer(r"(\\(?:fig|ref)\{%s\})" % (re.escape(label),), \
                                                           lines)]
         for startindex, endindex in indicies:
             # Retrive all lines before label until beginning of file
@@ -509,7 +509,7 @@ def extract_context(tex_file, extracted_image_data):
                 text_before = lines[:startindex]
             else:
                 text_before = lines[i:startindex]
-            context_before = get_context(text_before, backwards = True)
+            context_before = get_context(text_before, backwards=True)
 
             # Retrive all lines from label until end of file and get context
             i = endindex + CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
@@ -519,7 +519,7 @@ def extract_context(tex_file, extracted_image_data):
         new_image_data.append((image, caption, label, context_list))
     return new_image_data
 
-def extract_captions(tex_file, sdir, image_list, primary = True):
+def extract_captions(tex_file, sdir, image_list, primary=True):
     """
     Take the TeX file and the list of images in the tarball (which all,
     presumably, are used in the TeX file) and figure out which captions
@@ -642,18 +642,18 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
                 ext = True
             else:
                 ext = False
-            filenames = intelligently_find_filenames(line, ext = ext,
-                                                     commas_okay = commas_okay)
+            filenames = intelligently_find_filenames(line, ext=ext,
+                                                     commas_okay=commas_okay)
 
             # try to look ahead!  sometimes there are better matches after
             if line_index < len(lines) - 1:
                 filenames.extend(\
                           intelligently_find_filenames(lines[line_index + 1],
-                                                      commas_okay = commas_okay))
+                                                      commas_okay=commas_okay))
             if line_index < len(lines) - 2:
                 filenames.extend(\
                           intelligently_find_filenames(lines[line_index + 2],
-                                                      commas_okay = commas_okay))
+                                                      commas_okay=commas_okay))
 
             for filename in filenames:
                 filename = str(filename)
@@ -674,14 +674,14 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
         if index > -1:
             # which is the image associated to it?
             filenames = intelligently_find_filenames(line,
-                                                     commas_okay = commas_okay)
+                                                     commas_okay=commas_okay)
             # try the line after and the line before
             if line_index + 1 < len(lines):
                 filenames.extend(intelligently_find_filenames(lines[line_index + 1],
-                                                      commas_okay = commas_okay))
+                                                      commas_okay=commas_okay))
             if line_index > 1:
                 filenames.extend(intelligently_find_filenames(lines[line_index - 1],
-                                                      commas_okay = commas_okay))
+                                                      commas_okay=commas_okay))
 
             already_tried = []
             for filename in filenames:
@@ -721,8 +721,8 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
         """
         index = line.find(input_head)
         if index > -1:
-            new_tex_names = intelligently_find_filenames(line, TeX = True, \
-                                                         commas_okay = commas_okay)
+            new_tex_names = intelligently_find_filenames(line, TeX=True, \
+                                                         commas_okay=commas_okay)
 
             for new_tex_name in new_tex_names:
                 if new_tex_name != 'ERROR':
@@ -731,7 +731,7 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
                         extracted_image_data.extend(extract_captions(\
                                                       new_tex_file, sdir, \
                                                       image_list,
-                                                      primary = False))
+                                                      primary=False))
 
         """PICTURE"""
 
@@ -1112,7 +1112,7 @@ def put_it_together(cur_image, caption, context, extracted_image_data, line_inde
 
     return (cur_image, caption, extracted_image_data)
 
-def intelligently_find_filenames(line, TeX = False, ext = False, commas_okay = False):
+def intelligently_find_filenames(line, TeX=False, ext=False, commas_okay=False):
     """
     Find the filename in the line.  We don't support all filenames!  Just eps
     and ps for now.

diff --git a/modules/miscutil/lib/plotextractor_config.py b/modules/miscutil/lib/plotextractor_config.py
@@ -57,3 +57,4 @@
 ## CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT -- when extracting context of plots from
 ## TeX sources, this is the limitation of sentences in each direction. Default 2.
 CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT = 2
+
diff --git a/modules/miscutil/lib/plotextractor_converter.py b/modules/miscutil/lib/plotextractor_converter.py
@@ -47,7 +47,7 @@ def untar(original_tarball, sdir):
         run_shell_command('rm %s', (tarball,))
         return ([], [], None)
     dummy1, cmd_out, cmd_err = run_process_with_timeout('tar xvf %s -C %s' %
-                                                        (tarball, sdir), shell = True)
+                                                        (tarball, sdir), shell=True)
 
     if cmd_err != '':
         return ([], [], None)
@@ -184,7 +184,7 @@ def convert_images(image_list):
             try:
                 dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\
                                                    % (image_file, \
-                                                      converted_image_file), shell = True)
+                                                      converted_image_file), shell=True)
                 if cmd_err == '':
                     ret_list.append(converted_image_file)
                 else:
@@ -206,7 +206,7 @@ def extract_text(tarball):
     try:
         os.stat(tarball + '.pdf')
         dummy1, dummy2, cmd_err = run_process_with_timeout('pdftotext %s %s' % \
-                                     (tarball + '.pdf ', tarball + '.txt'), shell = True)
+                                     (tarball + '.pdf ', tarball + '.txt'), shell=True)
         if cmd_err != '':
             return - 1
         write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf')

diff --git a/modules/miscutil/lib/plotextractor_getter.py b/modules/miscutil/lib/plotextractor_getter.py
@@ -260,7 +260,7 @@ def tarballs_by_recids(recids, sdir):
 
     for recid in list_of_ids:
         rec = get_record(recid)
-        for afieldinstance in record_get_field_instances(rec, tag = '037'):
+        for afieldinstance in record_get_field_instances(rec, tag='037'):
             if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]:
                 arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0]
                 arXiv_ids.append(arXiv_id)

diff --git a/modules/miscutil/lib/plotextractor_output_utils.py b/modules/miscutil/lib/plotextractor_output_utils.py
@@ -161,7 +161,7 @@ def assemble_caption(begin_line, begin_index, end_line, end_index, lines):
     # clean out characters not allowed in MARCXML
     # not allowed: & < >
     try:
-        caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash = True)
+        caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True)
     except: # that damn encode thing threw an error on astro-ph/0601014
         sys.stderr.write(caption)
         sys.stderr.write(' cannot be processed\n')
@@ -332,7 +332,7 @@ def create_MARC(extracted_image_data, tarball, refno):
     marcxml.append('</record>')
     return '\n'.join(marcxml)
 
-def get_image_location(image, sdir, image_list, recurred = False):
+def get_image_location(image, sdir, image_list, recurred=False):
     """
     This function takes a raw image name and a directory and returns the location of the
     (possibly converted) image
@@ -446,17 +446,17 @@ def get_image_location(image, sdir, image_list, recurred = False):
 
     # agh, this calls for drastic measures
     for piece in image.split(' '):
-        res = get_image_location(piece, sdir, image_list, recurred = True)
+        res = get_image_location(piece, sdir, image_list, recurred=True)
         if res != None:
             return res
 
     for piece in image.split(','):
-        res = get_image_location(piece, sdir, image_list, recurred = True)
+        res = get_image_location(piece, sdir, image_list, recurred=True)
         if res != None:
             return res
 
     for piece in image.split('='):
-        res = get_image_location(piece, sdir, image_list, recurred = True)
+        res = get_image_location(piece, sdir, image_list, recurred=True)
         if res != None:
             return res
 
@@ -492,7 +492,7 @@ def get_converted_image_name(image):
 
     return os.path.join(img_dir, converted_image)
 
-def get_tex_location(new_tex_name, current_tex_name, recurred = False):
+def get_tex_location(new_tex_name, current_tex_name, recurred=False):
     """
     Takes the name of a TeX file and attempts to match it to an actual file
     in the tarball.
@@ -556,6 +556,6 @@ def get_tex_location(new_tex_name, current_tex_name, recurred = False):
 
     if tex_location == None and not recurred:
         return get_tex_location(new_tex_name + '.tex', current_tex_name, \
-                                recurred = True)
+                                recurred=True)
 
     return tex_location
diff --git a/modules/miscutil/lib/plotextractor_tests.py b/modules/miscutil/lib/plotextractor_tests.py
@@ -227,28 +227,28 @@ def test_simple_test(self):
         """plotextractor - intelligently_find_filenames simple"""
         line = 'file.eps'
 
-        filenames = intelligently_find_filenames(line, ext = True)
+        filenames = intelligently_find_filenames(line, ext=True)
         self.assertTrue(filenames == ['file.eps'], 'didn\'t find correct filenames')
 
     def test_ext_test(self):
         """plotextractor - intelligently_find_filenames extension"""
         line = 'file.eps file2'
 
-        filenames = intelligently_find_filenames(line, ext = True)
+        filenames = intelligently_find_filenames(line, ext=True)
         self.assertTrue(filenames == ['file.eps'], 'didn\'t look for extension')
 
     def test_tex_test(self):
         """plotextractor - intelligently_find_filenames TeX extension"""
         line = 'file.eps file2.tex'
 
-        filenames = intelligently_find_filenames(line, TeX = True)
+        filenames = intelligently_find_filenames(line, TeX=True)
         self.assertTrue(filenames == ['file.eps', 'file2.tex'], 'not looking for TeX ext')
 
     def test_file_equals_test(self):
         """plotextractor - intelligently_find_filenames equals"""
         line = 'file=something.eps'
 
-        filenames = intelligently_find_filenames(line, ext = True)
+        filenames = intelligently_find_filenames(line, ext=True)
         self.assertTrue(filenames == ['something.eps', 'file=something.eps'], \
                         'didn\'t catch file=')
 
@@ -264,7 +264,7 @@ def test_lots_of_filenames(self):
         """plotextractor - intelligently_find_filenames lots of filenames"""
         line = '[file.pstex]figure=something.eps,haha,anotherthing.ps'
 
-        filenames = intelligently_find_filenames(line, ext = True)
+        filenames = intelligently_find_filenames(line, ext=True)
         self.assertTrue('file.pstex' in filenames, 'didn\'t look in brackets')
         self.assertTrue('something.eps' in filenames, 'didn\'t find figure=')
         self.assertTrue('anotherthing.ps' in filenames, 'didn\'t find filename')