Skip to content

Commit

Permalink
plotextractor: fix escaping of regular expression
Browse files Browse the repository at this point in the history
* Fixed an issue where label-names were not escaped when used
  in a regular expression.

* Also includes some cosmetic changes to conform with coding standards.
  • Loading branch information
jalavik authored and tiborsimko committed Feb 1, 2011
1 parent 5edccee commit 8948877
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 45 deletions.
58 changes: 29 additions & 29 deletions modules/miscutil/lib/plotextractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,16 +178,16 @@ def main():
sys.exit(1)

if squash:
squash_fd, squash_path = mkstemp(suffix = "_" + time.strftime("%Y%m%d%H%M%S") + ".xml", \
prefix = "plotextractor_", dir = sdir)
squash_fd, squash_path = mkstemp(suffix="_" + time.strftime("%Y%m%d%H%M%S") + ".xml", \
prefix="plotextractor_", dir=sdir)
os.write(squash_fd, '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n')
os.close(squash_fd)

for tarball in tars_and_gzips:
process_single(tarball, sdir = sdir, xtract_text = xtract_text, \
upload_plots = upload_plots, force = force, squash = squash_path, \
yes_i_know = yes_i_know, refno_url = refno_url, \
clean = clean)
process_single(tarball, sdir=sdir, xtract_text=xtract_text, \
upload_plots=upload_plots, force=force, squash=squash_path, \
yes_i_know=yes_i_know, refno_url=refno_url, \
clean=clean)
if squash:
squash_fd = open(squash_path, "a")
squash_fd.write("</collection>\n")
Expand All @@ -196,10 +196,10 @@ def main():
if upload_plots:
upload_to_site(squash_path, yes_i_know)

def process_single(tarball, sdir = CFG_TMPDIR, xtract_text = False, \
upload_plots = False, force = False, squash = "", \
yes_i_know = False, refno_url = "", \
clean = False):
def process_single(tarball, sdir=CFG_TMPDIR, xtract_text=False, \
upload_plots=False, force=False, squash="", \
yes_i_know=False, refno_url="", \
clean=False):
"""
Processes one tarball end-to-end.
Expand Down Expand Up @@ -354,7 +354,7 @@ def get_reference_number(tarball, refno_url):
else:
arXiv_record = tarball

result = server.search(p = prefix + arXiv_record, of = 'id')
result = server.search(p=prefix + arXiv_record, of='id')

if len(result) == 0:
return None
Expand All @@ -364,7 +364,7 @@ def get_reference_number(tarball, refno_url):
arXiv_record = re.findall('(([a-zA-Z\\-]+/\\d+)|(\\d+\\.\\d+))', tarball)
if len(arXiv_record) > 1:
arXiv_record = arXiv_record[0]
result = server.search(p = prefix + arXiv_record, of = 'id')
result = server.search(p=prefix + arXiv_record, of='id')

if len(result) > 0:
return str(result[0])
Expand All @@ -374,7 +374,7 @@ def get_reference_number(tarball, refno_url):
tarball_mod)
if len(arXiv_record) > 1:
arXiv_record = arXiv_record[0]
result = server.search(p = prefix + arXiv_record, of = 'id')
result = server.search(p=prefix + arXiv_record, of='id')

if len(result) > 0:
return str(result[0])
Expand Down Expand Up @@ -408,13 +408,13 @@ def rotate_image(filename, line, sdir, image_list):
degrees = str(0 - int(degrees))

dummy, dummy, cmd_err = run_process_with_timeout('mogrify -rotate %s %s' % \
(degrees, file_loc), shell = True)
(degrees, file_loc), shell=True)
if cmd_err != '':
return True
else:
return True

def get_context(lines, backwards = False):
def get_context(lines, backwards=False):
"""
Given a relevant string from a TeX file, this function will extract text
from it as far as it is deemed contextually relevant, either backwards or forwards
Expand Down Expand Up @@ -500,7 +500,7 @@ def extract_context(tex_file, extracted_image_data):

# Generate a list of index tuples for all matches
indicies = [match.span() \
for match in re.finditer(r"(\\(?:fig|ref)\{" + label + "\})", \
for match in re.finditer(r"(\\(?:fig|ref)\{%s\})" % (re.escape(label),), \
lines)]
for startindex, endindex in indicies:
# Retrive all lines before label until beginning of file
Expand All @@ -509,7 +509,7 @@ def extract_context(tex_file, extracted_image_data):
text_before = lines[:startindex]
else:
text_before = lines[i:startindex]
context_before = get_context(text_before, backwards = True)
context_before = get_context(text_before, backwards=True)

# Retrive all lines from label until end of file and get context
i = endindex + CFG_PLOTEXTRACTOR_CONTEXT_EXTRACT_LIMIT
Expand All @@ -519,7 +519,7 @@ def extract_context(tex_file, extracted_image_data):
new_image_data.append((image, caption, label, context_list))
return new_image_data

def extract_captions(tex_file, sdir, image_list, primary = True):
def extract_captions(tex_file, sdir, image_list, primary=True):
"""
Take the TeX file and the list of images in the tarball (which all,
presumably, are used in the TeX file) and figure out which captions
Expand Down Expand Up @@ -642,18 +642,18 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
ext = True
else:
ext = False
filenames = intelligently_find_filenames(line, ext = ext,
commas_okay = commas_okay)
filenames = intelligently_find_filenames(line, ext=ext,
commas_okay=commas_okay)

# try to look ahead! sometimes there are better matches after
if line_index < len(lines) - 1:
filenames.extend(\
intelligently_find_filenames(lines[line_index + 1],
commas_okay = commas_okay))
commas_okay=commas_okay))
if line_index < len(lines) - 2:
filenames.extend(\
intelligently_find_filenames(lines[line_index + 2],
commas_okay = commas_okay))
commas_okay=commas_okay))

for filename in filenames:
filename = str(filename)
Expand All @@ -674,14 +674,14 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
if index > -1:
# which is the image associated to it?
filenames = intelligently_find_filenames(line,
commas_okay = commas_okay)
commas_okay=commas_okay)
# try the line after and the line before
if line_index + 1 < len(lines):
filenames.extend(intelligently_find_filenames(lines[line_index + 1],
commas_okay = commas_okay))
commas_okay=commas_okay))
if line_index > 1:
filenames.extend(intelligently_find_filenames(lines[line_index - 1],
commas_okay = commas_okay))
commas_okay=commas_okay))

already_tried = []
for filename in filenames:
Expand Down Expand Up @@ -721,8 +721,8 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
"""
index = line.find(input_head)
if index > -1:
new_tex_names = intelligently_find_filenames(line, TeX = True, \
commas_okay = commas_okay)
new_tex_names = intelligently_find_filenames(line, TeX=True, \
commas_okay=commas_okay)

for new_tex_name in new_tex_names:
if new_tex_name != 'ERROR':
Expand All @@ -731,7 +731,7 @@ def extract_captions(tex_file, sdir, image_list, primary = True):
extracted_image_data.extend(extract_captions(\
new_tex_file, sdir, \
image_list,
primary = False))
primary=False))

"""PICTURE"""

Expand Down Expand Up @@ -1112,7 +1112,7 @@ def put_it_together(cur_image, caption, context, extracted_image_data, line_inde

return (cur_image, caption, extracted_image_data)

def intelligently_find_filenames(line, TeX = False, ext = False, commas_okay = False):
def intelligently_find_filenames(line, TeX=False, ext=False, commas_okay=False):
"""
Find the filename in the line. We don't support all filenames! Just eps
and ps for now.
Expand Down
1 change: 1 addition & 0 deletions modules/miscutil/lib/plotextractor_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,4 @@
## CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT -- when extracting context of plots from
## TeX sources, this is the limitation of sentences in each direction. Default 2.
CFG_PLOTEXTRACTOR_CONTEXT_SENTENCE_LIMIT = 2

6 changes: 3 additions & 3 deletions modules/miscutil/lib/plotextractor_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def untar(original_tarball, sdir):
run_shell_command('rm %s', (tarball,))
return ([], [], None)
dummy1, cmd_out, cmd_err = run_process_with_timeout('tar xvf %s -C %s' %
(tarball, sdir), shell = True)
(tarball, sdir), shell=True)

if cmd_err != '':
return ([], [], None)
Expand Down Expand Up @@ -184,7 +184,7 @@ def convert_images(image_list):
try:
dummy1, cmd_out, cmd_err = run_process_with_timeout('convert %s %s'\
% (image_file, \
converted_image_file), shell = True)
converted_image_file), shell=True)
if cmd_err == '':
ret_list.append(converted_image_file)
else:
Expand All @@ -206,7 +206,7 @@ def extract_text(tarball):
try:
os.stat(tarball + '.pdf')
dummy1, dummy2, cmd_err = run_process_with_timeout('pdftotext %s %s' % \
(tarball + '.pdf ', tarball + '.txt'), shell = True)
(tarball + '.pdf ', tarball + '.txt'), shell=True)
if cmd_err != '':
return - 1
write_message('generated ' + tarball + '.txt from ' + tarball + '.pdf')
Expand Down
2 changes: 1 addition & 1 deletion modules/miscutil/lib/plotextractor_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def tarballs_by_recids(recids, sdir):

for recid in list_of_ids:
rec = get_record(recid)
for afieldinstance in record_get_field_instances(rec, tag = '037'):
for afieldinstance in record_get_field_instances(rec, tag='037'):
if 'arXiv' == field_get_subfield_values(afieldinstance, '9')[0]:
arXiv_id = field_get_subfield_values(afieldinstance, 'a')[0]
arXiv_ids.append(arXiv_id)
Expand Down
14 changes: 7 additions & 7 deletions modules/miscutil/lib/plotextractor_output_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ def assemble_caption(begin_line, begin_index, end_line, end_index, lines):
# clean out characters not allowed in MARCXML
# not allowed: & < >
try:
caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash = True)
caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True)
except: # that damn encode thing threw an error on astro-ph/0601014
sys.stderr.write(caption)
sys.stderr.write(' cannot be processed\n')
Expand Down Expand Up @@ -332,7 +332,7 @@ def create_MARC(extracted_image_data, tarball, refno):
marcxml.append('</record>')
return '\n'.join(marcxml)

def get_image_location(image, sdir, image_list, recurred = False):
def get_image_location(image, sdir, image_list, recurred=False):
"""
This function takes a raw image name and a directory and returns the location of the
(possibly converted) image
Expand Down Expand Up @@ -446,17 +446,17 @@ def get_image_location(image, sdir, image_list, recurred = False):

# agh, this calls for drastic measures
for piece in image.split(' '):
res = get_image_location(piece, sdir, image_list, recurred = True)
res = get_image_location(piece, sdir, image_list, recurred=True)
if res != None:
return res

for piece in image.split(','):
res = get_image_location(piece, sdir, image_list, recurred = True)
res = get_image_location(piece, sdir, image_list, recurred=True)
if res != None:
return res

for piece in image.split('='):
res = get_image_location(piece, sdir, image_list, recurred = True)
res = get_image_location(piece, sdir, image_list, recurred=True)
if res != None:
return res

Expand Down Expand Up @@ -492,7 +492,7 @@ def get_converted_image_name(image):

return os.path.join(img_dir, converted_image)

def get_tex_location(new_tex_name, current_tex_name, recurred = False):
def get_tex_location(new_tex_name, current_tex_name, recurred=False):
"""
Takes the name of a TeX file and attempts to match it to an actual file
in the tarball.
Expand Down Expand Up @@ -556,6 +556,6 @@ def get_tex_location(new_tex_name, current_tex_name, recurred = False):

if tex_location == None and not recurred:
return get_tex_location(new_tex_name + '.tex', current_tex_name, \
recurred = True)
recurred=True)

return tex_location
10 changes: 5 additions & 5 deletions modules/miscutil/lib/plotextractor_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,28 +227,28 @@ def test_simple_test(self):
"""plotextractor - intelligently_find_filenames simple"""
line = 'file.eps'

filenames = intelligently_find_filenames(line, ext = True)
filenames = intelligently_find_filenames(line, ext=True)
self.assertTrue(filenames == ['file.eps'], 'didn\'t find correct filenames')

def test_ext_test(self):
"""plotextractor - intelligently_find_filenames extension"""
line = 'file.eps file2'

filenames = intelligently_find_filenames(line, ext = True)
filenames = intelligently_find_filenames(line, ext=True)
self.assertTrue(filenames == ['file.eps'], 'didn\'t look for extension')

def test_tex_test(self):
"""plotextractor - intelligently_find_filenames TeX extension"""
line = 'file.eps file2.tex'

filenames = intelligently_find_filenames(line, TeX = True)
filenames = intelligently_find_filenames(line, TeX=True)
self.assertTrue(filenames == ['file.eps', 'file2.tex'], 'not looking for TeX ext')

def test_file_equals_test(self):
"""plotextractor - intelligently_find_filenames equals"""
line = 'file=something.eps'

filenames = intelligently_find_filenames(line, ext = True)
filenames = intelligently_find_filenames(line, ext=True)
self.assertTrue(filenames == ['something.eps', 'file=something.eps'], \
'didn\'t catch file=')

Expand All @@ -264,7 +264,7 @@ def test_lots_of_filenames(self):
"""plotextractor - intelligently_find_filenames lots of filenames"""
line = '[file.pstex]figure=something.eps,haha,anotherthing.ps'

filenames = intelligently_find_filenames(line, ext = True)
filenames = intelligently_find_filenames(line, ext=True)
self.assertTrue('file.pstex' in filenames, 'didn\'t look in brackets')
self.assertTrue('something.eps' in filenames, 'didn\'t find figure=')
self.assertTrue('anotherthing.ps' in filenames, 'didn\'t find filename')
Expand Down

0 comments on commit 8948877

Please sign in to comment.