Skip to content

Commit

Permalink
BibHarvest: strengthen temporary file generation
Browse files Browse the repository at this point in the history
* Refactored the OAI harvester to generate safer and more
  intuitive filenames. (fixes #321)

* Prettyfied and cleaned various parts of oai_harvest_getter.

* Fixed an issue causing fulltext (t) post-process to be skipped.
  • Loading branch information
jalavik authored and tiborsimko committed Mar 25, 2011
1 parent 9b93e94 commit ff3ad52
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 77 deletions.
27 changes: 7 additions & 20 deletions modules/bibharvest/lib/oai_harvest_daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@ def task_run_core():
downloaded_material_dict = {}
harvested_files_list = []
# Harvest phase
harvestpath = filepath_prefix + "_" + str(j) + "_" + \
time.strftime("%Y%m%d%H%M%S") + "_harvested"
harvestpath = "%s_%d_%s_" % (filepath_prefix, j, time.strftime("%Y%m%d%H%M%S"))
if dateflag == 1:
task_update_progress("Harvesting %s from %s to %s (%i/%i)" % \
(reponame, \
Expand Down Expand Up @@ -228,8 +227,7 @@ def task_run_core():
(reponame, \
i, \
len(active_files_list)))
updated_file = filepath_prefix + "_" + str(i) + "_" + \
time.strftime("%Y%m%d%H%M%S") + "_converted"
updated_file = "%s.converted" % (active_file.split('.')[0],)
updated_files_list.append(updated_file)
(exitcode, err_msg) = call_bibconvert(config=str(repos[0][5]),
harvestpath=active_file,
Expand Down Expand Up @@ -260,8 +258,7 @@ def task_run_core():
task_sleep_now_if_required()
task_update_progress("Extracting plots from harvested material from %s (%i/%i)" % \
(reponame, i, len(active_files_list)))
updated_file = filepath_prefix + "_" + str(i) + "_" + \
time.strftime("%Y%m%d%H%M%S") + "_extracted"
updated_file = "%s.plotextracted" % (active_file.split('.')[0],)
updated_files_list.append(updated_file)
(exitcode, err_msg) = call_plotextractor(active_file,
updated_file,
Expand Down Expand Up @@ -295,8 +292,7 @@ def task_run_core():
task_sleep_now_if_required()
task_update_progress("Extracting references from material harvested from %s (%i/%i)" % \
(reponame, i, len(active_files_list)))
updated_file = filepath_prefix + "_" + str(i) + "_" + \
time.strftime("%Y%m%d%H%M%S") + "_refextracted"
updated_file = "%s.refextracted" % (active_file.split('.')[0],)
updated_files_list.append(updated_file)
(exitcode, err_msg) = call_refextract(active_file,
updated_file,
Expand Down Expand Up @@ -332,8 +328,7 @@ def task_run_core():
task_sleep_now_if_required()
task_update_progress("Attaching fulltext to records harvested from %s (%i/%i)" % \
(reponame, i, len(active_files_list)))
updated_file = filepath_prefix + "_" + str(i) + "_" + \
time.strftime("%Y%m%d%H%M%S") + "_fulltext"
updated_file = "%s.fulltext" % (active_file.split('.')[0],)
updated_files_list.append(updated_file)
(exitcode, err_msg) = call_fulltext(active_file,
updated_file,
Expand Down Expand Up @@ -577,16 +572,8 @@ def oai_harvest_get(prefix, baseurl, harvestpath,
if setspecs:
sets = [set.strip() for set in setspecs.split(' ')]

print "Start harvesting"
oai_harvest_getter.harvest(network_location, path, http_param_dict, method, harvestpath,
harvested_files = oai_harvest_getter.harvest(network_location, path, http_param_dict, method, harvestpath,
sets, secure, user, password, cert_file, key_file)

harvest_dir, harvest_filename = os.path.split(harvestpath)
files = os.listdir(harvest_dir)
files.sort()
harvested_files = [harvest_dir + os.sep + filename for \
filename in files \
if filename.startswith(harvest_filename)]
remove_duplicates(harvested_files)
return (1, harvested_files)
except StandardError, e:
Expand Down Expand Up @@ -777,7 +764,7 @@ def call_fulltext(active_file, extracted_file, harvested_identifier_list,
all_err_msg.append(err_msg)
else:
downloaded_files[identifier]["pdf"] = pdf
if current_exitcode != 0:
if current_exitcode == 0:
fulltext_xml = ' <datafield tag="FFT" ind1=" " ind2=" ">\n' + \
' <subfield code="a">' + downloaded_files[identifier]["pdf"] + '</subfield>\n' + \
' <subfield code="t"></subfield>\n' + \
Expand Down
95 changes: 38 additions & 57 deletions modules/bibharvest/lib/oai_harvest_getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
import re
import time
import base64
import tempfile
import os
except ImportError, e:
print "Error: %s" % e
sys.exit(1)
Expand Down Expand Up @@ -82,63 +84,44 @@ def OAI_Session(server, script, http_param_dict , method="POST", output="",
in corresponding filepath, with a unique number appended at the end.
This number starts at 'resume_request_nbr'.
Returns an int corresponding to the last created 'resume_request_nbr'.
Returns a tuple containing an int corresponding to the last created 'resume_request_nbr' and
a list of harvested files.
"""

sys.stderr.write("Starting the harvesting session at %s" %
time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
sys.stderr.write("%s - %s\n" % (server,
http_request_parameters(http_param_dict)))

a = OAI_Request(server, script,
output_path, output_name = os.path.split(output)
harvested_files = []
i = resume_request_nbr
while True:
harvested_data = OAI_Request(server, script,
http_request_parameters(http_param_dict, method), method,
secure, user, password, cert_file, key_file)

rt_obj = re.search('<resumptionToken.*>(.+)</resumptionToken>',
a, re.DOTALL)

i = resume_request_nbr

while rt_obj is not None and rt_obj != "":

if output:
# Write results to a file named 'output'
if a.lower().find('<'+http_param_dict['verb'].lower()) > -1:
write_file( "%s.%07d" % (output, i), a)
# Write results to a file specified by 'output'
if harvested_data.lower().find('<'+http_param_dict['verb'].lower()) > -1:
output_fd, output_filename = tempfile.mkstemp(suffix="_%07d.harvested" % (i,), prefix=output_name, dir=output_path)
os.write(output_fd, harvested_data)
os.close(output_fd)
harvested_files.append(output_filename)
else:
# hmm, were there no records in output? Do not create
# a file and warn user
# No records in output? Do not create a file. Warn the user.
sys.stderr.write("\n<!--\n*** WARNING: NO RECORDS IN THE HARVESTED DATA: "
+ "\n" + repr(a) + "\n***\n-->\n")
+ "\n" + repr(harvested_data) + "\n***\n-->\n")
else:
sys.stdout.write(a)

i = i + 1

time.sleep(1)

http_param_dict = http_param_resume(http_param_dict, rt_obj.group(1))

a = OAI_Request(server, script,
http_request_parameters(http_param_dict, method), method,
secure, user, password, cert_file, key_file)
sys.stdout.write(harvested_data)

rt_obj = re.search('<resumptionToken.*>(.+)</resumptionToken>',
a, re.DOTALL)

if output:
# Write results to a file named 'output'
if a.lower().find('<'+http_param_dict['verb'].lower()) > -1:
write_file("%s.%07d" % (output, i), a)
harvested_data, re.DOTALL)
if rt_obj is not None and rt_obj != "":
http_param_dict = http_param_resume(http_param_dict, rt_obj.group(1))
i = i + 1
else:
# hmm, were there no records in output? Do not create
# a file and warn user
sys.stderr.write("\n<!--\n*** WARNING: NO RECORDS IN THE HARVESTED DATA: "
+ "\n" + repr(a) + "\n***\n-->\n")
else:
sys.stdout.write(a)
break

return i
return i, harvested_files

def harvest(server, script, http_param_dict , method="POST", output="",
sets=None, secure=False, user=None, password=None,
Expand All @@ -149,7 +132,7 @@ def harvest(server, script, http_param_dict , method="POST", output="",
Needed for harvesting multiple sets in one row.
Returns the number of files created by the harvesting
Returns a list of filepaths for harvested files.
Parameters:
Expand Down Expand Up @@ -202,32 +185,27 @@ def harvest(server, script, http_param_dict , method="POST", output="",
(If provided, 'key_file' must also be provided)
"""
if sets:
i = 0
resume_request_nbr = 0
all_harvested_files = []
for set in sets:
http_param_dict['set'] = set
i = OAI_Session(server, script, http_param_dict, method,
output, i, secure, user, password,
resume_request_nbr, harvested_files = OAI_Session(server, script, http_param_dict, method,
output, resume_request_nbr, secure, user, password,
cert_file, key_file)
i += 1
return i
resume_request_nbr += 1
all_harvested_files.extend(harvested_files)
return all_harvested_files
else:
OAI_Session(server, script, http_param_dict, method,
dummy, harvested_files = OAI_Session(server, script, http_param_dict, method,
output, secure=secure, user=user,
password=password, cert_file=cert_file,
key_file=key_file)
return 1

def write_file(filename="harvest", a=""):
"Writes a to filename"

f = open(filename, "w")
f.write(a)
f.close()
return harvested_files

def OAI_Request(server, script, params, method="POST", secure=False,
user=None, password=None,
key_file=None, cert_file=None):
"""Handle OAI request
"""Handle OAI request. Returns harvested data.
Parameters:
Expand Down Expand Up @@ -260,6 +238,9 @@ def OAI_Request(server, script, params, method="POST", secure=False,
key in case the server to harvest requires
certificate-based authentication
(If provided, 'key_file' must also be provided)
Return:
Returns harvested data if harvest is successful.
"""

headers = {"Content-type":"application/x-www-form-urlencoded",
Expand Down

0 comments on commit ff3ad52

Please sign in to comment.