diff --git a/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc b/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc
index 2b1fc61de..40b5a7900 100644
--- a/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc
+++ b/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc
@@ -176,7 +176,15 @@ To match records in specific collection(s):
- $ bibmatch -x 'Books,Articles' < input.xml
+ $ bibmatch --collection 'Books,Articles' < input.xml
+
+
+
+To match records in restricted collection(s):
+
+
+
+ $ bibmatch --collection 'Theses' --user admin < input.xml
@@ -217,12 +225,6 @@ Command line options:
Advanced options:
- -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will
- be added as a query. i.e. QRYSTR --- [title] [author]
-
- -x --collection only perform queries in certain collection(s).
- Note: matching against restricted collections does not work.
-
-m --mode=(a|e|o|p|r) perform an advanced search using special search mode.
Where mode is:
"a" all of the words,
@@ -235,5 +237,14 @@ Command line options:
Where operator is:
"a" boolean AND (default)
"o" boolean OR
+
+ -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will
+ be added as a query. i.e. QRYSTR --- [title] [author]
+
+ -x --collection only perform queries in certain collection(s).
+ Note: matching against restricted collections requires authentication.
+
+ --user=USERNAME username to use when connecting to Invenio instance. Useful when searching
+ restricted collections. You will be prompted for password.
diff --git a/modules/bibmatch/lib/bibmatch_engine.py b/modules/bibmatch/lib/bibmatch_engine.py
index ec900c257..d2284be18 100644
--- a/modules/bibmatch/lib/bibmatch_engine.py
+++ b/modules/bibmatch/lib/bibmatch_engine.py
@@ -29,16 +29,18 @@
import os
import getopt
import re
+import getpass
from tempfile import mkstemp
from time import sleep
-from invenio.config import CFG_SITE_URL, CFG_BIBMATCH_FUZZY_WORDLIMITS, \
+from invenio.config import CFG_SITE_SECURE_URL, CFG_BIBMATCH_FUZZY_WORDLIMITS, \
CFG_BIBMATCH_QUERY_TEMPLATES, \
CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT, \
CFG_BIBMATCH_LOCAL_SLEEPTIME, \
CFG_BIBMATCH_REMOTE_SLEEPTIME, \
CFG_SITE_RECORD
-from invenio.invenio_connector import InvenioConnector
+from invenio.invenio_connector import InvenioConnector, \
+ InvenioConnectorAuthError
from invenio.bibrecord import create_records, \
record_get_field_values, record_xml_output, record_modify_controlfield, \
record_has_field, record_add_field
@@ -99,12 +101,6 @@ def usage():
Advanced options:
- -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will
- be added as a query. i.e. QRYSTR --- [title] [author]
-
- -x --collection only perform queries in certain collection(s).
- Note: matching against restricted collections does not work.
-
-m --mode=(a|e|o|p|r) perform an advanced search using special search mode.
Where mode is:
"a" all of the words,
@@ -118,6 +114,15 @@ def usage():
"a" boolean AND (default)
"o" boolean OR
+ -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will
+ be added as a query. i.e. QRYSTR --- [title] [author]
+
+ -x --collection only perform queries in certain collection(s).
+ Note: matching against restricted collections requires authentication.
+
+ --user=USERNAME username to use when connecting to Invenio instance. Useful when searching
+ restricted collections. You will be prompted for password.
+
QUERYSTRINGS
Querystrings determine which type of query/strategy to use when searching for the
matching records in the database.
@@ -193,7 +198,8 @@ def usage():
$ bibmatch --print-ambiguous -q title-author < input.xml > ambigmatched.xml
$ bibmatch -q "980:Thesis 773__p:\"[773__p]\" 100__a:[100__a]" -r "http://inspirebeta.net" < input.xml
- $ bibmatch -x 'Books,Articles' < input.xml
+ $ bibmatch --collection 'Books,Articles' < input.xml
+ $ bibmatch --collection 'Theses' --user admin < input.xml
""" % (sys.argv[0],)
sys.exit(1)
@@ -498,8 +504,8 @@ def match_result_output(recID_list, server_url, query, matchmode="no match"):
return "\n".join(result)
def match_records(records, qrystrs=None, search_mode=None, operator="and", verbose=1, \
- server_url=CFG_SITE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \
- clean=False, collections=[]):
+ server_url=CFG_SITE_SECURE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \
+ clean=False, collections=[], user="", password=""):
"""
Match passed records with existing records on a local or remote Invenio
installation. Returns which records are new (no match), which are matched,
@@ -539,17 +545,28 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo
@param collections: list of collections to search, if specified
@type collections: list
+ @param user: username in case of authenticated search requests
+ @type user: string
+
+ @param password: password in case of authenticated search requests
+ @type password: string
+
@rtype: list of lists
@return an array of arrays of records, like this [newrecs,matchedrecs,
ambiguousrecs,fuzzyrecs]
"""
- server = InvenioConnector(server_url)
-
newrecs = []
matchedrecs = []
ambiguousrecs = []
fuzzyrecs = []
+ try:
+ server = InvenioConnector(server_url, user=user, password=password)
+ except InvenioConnectorAuthError as error:
+ if verbose > 0:
+ sys.stderr.write(str(error))
+ return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs]
+
## Go through each record and try to find matches using defined querystrings
record_counter = 0
querystring = Querystring(operator, clean=clean)
@@ -587,7 +604,12 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo
search_params = dict(p=query, f=field, of='id', c=collections)
## Perform the search with retries
- result_recids = server.search_with_retry(**search_params)
+ try:
+ result_recids = server.search_with_retry(**search_params)
+ except InvenioConnectorAuthError as error:
+ if verbose > 0:
+ sys.stderr.write(str(error))
+ break
if (verbose > 8):
if len(result_recids) > 10:
sys.stderr.write("\nSearching with values %s result=%s\n" %
@@ -638,7 +660,12 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo
for current_operator, qry in fuzzy_query_list:
current_resultset = None
search_params = dict(p=qry, f=field, of='id', c=collections)
- current_resultset = server.search_with_retry(**search_params)
+ try:
+ current_resultset = server.search_with_retry(**search_params)
+ except InvenioConnectorAuthError as error:
+ if (verbose > 0):
+ sys.stderr.write(str(error))
+ break
if (verbose > 8):
if len(current_resultset) > 10:
sys.stderr.write("\nSearching with values %s result=%s\n" %
@@ -662,23 +689,23 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo
result_hitset = list(set(result_hitset) - set(current_resultset))
elif current_operator == '|':
result_hitset = list(set(result_hitset) | set(current_resultset))
-
- if result_hitset and len(result_hitset) < 10:
- # This was a fuzzy match
- query_out = " #Fuzzy# ".join([q for dummy, q in fuzzy_query_list])
- if len(result_hitset) == 1 and complete:
- if modify:
- add_recid(rec[0], result_hitset[0])
- fuzzy_results.append((result_hitset, query_out))
- if (verbose > 8):
- sys.stderr.write("Fuzzy: %s\n" % (result_hitset,))
- else:
- # We treat the result as ambiguous (uncertain) when:
- # - query is not complete
- # - more then one result
- ambiguous_results.append((result_hitset, query_out))
- if (verbose > 8):
- sys.stderr.write("Ambiguous\n")
+ else:
+ if result_hitset and len(result_hitset) < 10:
+ # This was a fuzzy match
+ query_out = " #Fuzzy# ".join([q for dummy, q in fuzzy_query_list])
+ if len(result_hitset) == 1 and complete:
+ if modify:
+ add_recid(rec[0], result_hitset[0])
+ fuzzy_results.append((result_hitset, query_out))
+ if (verbose > 8):
+ sys.stderr.write("Fuzzy: %s\n" % (result_hitset,))
+ else:
+ # We treat the result as ambiguous (uncertain) when:
+ # - query is not complete
+ # - more then one result
+ ambiguous_results.append((result_hitset, query_out))
+ if (verbose > 8):
+ sys.stderr.write("Ambiguous\n")
## Evaluate final results for record
# Add matched record iff number found is equal to one, otherwise return fuzzy, ambiguous or no match
@@ -766,7 +793,8 @@ def main():
"text-marc-output",
"alter-recid",
"clean",
- "collection="
+ "collection=",
+ "user="
])
except getopt.GetoptError, e:
@@ -781,7 +809,7 @@ def main():
records = []
batch_output = "" # print stuff in files
f_input = "" # read from where, if param "i"
- server_url = CFG_SITE_URL # url to server performing search, local by default
+ server_url = CFG_SITE_SECURE_URL # url to server performing search, local by default
modify = 0 # alter output with matched record identifiers
textmarc_output = 0 # output in MARC instead of MARCXML
field = ""
@@ -789,6 +817,8 @@ def main():
sleeptime = CFG_BIBMATCH_LOCAL_SLEEPTIME # the amount of time to sleep between queries, changes on remote queries
clean = False # should queries be sanitized?
collections = [] # only search certain collections?
+ user = ""
+ password = ""
for opt, opt_value in opts:
if opt in ["-0", "--print-new"]:
@@ -847,10 +877,12 @@ def main():
qrystrs.append((field, tmp[1]))
if opt in ["-x", "--collection"]:
colls = opt_value.split(',')
- print opt_value
for collection in colls:
if collection not in collections:
collections.append(collection)
+ if opt in ["--user"]:
+ user = opt_value
+ password = getpass.getpass()
if verbose:
sys.stderr.write("\nBibMatch: Parsing input file %s..." % (f_input,))
@@ -891,7 +923,9 @@ def main():
modify,
sleeptime,
clean,
- collections)
+ collections,
+ user,
+ password)
# set the output according to print..
# 0-newrecs 1-matchedrecs 2-ambiguousrecs 3-fuzzyrecs
diff --git a/modules/bibmatch/lib/bibmatch_regression_tests.py b/modules/bibmatch/lib/bibmatch_regression_tests.py
index bbf8e66d3..8cb9d20f6 100644
--- a/modules/bibmatch/lib/bibmatch_regression_tests.py
+++ b/modules/bibmatch/lib/bibmatch_regression_tests.py
@@ -413,6 +413,71 @@ def setUp(self):
""" % CFG_SITE_RECORD
+ # Restricted record in thesis collection
+ self.recxml5 = """
+
+
+
+ 42
+
+ eng
+
+
+ LBL-28106
+
+
+ Bertsche, K J
+ Calif. Univ. Berkeley
+
+
+ A small low energy cyclotron for radioisotope measurements
+
+
+ Berkeley, CA
+ Lawrence Berkeley Nat. Lab.
+ Nov 1989
+
+
+ 155 p
+
+
+ Thesis : Calif. Univ. Berkeley
+
+
+ SzGeCERN
+ Accelerators and Storage Rings
+
+
+ bibliography
+
+
+ REPORT
+
+
+ THESIS
+
+
+ 14
+
+
+ 1989
+
+
+ 1990-02-28
+ 50
+ 2002-03-22
+ BATCH
+
+
+ h
+ 199010n
+
+
+ THESIS
+
+
+
+ """
return
@@ -431,7 +496,8 @@ def test_check_new(self):
def test_check_ambiguous(self):
"""bibmatch - check an ambiguous record"""
records = create_records(self.recxml1)
- [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, qrystrs=[("", "[100__a]")])
+ [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, \
+ qrystrs=[("", "[100__a]")])
self.assertEqual(1, len(ambigrecs))
def test_check_fuzzy(self):
@@ -443,14 +509,16 @@ def test_check_fuzzy(self):
def test_check_remote(self):
"""bibmatch - check remote match (Invenio demo site)"""
records = create_records(self.recxml3)
- [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, server_url="http://invenio-demo.cern.ch")
+ [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
+ server_url="http://invenio-demo.cern.ch")
self.assertEqual(1, len(matchedrecs))
def test_check_textmarc(self):
"""bibmatch - check textmarc as input"""
marcxml = transform_input_to_marcxml("", self.textmarc)
records = create_records(marcxml)
- [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, server_url="http://invenio-demo.cern.ch")
+ [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \
+ server_url="http://invenio-demo.cern.ch")
self.assertEqual(2, len(matchedrecs))
def test_check_altered(self):
@@ -474,17 +542,60 @@ def test_check_qrystr(self):
def test_check_completeness(self):
"""bibmatch - check query completeness"""
records = create_records(self.recxml4)
- [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, qrystrs=[("", "[088__a] [035__a]")])
+ [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, \
+ qrystrs=[("", "[088__a] [035__a]")])
self.assertEqual(1, len(ambigrecs))
def test_check_collection(self):
"""bibmatch - check collection"""
records = create_records(self.recxml3)
- [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, collections=["Articles"])
+ [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \
+ collections=["Articles"])
self.assertEqual(1, len(nomatchrecs))
- [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, collections=["Books"])
+ [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \
+ collections=["Books"])
self.assertEqual(1, len(matchedrecs))
+ def test_restricted_collections_local(self):
+ """bibmatch - check restricted collections local search"""
+ records = create_records(self.recxml5)
+ # Jekyll should have access
+ [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \
+ qrystrs=[("", "[088__a]")], \
+ collections=["Theses"], \
+ user="jekyll",
+ password="j123ekyll")
+ self.assertEqual(1, len(matchedrecs))
+ # Hyde should not have access
+ [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \
+ qrystrs=[("", "[088__a]")], \
+ collections=["Theses"], \
+ user="hyde", \
+ password="h123yde",
+ verbose=0)
+ self.assertEqual(1, len(matchedrecs))
+
+ def test_restricted_collections_remote(self):
+ """bibmatch - check restricted collections remote search"""
+ records = create_records(self.recxml5)
+ # Jekyll should have access
+ [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \
+ qrystrs=[("", "[088__a]")], \
+ collections=["Theses"], \
+ server_url="https://invenio-demo.cern.ch", \
+ user="jekyll", \
+ password="j123ekyll")
+ self.assertEqual(1, len(matchedrecs))
+ # Hyde should not have access
+ [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \
+ qrystrs=[("", "[088__a]")], \
+ collections=["Theses"], \
+ server_url="https://invenio-demo.cern.ch", \
+ user="hyde", \
+ password="h123yde",
+ verbose=0)
+ self.assertEqual(1, len(nomatchrecs))
+
TEST_SUITE = make_test_suite(BibMatchTest)
if __name__ == "__main__":
diff --git a/modules/miscutil/lib/invenio_connector.py b/modules/miscutil/lib/invenio_connector.py
index 908d259fe..720ca3e75 100644
--- a/modules/miscutil/lib/invenio_connector.py
+++ b/modules/miscutil/lib/invenio_connector.py
@@ -52,23 +52,44 @@
try:
# if we are running locally, we can optimize :-)
- from invenio.config import CFG_SITE_URL, CFG_SITE_RECORD, CFG_CERN_SITE
+ from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE
from invenio.bibtask import task_low_level_submission
from invenio.search_engine import perform_request_search, collection_restricted_p
from invenio.bibformat import format_records
- LOCAL_SITE_URL = CFG_SITE_URL
+ LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL]
except ImportError:
- LOCAL_SITE_URL = None
+ LOCAL_SITE_URLS = None
CFG_CERN_SITE = 0
CFG_CDS_URL = "http://cdsweb.cern.ch/"
+class InvenioConnectorAuthError(Exception):
+ """
+ This exception is called by InvenioConnector when authentication fails during
+ remote or local connections.
+ """
+ def __init__(self, value):
+ """
+ Set the internal "value" attribute to that of the passed "value" parameter.
+ @param value: an error string to display to the user.
+ @type value: string
+ """
+ Exception.__init__(self)
+ self.value = value
+ def __str__(self):
+ """
+ Return oneself as a string (actually, return the contents of self.value).
+ @return: representation of error
+ @rtype: string
+ """
+ return str(self.value)
+
class InvenioConnector(object):
"""
Creates an connector to a server running Invenio
"""
- def __init__(self, url=LOCAL_SITE_URL, user="", password="", login_method=None, local_import_path="invenio"):
+ def __init__(self, url=LOCAL_SITE_URLS, user="", password="", login_method="Local", local_import_path="invenio"):
"""
Initialize a new instance of the server at given URL.
@@ -93,7 +114,7 @@ def __init__(self, url=LOCAL_SITE_URL, user="", password="", login_method=None,
@type local_import_path: string
"""
self.server_url = url
- self.local = self.server_url == LOCAL_SITE_URL
+ self.local = self.server_url in LOCAL_SITE_URLS
self.cached_queries = {}
self.cached_records = {}
self.cached_baskets = {}
@@ -102,6 +123,8 @@ def __init__(self, url=LOCAL_SITE_URL, user="", password="", login_method=None,
self.login_method = login_method
self.browser = None
if self.user:
+ if not self.server_url.startswith('https://'):
+ raise InvenioConnectorAuthError("You have to use a secure URL (HTTPS) to login")
self._init_browser()
self._check_credentials()
@@ -116,12 +139,14 @@ def _init_browser(self):
self.browser.select_form(nr=0)
self.browser['p_un'] = self.user
self.browser['p_pw'] = self.password
+ # Set login_method to be writable
+ self.browser.form.find_control('login_method').readonly = False
self.browser['login_method'] = self.login_method
self.browser.submit()
def _check_credentials(self):
if not 'youraccount/logout' in self.browser.response().read():
- raise ValueError("It was not possible to successfully login with the provided credentials")
+ raise InvenioConnectorAuthError("It was not possible to successfully login with the provided credentials")
def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="",
rm="", of="", ot="", p1="", f1="", m1="", op1="",
@@ -131,6 +156,8 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="",
read_cache=True):
"""
Returns records corresponding to the given search query.
+
+ @raise InvenioConnectorAuthError: if authentication fails
"""
parse_results = False
if of == "":
@@ -155,8 +182,7 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="",
# Are we running locally? If so, better directly access the
# search engine directly
- if LOCAL_SITE_URL == self.server_url and \
- of != 't':
+ if self.server_url in LOCAL_SITE_URLS and of != 't':
# See if user tries to search any restricted collection
if c != "":
if type(c) is list:
@@ -165,9 +191,10 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="",
colls = [c]
for collection in colls:
if collection_restricted_p(collection):
- sys.stderr.write("Searching local restricted collections\
- is NOT allowed. Aborting search.\n")
- return []
+ if self.user:
+ self._check_credentials()
+ continue
+ raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n")
results = perform_request_search(p=p, f=f, c=c, rg=rg, sf=sf, so=so, sp=so, rm=rm,
p1=p1, f1=f1, m1=m1, op1=op1,
p2=p2, f2=f2, m2=m2, op2=op2,
@@ -183,6 +210,9 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="",
results = self.browser.open(self.server_url + "/search?" + params)
else:
results = urllib2.urlopen(self.server_url + "/search?" + params)
+ if 'youraccount/login' in results.geturl():
+ # Current user not able to search collection
+ raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n")
else:
return self.cached_queries[params + str(parse_results)]
@@ -305,7 +335,7 @@ def upload_marcxml(self, marcxml, mode):
raise NameError, "Incorrect mode " + str(mode)
# Are we running locally? If so, submit directly
- if LOCAL_SITE_URL == self.server_url:
+ if self.server_url in LOCAL_SITE_URLS:
(code, marcxml_filepath) = tempfile.mkstemp(prefix="upload_%s" % \
time.strftime("%Y%m%d_%H%M%S_",
time.localtime()))
diff --git a/modules/miscutil/lib/invenio_connector_regression_tests.py b/modules/miscutil/lib/invenio_connector_regression_tests.py
index fa04659af..059ca9f04 100644
--- a/modules/miscutil/lib/invenio_connector_regression_tests.py
+++ b/modules/miscutil/lib/invenio_connector_regression_tests.py
@@ -24,8 +24,9 @@
import os
import unittest
-from invenio.invenio_connector import InvenioConnector
-from invenio.config import CFG_SITE_URL
+from invenio.invenio_connector import InvenioConnector, \
+ InvenioConnectorAuthError
+from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL
from invenio.testutils import make_test_suite, run_test_suite
class InvenioConnectorTest(unittest.TestCase):
@@ -52,6 +53,28 @@ def test_search_collections(self):
self.assertTrue(len(result) > 0, \
'did not get collection search results.')
+ def test_search_local_restricted_collections(self):
+ """InvenioConnector - local restricted collection search"""
+ server = InvenioConnector(CFG_SITE_URL)
+ search_params = dict(p='LBL-28106', c=['Theses'], of='id')
+ self.assertRaises(InvenioConnectorAuthError, server.search, **search_params)
+
+ server = InvenioConnector(CFG_SITE_SECURE_URL, user='admin', password='')
+ result = server.search(p='LBL-28106', c=['Theses'], of='id')
+ self.assertTrue(len(result) > 0, \
+ 'did not get restricted collection search results.')
+
+ def test_search_remote_restricted_collections(self):
+ """InvenioConnector - remote restricted collection search"""
+ server = InvenioConnector("http://invenio-demo.cern.ch")
+ search_params = dict(p='LBL-28106', c=['Theses'], of='id')
+ self.assertRaises(InvenioConnectorAuthError, server.search, **search_params)
+
+ server = InvenioConnector("https://invenio-demo.cern.ch", user='jekyll', password='j123ekyll')
+ result = server.search(p='LBL-28106', c=['Theses'], of='id')
+ self.assertTrue(len(result) > 0, \
+ 'did not get restricted collection search results.')
+
TEST_SUITE = make_test_suite(InvenioConnectorTest)
if __name__ == "__main__":