diff --git a/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc b/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc index 2b1fc61de..40b5a7900 100644 --- a/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc +++ b/modules/bibmatch/doc/admin/bibmatch-admin-guide.webdoc @@ -176,7 +176,15 @@ To match records in specific collection(s):
- $ bibmatch -x 'Books,Articles' < input.xml
+ $ bibmatch --collection 'Books,Articles' < input.xml
+
+
+ +To match records in restricted collection(s): + +
+
+ $ bibmatch --collection 'Theses' --user admin < input.xml
 
@@ -217,12 +225,6 @@ Command line options: Advanced options: - -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will - be added as a query. i.e. QRYSTR --- [title] [author] - - -x --collection only perform queries in certain collection(s). - Note: matching against restricted collections does not work. - -m --mode=(a|e|o|p|r) perform an advanced search using special search mode. Where mode is: "a" all of the words, @@ -235,5 +237,14 @@ Command line options: Where operator is: "a" boolean AND (default) "o" boolean OR + + -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will + be added as a query. i.e. QRYSTR --- [title] [author] + + -x --collection only perform queries in certain collection(s). + Note: matching against restricted collections requires authentication. + + --user=USERNAME username to use when connecting to Invenio instance. Useful when searching + restricted collections. You will be prompted for password. diff --git a/modules/bibmatch/lib/bibmatch_engine.py b/modules/bibmatch/lib/bibmatch_engine.py index ec900c257..d2284be18 100644 --- a/modules/bibmatch/lib/bibmatch_engine.py +++ b/modules/bibmatch/lib/bibmatch_engine.py @@ -29,16 +29,18 @@ import os import getopt import re +import getpass from tempfile import mkstemp from time import sleep -from invenio.config import CFG_SITE_URL, CFG_BIBMATCH_FUZZY_WORDLIMITS, \ +from invenio.config import CFG_SITE_SECURE_URL, CFG_BIBMATCH_FUZZY_WORDLIMITS, \ CFG_BIBMATCH_QUERY_TEMPLATES, \ CFG_BIBMATCH_FUZZY_EMPTY_RESULT_LIMIT, \ CFG_BIBMATCH_LOCAL_SLEEPTIME, \ CFG_BIBMATCH_REMOTE_SLEEPTIME, \ CFG_SITE_RECORD -from invenio.invenio_connector import InvenioConnector +from invenio.invenio_connector import InvenioConnector, \ + InvenioConnectorAuthError from invenio.bibrecord import create_records, \ record_get_field_values, record_xml_output, record_modify_controlfield, \ record_has_field, record_add_field @@ -99,12 +101,6 @@ def usage(): Advanced options: - -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will - be added as a query. i.e. QRYSTR --- [title] [author] - - -x --collection only perform queries in certain collection(s). - Note: matching against restricted collections does not work. - -m --mode=(a|e|o|p|r) perform an advanced search using special search mode. Where mode is: "a" all of the words, @@ -118,6 +114,15 @@ def usage(): "a" boolean AND (default) "o" boolean OR + -c --config=filename load querystrings from a config file. Each line starting with QRYSTR will + be added as a query. i.e. QRYSTR --- [title] [author] + + -x --collection only perform queries in certain collection(s). + Note: matching against restricted collections requires authentication. + + --user=USERNAME username to use when connecting to Invenio instance. Useful when searching + restricted collections. You will be prompted for password. + QUERYSTRINGS Querystrings determine which type of query/strategy to use when searching for the matching records in the database. @@ -193,7 +198,8 @@ def usage(): $ bibmatch --print-ambiguous -q title-author < input.xml > ambigmatched.xml $ bibmatch -q "980:Thesis 773__p:\"[773__p]\" 100__a:[100__a]" -r "http://inspirebeta.net" < input.xml - $ bibmatch -x 'Books,Articles' < input.xml + $ bibmatch --collection 'Books,Articles' < input.xml + $ bibmatch --collection 'Theses' --user admin < input.xml """ % (sys.argv[0],) sys.exit(1) @@ -498,8 +504,8 @@ def match_result_output(recID_list, server_url, query, matchmode="no match"): return "\n".join(result) def match_records(records, qrystrs=None, search_mode=None, operator="and", verbose=1, \ - server_url=CFG_SITE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \ - clean=False, collections=[]): + server_url=CFG_SITE_SECURE_URL, modify=0, sleeptime=CFG_BIBMATCH_LOCAL_SLEEPTIME, \ + clean=False, collections=[], user="", password=""): """ Match passed records with existing records on a local or remote Invenio installation. Returns which records are new (no match), which are matched, @@ -539,17 +545,28 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo @param collections: list of collections to search, if specified @type collections: list + @param user: username in case of authenticated search requests + @type user: string + + @param password: password in case of authenticated search requests + @type password: string + @rtype: list of lists @return an array of arrays of records, like this [newrecs,matchedrecs, ambiguousrecs,fuzzyrecs] """ - server = InvenioConnector(server_url) - newrecs = [] matchedrecs = [] ambiguousrecs = [] fuzzyrecs = [] + try: + server = InvenioConnector(server_url, user=user, password=password) + except InvenioConnectorAuthError as error: + if verbose > 0: + sys.stderr.write(str(error)) + return [newrecs, matchedrecs, ambiguousrecs, fuzzyrecs] + ## Go through each record and try to find matches using defined querystrings record_counter = 0 querystring = Querystring(operator, clean=clean) @@ -587,7 +604,12 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo search_params = dict(p=query, f=field, of='id', c=collections) ## Perform the search with retries - result_recids = server.search_with_retry(**search_params) + try: + result_recids = server.search_with_retry(**search_params) + except InvenioConnectorAuthError as error: + if verbose > 0: + sys.stderr.write(str(error)) + break if (verbose > 8): if len(result_recids) > 10: sys.stderr.write("\nSearching with values %s result=%s\n" % @@ -638,7 +660,12 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo for current_operator, qry in fuzzy_query_list: current_resultset = None search_params = dict(p=qry, f=field, of='id', c=collections) - current_resultset = server.search_with_retry(**search_params) + try: + current_resultset = server.search_with_retry(**search_params) + except InvenioConnectorAuthError as error: + if (verbose > 0): + sys.stderr.write(str(error)) + break if (verbose > 8): if len(current_resultset) > 10: sys.stderr.write("\nSearching with values %s result=%s\n" % @@ -662,23 +689,23 @@ def match_records(records, qrystrs=None, search_mode=None, operator="and", verbo result_hitset = list(set(result_hitset) - set(current_resultset)) elif current_operator == '|': result_hitset = list(set(result_hitset) | set(current_resultset)) - - if result_hitset and len(result_hitset) < 10: - # This was a fuzzy match - query_out = " #Fuzzy# ".join([q for dummy, q in fuzzy_query_list]) - if len(result_hitset) == 1 and complete: - if modify: - add_recid(rec[0], result_hitset[0]) - fuzzy_results.append((result_hitset, query_out)) - if (verbose > 8): - sys.stderr.write("Fuzzy: %s\n" % (result_hitset,)) - else: - # We treat the result as ambiguous (uncertain) when: - # - query is not complete - # - more then one result - ambiguous_results.append((result_hitset, query_out)) - if (verbose > 8): - sys.stderr.write("Ambiguous\n") + else: + if result_hitset and len(result_hitset) < 10: + # This was a fuzzy match + query_out = " #Fuzzy# ".join([q for dummy, q in fuzzy_query_list]) + if len(result_hitset) == 1 and complete: + if modify: + add_recid(rec[0], result_hitset[0]) + fuzzy_results.append((result_hitset, query_out)) + if (verbose > 8): + sys.stderr.write("Fuzzy: %s\n" % (result_hitset,)) + else: + # We treat the result as ambiguous (uncertain) when: + # - query is not complete + # - more then one result + ambiguous_results.append((result_hitset, query_out)) + if (verbose > 8): + sys.stderr.write("Ambiguous\n") ## Evaluate final results for record # Add matched record iff number found is equal to one, otherwise return fuzzy, ambiguous or no match @@ -766,7 +793,8 @@ def main(): "text-marc-output", "alter-recid", "clean", - "collection=" + "collection=", + "user=" ]) except getopt.GetoptError, e: @@ -781,7 +809,7 @@ def main(): records = [] batch_output = "" # print stuff in files f_input = "" # read from where, if param "i" - server_url = CFG_SITE_URL # url to server performing search, local by default + server_url = CFG_SITE_SECURE_URL # url to server performing search, local by default modify = 0 # alter output with matched record identifiers textmarc_output = 0 # output in MARC instead of MARCXML field = "" @@ -789,6 +817,8 @@ def main(): sleeptime = CFG_BIBMATCH_LOCAL_SLEEPTIME # the amount of time to sleep between queries, changes on remote queries clean = False # should queries be sanitized? collections = [] # only search certain collections? + user = "" + password = "" for opt, opt_value in opts: if opt in ["-0", "--print-new"]: @@ -847,10 +877,12 @@ def main(): qrystrs.append((field, tmp[1])) if opt in ["-x", "--collection"]: colls = opt_value.split(',') - print opt_value for collection in colls: if collection not in collections: collections.append(collection) + if opt in ["--user"]: + user = opt_value + password = getpass.getpass() if verbose: sys.stderr.write("\nBibMatch: Parsing input file %s..." % (f_input,)) @@ -891,7 +923,9 @@ def main(): modify, sleeptime, clean, - collections) + collections, + user, + password) # set the output according to print.. # 0-newrecs 1-matchedrecs 2-ambiguousrecs 3-fuzzyrecs diff --git a/modules/bibmatch/lib/bibmatch_regression_tests.py b/modules/bibmatch/lib/bibmatch_regression_tests.py index bbf8e66d3..8cb9d20f6 100644 --- a/modules/bibmatch/lib/bibmatch_regression_tests.py +++ b/modules/bibmatch/lib/bibmatch_regression_tests.py @@ -413,6 +413,71 @@ def setUp(self): """ % CFG_SITE_RECORD + # Restricted record in thesis collection + self.recxml5 = """ + + + + 42 + + eng + + + LBL-28106 + + + Bertsche, K J + Calif. Univ. Berkeley + + + A small low energy cyclotron for radioisotope measurements + + + Berkeley, CA + Lawrence Berkeley Nat. Lab. + Nov 1989 + + + 155 p + + + Thesis : Calif. Univ. Berkeley + + + SzGeCERN + Accelerators and Storage Rings + + + bibliography + + + REPORT + + + THESIS + + + 14 + + + 1989 + + + 1990-02-28 + 50 + 2002-03-22 + BATCH + + + h + 199010n + + + THESIS + + + + """ return @@ -431,7 +496,8 @@ def test_check_new(self): def test_check_ambiguous(self): """bibmatch - check an ambiguous record""" records = create_records(self.recxml1) - [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, qrystrs=[("", "[100__a]")]) + [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, \ + qrystrs=[("", "[100__a]")]) self.assertEqual(1, len(ambigrecs)) def test_check_fuzzy(self): @@ -443,14 +509,16 @@ def test_check_fuzzy(self): def test_check_remote(self): """bibmatch - check remote match (Invenio demo site)""" records = create_records(self.recxml3) - [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, server_url="http://invenio-demo.cern.ch") + [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \ + server_url="http://invenio-demo.cern.ch") self.assertEqual(1, len(matchedrecs)) def test_check_textmarc(self): """bibmatch - check textmarc as input""" marcxml = transform_input_to_marcxml("", self.textmarc) records = create_records(marcxml) - [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, server_url="http://invenio-demo.cern.ch") + [dummy1, matchedrecs, dummy3, dummy4] = match_records(records, \ + server_url="http://invenio-demo.cern.ch") self.assertEqual(2, len(matchedrecs)) def test_check_altered(self): @@ -474,17 +542,60 @@ def test_check_qrystr(self): def test_check_completeness(self): """bibmatch - check query completeness""" records = create_records(self.recxml4) - [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, qrystrs=[("", "[088__a] [035__a]")]) + [dummy1, dummy2, ambigrecs, dummy3] = match_records(records, \ + qrystrs=[("", "[088__a] [035__a]")]) self.assertEqual(1, len(ambigrecs)) def test_check_collection(self): """bibmatch - check collection""" records = create_records(self.recxml3) - [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, collections=["Articles"]) + [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \ + collections=["Articles"]) self.assertEqual(1, len(nomatchrecs)) - [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, collections=["Books"]) + [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \ + collections=["Books"]) self.assertEqual(1, len(matchedrecs)) + def test_restricted_collections_local(self): + """bibmatch - check restricted collections local search""" + records = create_records(self.recxml5) + # Jekyll should have access + [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \ + qrystrs=[("", "[088__a]")], \ + collections=["Theses"], \ + user="jekyll", + password="j123ekyll") + self.assertEqual(1, len(matchedrecs)) + # Hyde should not have access + [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \ + qrystrs=[("", "[088__a]")], \ + collections=["Theses"], \ + user="hyde", \ + password="h123yde", + verbose=0) + self.assertEqual(1, len(matchedrecs)) + + def test_restricted_collections_remote(self): + """bibmatch - check restricted collections remote search""" + records = create_records(self.recxml5) + # Jekyll should have access + [dummy1, matchedrecs, dummy2, dummy3] = match_records(records, \ + qrystrs=[("", "[088__a]")], \ + collections=["Theses"], \ + server_url="https://invenio-demo.cern.ch", \ + user="jekyll", \ + password="j123ekyll") + self.assertEqual(1, len(matchedrecs)) + # Hyde should not have access + [nomatchrecs, dummy1, dummy2, dummy3] = match_records(records, \ + qrystrs=[("", "[088__a]")], \ + collections=["Theses"], \ + server_url="https://invenio-demo.cern.ch", \ + user="hyde", \ + password="h123yde", + verbose=0) + self.assertEqual(1, len(nomatchrecs)) + TEST_SUITE = make_test_suite(BibMatchTest) if __name__ == "__main__": diff --git a/modules/miscutil/lib/invenio_connector.py b/modules/miscutil/lib/invenio_connector.py index 908d259fe..720ca3e75 100644 --- a/modules/miscutil/lib/invenio_connector.py +++ b/modules/miscutil/lib/invenio_connector.py @@ -52,23 +52,44 @@ try: # if we are running locally, we can optimize :-) - from invenio.config import CFG_SITE_URL, CFG_SITE_RECORD, CFG_CERN_SITE + from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL, CFG_SITE_RECORD, CFG_CERN_SITE from invenio.bibtask import task_low_level_submission from invenio.search_engine import perform_request_search, collection_restricted_p from invenio.bibformat import format_records - LOCAL_SITE_URL = CFG_SITE_URL + LOCAL_SITE_URLS = [CFG_SITE_URL, CFG_SITE_SECURE_URL] except ImportError: - LOCAL_SITE_URL = None + LOCAL_SITE_URLS = None CFG_CERN_SITE = 0 CFG_CDS_URL = "http://cdsweb.cern.ch/" +class InvenioConnectorAuthError(Exception): + """ + This exception is called by InvenioConnector when authentication fails during + remote or local connections. + """ + def __init__(self, value): + """ + Set the internal "value" attribute to that of the passed "value" parameter. + @param value: an error string to display to the user. + @type value: string + """ + Exception.__init__(self) + self.value = value + def __str__(self): + """ + Return oneself as a string (actually, return the contents of self.value). + @return: representation of error + @rtype: string + """ + return str(self.value) + class InvenioConnector(object): """ Creates an connector to a server running Invenio """ - def __init__(self, url=LOCAL_SITE_URL, user="", password="", login_method=None, local_import_path="invenio"): + def __init__(self, url=LOCAL_SITE_URLS, user="", password="", login_method="Local", local_import_path="invenio"): """ Initialize a new instance of the server at given URL. @@ -93,7 +114,7 @@ def __init__(self, url=LOCAL_SITE_URL, user="", password="", login_method=None, @type local_import_path: string """ self.server_url = url - self.local = self.server_url == LOCAL_SITE_URL + self.local = self.server_url in LOCAL_SITE_URLS self.cached_queries = {} self.cached_records = {} self.cached_baskets = {} @@ -102,6 +123,8 @@ def __init__(self, url=LOCAL_SITE_URL, user="", password="", login_method=None, self.login_method = login_method self.browser = None if self.user: + if not self.server_url.startswith('https://'): + raise InvenioConnectorAuthError("You have to use a secure URL (HTTPS) to login") self._init_browser() self._check_credentials() @@ -116,12 +139,14 @@ def _init_browser(self): self.browser.select_form(nr=0) self.browser['p_un'] = self.user self.browser['p_pw'] = self.password + # Set login_method to be writable + self.browser.form.find_control('login_method').readonly = False self.browser['login_method'] = self.login_method self.browser.submit() def _check_credentials(self): if not 'youraccount/logout' in self.browser.response().read(): - raise ValueError("It was not possible to successfully login with the provided credentials") + raise InvenioConnectorAuthError("It was not possible to successfully login with the provided credentials") def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="", rm="", of="", ot="", p1="", f1="", m1="", op1="", @@ -131,6 +156,8 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="", read_cache=True): """ Returns records corresponding to the given search query. + + @raise InvenioConnectorAuthError: if authentication fails """ parse_results = False if of == "": @@ -155,8 +182,7 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="", # Are we running locally? If so, better directly access the # search engine directly - if LOCAL_SITE_URL == self.server_url and \ - of != 't': + if self.server_url in LOCAL_SITE_URLS and of != 't': # See if user tries to search any restricted collection if c != "": if type(c) is list: @@ -165,9 +191,10 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="", colls = [c] for collection in colls: if collection_restricted_p(collection): - sys.stderr.write("Searching local restricted collections\ - is NOT allowed. Aborting search.\n") - return [] + if self.user: + self._check_credentials() + continue + raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n") results = perform_request_search(p=p, f=f, c=c, rg=rg, sf=sf, so=so, sp=so, rm=rm, p1=p1, f1=f1, m1=m1, op1=op1, p2=p2, f2=f2, m2=m2, op2=op2, @@ -183,6 +210,9 @@ def search(self, p="", f="", c="", rg=10, sf="", so="d", sp="", results = self.browser.open(self.server_url + "/search?" + params) else: results = urllib2.urlopen(self.server_url + "/search?" + params) + if 'youraccount/login' in results.geturl(): + # Current user not able to search collection + raise InvenioConnectorAuthError("You are trying to search a restricted collection. Please authenticate yourself.\n") else: return self.cached_queries[params + str(parse_results)] @@ -305,7 +335,7 @@ def upload_marcxml(self, marcxml, mode): raise NameError, "Incorrect mode " + str(mode) # Are we running locally? If so, submit directly - if LOCAL_SITE_URL == self.server_url: + if self.server_url in LOCAL_SITE_URLS: (code, marcxml_filepath) = tempfile.mkstemp(prefix="upload_%s" % \ time.strftime("%Y%m%d_%H%M%S_", time.localtime())) diff --git a/modules/miscutil/lib/invenio_connector_regression_tests.py b/modules/miscutil/lib/invenio_connector_regression_tests.py index fa04659af..059ca9f04 100644 --- a/modules/miscutil/lib/invenio_connector_regression_tests.py +++ b/modules/miscutil/lib/invenio_connector_regression_tests.py @@ -24,8 +24,9 @@ import os import unittest -from invenio.invenio_connector import InvenioConnector -from invenio.config import CFG_SITE_URL +from invenio.invenio_connector import InvenioConnector, \ + InvenioConnectorAuthError +from invenio.config import CFG_SITE_URL, CFG_SITE_SECURE_URL from invenio.testutils import make_test_suite, run_test_suite class InvenioConnectorTest(unittest.TestCase): @@ -52,6 +53,28 @@ def test_search_collections(self): self.assertTrue(len(result) > 0, \ 'did not get collection search results.') + def test_search_local_restricted_collections(self): + """InvenioConnector - local restricted collection search""" + server = InvenioConnector(CFG_SITE_URL) + search_params = dict(p='LBL-28106', c=['Theses'], of='id') + self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) + + server = InvenioConnector(CFG_SITE_SECURE_URL, user='admin', password='') + result = server.search(p='LBL-28106', c=['Theses'], of='id') + self.assertTrue(len(result) > 0, \ + 'did not get restricted collection search results.') + + def test_search_remote_restricted_collections(self): + """InvenioConnector - remote restricted collection search""" + server = InvenioConnector("http://invenio-demo.cern.ch") + search_params = dict(p='LBL-28106', c=['Theses'], of='id') + self.assertRaises(InvenioConnectorAuthError, server.search, **search_params) + + server = InvenioConnector("https://invenio-demo.cern.ch", user='jekyll', password='j123ekyll') + result = server.search(p='LBL-28106', c=['Theses'], of='id') + self.assertTrue(len(result) > 0, \ + 'did not get restricted collection search results.') + TEST_SUITE = make_test_suite(InvenioConnectorTest) if __name__ == "__main__":