From 5edcceee3e79ee30c4d0edf4e2311bce20794b6d Mon Sep 17 00:00:00 2001 From: Tibor Simko Date: Mon, 31 Jan 2011 23:10:29 +0100 Subject: [PATCH] WebSearch: fix structured regexp query parsing * Fix parsing of some structured regexp queries of the form `field:/pattern/` that was not working properly when field was either a MARC tag or a refersto/citedby operator. (closes #470) --- modules/websearch/lib/search_engine.py | 10 +++++----- modules/websearch/lib/search_engine_tests.py | 10 ++++++++++ modules/websearch/lib/websearch_regression_tests.py | 5 +---- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/modules/websearch/lib/search_engine.py b/modules/websearch/lib/search_engine.py index f78e478f9..3edda789f 100644 --- a/modules/websearch/lib/search_engine.py +++ b/modules/websearch/lib/search_engine.py @@ -691,15 +691,15 @@ def create_basic_search_units(req, p, f, m=None, of='hb'): opfts.append([oi, "%" + pi + "%", fi, 'a']) else: # unbalanced quotes, so fall back to WRD query: opfts.append([oi, pi, fi, 'w']) + elif pi.startswith('/') and pi.endswith('/'): + # B3b - pi has slashes around => do regexp search + opfts.append([oi, pi[1:-1], fi, 'r']) elif fi and str(fi[0]).isdigit() and str(fi[0]).isdigit(): - # B3b - fi exists and starts by two digits => do ACC search + # B3c - fi exists and starts by two digits => do ACC search opfts.append([oi, pi, fi, 'a']) elif fi and not get_index_id_from_field(fi) and get_field_name(fi): - # B3c - logical field fi exists but there is no WRD index for fi => try ACC search + # B3d - logical field fi exists but there is no WRD index for fi => try ACC search opfts.append([oi, pi, fi, 'a']) - elif pi.startswith('/') and pi.endswith('/'): - # B3d - pi has slashes around => do regexp search - opfts.append([oi, pi[1:-1], fi, 'r']) else: # B3e - general case => do WRD search pi = strip_accents(pi) # strip accents for 'w' mode, FIXME: delete when not needed diff --git a/modules/websearch/lib/search_engine_tests.py b/modules/websearch/lib/search_engine_tests.py index 519fbdf4c..c3f211327 100644 --- a/modules/websearch/lib/search_engine_tests.py +++ b/modules/websearch/lib/search_engine_tests.py @@ -234,6 +234,16 @@ def test_parsing_structured_regexp_query(self): self._check("title:/(one|two)/", '', None, [['+', '(one|two)', 'title', 'r']]) + def test_parsing_structured_regexp_marc_query(self): + "search engine - parsing structured regexp MARC query" + self._check("245__a:/(one|two)/", '', None, + [['+', '(one|two)', '245__a', 'r']]) + + def test_parsing_structured_regexp_refersto_query(self): + "search engine - parsing structured regexp refersto query" + self._check("refersto:/(one|two)/", '', None, + [['+', '(one|two)', 'refersto', 'r']]) + def test_parsing_combined_structured_query_in_a_field(self): "search engine - parsing structured query in a field" self._check("title:muon author:ellis", 'abstract', None, diff --git a/modules/websearch/lib/websearch_regression_tests.py b/modules/websearch/lib/websearch_regression_tests.py index c3a80e60e..3039b31f5 100644 --- a/modules/websearch/lib/websearch_regression_tests.py +++ b/modules/websearch/lib/websearch_regression_tests.py @@ -1295,12 +1295,9 @@ def test_many_marc_tags_partial_phrase_query(self): def test_single_marc_tag_regexp_query(self): """websearch - single MARC tag, regexp query""" - # NOTE: regexp queries for physical MARC tags (e.g. 245:/and/) - # are not treated by the search engine by purpose. But maybe - # we should support them?! self.assertEqual([], test_web_page_content(CFG_SITE_URL + '/search?of=id&p=245%3A%2Fand%2F', - expected_text="[]")) + expected_text="[1, 8, 9, 14, 15, 20, 22, 24, 28, 33, 47, 48, 49, 51, 53, 64, 69, 71, 79, 82, 83, 85, 91, 96]")) class WebSearchExtSysnoQueryTest(unittest.TestCase): """Test of queries using external system numbers."""