From 242d330771c4dc124f8a07d5ce68d31dd37ccf54 Mon Sep 17 00:00:00 2001 From: Jeremy Date: Sat, 4 Oct 2014 14:32:37 -0400 Subject: [PATCH] (#41) - don't re-quote quoted phrases Issue #41, when original search term is quoted and synonym being expanded is item quoted don't re-quote when doing constructPhraseQueries. Removes issue where double quotes appear in result. Also when doing constructPhraseQueries, only quote phrases not single term synonyms. --- ...mExpandingExtendedDismaxQParserPlugin.java | 30 ++++++-- test/015-test-issue-41.py | 71 +++++++++++++++++++ 2 files changed, 94 insertions(+), 7 deletions(-) create mode 100644 test/015-test-issue-41.py diff --git a/src/main/java/org/apache/solr/search/SynonymExpandingExtendedDismaxQParserPlugin.java b/src/main/java/org/apache/solr/search/SynonymExpandingExtendedDismaxQParserPlugin.java index c4d0f8a..96c3f2e 100644 --- a/src/main/java/org/apache/solr/search/SynonymExpandingExtendedDismaxQParserPlugin.java +++ b/src/main/java/org/apache/solr/search/SynonymExpandingExtendedDismaxQParserPlugin.java @@ -29,6 +29,8 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.HashMap; import java.util.LinkedList; import java.util.List; @@ -431,9 +433,12 @@ private void applySynonymQueries(Query query, List synonymQueries, float */ private List generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) throws IOException { + String origQuery = getQueryStringFromParser(); + int queryLen = origQuery.length(); + // TODO: make the token stream reusable? TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, - new StringReader(getQueryStringFromParser())); + new StringReader(origQuery); SortedSetMultimap startPosToTextsInQuery = TreeMultimap.create(); @@ -460,10 +465,20 @@ private List generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams synonymBag.add(termToAdd); } - if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")) { - // make a phrase out of the synonym - termToAdd = new StringBuilder(termToAdd).insert(0,'"').append('"').toString(); - } + //Don't quote sibgle term term synonyms + if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") && + termToAdd.contains(" ")) + { + //Don't Quote when original is already surrounded by quotes + if( offsetAttribute.startOffset()==0 || + offsetAttribute.endOffset() == queryLen || + origQuery.charAt(offsetAttribute.startOffset()-1)!='"' || + origQuery.charAt(offsetAttribute.endOffset())!='"') + { + // make a phrase out of the synonym + termToAdd = new StringBuilder(termToAdd).insert(0,'"').append('"').toString(); + } + } if (!bag) { // create a graph of all possible synonym combinations, // e.g. dog bite, hound bite, dog nibble, hound nibble, etc. @@ -596,7 +611,8 @@ private List buildUpAlternateQueries(SolrParams solrParams, List result = new ArrayList(); + //Make sure result is unique + HashSet result = new LinkedHashSet(); for (AlternateQuery alternateQuery : alternateQueries) { @@ -607,7 +623,7 @@ private List buildUpAlternateQueries(SolrParams solrParams, List(result); } /** diff --git a/test/015-test-issue-41.py b/test/015-test-issue-41.py new file mode 100644 index 0000000..5df6445 --- /dev/null +++ b/test/015-test-issue-41.py @@ -0,0 +1,71 @@ +# +# Basic unit tests for HON-Lucene-Synonyms +# +# Test that phrase queries synonyms do not get double quoted +# when search contains quotes. And do not quote single term synonyms +# + +from urllib2 import * +import unittest, solr, urllib, time +class TestBasic(unittest.TestCase): + + # + # We have the synonyms: + # + # dog, pooch, hound, canis familiaris, man's best friend + # + + url = 'http://localhost:8983/solr' + test_data = [ \ + {'id': '1', 'name': "I have a dog."}, \ + {'id': '2', 'name': "I have a pooch."}, \ + {'id': '3', 'name': "I have a hound."}, \ + {'id': '4', 'name': "I have a canis."}, \ + ] + solr_connection = None + + def setUp(self): + self.solr_connection = solr.SolrConnection(self.url) + self.solr_connection.delete_query('*:*') + self.solr_connection.add_many(self.test_data) + self.solr_connection.commit() + + def tearDown(self): + self.solr_connection.delete_query('*:*') + self.solr_connection.commit() + + def test_queries(self): + + self.tst_query('"dog"', 10) + self.tst_query('"pooch"', 10) + self.tst_query('"hound"', 10) + self.tst_query('"canis familiaris"', 10) + + self.tst_query('dog', 4) + self.tst_query('pooch', 4) + self.tst_query('hound', 4) + self.tst_query('canis familiaris', 4) + + + def tst_query(self, query, quote_cnt): + #Properly format spaces in the query + query = urllib.quote_plus(query) + connstr = self.url +'/select?q='+query+'&fl=*,score&qf=name&defType=synonym_edismax&synonyms=true&synonyms.constructPhrases=true&debugQuery=on' + #Add wt=python so response is formatted as python readable + conn = urlopen(connstr+'&wt=python') + rsp = eval( conn.read() ) + #print "number of matches=", rsp['response']['numFound'] + print rsp['debug']['expandedSynonyms'] + + #Count the number of quotes in our expandedSynonyms Debug element + cnt = 0 + for str in rsp['debug']['expandedSynonyms']: + #print str + cnt += str.count('"') + print 'Quotes found count = ', cnt + + self.assertEqual(cnt, quote_cnt) + +if __name__ == '__main__': + unittest.main() +