Skip to content

Commit

Permalink
(#41) - don't re-quote quoted phrases
Browse files Browse the repository at this point in the history
Issue #41, when original search term is quoted and synonym being expanded is item quoted don't re-quote when doing constructPhraseQueries.  Removes issue where double quotes appear in result.  Also when doing constructPhraseQueries, only quote phrases not single term synonyms.
  • Loading branch information
rpialum authored and nolanlawson committed Oct 4, 2014
1 parent f2fe904 commit 242d330
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
Expand Down Expand Up @@ -431,9 +433,12 @@ private void applySynonymQueries(Query query, List<Query> synonymQueries, float
*/
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) throws IOException {

String origQuery = getQueryStringFromParser();
int queryLen = origQuery.length();

// TODO: make the token stream reusable?
TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
new StringReader(getQueryStringFromParser()));
new StringReader(origQuery);

SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create();

Expand All @@ -460,10 +465,20 @@ private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams
synonymBag.add(termToAdd);
}

if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")) {
// make a phrase out of the synonym
termToAdd = new StringBuilder(termToAdd).insert(0,'"').append('"').toString();
}
//Don't quote sibgle term term synonyms
if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM") &&
termToAdd.contains(" "))
{
//Don't Quote when original is already surrounded by quotes
if( offsetAttribute.startOffset()==0 ||
offsetAttribute.endOffset() == queryLen ||
origQuery.charAt(offsetAttribute.startOffset()-1)!='"' ||
origQuery.charAt(offsetAttribute.endOffset())!='"')
{
// make a phrase out of the synonym
termToAdd = new StringBuilder(termToAdd).insert(0,'"').append('"').toString();
}
}
if (!bag) {
// create a graph of all possible synonym combinations,
// e.g. dog bite, hound bite, dog nibble, hound nibble, etc.
Expand Down Expand Up @@ -596,7 +611,8 @@ private List<String> buildUpAlternateQueries(SolrParams solrParams, List<List<Te
}
}

List<String> result = new ArrayList<String>();
//Make sure result is unique
HashSet<String> result = new LinkedHashSet<String>();

for (AlternateQuery alternateQuery : alternateQueries) {

Expand All @@ -607,7 +623,7 @@ private List<String> buildUpAlternateQueries(SolrParams solrParams, List<List<Te

result.add(sb.toString());
}
return result;
return new ArrayList<String>(result);
}

/**
Expand Down
71 changes: 71 additions & 0 deletions test/015-test-issue-41.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#
# Basic unit tests for HON-Lucene-Synonyms
#
# Test that phrase queries synonyms do not get double quoted
# when search contains quotes. And do not quote single term synonyms
#

from urllib2 import *
import unittest, solr, urllib, time
class TestBasic(unittest.TestCase):

#
# We have the synonyms:
#
# dog, pooch, hound, canis familiaris, man's best friend
#

url = 'http://localhost:8983/solr'
test_data = [ \
{'id': '1', 'name': "I have a dog."}, \
{'id': '2', 'name': "I have a pooch."}, \
{'id': '3', 'name': "I have a hound."}, \
{'id': '4', 'name': "I have a canis."}, \
]
solr_connection = None

def setUp(self):
self.solr_connection = solr.SolrConnection(self.url)
self.solr_connection.delete_query('*:*')
self.solr_connection.add_many(self.test_data)
self.solr_connection.commit()

def tearDown(self):
self.solr_connection.delete_query('*:*')
self.solr_connection.commit()

def test_queries(self):

self.tst_query('"dog"', 10)
self.tst_query('"pooch"', 10)
self.tst_query('"hound"', 10)
self.tst_query('"canis familiaris"', 10)

self.tst_query('dog', 4)
self.tst_query('pooch', 4)
self.tst_query('hound', 4)
self.tst_query('canis familiaris', 4)


def tst_query(self, query, quote_cnt):
#Properly format spaces in the query
query = urllib.quote_plus(query)
connstr = self.url +'/select?q='+query+'&fl=*,score&qf=name&defType=synonym_edismax&synonyms=true&synonyms.constructPhrases=true&debugQuery=on'
#Add wt=python so response is formatted as python readable
conn = urlopen(connstr+'&wt=python')
rsp = eval( conn.read() )
#print "number of matches=", rsp['response']['numFound']
print rsp['debug']['expandedSynonyms']

#Count the number of quotes in our expandedSynonyms Debug element
cnt = 0
for str in rsp['debug']['expandedSynonyms']:
#print str
cnt += str.count('"')
print 'Quotes found count = ', cnt

self.assertEqual(cnt, quote_cnt)

if __name__ == '__main__':
unittest.main()

0 comments on commit 242d330

Please sign in to comment.