Skip to content

Commit 63a02a8

Browse files
committed
Add tasks to benchmark Lucene's intervals
This change adds some tasks to benchmark the IntervalQuery added in Lucene 7x. Tasks are derived from the SpanNear query and runs ordered intervals query with a max width of 10. I ran the wikinightly with these new tasks and got the following result: ``` TaskQPS baseline StdDev QPS patch StdDev Pct diff SpanNear 4.66 (0.0%) 4.64 (0.0%) -0.3% ( 0% - 0%) IntervalsOrdered 4.52 (0.0%) 4.66 (0.0%) 3.0% ( 2% - 2%) SloppyPhrase 2.51 (0.0%) 2.59 (0.0%) 3.2% ( 3% - 3%) ```
1 parent 0820138 commit 63a02a8

10 files changed

+277
-29
lines changed

src/main/perf/CreateQueries.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,7 @@ private static void processShingles(IndexReader r, String field, Writer queriesO
447447
queriesOut.write("HighPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
448448
queriesOut.write("HighSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
449449
queriesOut.write("HighSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
450+
queriesOut.write("HighOrderedIntervals: ordered//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
450451
upto++;
451452
counter++;
452453
if (counter >= NUM_QUERIES) {
@@ -465,6 +466,7 @@ private static void processShingles(IndexReader r, String field, Writer queriesO
465466
queriesOut.write("MedPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
466467
queriesOut.write("MedSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
467468
queriesOut.write("MedSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
469+
queriesOut.write("MedOrderedIntervals: ordered//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
468470
upto++;
469471
counter++;
470472
if (counter >= NUM_QUERIES) {
@@ -483,6 +485,7 @@ private static void processShingles(IndexReader r, String field, Writer queriesO
483485
queriesOut.write("LowPhrase: \"" + tf.term.utf8ToString() + "\" # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
484486
queriesOut.write("LowSloppyPhrase: \"" + tf.term.utf8ToString() + "\"~4 # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
485487
queriesOut.write("LowSpanNear: near//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
488+
queriesOut.write("LowOrderedIntervals: ordered//" + tf.term.utf8ToString() + " # freq=" + tf.df + "|" + df1 + "|" + df2 + "\n");
486489
upto++;
487490
counter++;
488491
if (counter >= NUM_QUERIES) {

src/main/perf/TaskParser.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
import org.apache.lucene.search.spans.SpanNearQuery;
3737
import org.apache.lucene.search.spans.SpanQuery;
3838
import org.apache.lucene.search.spans.SpanTermQuery;
39-
39+
import org.apache.lucene.search.intervals.Intervals;
40+
import org.apache.lucene.search.intervals.IntervalQuery;
4041
import java.util.ArrayList;
4142
import java.util.List;
4243
import java.util.Random;
@@ -193,7 +194,22 @@ public Task parseOneTask(String line) throws ParseException {
193194
doHilite = false;
194195
}
195196

196-
if (text.startsWith("near//")) {
197+
198+
if (text.startsWith("ordered//")) {
199+
final int spot3 = text.indexOf(' ');
200+
if (spot3 == -1) {
201+
throw new RuntimeException("failed to parse query=" + text);
202+
}
203+
query = new IntervalQuery(fieldName,
204+
Intervals.maxwidth(10,
205+
Intervals.ordered(
206+
Intervals.term(text.substring(9, spot3)),
207+
Intervals.term(text.substring(spot3+1).trim())
208+
)
209+
));
210+
sort = null;
211+
group = null;
212+
} else if (text.startsWith("near//")) {
197213
final int spot3 = text.indexOf(' ');
198214
if (spot3 == -1) {
199215
throw new RuntimeException("failed to parse query=" + text);

src/python/nightlyBench.py

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
# The ASF licenses this file to You under the Apache License, Version 2.0
77
# (the "License"); you may not use this file except in compliance with
88
# the License. You may obtain a copy of the License at
9-
#
9+
#
1010
# http://www.apache.org/licenses/LICENSE-2.0
11-
#
11+
#
1212
# Unless required by applicable law or agreed to in writing, software
1313
# distributed under the License is distributed on an "AS IS" BASIS,
1414
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -600,9 +600,9 @@ def runNRTTest(r, indexPath, runLogDir):
600600

601601
min, max, mean, stdDev = stats.getStats(times)
602602
message('NRT reopen time (msec) mean=%.4f stdDev=%.4f' % (mean, stdDev))
603-
603+
604604
checkIndex(r, indexPath, '%s/checkIndex.nrt.log' % runLogDir)
605-
605+
606606
return mean, stdDev
607607

608608
def run():
@@ -692,7 +692,7 @@ def run():
692692
print('Unable to read /sys/kernel/mm/transparent_hugepage/enabled')
693693
else:
694694
print('transparent_hugepages: %s' % s)
695-
695+
696696
runCommand('%s clean > clean.log 2>&1' % constants.ANT_EXE)
697697

698698
r = benchUtil.RunAlgs(constants.JAVA_COMMAND, True, True)
@@ -779,7 +779,7 @@ def run():
779779
index=index,
780780
directory=DIR_IMPL,
781781
commitPoint='multi')
782-
782+
783783
#c = benchUtil.Competitor(id, 'trunk.nightly', index, DIR_IMPL, 'StandardAnalyzerNoStopWords', 'multi', constants.WIKI_MEDIUM_TASKS_FILE)
784784

785785
if REAL:
@@ -827,15 +827,15 @@ def run():
827827
if REAL:
828828
resultsNow = []
829829
for iter in xrange(JVM_COUNT):
830-
seed = rand.randint(-10000000, 1000000)
830+
seed = rand.randint(-10000000, 1000000)
831831
resultsNow.append(r.runSimpleSearchBench(iter, id, comp, coldRun, seed, staticSeed, filter=None))
832832
else:
833833
resultsNow = ['%s/%s/modules/benchmark/%s.%s.x.%d' % (constants.BASE_DIR, NIGHTLY_DIR, id, comp.name, iter) for iter in xrange(20)]
834834
message('done search (%s)' % (now()-t0))
835835
resultsPrev = []
836836

837837
searchResults = searchHeap = None
838-
838+
839839
for fname in resultsNow:
840840
prevFName = fname + '.prev'
841841
if os.path.exists(prevFName):
@@ -965,7 +965,7 @@ def makeGraphs():
965965

966966
tup = cPickle.loads(open(resultsFile).read())
967967
# print 'RESULTS: %s' % resultsFile
968-
968+
969969
timeStamp, \
970970
medNumDocs, medIndexTimeSec, medBytesIndexed, \
971971
bigNumDocs, bigIndexTimeSec, bigBytesIndexed, \
@@ -984,7 +984,7 @@ def makeGraphs():
984984
searchHeaps = tup[11]
985985
else:
986986
searchHeaps = None
987-
987+
988988
timeStampString = '%04d-%02d-%02d %02d:%02d:%02d' % \
989989
(timeStamp.year,
990990
timeStamp.month,
@@ -1117,7 +1117,7 @@ def writeCheckIndexTimeHTML():
11171117
if os.path.exists('%s/%s/results.debug.pk' % (constants.NIGHTLY_LOG_DIR, subDir)):
11181118
# Skip debug runs
11191119
continue
1120-
1120+
11211121
tup = subDir.split('.')
11221122
if len(tup) != 6:
11231123
#print('skip %s' % subDir)
@@ -1126,7 +1126,7 @@ def writeCheckIndexTimeHTML():
11261126
if tup[:3] == ['2015', '04', '04']:
11271127
# Hide disastrously slow CheckIndex time after auto-prefix first landed
11281128
continue
1129-
1129+
11301130
if os.path.exists(checkIndexTimeFile):
11311131
# Already previously computed & cached:
11321132
seconds = int(open(checkIndexTimeFile, 'r').read())
@@ -1157,7 +1157,7 @@ def writeCheckIndexTimeHTML():
11571157
#print("tup %s" % tup)
11581158
chartData.append('%s-%s-%s %s:%s:%s,%s' % (tuple(tup) + (seconds,)))
11591159
#print("added %s" % chartData[-1])
1160-
1160+
11611161
with open('%s/checkIndexTime.html' % constants.NIGHTLY_REPORTS_DIR, 'wb') as f:
11621162
w = f.write
11631163
header(w, 'Lucene nightly CheckIndex time')
@@ -1181,7 +1181,7 @@ def writeCheckIndexTimeHTML():
11811181
w('</ul>')
11821182
w('<br><a href="index.html">Back to all results</a><br>')
11831183
footer(w)
1184-
1184+
11851185
def header(w, title):
11861186
w('<html>')
11871187
w('<head>')
@@ -1192,7 +1192,7 @@ def header(w, title):
11921192
w('<script type="text/javascript" src="dygraph-combined-dev.js"></script>\n')
11931193
w('</head>')
11941194
w('<body>')
1195-
1195+
11961196
def footer(w):
11971197
w('<br><em>[last updated: %s; send questions to <a href="mailto:lucene@mikemccandless.com">Mike McCandless</a>]</em>' % now())
11981198
w('</div>')
@@ -1202,7 +1202,7 @@ def footer(w):
12021202
def writeOneLine(w, seen, cat, desc):
12031203
seen.add(cat)
12041204
w('<br>&nbsp;&nbsp;&nbsp;&nbsp;<a href="%s.html">%s</a>' % (cat, desc))
1205-
1205+
12061206
def writeIndexHTML(searchChartData, days):
12071207
f = open('%s/index.html' % constants.NIGHTLY_REPORTS_DIR, 'wb')
12081208
w = f.write
@@ -1231,6 +1231,7 @@ def writeIndexHTML(searchChartData, days):
12311231
writeOneLine(w, done, 'Phrase', 'Exact phrase')
12321232
writeOneLine(w, done, 'SloppyPhrase', 'Sloppy (~4) phrase')
12331233
writeOneLine(w, done, 'SpanNear', 'Span near (~10)')
1234+
writeOneLine(w, done, 'IntervalsOrdered', 'Ordered intervals (MAXWIDTH/10)')
12341235

12351236
w('<br><br><b>FuzzyQuery:</b>')
12361237
writeOneLine(w, done, 'Fuzzy1', 'Edit distance 1')
@@ -1240,10 +1241,10 @@ def writeIndexHTML(searchChartData, days):
12401241
writeOneLine(w, done, 'Term', 'TermQuery')
12411242
writeOneLine(w, done, 'Respell', 'Respell (DirectSpellChecker)')
12421243
writeOneLine(w, done, 'PKLookup', 'Primary key lookup')
1243-
writeOneLine(w, done, 'Wildcard', 'WildcardQuery')
1244-
writeOneLine(w, done, 'Prefix3', 'PrefixQuery (3 leading characters)')
1245-
writeOneLine(w, done, 'IntNRQ', 'Numeric range filtering on last-modified-datetime')
1246-
1244+
writeOneLine(w, done, 'Wildcard', 'WildcardQuery')
1245+
writeOneLine(w, done, 'Prefix3', 'PrefixQuery (3 leading characters)')
1246+
writeOneLine(w, done, 'IntNRQ', 'Numeric range filtering on last-modified-datetime')
1247+
12471248
w('<br><br><b>Faceting:</b>')
12481249
writeOneLine(w, done, 'TermDateFacets', 'Term query + date hierarchy')
12491250
writeOneLine(w, done, 'BrowseDateTaxoFacets', 'All dates hierarchy')
@@ -1270,7 +1271,7 @@ def writeIndexHTML(searchChartData, days):
12701271
w('<br>&nbsp;&nbsp;&nbsp;&nbsp;<a href="sparseResults.html">Sparse vs dense doc values performance on NYC taxi ride corpus</a>')
12711272
w('<br>&nbsp;&nbsp;&nbsp;&nbsp;<a href="antcleantest.html">"ant clean test" time in lucene</a>')
12721273
w('<br>&nbsp;&nbsp;&nbsp;&nbsp;<a href="checkIndexTime.html">CheckIndex time</a>')
1273-
1274+
12741275
l = searchChartData.keys()
12751276
lx = []
12761277
for s in l:
@@ -1310,6 +1311,7 @@ def writeIndexHTML(searchChartData, days):
13101311
'Phrase': 'PhraseQuery (exact)',
13111312
'SloppyPhrase': 'PhraseQuery (sloppy)',
13121313
'SpanNear': 'SpanNearQuery',
1314+
'IntervalsOrdered': 'IntervalsQuery (ordered)',
13131315
'AndHighHigh': 'BooleanQuery (AND, high freq, high freq term)',
13141316
'AndHighMed': 'BooleanQuery (AND, high freq, medium freq term)',
13151317
'OrHighHigh': 'BooleanQuery (OR, high freq, high freq term)',
@@ -1397,7 +1399,7 @@ def writeIndexingHTML(medChartData, bigChartData, gcTimesChartData):
13971399
w(' <li> Test does <b>not wait for merges on close</b> (calls <tt>IW.close(false)</tt>)')
13981400
w(' <li> Analyzer is <tt>StandardAnalyzer</tt>, but we <b>index all stop words</b>')
13991401
w(' <li> Test indexes full <a href="http://en.wikipedia.org/wiki/Wikipedia:Database_download">Wikipedia English XML export</a> (1/15/2011), from a pre-created line file (one document per line), on a different drive from the one that stores the index')
1400-
w(' <li> %d indexing threads\n' % constants.INDEX_NUM_THREADS)
1402+
w(' <li> %d indexing threads\n' % constants.INDEX_NUM_THREADS)
14011403
w(' <li> %s MB RAM buffer\n' % INDEXING_RAM_BUFFER_MB)
14021404
w(' <li> Java command-line: <tt>%s</tt>\n' % constants.JAVA_COMMAND)
14031405
w(' <li> Java version: <tt>%s</tt>\n' % htmlEscape(os.popen('java -version 2>&1').read().strip()))
@@ -1418,7 +1420,7 @@ def writeNRTHTML(nrtChartData):
14181420
w('<br>')
14191421
w(getOneGraphHTML('NRT', nrtChartData, "Milliseconds", "Time (msec) to open a new reader", errorBars=True))
14201422
writeKnownChanges(w)
1421-
1423+
14221424
w('<b>Notes</b>:\n')
14231425
w('<ul>\n')
14241426
w(' <li> Test starts from full Wikipedia index, then use <tt>IW.updateDocument</tt> (so we stress deletions)')
@@ -1501,15 +1503,15 @@ def getOneGraphHTML(id, data, yLabel, title, errorBars=True, pctOffset=5):
15011503
maxY = max([float(x.split(',')[1]) for x in data[1:]])
15021504
options.append('valueRange:[0,%.3f]' % (maxY*1.25))
15031505
#options.append('includeZero: true')
1504-
1506+
15051507
if errorBars:
15061508
options.append('errorBars: true')
15071509
options.append('sigma: 1')
15081510

15091511
options.append('showRoller: false')
15101512

15111513
w(' {%s}' % ', '.join(options))
1512-
1514+
15131515
if 0:
15141516
if errorBars:
15151517
w(' {errorBars: true, valueRange:[0,%.3f], sigma:1, title:"%s", ylabel:"%s", xlabel:"Date"}' % (maxY*1.25, title, yLabel))
@@ -1597,12 +1599,11 @@ def sendEmail(toEmailAddr, subject, messageText):
15971599
if not DEBUG and REAL:
15981600
import socket
15991601
sendEmail('mail@mikemccandless.com', 'Nightly Lucene bench FAILED (%s)' % socket.gethostname(), '')
1600-
1602+
16011603
# scp -rp /lucene/reports.nightly mike@10.17.4.9:/usr/local/apache2/htdocs
16021604

16031605
# TO CLEAN
16041606
# - rm -rf /p/lucene/indices/trunk.nightly.index.prev/
16051607
# - rm -rf /lucene/logs.nightly/*
16061608
# - rm -rf /lucene/reports.nightly/*
16071609
# - rm -f /lucene/trunk.nightly/modules/benchmark/*.x
1608-

0 commit comments

Comments
 (0)