Skip to content

Vectorize filterCompetitiveHits #14896

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jdk.incubator.vector.FloatVector#fma(**)
jdk.incubator.vector.DoubleVector#fma(**)
jdk.incubator.vector.VectorOperators#FMA

@defaultMessage Potentially slow on some CPUs, please check the CPU has feature: Unsupported on NEON
@defaultMessage Potentially slow on some CPUs, please check Constants.HAS_FAST_COMPRESS: Unsupported on NEON
jdk.incubator.vector.ByteVector#compress(**)
jdk.incubator.vector.IntVector#compress(**)
jdk.incubator.vector.ShortVector#compress(**)
Expand All @@ -18,3 +18,6 @@ jdk.incubator.vector.IntVector#expand(**)
jdk.incubator.vector.ShortVector#expand(**)
jdk.incubator.vector.LongVector#expand(**)
jdk.incubator.vector.VectorOperators#EXPAND_BITS

@defaultMessage Potentially slow on some CPUs, please check Constants.HAS_SVE and Constants.HAS_AVX2
jdk.incubator.vector.VectorMask#cast(**)
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ Optimizations

* GITHUB#14935: Speed up PostingsEnum#nextPostings when block is encoded as bitset. (Guo Feng)

* GITHUB#14896: Vectorize filterCompetitiveHits (Ge Song, Adrien Grand)

Changes in Runtime Behavior
---------------------
* GITHUB#14823: Decrease TieredMergePolicy's default number of segments per
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.jmh;

import java.util.Arrays;
import java.util.SplittableRandom;
import java.util.concurrent.TimeUnit;
import java.util.function.IntSupplier;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.VectorUtil;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 3, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(
value = 1,
jvmArgsAppend = {
"-Xmx1g",
"-Xms1g",
"-XX:+AlwaysPreTouch",
"--add-modules",
"jdk.incubator.vector"
})
public class CompetitiveBenchmark {

private final SplittableRandom R = new SplittableRandom(0);

@Param("128")
int size;

double[] scores;
int[] docs;

// scores generated by nextDouble() locate in range [0, 1), so we can tune this parameter and
// see how the performance changes depends on how selective the filter is.
@Param({"0", "0.2", "0.4", "0.5", "0.8"})
double minScoreInclusive;

@Setup(Level.Trial)
public void setUpTrial() {
scores = new double[size];
docs = new int[size];
}

@Setup(Level.Invocation)
public void setUpInvocation() {
for (int i = 0; i < size; i++) {
docs[i] = R.nextInt(Integer.MAX_VALUE);
scores[i] = R.nextDouble();
}
}

@Benchmark
public int baseline() {
int newSize = 0;
for (int i = 0; i < size; ++i) {
if (scores[i] >= minScoreInclusive) {
docs[newSize] = docs[i];
scores[newSize] = scores[i];
newSize++;
}
}
return newSize;
}

@Benchmark
public int branchlessCandidate() {
int newSize = 0;
for (int i = 0; i < size; ++i) {
int inc = scores[i] >= minScoreInclusive ? 1 : 0;
docs[newSize] = docs[i];
scores[newSize] = scores[i];
newSize += inc;
}
return newSize;
}

// This is an effort try to make the modification of newSize using cmov
// see https://github.com/apache/lucene/pull/14906
@Benchmark
public int branchlessCandidateCmov() {
int newSize = 0;
for (int i = 0; i < size; ++i) {
int doc = docs[i];
double score = scores[i];
docs[newSize] = doc;
scores[newSize] = score;
if (score >= minScoreInclusive) {
newSize++;
}
}
return newSize;
}

@Benchmark
public int vectorizedCandidate() {
return VectorUtil.filterByScore(docs, scores, minScoreInclusive, size);
}

public static void main(String[] args) {
CompetitiveBenchmark baseline = new CompetitiveBenchmark();
baseline.size = 128;
baseline.setUpTrial();
baseline.setUpInvocation();
int baselineSize = baseline.baseline();

CompetitiveBenchmark candidate = new CompetitiveBenchmark();
candidate.size = 128;
candidate.setUpTrial();
candidate.setUpInvocation();

for (IntSupplier s :
new IntSupplier[] {
candidate::branchlessCandidate,
candidate::vectorizedCandidate,
candidate::branchlessCandidateCmov
}) {

int candidateSize = s.getAsInt();

if (baselineSize != candidateSize) {
throw new IllegalArgumentException("incorrect size");
}

if (Arrays.equals(baseline.docs, 0, baselineSize, candidate.docs, 0, candidateSize)
== false) {
throw new IllegalArgumentException(
"incorrect docs,"
+ "\nbaseline: "
+ Arrays.toString(ArrayUtil.copyOfSubArray(baseline.docs, 0, baselineSize))
+ "\ncandidate: "
+ Arrays.toString(ArrayUtil.copyOfSubArray(candidate.docs, 0, candidateSize)));
}

if (Arrays.equals(baseline.scores, 0, baselineSize, candidate.scores, 0, candidateSize)
== false) {
throw new IllegalArgumentException("incorrect scores");
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -309,4 +309,20 @@ private float quantizeFloat(float v, byte[] dest, int destIndex) {
return minQuantile * (v - minQuantile / 2.0F) + (dx - dxq) * dxq;
}
}

@Override
public int filterByScore(
int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
int newSize = 0;
for (int i = 0; i < upTo; ++i) {
int doc = docBuffer[i];
double score = scoreBuffer[i];
docBuffer[newSize] = doc;
scoreBuffer[newSize] = score;
if (score >= minScoreInclusive) {
newSize++;
}
}
return newSize;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,18 @@ float recalculateScalarQuantizationOffset(
float alpha,
float minQuantile,
float maxQuantile);

/**
* filter both {@code docBuffer} and {@code scoreBuffer} with {@code minScoreInclusive}, each
* {@code docBuffer} and {@code scoreBuffer} of the same index forms a pair, pairs with score not
* greater than or equal to {@code minScoreInclusive} will be filtered out from the array.
*
* @param docBuffer doc buffer contains docs (or some other value forms a pair with {@code
* scoreBuffer})
* @param scoreBuffer score buffer contains scores to be compared with {@code minScoreInclusive}
* @param minScoreInclusive minimal required score to not be filtered out
* @param upTo where the filter should end
* @return how many pairs left after filter
*/
int filterByScore(int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo);
}
14 changes: 3 additions & 11 deletions lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.VectorUtil;

/** Util class for Scorer related methods */
class ScorerUtil {
Expand Down Expand Up @@ -155,17 +156,8 @@ static void filterCompetitiveHits(
return;
}

int newSize = 0;
for (int i = 0; i < buffer.size; ++i) {
int doc = buffer.docs[i];
double score = buffer.scores[i];
buffer.docs[newSize] = doc;
buffer.scores[newSize] = score;
if (score >= minRequiredScore) {
newSize += 1;
}
}
buffer.size = newSize;
buffer.size =
VectorUtil.filterByScore(buffer.docs, buffer.scores, minRequiredScore, buffer.size);
}

/**
Expand Down
22 changes: 22 additions & 0 deletions lucene/core/src/java/org/apache/lucene/util/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,23 @@ private static boolean is64Bit() {
private static final boolean HAS_SSE4A =
HotspotVMOptions.get("UseXmmI2F").map(Boolean::valueOf).orElse(false);

/** true for cpu with AVX support at least AVX2. */
private static final boolean HAS_AVX2 =
HotspotVMOptions.get("UseAVX").map(Integer::valueOf).orElse(0) >= 2;

/** true for arm cpu with SVE support. */
private static final boolean HAS_SVE =
HotspotVMOptions.get("UseSVE").map(Integer::valueOf).orElse(0) >= 1;

/** true iff we know VFMA has faster throughput than separate vmul/vadd. */
public static final boolean HAS_FAST_VECTOR_FMA = hasFastVectorFMA();

/** true iff we know FMA has faster throughput than separate mul/add. */
public static final boolean HAS_FAST_SCALAR_FMA = hasFastScalarFMA();

/** true iff we know Compress has faster throughput than one by one move. */
public static final boolean HAS_FAST_COMPRESS = hasFastCompress();

private static boolean hasFastVectorFMA() {
if (HAS_FMA) {
String value = getSysProp("lucene.useVectorFMA", "auto");
Expand Down Expand Up @@ -152,6 +163,17 @@ private static boolean hasFastScalarFMA() {
return false;
}

private static boolean hasFastCompress() {
if (OS_ARCH.equals("aarch64") && HAS_SVE) {
return true;
}

if (OS_ARCH.equals("amd64") && HAS_AVX2) {
return true;
}
return false;
}

/**
* The default {@link ReadAdvice} used for opening index files. It will be {@link
* ReadAdvice#RANDOM} by default, unless set by system property {@code
Expand Down
21 changes: 21 additions & 0 deletions lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -376,4 +376,25 @@ public static float recalculateOffset(
return IMPL.recalculateScalarQuantizationOffset(
vector, oldAlpha, oldMinQuantile, scale, alpha, minQuantile, maxQuantile);
}

/**
* filter both {@code docBuffer} and {@code scoreBuffer} with {@code minScoreInclusive}, each
* {@code docBuffer} and {@code scoreBuffer} of the same index forms a pair, pairs with score not
* greater than or equal to {@code minScoreInclusive} will be filtered out from the array.
*
* @param docBuffer doc buffer contains docs (or some other value forms a pair with {@code
* scoreBuffer})
* @param scoreBuffer score buffer contains scores to be compared with {@code minScoreInclusive}
* @param minScoreInclusive minimal required score to not be filtered out
* @param upTo where the filter should end
* @return how many pairs left after filter
*/
public static int filterByScore(
int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
if (docBuffer.length != scoreBuffer.length || docBuffer.length < upTo) {
throw new IllegalArgumentException(
"docBuffer and scoreBuffer should keep same length and at least as long as upTo");
}
return IMPL.filterByScore(docBuffer, scoreBuffer, minScoreInclusive, upTo);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ final class PanamaVectorConstants {
static final boolean ENABLE_INTEGER_VECTORS;

static final VectorSpecies<Integer> PRERERRED_INT_SPECIES;
static final VectorSpecies<Double> PREFERRED_DOUBLE_SPECIES;

static {
// default to platform supported bitsize
Expand All @@ -46,6 +47,8 @@ final class PanamaVectorConstants {

PRERERRED_INT_SPECIES =
VectorSpecies.of(int.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE));
PREFERRED_DOUBLE_SPECIES =
VectorSpecies.of(double.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE));
}

private PanamaVectorConstants() {}
Expand Down
Loading
Loading