Skip to content

New IndexReaderFunctions.positionLength from the norm #14433

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions lucene/core/src/java/org/apache/lucene/search/LongValues.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,20 @@ public abstract class LongValues {
* @return true if there is a value for this document
*/
public abstract boolean advanceExact(int doc) throws IOException;

/**
* An empty LongValues instance that always returns {@code false} from {@link #advanceExact(int)}
*/
public static final LongValues EMPTY =
new LongValues() {
@Override
public long longValue() throws IOException {
throw new UnsupportedOperationException();
}

@Override
public boolean advanceExact(int doc) throws IOException {
return false;
}
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ protected Similarity(boolean discountOverlaps) {
*
* <p><b>WARNING</b>: The default implementation is used by Lucene's supplied Similarity classes,
* which means you can change the Similarity at runtime without reindexing. If you override this
* method, you'll need to re-index documents for it to take effect.
* method, you'll need to re-index documents for it to take effect. Also be sure to override
* {@link #decodeNorm(long)}.
*
* <p>Matches in longer fields are less precise, so implementations of this method usually set
* smaller values when <code>state.getLength()</code> is large, and larger values when <code>
Expand Down Expand Up @@ -161,6 +162,18 @@ public long computeNorm(FieldInvertState state) {
return SmallFloat.intToByte4(numTerms);
}

/**
* Decodes the normalization value as computed by {@link #computeNorm(FieldInvertState)}. The
* meaning is Similarity-dependent. The default meaning is the field length measured in positions,
* approximated.
*
* @lucene.experimental
* @param norm from {@link org.apache.lucene.index.NumericDocValues#longValue()} of the norm.
*/
public long decodeNorm(long norm) {
return SmallFloat.byte4ToInt((byte) norm);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this method as it is impossible for someone to implement correctly if they customize just one field. The other method is per-field, this one is not.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also the name is wrong, there's nothing that requires this to be a position length. For some scoring methods it is something else such as the number of unique terms.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should add the field name as an arg. The name is intentional -- if a Similarity can't decode the norm to a position length, it can throw UnsupportedOperationException.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't agree with this being in the similarity api, sorry, that's too hacky.


/**
* Compute any collection-level weight (e.g. IDF, average document length, etc) needed for scoring
* a query.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@

import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
Expand All @@ -30,10 +32,11 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LongValues;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.search.similarities.Similarity;

/**
* Class exposing static helper methods for generating DoubleValuesSource instances over some
* IndexReader statistics
* Static helper methods for generating {@link DoubleValuesSource} and {@link LongValuesSource}
* instances over some IndexReader statistics
*/
public final class IndexReaderFunctions {

Expand Down Expand Up @@ -301,6 +304,18 @@ public static DoubleValuesSource docCount(String field) {
return new IndexReaderDoubleValuesSource(r -> r.getDocCount(field), "docCount(" + field + ")");
}

/**
* Creates a value source that returns what the {@link Similarity} puts in the norm for this
* field. The default meaning is the field's position length, approximated.
*
* @see Similarity#computeNorm(FieldInvertState)
* @see Similarity#decodeNorm(long)
* @see org.apache.lucene.index.LeafReader#getNormValues(String)
*/
public static LongValuesSource norm(String field) {
return new NormValuesSource(field);
}

@FunctionalInterface
private interface ReaderFunction {
double apply(IndexReader reader) throws IOException;
Expand Down Expand Up @@ -413,4 +428,65 @@ public boolean isCacheable(LeafReaderContext ctx) {
return false;
}
}

private static class NormValuesSource extends LongValuesSource {
private final String field;
private Similarity similarity;

private NormValuesSource(String field) {
this.field = Objects.requireNonNull(field);
}

@Override
public LongValuesSource rewrite(IndexSearcher searcher) throws IOException {
this.similarity = searcher.getSimilarity(); // isn't field-specific
return this;
}

@Override
public LongValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
final NumericDocValues norms = ctx.reader().getNormValues(field);
if (norms == null) {
return LongValues.EMPTY;
}

return new LongValues() {
@Override
public long longValue() throws IOException {
return similarity.decodeNorm(norms.longValue());
}

@Override
public boolean advanceExact(int doc) throws IOException {
return norms.advanceExact(doc);
}
};
}

@Override
public boolean needsScores() {
return false;
}

@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}

@Override
public boolean equals(Object o) {
if (!(o instanceof NormValuesSource that)) return false;
return field.equals(that.field);
}

@Override
public int hashCode() {
return Objects.hash(getClass(), field);
}

@Override
public String toString() {
return "norm(" + field + ")";
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,13 @@ public void testDocCount() throws Exception {
assertCacheable(vs, false);
}

public void testNorm() throws Exception {
LongValuesSource vs = IndexReaderFunctions.norm("text");
assertHits(vs.toDoubleValuesSource(), new float[] {6, 2});
assertEquals("norm(text)", vs.toString());
assertCacheable(vs, true);
}

void assertCacheable(DoubleValuesSource vs, boolean expected) throws Exception {
Query q = new FunctionScoreQuery(new MatchAllDocsQuery(), vs);
Weight w = searcher.createWeight(q, ScoreMode.COMPLETE, 1);
Expand Down