Skip to content

Commit

Permalink
Reduce FST block size for BlockTreeTermsWriter (#12604)
Browse files Browse the repository at this point in the history
  • Loading branch information
gf2121 authored Oct 4, 2023
1 parent 75da338 commit 9605289
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 0 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ Optimizations

* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)

* GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter
to reduce GC load during indexing. (Guo Feng)

Changes in runtime behavior
---------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.util.packed.PackedInts;

/*
TODO:
Expand Down Expand Up @@ -490,10 +491,22 @@ public void compileIndex(
}
}

long estimateSize = prefix.length;
for (PendingBlock block : blocks) {
if (block.subIndices != null) {
for (FST<BytesRef> subIndex : block.subIndices) {
estimateSize += subIndex.numBytes();
}
}
}
int estimateBitsRequired = PackedInts.bitsRequired(estimateSize);
int pageBits = Math.min(15, Math.max(6, estimateBitsRequired));

final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
final FSTCompiler<BytesRef> fstCompiler =
new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
.shouldShareNonSingletonNodes(false)
.bytesPageBits(pageBits)
.build();
// if (DEBUG) {
// System.out.println(" compile index for prefix=" + prefix);
Expand Down
4 changes: 4 additions & 0 deletions lucene/core/src/java/org/apache/lucene/util/fst/FST.java
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,10 @@ void finish(long newStartNode) throws IOException {
bytes.finish();
}

public long numBytes() {
return bytes.getPosition();
}

public T getEmptyOutput() {
return emptyOutput;
}
Expand Down

0 comments on commit 9605289

Please sign in to comment.