Skip to content

Commit

Permalink
Allow FST builder to use different writer (apache#12543)
Browse files Browse the repository at this point in the history
  • Loading branch information
dungba88 committed Nov 6, 2023
1 parent fdc23e3 commit 09ab6dd
Show file tree
Hide file tree
Showing 12 changed files with 761 additions and 197 deletions.
46 changes: 42 additions & 4 deletions lucene/core/src/java/org/apache/lucene/util/ByteBlockPool.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ public final class ByteBlockPool implements Accountable {

/** Abstract class for allocating and freeing byte blocks. */
public abstract static class Allocator {
// TODO: ByteBlockPool assume the blockSize is always {@link BYTE_BLOCK_SIZE}, but this class
// allow arbitrary value of blockSize. We should make them consistent.
protected final int blockSize;

protected Allocator(int blockSize) {
Expand Down Expand Up @@ -215,19 +217,38 @@ void setBytesRef(BytesRefBuilder builder, BytesRef result, long offset, int leng

/** Appends the bytes in the provided {@link BytesRef} at the current position. */
public void append(final BytesRef bytes) {
int bytesLeft = bytes.length;
int offset = bytes.offset;
append(bytes.bytes, bytes.offset, bytes.length);
}

/**
* Append the provided byte array at the current position.
*
* @param bytes the byte array to write
*/
public void append(final byte[] bytes) {
append(bytes, 0, bytes.length);
}

/**
* Append some portion of the provided byte array at the current position.
*
* @param bytes the byte array to write
* @param offset the offset of the byte array
* @param length the number of bytes to write
*/
public void append(final byte[] bytes, int offset, int length) {
int bytesLeft = length;
while (bytesLeft > 0) {
int bufferLeft = BYTE_BLOCK_SIZE - byteUpto;
if (bytesLeft < bufferLeft) {
// fits within current buffer
System.arraycopy(bytes.bytes, offset, buffer, byteUpto, bytesLeft);
System.arraycopy(bytes, offset, buffer, byteUpto, bytesLeft);
byteUpto += bytesLeft;
break;
} else {
// fill up this buffer and move to next one
if (bufferLeft > 0) {
System.arraycopy(bytes.bytes, offset, buffer, byteUpto, bufferLeft);
System.arraycopy(bytes, offset, buffer, byteUpto, bufferLeft);
}
nextBuffer();
bytesLeft -= bufferLeft;
Expand Down Expand Up @@ -256,6 +277,18 @@ public void readBytes(final long offset, final byte[] bytes, int bytesOffset, in
}
}

/**
* Read a single byte at the given offset
*
* @param offset the offset to read
* @return the byte
*/
public byte readByte(final long offset) {
int bufferIndex = (int) (offset >> BYTE_BLOCK_SHIFT);
int pos = (int) (offset & BYTE_BLOCK_MASK);
return buffers[bufferIndex][pos];
}

@Override
public long ramBytesUsed() {
long size = BASE_RAM_BYTES;
Expand All @@ -269,4 +302,9 @@ public long ramBytesUsed() {
}
return size;
}

/** the current position (in absolute value) of this byte pool */
public long getPosition() {
return bufferUpto * allocator.blockSize + byteUpto;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;

import java.io.IOException;
import org.apache.lucene.util.ByteBlockPool;

/** Reads in reverse from a ByteBlockPool. */
final class ByteBlockPoolReverseBytesReader extends FST.BytesReader {

private final ByteBlockPool buf;
// the difference between the FST node address and the hash table copied node address
private long posDelta;
private long pos;

public ByteBlockPoolReverseBytesReader(ByteBlockPool buf) {
this.buf = buf;
}

@Override
public byte readByte() {
return buf.readByte(pos--);
}

@Override
public void readBytes(byte[] b, int offset, int len) {
for (int i = 0; i < len; i++) {
b[offset + i] = buf.readByte(pos--);
}
}

@Override
public void skipBytes(long numBytes) throws IOException {
pos -= numBytes;
}

@Override
public long getPosition() {
return pos + posDelta;
}

@Override
public void setPosition(long pos) {
this.pos = pos - posDelta;
}

@Override
public boolean reversed() {
return true;
}

public void setPosDelta(long posDelta) {
this.posDelta = posDelta;
}
}
23 changes: 17 additions & 6 deletions lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
// TODO: merge with PagedBytes, except PagedBytes doesn't
// let you read while writing which FST needs

class BytesStore extends DataOutput implements FSTReader {
class BytesStore extends DataOutput implements FSTWriter {

private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(BytesStore.class)
Expand Down Expand Up @@ -359,6 +359,7 @@ public void truncate(long newLen) {
assert newLen == getPosition();
}

@Override
public void finish() {
if (current != null) {
byte[] lastBuffer = new byte[nextWrite];
Expand All @@ -368,6 +369,20 @@ public void finish() {
}
}

/** Writes all of our bytes to the target {@link FSTWriter}. */
public void flushTo(FSTWriter out) throws IOException {
// TODO: if the FSTWriter is also BytesStore we are doing double write
// once to reverse the bytes and once to write to the BytesStore
// maybe we should combine it into reverseAndWriteTo()?
reverse(0, getPosition() - 1);
finish();
for (byte[] block : blocks) {
out.writeBytes(block, 0, block.length);
}
// TODO: Rather than truncating, we can keep the first block and set nextWrite to 0 to reduce GC
truncate(0);
}

/** Writes all of our bytes to the target {@link DataOutput}. */
@Override
public void writeTo(DataOutput out) throws IOException {
Expand Down Expand Up @@ -444,11 +459,7 @@ public boolean reversed() {

@Override
public FST.BytesReader getReverseBytesReader() {
return getReverseReader(true);
}

FST.BytesReader getReverseReader(boolean allowSingle) {
if (allowSingle && blocks.size() == 1) {
if (blocks.size() == 1) {
return new ReverseBytesReader(blocks.get(0));
}
return new FST.BytesReader() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;

import java.io.Closeable;
import java.io.IOException;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.RamUsageEstimator;

/**
* An {@link FSTWriter} which write to a DataOutput
*
* @lucene.experimental
*/
public class DataOutputFSTWriter implements FSTWriter {

private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(DataOutputFSTWriter.class);

private final DataOutput dataOutput;

protected boolean finish = false;

private long size = 0L;

/**
* ctor
*
* @param dataOutput the data output to write to
*/
public DataOutputFSTWriter(DataOutput dataOutput) {
this.dataOutput = dataOutput;
}

@Override
public long ramBytesUsed() {
long size = BASE_RAM_BYTES_USED;
if (dataOutput instanceof Accountable) {
size += ((Accountable) dataOutput).ramBytesUsed();
}
return size;
}

@Override
public long size() {
return size;
}

@Override
public void writeByte(byte b) throws IOException {
size++;
dataOutput.writeByte(b);
}

@Override
public void writeBytes(byte[] b, int offset, int length) throws IOException {
size += length;
dataOutput.writeBytes(b, offset, length);
}

@Override
public void finish() throws IOException {
finish = true;
if (dataOutput instanceof Closeable) {
((Closeable) dataOutput).close();
}
}

@Override
public void writeTo(DataOutput out) throws IOException {
// Technically we can support this method, as the DataOutput by this time has already been
// closed.
// But allow the FST which is already written to a DataOutput to be saved to another DataOutput
// would be rather a strange use case
throw new UnsupportedOperationException("writeTo(DataOutput) is not supported by this class");
}

@Override
public FST.BytesReader getReverseBytesReader() {
// Technically we can support this method, as the DataOutput by this time has already been
// closed.
// However, I think ideally we would want the FSTWriter/FSTCompiler to only write the FST to the
// DataOutput, and
// some process later on can construct the FST using the FSTStore method.
throw new UnsupportedOperationException(
"getReverseBytesReader() is not supported by this class");
}
}
6 changes: 3 additions & 3 deletions lucene/core/src/java/org/apache/lucene/util/fst/FST.java
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
*/
public final class FST<T> implements Accountable {

final FSTMetadata<T> metadata;
private final FSTMetadata<T> metadata;

/** Specifies allowed range of each int input label for this FST. */
public enum INPUT_TYPE {
Expand Down Expand Up @@ -128,9 +128,9 @@ public static final class Arc<T> {

// *** Arc fields.

private int label;
int label;

private T output;
T output;

private long target;

Expand Down
Loading

0 comments on commit 09ab6dd

Please sign in to comment.