Skip to content

Add codec support for Lucene 6 and 7 versions #81258

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Dec 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions server/src/main/java/org/elasticsearch/index/IndexModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,13 @@ public Settings getSettings() {
return indexSettings.getSettings();
}

/**
* Returns the {@link IndexSettings} for this index
*/
public IndexSettings indexSettings() {
return indexSettings;
}

/**
* Returns the index this module is associated with
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ public Status needsField(FieldInfo fieldInfo) {
if (IgnoredFieldMapper.NAME.equals(fieldInfo.name)) {
return Status.YES;
}
// support _uid for loading older indices
if ("_uid".equals(fieldInfo.name)) {
return Status.YES;
}
// All these fields are single-valued so we can stop when the set is
// empty
return requiredFields.isEmpty() ? Status.STOP : Status.NO;
Expand Down Expand Up @@ -103,9 +107,18 @@ public void binaryField(FieldInfo fieldInfo, BytesRef value) {

@Override
public void stringField(FieldInfo fieldInfo, String value) {
assert IdFieldMapper.NAME.equals(fieldInfo.name) == false : "_id field must go through binaryField";
assert sourceFieldName.equals(fieldInfo.name) == false : "source field must go through binaryField";
addValue(fieldInfo.name, value);
if ("_uid".equals(fieldInfo.name)) {
// 5.x-only
int delimiterIndex = value.indexOf('#'); // type is not allowed to have # in it..., ids can
// type = value.substring(0, delimiterIndex);
id = value.substring(delimiterIndex + 1);
} else if (IdFieldMapper.NAME.equals(fieldInfo.name)) {
// only applies to 5.x indices that have single_type = true
id = value;
} else {
addValue(fieldInfo.name, value);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.xcontent.XContentParserUtils;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.index.store.StoreFileMetadata;
import org.elasticsearch.xcontent.ParseField;
import org.elasticsearch.xcontent.ToXContentFragment;
Expand All @@ -41,6 +42,7 @@ public static class FileInfo implements Writeable {
public static final String SERIALIZE_WRITER_UUID = "serialize_writer_uuid";

private final String name;
@Nullable
private final ByteSizeValue partSize;
private final long partBytes;
private final int numberOfParts;
Expand All @@ -53,7 +55,7 @@ public static class FileInfo implements Writeable {
* @param metadata the files meta data
* @param partSize size of the single chunk
*/
public FileInfo(String name, StoreFileMetadata metadata, ByteSizeValue partSize) {
public FileInfo(String name, StoreFileMetadata metadata, @Nullable ByteSizeValue partSize) {
this.name = Objects.requireNonNull(name);
this.metadata = metadata;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,10 @@

package org.elasticsearch.xpack.lucene.bwc;

import org.apache.lucene.backward_codecs.lucene70.Lucene70Codec;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
import org.elasticsearch.Build;
import org.elasticsearch.Version;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.index.IndexModule;
Expand All @@ -28,6 +20,7 @@
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.plugins.IndexStorePlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.xpack.lucene.bwc.codecs.BWCCodec;

import java.io.IOException;
import java.io.UncheckedIOException;
Expand All @@ -38,83 +31,69 @@ public class OldLuceneVersions extends Plugin implements IndexStorePlugin {

@Override
public void onIndexModule(IndexModule indexModule) {
if (Build.CURRENT.isSnapshot()) {
if (indexModule.indexSettings().getIndexVersionCreated().before(Version.CURRENT.minimumIndexCompatibilityVersion())) {
indexModule.addIndexEventListener(new IndexEventListener() {
@Override
public void afterFilesRestoredFromRepository(IndexShard indexShard) {
maybeConvertToNewFormat(indexShard);
convertToNewFormat(indexShard);
}
});
}
}

private static void maybeConvertToNewFormat(IndexShard indexShard) {
/**
* The trick used to allow newer Lucene versions to read older Lucene indices is to convert the old directory to a directory that new
* Lucene versions happily operate on. The way newer Lucene versions happily comply with reading older data is to put in place a
* segments file that the newer Lucene version can open, using codecs that allow reading everything from the old files, making it
* available under the newer interfaces. The way this works is to read in the old segments file using a special class
* {@link OldSegmentInfos} that supports reading older Lucene {@link SegmentInfos}, and then write out an updated segments file that
* newer Lucene versions can understand.
*/
private static void convertToNewFormat(IndexShard indexShard) {
indexShard.store().incRef();
try {
try {
Version version = getLuceneVersion(indexShard.store().directory());
// Lucene version in [7.0.0, 8.0.0)
if (version != null
&& version.onOrAfter(Version.fromBits(7, 0, 0))
&& version.onOrAfter(Version.fromBits(8, 0, 0)) == false) {
final OldSegmentInfos oldSegmentInfos = OldSegmentInfos.readLatestCommit(indexShard.store().directory(), 7);
final SegmentInfos segmentInfos = convertLucene7x(oldSegmentInfos);
// write upgraded segments file
segmentInfos.commit(indexShard.store().directory());
final OldSegmentInfos oldSegmentInfos = OldSegmentInfos.readLatestCommit(indexShard.store().directory(), 6);
final SegmentInfos segmentInfos = convertToNewerLuceneVersion(oldSegmentInfos);
// write upgraded segments file
segmentInfos.commit(indexShard.store().directory());

// validate that what we have written can be read using standard path
// TODO: norelease: remove this when development completes
SegmentInfos segmentInfos1 = SegmentInfos.readLatestCommit(indexShard.store().directory());
// what we have written can be read using standard path
assert SegmentInfos.readLatestCommit(indexShard.store().directory()) != null;

// clean older segments file
Lucene.pruneUnreferencedFiles(segmentInfos1.getSegmentsFileName(), indexShard.store().directory());
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
// clean older segments file
Lucene.pruneUnreferencedFiles(segmentInfos.getSegmentsFileName(), indexShard.store().directory());
} catch (IOException e) {
throw new UncheckedIOException(e);
} finally {
indexShard.store().decRef();
}
}

private static Version getLuceneVersion(Directory directory) throws IOException {
final String segmentFileName = SegmentInfos.getLastCommitSegmentsFileName(directory);
if (segmentFileName != null) {
long generation = SegmentInfos.generationFromSegmentsFileName(segmentFileName);
try (ChecksumIndexInput input = directory.openChecksumInput(segmentFileName, IOContext.READ)) {
CodecUtil.checkHeader(input, "segments", 0, Integer.MAX_VALUE);
byte[] id = new byte[StringHelper.ID_LENGTH];
input.readBytes(id, 0, id.length);
CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX));

Version luceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
int indexCreatedVersion = input.readVInt();
return luceneVersion;
} catch (Exception e) {
// ignore
}
}
return null;
}

private static SegmentInfos convertLucene7x(OldSegmentInfos oldSegmentInfos) {
private static SegmentInfos convertToNewerLuceneVersion(OldSegmentInfos oldSegmentInfos) {
final SegmentInfos segmentInfos = new SegmentInfos(org.apache.lucene.util.Version.LATEST.major);
segmentInfos.setNextWriteGeneration(oldSegmentInfos.getGeneration() + 1);
final Map<String, String> map = new HashMap<>(oldSegmentInfos.getUserData());
map.put(Engine.HISTORY_UUID_KEY, UUIDs.randomBase64UUID());
map.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, Long.toString(SequenceNumbers.NO_OPS_PERFORMED));
map.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(SequenceNumbers.NO_OPS_PERFORMED));
map.put(Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, "-1");
if (map.containsKey(Engine.HISTORY_UUID_KEY) == false) {
map.put(Engine.HISTORY_UUID_KEY, UUIDs.randomBase64UUID());
}
if (map.containsKey(SequenceNumbers.LOCAL_CHECKPOINT_KEY) == false) {
map.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, Long.toString(SequenceNumbers.NO_OPS_PERFORMED));
}
if (map.containsKey(SequenceNumbers.MAX_SEQ_NO) == false) {
map.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(SequenceNumbers.NO_OPS_PERFORMED));
}
if (map.containsKey(Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID) == false) {
map.put(Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, "-1");
}
segmentInfos.setUserData(map, true);
for (SegmentCommitInfo infoPerCommit : oldSegmentInfos.asList()) {
SegmentInfo info = infoPerCommit.info;
SegmentInfo newInfo = wrap(info);
final SegmentInfo newInfo = BWCCodec.wrap(infoPerCommit.info);

segmentInfos.add(
new SegmentCommitInfo(
newInfo,
infoPerCommit.getDelCount(),
0,
infoPerCommit.getSoftDelCount(),
infoPerCommit.getDelGen(),
infoPerCommit.getFieldInfosGen(),
infoPerCommit.getDocValuesGen(),
Expand All @@ -125,31 +104,6 @@ private static SegmentInfos convertLucene7x(OldSegmentInfos oldSegmentInfos) {
return segmentInfos;
}

static SegmentInfo wrap(SegmentInfo segmentInfo) {
// Use Version.LATEST instead of original version, otherwise SegmentCommitInfo will bark when processing (N-1 limitation)
// TODO: alternatively store the original version information in attributes?
byte[] id = segmentInfo.getId();
if (id == null) {
id = StringHelper.randomId();
}
Codec codec = segmentInfo.getCodec() instanceof Lucene70Codec ? new BWCLucene70Codec() : segmentInfo.getCodec();
SegmentInfo segmentInfo1 = new SegmentInfo(
segmentInfo.dir,
org.apache.lucene.util.Version.LATEST,
org.apache.lucene.util.Version.LATEST,
segmentInfo.name,
segmentInfo.maxDoc(),
segmentInfo.getUseCompoundFile(),
codec,
segmentInfo.getDiagnostics(),
id,
segmentInfo.getAttributes(),
null
);
segmentInfo1.setFiles(segmentInfo.files());
return segmentInfo1;
}

@Override
public Map<String, DirectoryFactory> getDirectoryFactories() {
return Map.of();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Modifications copyright (C) 2021 Elasticsearch B.V.
*/

package org.elasticsearch.xpack.lucene.bwc;
Expand Down Expand Up @@ -60,6 +62,12 @@
@SuppressWarnings("CheckStyle")
@SuppressForbidden(reason = "Lucene class")
public class OldSegmentInfos implements Cloneable, Iterable<SegmentCommitInfo> {

/**
* Adds the {@link Version} that committed this segments_N file, as well as the {@link Version}
* of the oldest segment, since 5.3+
*/
public static final int VERSION_53 = 6;
/**
* The version that added information about the Lucene version at the time when the index has been
* created.
Expand Down Expand Up @@ -209,13 +217,16 @@ static final OldSegmentInfos readCommit(Directory directory, ChecksumIndexInput
if (magic != CodecUtil.CODEC_MAGIC) {
throw new IndexFormatTooOldException(input, magic, CodecUtil.CODEC_MAGIC, CodecUtil.CODEC_MAGIC);
}
format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_70, VERSION_CURRENT);
format = CodecUtil.checkHeaderNoMagic(input, "segments", VERSION_53, VERSION_CURRENT);
byte[] id = new byte[StringHelper.ID_LENGTH];
input.readBytes(id, 0, id.length);
CodecUtil.checkIndexHeaderSuffix(input, Long.toString(generation, Character.MAX_RADIX));

Version luceneVersion = Version.fromBits(input.readVInt(), input.readVInt(), input.readVInt());
int indexCreatedVersion = input.readVInt();
int indexCreatedVersion = 6;
if (format >= VERSION_70) {
indexCreatedVersion = input.readVInt();
}
if (luceneVersion.major < indexCreatedVersion) {
throw new CorruptIndexException(
"Creation version ["
Expand Down Expand Up @@ -252,7 +263,7 @@ static final OldSegmentInfos readCommit(Directory directory, ChecksumIndexInput
} catch (Throwable t) {
priorE = t;
} finally {
if (format >= VERSION_70) { // oldest supported version
if (format >= VERSION_53) { // oldest supported version
CodecUtil.checkFooter(input, priorE);
} else {
throw IOUtils.rethrowAlways(priorE);
Expand Down Expand Up @@ -283,6 +294,14 @@ private static void parseSegmentInfos(Directory directory, DataInput input, OldS
long totalDocs = 0;
for (int seg = 0; seg < numSegments; seg++) {
String segName = input.readString();
if (format < VERSION_70) {
byte hasID = input.readByte();
if (hasID == 0) {
throw new IndexFormatTooOldException(input, "Segment is from Lucene 4.x");
} else if (hasID != 1) {
throw new CorruptIndexException("invalid hasID byte, got: " + hasID, input);
}
}
byte[] segmentID = new byte[StringHelper.ID_LENGTH];
input.readBytes(segmentID, 0, segmentID.length);
Codec codec = readCodec(input);
Expand Down
Loading