Skip to content

Commit

Permalink
LUCENE-9740: scan affix stream once. (apache#2327)
Browse files Browse the repository at this point in the history
  • Loading branch information
dweiss authored Feb 9, 2021
1 parent f93cbb3 commit 061b3f2
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
Expand Down Expand Up @@ -70,6 +70,9 @@

/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
public class Dictionary {
// Derived from woorm/ openoffice dictionaries.
// See TestAllDictionaries.testMaxPrologueNeeded.
static final int MAX_PROLOGUE_SCAN_WINDOW = 30 * 1024;

static final char[] NOFLAGS = new char[0];

Expand Down Expand Up @@ -227,26 +230,37 @@ public Dictionary(
this.needsInputCleaning = ignoreCase;
this.needsOutputCleaning = false; // set if we have an OCONV

Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
Path aff = Files.createTempFile(tempPath, "affix", "aff");

BufferedInputStream aff1 = null;
InputStream aff2 = null;
boolean success = false;
try {
// Copy contents of the affix stream to a temp file.
try (OutputStream os = Files.newOutputStream(aff)) {
affix.transferTo(os);
try (BufferedInputStream affixStream =
new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
@Override
public void close() throws IOException {
// TODO: maybe we should consume and close it? Why does it need to stay open?
// Don't close the affix stream as per javadoc.
}
}) {
// I assume we don't support other BOMs (utf16, etc.)? We trivially could,
// by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
// any such exotic examples.
Charset streamCharset;
if (maybeConsume(affixStream, BOM_UTF8)) {
streamCharset = StandardCharsets.UTF_8;
} else {
streamCharset = DEFAULT_CHARSET;
}

// pass 1: get encoding & flag
aff1 = new BufferedInputStream(Files.newInputStream(aff));
readConfig(aff1);
/*
* pass 1: look for encoding & flag. This is simple but works. We just prefetch
* a large enough chunk of the input and scan through it. The buffered data will
* be subsequently reused anyway so nothing is wasted.
*/
affixStream.mark(MAX_PROLOGUE_SCAN_WINDOW);
byte[] prologue = affixStream.readNBytes(MAX_PROLOGUE_SCAN_WINDOW - 1);
affixStream.reset();
readConfig(new ByteArrayInputStream(prologue), streamCharset);

// pass 2: parse affixes
aff2 = new BufferedInputStream(Files.newInputStream(aff));
FlagEnumerator flagEnumerator = new FlagEnumerator();
readAffixFile(aff2, decoder, flagEnumerator);
readAffixFile(affixStream, decoder, flagEnumerator);

// read dictionary entries
IndexOutput unsorted = mergeDictionaries(tempDir, tempFileNamePrefix, dictionaries, decoder);
Expand All @@ -255,14 +269,6 @@ public Dictionary(
flagLookup = flagEnumerator.finish();
aliases = null; // no longer needed
morphAliases = null; // no longer needed
success = true;
} finally {
IOUtils.closeWhileHandlingException(aff1, aff2);
if (success) {
Files.delete(aff);
} else {
IOUtils.deleteFilesIgnoringExceptions(aff);
}
}
}

Expand Down Expand Up @@ -349,6 +355,7 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
if (line.isEmpty()) continue;

String firstWord = line.split("\\s")[0];
// TODO: convert to a switch?
if ("AF".equals(firstWord)) {
parseAlias(line);
} else if ("AM".equals(firstWord)) {
Expand Down Expand Up @@ -458,6 +465,12 @@ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, Flag
checkCompoundPatterns.add(
new CheckCompoundPattern(reader.readLine(), flagParsingStrategy, this));
}
} else if ("SET".equals(firstWord)) {
// We could add some sanity-checking whether set command is identical to what was
// parsed in readConfig. This would handle cases of flags too far in the file or
// duplicated (both are incorrect, I assume).
} else if ("FLAG".equals(firstWord)) {
// Similar for FLAG.
}
}

Expand Down Expand Up @@ -791,31 +804,36 @@ private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};

/** Parses the encoding and flag format specified in the provided InputStream */
private void readConfig(BufferedInputStream stream) throws IOException, ParseException {
// I assume we don't support other BOMs (utf16, etc.)? We trivially could,
// by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
// any such exotic examples.
Charset streamCharset;
if (maybeConsume(stream, BOM_UTF8)) {
streamCharset = StandardCharsets.UTF_8;
} else {
streamCharset = DEFAULT_CHARSET;
}

// TODO: can these flags change throughout the file? If not then we can abort sooner. And
// then we wouldn't even need to create a temp file for the affix stream - a large enough
// leading buffer (BufferedInputStream) would be sufficient?
private void readConfig(InputStream stream, Charset streamCharset)
throws IOException, ParseException {
LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
String line;
String flagLine = null;
boolean charsetFound = false;
boolean flagFound = false;
while ((line = reader.readLine()) != null) {
if (line.isBlank()) continue;

String firstWord = line.split("\\s")[0];
if ("SET".equals(firstWord)) {
decoder = getDecoder(singleArgument(reader, line));
charsetFound = true;
} else if ("FLAG".equals(firstWord)) {
flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
// Preserve the flag line for parsing later since we need the decoder's charset
// and just in case they come out of order.
flagLine = line;
flagFound = true;
} else {
continue;
}

if (charsetFound && flagFound) {
break;
}
}

if (flagFound) {
flagParsingStrategy = getFlagParsingStrategy(flagLine, decoder.charset());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.RamUsageTester;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Ignore;

Expand Down Expand Up @@ -135,6 +136,9 @@ public void testMaxPrologueNeeded() throws Exception {
(flag, positions) -> {
long max = positions.stream().mapToLong(v -> v).max().orElse(0);
System.out.printf(Locale.ROOT, "Flag %s at maximum offset %s%n", flag, max);
Assert.assertTrue(
"Flags beyond max prologue scan window: " + max,
max < Dictionary.MAX_PROLOGUE_SCAN_WINDOW);
});

if (failTest.get()) {
Expand Down

0 comments on commit 061b3f2

Please sign in to comment.