Skip to content

TSDB: unicode dimensions #83681

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class LuceneDocument implements Iterable<IndexableField> {
* for generating the _tsid field. The map will be used by {@link TimeSeriesIdFieldMapper}
* to build the _tsid field for the document.
*/
private SortedMap<String, BytesReference> dimensionBytes;
private SortedMap<BytesRef, BytesReference> dimensionBytes;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I switched to bytes ref here to make it really obvious these are sorted by utf-8.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ulterior motive: I'll want them in utf-8 in more than one place.


LuceneDocument(String path, LuceneDocument parent) {
fields = new ArrayList<>();
Expand Down Expand Up @@ -114,16 +114,17 @@ public IndexableField getByKey(Object key) {
* to build the _tsid field for the document.
*/
public void addDimensionBytes(String fieldName, BytesReference tsidBytes) {
BytesRef fieldNameBytes = new BytesRef(fieldName);
if (dimensionBytes == null) {
// It is a {@link TreeMap} so that it is order by field name.
dimensionBytes = new TreeMap<>();
} else if (dimensionBytes.containsKey(fieldName)) {
} else if (dimensionBytes.containsKey(fieldNameBytes)) {
throw new IllegalArgumentException("Dimension field [" + fieldName + "] cannot be a multi-valued field.");
}
dimensionBytes.put(fieldName, tsidBytes);
dimensionBytes.put(fieldNameBytes, tsidBytes);
}

public SortedMap<String, BytesReference> getDimensionBytes() {
public SortedMap<BytesRef, BytesReference> getDimensionBytes() {
if (dimensionBytes == null) {
return Collections.emptySortedMap();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.SortedMap;
import java.util.function.Supplier;
Expand Down Expand Up @@ -141,12 +142,12 @@ public void postParse(DocumentParserContext context) throws IOException {
assert fieldType().isIndexed() == false;

// SortedMap is expected to be sorted by key (field name)
SortedMap<String, BytesReference> dimensionFields = context.doc().getDimensionBytes();
SortedMap<BytesRef, BytesReference> dimensionFields = context.doc().getDimensionBytes();
BytesReference timeSeriesId = buildTsidField(dimensionFields);
context.doc().add(new SortedDocValuesField(fieldType().name(), timeSeriesId.toBytesRef()));
}

public static BytesReference buildTsidField(SortedMap<String, BytesReference> dimensionFields) throws IOException {
public static BytesReference buildTsidField(SortedMap<BytesRef, BytesReference> dimensionFields) throws IOException {
if (dimensionFields == null || dimensionFields.isEmpty()) {
throw new IllegalArgumentException("Dimension fields are missing.");
}
Expand All @@ -166,19 +167,22 @@ protected String contentType() {
return CONTENT_TYPE;
}

public static void encodeTsid(StreamOutput out, SortedMap<String, BytesReference> dimensionFields) throws IOException {
public static void encodeTsid(StreamOutput out, SortedMap<BytesRef, BytesReference> dimensionFields) throws IOException {
out.writeVInt(dimensionFields.size());
for (Map.Entry<String, BytesReference> entry : dimensionFields.entrySet()) {
String fieldName = entry.getKey();
BytesRef fieldNameBytes = new BytesRef(fieldName);
int len = fieldNameBytes.length;
if (len > DIMENSION_NAME_LIMIT) {
for (Map.Entry<BytesRef, BytesReference> entry : dimensionFields.entrySet()) {
BytesRef fieldName = entry.getKey();
if (fieldName.length > DIMENSION_NAME_LIMIT) {
throw new IllegalArgumentException(
"Dimension name must be less than [" + DIMENSION_NAME_LIMIT + "] bytes but [" + fieldName + "] was [" + len + "]."
String.format(
Locale.ROOT,
"Dimension name must be less than [%d] bytes but [%s] was [%s].",
DIMENSION_NAME_LIMIT,
fieldName.utf8ToString(),
fieldName.length
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The formatter made a mess of this when I made the parameters a little longer so I switched it to String.format so human can read the message.

)
);
}
// Write field name in utf-8 instead of writeString's utf-16-ish thing
out.writeBytesRef(fieldNameBytes);
out.writeBytesRef(fieldName);
entry.getValue().writeTo(out);
}

Expand All @@ -193,7 +197,7 @@ public static Map<String, Object> decodeTsid(StreamInput in) {
Map<String, Object> result = new LinkedHashMap<String, Object>(size);

for (int i = 0; i < size; i++) {
String name = in.readString();
String name = in.readBytesRef().utf8ToString();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's the bug that made this not work - sometimes BytesRef and String don't write the same.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there is a value in hiding the fact that it's BytesRef all the way down and can this hurt us beyond serialization thing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you thinking this should be it's own class? Like one that just wraps BytesRef or something?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, if I am completely honest, I was thinking why do we have 2 different maps with String in one case and BytesRef in another case that essentially represent the same data. But I think I like your interpretation much better. It has quite a bit of complexity that could probably benefit from some encapsulation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be clear, maybe we need a TimeSeriesId class to represent this sort of data.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you like me to do it in this or a followup?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, let's not mix refactoring with the actual change.


int type = in.read();
switch (type) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -706,9 +706,9 @@ public BytesRef parseBytesRef(Object value) {
}

Map<?, ?> m = (Map<?, ?>) value;
SortedMap<String, BytesReference> dimensionFields = new TreeMap<>();
SortedMap<BytesRef, BytesReference> dimensionFields = new TreeMap<>();
for (Map.Entry<?, ?> entry : m.entrySet()) {
String k = (String) entry.getKey();
BytesRef k = new BytesRef(entry.getKey().toString());
Object v = entry.getValue();
BytesReference bytes;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;
import java.util.List;
import java.util.Map;

import static org.elasticsearch.test.MapMatcher.assertMap;
import static org.elasticsearch.test.MapMatcher.matchesMap;
Expand Down Expand Up @@ -123,6 +125,23 @@ public void testStrings() throws IOException {
);
}

public void testUnicodeKeys() throws IOException {
String fire = new String(new int[] { 0x1F525 }, 0, 1);
String coffee = "\u2615";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

☕ is a good dimension

DocumentMapper docMapper = createDocumentMapper("a", mapping(b -> {
b.startObject(fire).field("type", "keyword").field("time_series_dimension", true).endObject();
b.startObject(coffee).field("type", "keyword").field("time_series_dimension", true).endObject();
}));

ParsedDocument doc = parseDocument(docMapper, b -> b.field(fire, "hot").field(coffee, "good"));
Map<String, Object> tsid = TimeSeriesIdFieldMapper.decodeTsid(
new ByteArrayStreamInput(doc.rootDoc().getBinaryValue("_tsid").bytes)
);
assertMap(tsid, matchesMap().entry(coffee, "good").entry(fire, "hot"));
// Also make sure the keys are in order
assertThat(List.copyOf(tsid.keySet()), equalTo(List.of(coffee, fire)));
}

public void testKeywordTooLong() throws IOException {
DocumentMapper docMapper = createDocumentMapper(
"a",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.core.CheckedConsumer;
Expand Down Expand Up @@ -80,15 +81,15 @@ public void testStandAloneTimeSeriesWithSum() throws IOException {
public static void writeTS(RandomIndexWriter iw, long timestamp, Object[] dimensions, Object[] metrics) throws IOException {
final List<IndexableField> fields = new ArrayList<>();
fields.add(new SortedNumericDocValuesField(DataStreamTimestampFieldMapper.DEFAULT_PATH, timestamp));
final SortedMap<String, BytesReference> dimensionFields = new TreeMap<>();
final SortedMap<BytesRef, BytesReference> dimensionFields = new TreeMap<>();
for (int i = 0; i < dimensions.length; i += 2) {
final BytesReference reference;
if (dimensions[i + 1] instanceof Number) {
reference = TimeSeriesIdFieldMapper.encodeTsidValue(((Number) dimensions[i + 1]).longValue());
} else {
reference = TimeSeriesIdFieldMapper.encodeTsidValue(dimensions[i + 1].toString());
}
dimensionFields.put(dimensions[i].toString(), reference);
dimensionFields.put(new BytesRef(dimensions[i].toString()), reference);
}
for (int i = 0; i < metrics.length; i += 2) {
if (metrics[i + 1] instanceof Integer || metrics[i + 1] instanceof Long) {
Expand Down