-
Notifications
You must be signed in to change notification settings - Fork 25.3k
TSDB: unicode dimensions #83681
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TSDB: unicode dimensions #83681
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ | |
import java.util.Collections; | ||
import java.util.LinkedHashMap; | ||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Map; | ||
import java.util.SortedMap; | ||
import java.util.function.Supplier; | ||
|
@@ -141,12 +142,12 @@ public void postParse(DocumentParserContext context) throws IOException { | |
assert fieldType().isIndexed() == false; | ||
|
||
// SortedMap is expected to be sorted by key (field name) | ||
SortedMap<String, BytesReference> dimensionFields = context.doc().getDimensionBytes(); | ||
SortedMap<BytesRef, BytesReference> dimensionFields = context.doc().getDimensionBytes(); | ||
BytesReference timeSeriesId = buildTsidField(dimensionFields); | ||
context.doc().add(new SortedDocValuesField(fieldType().name(), timeSeriesId.toBytesRef())); | ||
} | ||
|
||
public static BytesReference buildTsidField(SortedMap<String, BytesReference> dimensionFields) throws IOException { | ||
public static BytesReference buildTsidField(SortedMap<BytesRef, BytesReference> dimensionFields) throws IOException { | ||
if (dimensionFields == null || dimensionFields.isEmpty()) { | ||
throw new IllegalArgumentException("Dimension fields are missing."); | ||
} | ||
|
@@ -166,19 +167,22 @@ protected String contentType() { | |
return CONTENT_TYPE; | ||
} | ||
|
||
public static void encodeTsid(StreamOutput out, SortedMap<String, BytesReference> dimensionFields) throws IOException { | ||
public static void encodeTsid(StreamOutput out, SortedMap<BytesRef, BytesReference> dimensionFields) throws IOException { | ||
out.writeVInt(dimensionFields.size()); | ||
for (Map.Entry<String, BytesReference> entry : dimensionFields.entrySet()) { | ||
String fieldName = entry.getKey(); | ||
BytesRef fieldNameBytes = new BytesRef(fieldName); | ||
int len = fieldNameBytes.length; | ||
if (len > DIMENSION_NAME_LIMIT) { | ||
for (Map.Entry<BytesRef, BytesReference> entry : dimensionFields.entrySet()) { | ||
BytesRef fieldName = entry.getKey(); | ||
if (fieldName.length > DIMENSION_NAME_LIMIT) { | ||
throw new IllegalArgumentException( | ||
"Dimension name must be less than [" + DIMENSION_NAME_LIMIT + "] bytes but [" + fieldName + "] was [" + len + "]." | ||
String.format( | ||
Locale.ROOT, | ||
"Dimension name must be less than [%d] bytes but [%s] was [%s].", | ||
DIMENSION_NAME_LIMIT, | ||
fieldName.utf8ToString(), | ||
fieldName.length | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The formatter made a mess of this when I made the parameters a little longer so I switched it to String.format so human can read the message. |
||
) | ||
); | ||
} | ||
// Write field name in utf-8 instead of writeString's utf-16-ish thing | ||
out.writeBytesRef(fieldNameBytes); | ||
out.writeBytesRef(fieldName); | ||
entry.getValue().writeTo(out); | ||
} | ||
|
||
|
@@ -193,7 +197,7 @@ public static Map<String, Object> decodeTsid(StreamInput in) { | |
Map<String, Object> result = new LinkedHashMap<String, Object>(size); | ||
|
||
for (int i = 0; i < size; i++) { | ||
String name = in.readString(); | ||
String name = in.readBytesRef().utf8ToString(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's the bug that made this not work - sometimes BytesRef and String don't write the same. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if there is a value in hiding the fact that it's BytesRef all the way down and can this hurt us beyond serialization thing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you thinking this should be it's own class? Like one that just wraps There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, if I am completely honest, I was thinking why do we have 2 different maps with String in one case and BytesRef in another case that essentially represent the same data. But I think I like your interpretation much better. It has quite a bit of complexity that could probably benefit from some encapsulation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to be clear, maybe we need a TimeSeriesId class to represent this sort of data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you like me to do it in this or a followup? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, let's not mix refactoring with the actual change. |
||
|
||
int type = in.read(); | ||
switch (type) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,8 @@ | |
import org.elasticsearch.xcontent.XContentBuilder; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
import static org.elasticsearch.test.MapMatcher.assertMap; | ||
import static org.elasticsearch.test.MapMatcher.matchesMap; | ||
|
@@ -123,6 +125,23 @@ public void testStrings() throws IOException { | |
); | ||
} | ||
|
||
public void testUnicodeKeys() throws IOException { | ||
String fire = new String(new int[] { 0x1F525 }, 0, 1); | ||
String coffee = "\u2615"; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ☕ is a good dimension |
||
DocumentMapper docMapper = createDocumentMapper("a", mapping(b -> { | ||
b.startObject(fire).field("type", "keyword").field("time_series_dimension", true).endObject(); | ||
b.startObject(coffee).field("type", "keyword").field("time_series_dimension", true).endObject(); | ||
})); | ||
|
||
ParsedDocument doc = parseDocument(docMapper, b -> b.field(fire, "hot").field(coffee, "good")); | ||
Map<String, Object> tsid = TimeSeriesIdFieldMapper.decodeTsid( | ||
new ByteArrayStreamInput(doc.rootDoc().getBinaryValue("_tsid").bytes) | ||
); | ||
assertMap(tsid, matchesMap().entry(coffee, "good").entry(fire, "hot")); | ||
// Also make sure the keys are in order | ||
assertThat(List.copyOf(tsid.keySet()), equalTo(List.of(coffee, fire))); | ||
} | ||
|
||
public void testKeywordTooLong() throws IOException { | ||
DocumentMapper docMapper = createDocumentMapper( | ||
"a", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I switched to bytes ref here to make it really obvious these are sorted by utf-8.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ulterior motive: I'll want them in utf-8 in more than one place.