Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 424: Add support for handling of vector fields. #489

Merged
Next Next commit
Issue 424: Add support for handling of vector fields. Signed-off-by: …
…Dmitry Goldenberg <dgoldenberg@integralads.com>

Signed-off-by: Dmitry Goldenberg <dgoldenberg@integralads.com>
  • Loading branch information
dgoldenberg-ias committed Sep 12, 2024
commit b08dba545feb0b0cfd1f983c0bfd28941d2db9a9
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ metastore_db
out/
localRepo/
.vscode/
*.jar
*.jar
.DS_Store
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ public Object readValue(Parser parser, String value, FieldType esType) {
return longValue(value, parser);
case HALF_FLOAT:
case FLOAT:
case KNN_VECTOR:
return floatValue(value, parser);
case SCALED_FLOAT:
case DOUBLE:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
import static org.opensearch.hadoop.serialization.FieldType.INTEGER;
import static org.opensearch.hadoop.serialization.FieldType.JOIN;
import static org.opensearch.hadoop.serialization.FieldType.KEYWORD;
import static org.opensearch.hadoop.serialization.FieldType.KNN_VECTOR;
import static org.opensearch.hadoop.serialization.FieldType.LONG;
import static org.opensearch.hadoop.serialization.FieldType.NESTED;
import static org.opensearch.hadoop.serialization.FieldType.OBJECT;
Expand Down Expand Up @@ -147,7 +148,7 @@ public void testPrimitivesParsing() throws Exception {
MappingSet mappings = getMappingsForResource("primitives.json");
Mapping mapping = ensureAndGet("index", "primitives", mappings);
Field[] props = mapping.getFields();
assertEquals(16, props.length);
assertEquals(17, props.length);
assertEquals("field01", props[0].name());
assertEquals(BOOLEAN, props[0].type());
assertEquals("field02", props[1].name());
Expand Down Expand Up @@ -180,6 +181,8 @@ public void testPrimitivesParsing() throws Exception {
assertEquals(DATE_NANOS, props[14].type());
assertEquals("field16", props[15].name());
assertEquals(WILDCARD, props[15].type());
assertEquals("field17", props[16].name());
assertEquals(KNN_VECTOR, props[16].type());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
},
"field16" : {
"type" : "wildcard"
},
"field17" : {
"type" : "knn_vector"
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@
},
"field16" : {
"type" : "wildcard"
},
"field17" : {
"type" : "knn_vector"
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,14 @@
*/
package org.opensearch.spark.serialization

import java.io.IOException
import java.util.Collections
import java.util.Date
import java.util.{List => JList}
import scala.collection.JavaConverters.asScalaBufferConverter
import scala.collection.Seq
import scala.collection.mutable.LinkedHashMap
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.Map
import org.opensearch.hadoop.serialization.FieldType.BINARY
import org.opensearch.hadoop.serialization.FieldType.BOOLEAN
Expand All @@ -56,6 +58,7 @@ import org.opensearch.hadoop.serialization.FieldType.STRING
import org.opensearch.hadoop.serialization.FieldType.TEXT
import org.opensearch.hadoop.serialization.FieldType.TOKEN_COUNT
import org.opensearch.hadoop.serialization.FieldType.WILDCARD
import org.opensearch.hadoop.serialization.FieldType.KNN_VECTOR
import org.opensearch.hadoop.serialization.Parser.Token.VALUE_BOOLEAN
import org.opensearch.hadoop.serialization.Parser.Token.VALUE_NULL
import org.opensearch.hadoop.serialization.Parser.Token.VALUE_NUMBER
Expand Down Expand Up @@ -103,6 +106,7 @@ class ScalaValueReader extends AbstractValueReader with SettingsAware {
case BINARY => binaryValue(Option(parser.binaryValue()).getOrElse(value.getBytes()))
case DATE => date(value, parser)
case DATE_NANOS => dateNanos(value, parser)
case KNN_VECTOR => floatValue(value, parser)
// GEO is ambiguous so use the JSON type instead to differentiate between doubles (a lot in GEO_SHAPE) and strings
case GEO_POINT | GEO_SHAPE => {
if (parser.currentToken() == VALUE_NUMBER) doubleValue(value, parser) else textValue(value, parser)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ import org.opensearch.hadoop.serialization.FieldType.GEO_SHAPE
import org.opensearch.hadoop.serialization.FieldType.INTEGER
import org.opensearch.hadoop.serialization.FieldType.JOIN
import org.opensearch.hadoop.serialization.FieldType.KEYWORD
import org.opensearch.hadoop.serialization.FieldType.KNN_VECTOR
import org.opensearch.hadoop.serialization.FieldType.LONG
import org.opensearch.hadoop.serialization.FieldType.NESTED
import org.opensearch.hadoop.serialization.FieldType.NULL
Expand Down Expand Up @@ -169,6 +170,7 @@ private[sql] object SchemaUtils {
case WILDCARD => StringType
case DATE => if (cfg.getMappingDateRich) TimestampType else StringType
case DATE_NANOS => if (cfg.getMappingDateRich) TimestampType else StringType
case KNN_VECTOR => DataTypes.createArrayType(FloatType)
case OBJECT => convertToStruct(field, geoInfo, absoluteName, arrayIncludes, arrayExcludes, cfg)
case NESTED => DataTypes.createArrayType(convertToStruct(field, geoInfo, absoluteName, arrayIncludes, arrayExcludes, cfg))
case JOIN => convertToStruct(field, geoInfo, absoluteName, arrayIncludes, arrayExcludes, cfg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ import org.opensearch.hadoop.serialization.FieldType.GEO_SHAPE
import org.opensearch.hadoop.serialization.FieldType.INTEGER
import org.opensearch.hadoop.serialization.FieldType.JOIN
import org.opensearch.hadoop.serialization.FieldType.KEYWORD
import org.opensearch.hadoop.serialization.FieldType.KNN_VECTOR
import org.opensearch.hadoop.serialization.FieldType.LONG
import org.opensearch.hadoop.serialization.FieldType.NESTED
import org.opensearch.hadoop.serialization.FieldType.NULL
Expand Down Expand Up @@ -169,6 +170,7 @@ private[sql] object SchemaUtils {
case WILDCARD => StringType
case DATE => if (cfg.getMappingDateRich) TimestampType else StringType
case DATE_NANOS => if (cfg.getMappingDateRich) TimestampType else StringType
case KNN_VECTOR => DataTypes.createArrayType(FloatType)
case OBJECT => convertToStruct(field, geoInfo, absoluteName, arrayIncludes, arrayExcludes, cfg)
case NESTED => DataTypes.createArrayType(convertToStruct(field, geoInfo, absoluteName, arrayIncludes, arrayExcludes, cfg))
case JOIN => convertToStruct(field, geoInfo, absoluteName, arrayIncludes, arrayExcludes, cfg)
Expand Down