Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
## Issue
Fixes langchain4j#1747

## Change
Fixed this bug for Milvus and Chroma.

Did not manage to fix it for Pinecone because Pinecone can store
`Integer`/`Long` only as `Double`. We could theoretically store them as
`String`s, but then metadata filtering for `Integer`/`Long` values will
not work. For now I've left the comment in the `PineconeEmbeddingStore`
Javadoc and Pinecone documentation about the bug and possible
workaround.

## General checklist
- [X] There are no breaking changes
- [X] I have added unit and integration tests for my change
- [x] I have manually run all the unit and integration tests in the
module I have added/changed, and they are all green
- [x] I have manually run all the unit and integration tests in the
[core](https://github.com/langchain4j/langchain4j/tree/main/langchain4j-core)
and
[main](https://github.com/langchain4j/langchain4j/tree/main/langchain4j)
modules, and they are all green
- [X] I have added/updated the
[documentation](https://github.com/langchain4j/langchain4j/tree/main/docs/docs)
- [ ] I have added an example in the [examples
repo](https://github.com/langchain4j/langchain4j-examples) (only for
"big" features)
- [ ] I have added/updated [Spring Boot
starter(s)](https://github.com/langchain4j/langchain4j-spring) (if
applicable)
  • Loading branch information
dliubarskyi authored Oct 21, 2024
1 parent 6f6040d commit b6f2cd6
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 7 deletions.
8 changes: 8 additions & 0 deletions docs/docs/integrations/embedding-stores/pinecone.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ https://www.pinecone.io/
</dependency>
```

## Known Issues

- https://github.com/langchain4j/langchain4j/issues/1948
Pinecone stores all numbers as [floating-point values](https://docs.pinecone.io/guides/data/filter-with-metadata#supported-metadata-types),
which means `Integer` and `Long` values (e.g., 1746714878034235396) stored in `Metadata`
may be corrupted and returned as incorrect numbers!
Possible workaround: convert integer/double values to `String` before storing them in `Metadata`.
Please note that in this case metadata filtering might not work properly!

## APIs

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package dev.langchain4j.store.embedding.chroma;

import static com.google.gson.FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES;
import static com.google.gson.ToNumberPolicy.LONG_OR_DOUBLE;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
Expand Down Expand Up @@ -32,7 +33,10 @@ private ChromaClient(Builder builder) {
httpClientBuilder.addInterceptor(new ChromaResponseLoggingInterceptor());
}

Gson gson = new GsonBuilder().setFieldNamingPolicy(LOWER_CASE_WITH_UNDERSCORES).create();
Gson gson = new GsonBuilder()
.setFieldNamingPolicy(LOWER_CASE_WITH_UNDERSCORES)
.setObjectToNumberStrategy(LONG_OR_DOUBLE)
.create();

Retrofit retrofit = new Retrofit.Builder()
.baseUrl(Utils.ensureTrailingForwardSlash(builder.baseUrl))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ void should_add_embedding_with_segment_with_metadata() {
assertThat(match.embedded().metadata().getLong("long_minus_1")).isEqualTo(-1L);
assertThat(match.embedded().metadata().getLong("long_0")).isEqualTo(0L);
assertThat(match.embedded().metadata().getLong("long_1")).isEqualTo(1L);
if (testLong1746714878034235396()) {
assertThat(match.embedded().metadata().getLong("long_1746714878034235396")).isEqualTo(1746714878034235396L);
}
assertThat(match.embedded().metadata().getLong("long_max")).isEqualTo(Long.MAX_VALUE);

assertThat(match.embedded().metadata().getFloat("float_min")).isEqualTo(-Float.MAX_VALUE);
Expand All @@ -84,6 +87,10 @@ void should_add_embedding_with_segment_with_metadata() {
.build()).matches()).isEqualTo(relevant);
}

protected boolean testLong1746714878034235396() {
return true;
}

protected Metadata createMetadata() {

Metadata metadata = new Metadata();
Expand All @@ -104,6 +111,9 @@ protected Metadata createMetadata() {
metadata.put("long_minus_1", -1L);
metadata.put("long_0", 0L);
metadata.put("long_1", 1L);
if (testLong1746714878034235396()) {
metadata.put("long_1746714878034235396", 1746714878034235396L);
}
metadata.put("long_max", Long.MAX_VALUE);

metadata.put("float_min", -Float.MAX_VALUE);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package dev.langchain4j.store.embedding.milvus;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonObject;
import com.google.gson.reflect.TypeToken;
import dev.langchain4j.data.document.Metadata;
Expand All @@ -22,6 +23,7 @@
import java.util.List;
import java.util.Map;

import static com.google.gson.ToNumberPolicy.LONG_OR_DOUBLE;
import static dev.langchain4j.internal.Utils.isNullOrBlank;
import static dev.langchain4j.internal.Utils.isNullOrEmpty;
import static dev.langchain4j.store.embedding.milvus.CollectionOperationsExecutor.queryForVectors;
Expand All @@ -31,7 +33,10 @@

class Mapper {

private static final Gson GSON = new Gson();
private static final Gson GSON = new GsonBuilder()
.setObjectToNumberStrategy(LONG_OR_DOUBLE)
.create();

private static final Type MAP_TYPE = new TypeToken<Map<String, Object>>() {
}.getType();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ static void beforeAll() {
Arrays.asList("key text NULL", "name text NULL", "age float NULL", "city varchar null", "country varchar null",
"string_empty varchar null", "string_space varchar null", "string_abc varchar null", "uuid uuid null",
"integer_min int null", "integer_minus_1 int null", "integer_0 int null", "integer_1 int null", "integer_max int null",
"long_min bigint null", "long_minus_1 bigint null", "long_0 bigint null", "long_1 bigint null", "long_max bigint null",
"long_min bigint null", "long_minus_1 bigint null", "long_0 bigint null", "long_1 bigint null", "long_1746714878034235396 bigint null", "long_max bigint null",
"float_min float null", "float_minus_1 float null", "float_0 float null", "float_1 float null", "float_123 float null", "float_max float null",
"double_minus_1 float8 null", "double_0 float8 null", "double_1 float8 null", "double_123 float8 null"
))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
import dev.langchain4j.data.document.Metadata;
import dev.langchain4j.data.embedding.Embedding;
import dev.langchain4j.data.segment.TextSegment;
import dev.langchain4j.store.embedding.*;
import dev.langchain4j.store.embedding.CosineSimilarity;
import dev.langchain4j.store.embedding.EmbeddingMatch;
import dev.langchain4j.store.embedding.EmbeddingSearchRequest;
import dev.langchain4j.store.embedding.EmbeddingSearchResult;
import dev.langchain4j.store.embedding.EmbeddingStore;
import dev.langchain4j.store.embedding.RelevanceScore;
import io.pinecone.clients.Index;
import io.pinecone.clients.Pinecone;
import io.pinecone.unsigned_indices_model.QueryResponseWithUnsignedIndices;
Expand All @@ -14,22 +19,34 @@
import org.openapitools.client.model.IndexList;
import org.openapitools.client.model.IndexModel;

import java.util.*;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import static dev.langchain4j.internal.Utils.isNullOrEmpty;
import static dev.langchain4j.internal.Utils.randomUUID;
import static dev.langchain4j.internal.ValidationUtils.ensureNotEmpty;
import static dev.langchain4j.store.embedding.pinecone.PineconeHelper.metadataToStruct;
import static dev.langchain4j.store.embedding.pinecone.PineconeHelper.structToMetadata;
import static io.pinecone.commons.IndexInterface.buildUpsertVectorWithUnsignedIndices;
import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;
import static java.util.Comparator.comparingDouble;
import static java.util.stream.Collectors.toList;

/**
* Represents a <a href="https://www.pinecone.io/">Pinecone</a> index as an embedding store.
* <p>Current implementation assumes the index uses the cosine distance metric.</p>
* <p>
* Current implementation assumes the index uses the cosine distance metric.
* <p>
* <b>WARNING! There is a known <a href="https://github.com/langchain4j/langchain4j/issues/1948">bug</a>:
* Pinecone stores all numbers as floating-point values,
* which means {@link Integer} and {@link Long} values (e.g., 1746714878034235396) stored in {@link Metadata}
* may be corrupted and returned as incorrect numbers!
* Possible workaround: convert integer/double values to {@link String} before storing them in {@link Metadata}.
* Please note that in this case metadata filtering might not work properly!</b>
*/
public class PineconeEmbeddingStore implements EmbeddingStore<TextSegment> {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,9 @@ protected static Stream<Arguments> should_filter_by_metadata() {
}
);
}

@Override
protected boolean testLong1746714878034235396() {
return false; // TODO remove after https://github.com/langchain4j/langchain4j/issues/1948 is fixed
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class LocalWeaviateEmbeddingStoreIT extends EmbeddingStoreIT {
"long_minus_1",
"long_0",
"long_1",
"long_1746714878034235396",
"long_max",
"float_min",
"float_minus_1",
Expand Down

0 comments on commit b6f2cd6

Please sign in to comment.