Skip to content

Commit

Permalink
Remove dynamic loading from Pinecone, Vespa and Weaviate (langchain4j…
Browse files Browse the repository at this point in the history
  • Loading branch information
dliubarskyi authored Sep 21, 2023
1 parent 8851de2 commit ad30b21
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 520 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import io.pinecone.PineconeConnection;
import io.pinecone.PineconeConnectionConfig;
import io.pinecone.proto.*;
import lombok.Builder;

import java.util.Collection;
import java.util.List;
Expand All @@ -24,22 +23,32 @@
import static java.util.stream.Collectors.toList;

/**
* This is an internal implementation. Please use PineconeEmbeddingStore.
* Represents a <a href="https://www.pinecone.io/">Pinecone</a> index as an embedding store.
* Current implementation assumes the index uses the cosine distance metric.
*/
public class PineconeEmbeddingStoreImpl implements EmbeddingStore<TextSegment> {
public class PineconeEmbeddingStore implements EmbeddingStore<TextSegment> {

private static final String DEFAULT_NAMESPACE = "default"; // do not change, will break backward compatibility!
private static final String METADATA_TEXT_SEGMENT = "text_segment"; // do not change, will break backward compatibility!

private final PineconeConnection connection;
private final String nameSpace;

@Builder
public PineconeEmbeddingStoreImpl(String apiKey,
String environment,
String projectId,
String index,
String nameSpace) {
/**
* Creates an instance of PineconeEmbeddingStore.
*
* @param apiKey The Pinecone API key.
* @param environment The environment (e.g., "northamerica-northeast1-gcp").
* @param projectId The ID of the project (e.g., "19a129b"). This is <b>not</b> a project name.
* The ID can be found in the Pinecone URL: https://app.pinecone.io/organizations/.../projects/...:{projectId}/indexes.
* @param index The name of the index (e.g., "test").
* @param nameSpace (Optional) Namespace. If not provided, "default" will be used.
*/
public PineconeEmbeddingStore(String apiKey,
String environment,
String projectId,
String index,
String nameSpace) {

PineconeClientConfig configuration = new PineconeClientConfig()
.withApiKey(apiKey)
Expand Down Expand Up @@ -186,4 +195,62 @@ private static EmbeddingMatch<TextSegment> toEmbeddingMatch(Vector vector, Embed
textSegmentValue == null ? null : TextSegment.from(textSegmentValue.getStringValue())
);
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

private String apiKey;
private String environment;
private String projectId;
private String index;
private String nameSpace;

/**
* @param apiKey The Pinecone API key.
*/
public Builder apiKey(String apiKey) {
this.apiKey = apiKey;
return this;
}

/**
* @param environment The environment (e.g., "northamerica-northeast1-gcp").
*/
public Builder environment(String environment) {
this.environment = environment;
return this;
}

/**
* @param projectId The ID of the project (e.g., "19a129b"). This is <b>not</b> a project name.
* The ID can be found in the Pinecone URL: https://app.pinecone.io/organizations/.../projects/...:{projectId}/indexes.
*/
public Builder projectId(String projectId) {
this.projectId = projectId;
return this;
}

/**
* @param index The name of the index (e.g., "test").
*/
public Builder index(String index) {
this.index = index;
return this;
}

/**
* @param nameSpace (Optional) Namespace. If not provided, "default" will be used.
*/
public Builder nameSpace(String nameSpace) {
this.nameSpace = nameSpace;
return this;
}

public PineconeEmbeddingStore build() {
return new PineconeEmbeddingStore(apiKey, environment, projectId, index, nameSpace);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,25 @@
import java.util.stream.Collectors;
import lombok.Builder;
import lombok.SneakyThrows;
import retrofit2.Call;
import retrofit2.Response;

public class VespaEmbeddingStoreImpl implements EmbeddingStore<TextSegment> {
/**
* Represents the <a href="https://vespa.ai/">Vespa</a> - search engine and vector database.
* Example server configuration contains cosine similarity search rank profile, of course other Vespa neighbor search
* methods are supported too. Read more <a href="https://docs.vespa.ai/en/nearest-neighbor-search.html">here</a>.
*/
public class VespaEmbeddingStore implements EmbeddingStore<TextSegment> {

private static final Duration DEFAULT_TIMEOUT = Duration.ofSeconds(5);
private static final String DEFAULT_NAMESPACE = "namespace";
private static final String DEFAULT_DOCUMENT_TYPE = "langchain4j";
private static final boolean DEFAULT_AVOID_DUPS = true;
private static final String FIELD_NAME_TEXT_SEGMENT = "text_segment";
private static final String FIELD_NAME_VECTOR = "vector";
public static final String FIELD_NAME_DOCUMENT_ID = "documentid";
public static final String DEFAULT_RANK_PROFILE = "cosine_similarity";
public static final double DEFAULT_MIN_SCORE = 0.0;
public static final int DEFAULT_TARGET_HITS = 10;
private static final String FIELD_NAME_DOCUMENT_ID = "documentid";
private static final String DEFAULT_RANK_PROFILE = "cosine_similarity";
private static final double DEFAULT_MIN_SCORE = 0.0;
private static final int DEFAULT_TARGET_HITS = 10;

private final String url;
private final Path keyPath;
Expand All @@ -55,8 +59,30 @@ public class VespaEmbeddingStoreImpl implements EmbeddingStore<TextSegment> {

private VespaQueryApi queryApi;

/**
* Creates a new VespaEmbeddingStore instance.
*
* @param url server url, local or cloud one. The latter you can find under Endpoint of your Vespa
* application, e.g. https://alexey-heezer.langchain4j.mytenant346.aws-us-east-1c.dev.z.vespa-app.cloud/
* @param keyPath local path to the SSL private key file in PEM format. Read
* <a href="https://cloud.vespa.ai/en/getting-started-java">docs</a> for details.
* @param certPath local path to the SSL certificate file in PEM format. Read
* <a href="https://cloud.vespa.ai/en/getting-started-java">docs</a> for details.
* @param timeout for Vespa Java client in <code>java.time.Duration</code> format.
* @param namespace required for document ID generation, find more details
* <a href="https://docs.vespa.ai/en/documents.html#namespace">here</a>.
* @param documentType document type, used for document ID generation, find more details
* <a href="https://docs.vespa.ai/en/documents.html#namespace">here</a> and data querying
* @param rankProfile rank profile from your .sd schema. Provided example schema configures cosine similarity match
* @param targetHits sets the number of hits (10 is default) exposed to the real Vespa's first-phase ranking
* function per content node, find more details
* <a href="https://docs.vespa.ai/en/nearest-neighbor-search.html#querying-using-nearestneighbor-query-operator">here</a>.
* @param avoidDups if true (default), then <code>VespaEmbeddingStore</code> will generate a hashed ID based on
* provided text segment, which avoids duplicated entries in DB.
* If false, then random ID will be generated.
*/
@Builder
public VespaEmbeddingStoreImpl(
public VespaEmbeddingStore(
String url,
String keyPath,
String certPath,
Expand All @@ -83,6 +109,13 @@ public String add(Embedding embedding) {
return add(null, embedding, null);
}

/**
* Adds a new embedding with provided ID to the store.
*
* @param id "user-specified" part of document ID, find more details
* <a href="https://docs.vespa.ai/en/documents.html#namespace">here</a>
* @param embedding the embedding to add
*/
@Override
public void add(String id, Embedding embedding) {
add(id, embedding, null);
Expand Down Expand Up @@ -138,11 +171,19 @@ public void onError(FeedException error) {
return ids;
}

/**
* {@inheritDoc}
* The score inside {@link EmbeddingMatch} is Vespa relevance according to provided rank profile.
*/
@Override
public List<EmbeddingMatch<TextSegment>> findRelevant(Embedding referenceEmbedding, int maxResults) {
return findRelevant(referenceEmbedding, maxResults, DEFAULT_MIN_SCORE);
}

/**
* {@inheritDoc}
* The score inside {@link EmbeddingMatch} is Vespa relevance according to provided rank profile.
*/
@Override
@SneakyThrows
public List<EmbeddingMatch<TextSegment>> findRelevant(Embedding referenceEmbedding, int maxResults, double minScore) {
Expand All @@ -165,7 +206,7 @@ public List<EmbeddingMatch<TextSegment>> findRelevant(Embedding referenceEmbeddi
.getRoot()
.getChildren()
.stream()
.map(VespaEmbeddingStoreImpl::toEmbeddingMatch)
.map(VespaEmbeddingStore::toEmbeddingMatch)
.collect(Collectors.toList());
} else {
throw new RuntimeException("Request failed");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@
import lombok.Builder;

/**
* This is an internal implementation. Please use WeaviateEmbeddingStore.
* Represents the <a href="https://weaviate.io/">Weaviate</a> vector database.
* Current implementation assumes the cosine distance metric is used.
*/
public class WeaviateEmbeddingStoreImpl implements EmbeddingStore<TextSegment> {
public class WeaviateEmbeddingStore implements EmbeddingStore<TextSegment> {

private static final String DEFAULT_CLASS = "Default";
private static final Double DEFAULT_MIN_CERTAINTY = 0.0;
Expand All @@ -42,8 +43,21 @@ public class WeaviateEmbeddingStoreImpl implements EmbeddingStore<TextSegment> {
private boolean avoidDups;
private String consistencyLevel;

/**
* Creates a new WeaviateEmbeddingStore instance.
*
* @param apiKey your Weaviate API key
* @param scheme the scheme, e.g. "https" of cluster URL. Find in under Details of your Weaviate cluster.
* @param host the host, e.g. "langchain4j-4jw7ufd9.weaviate.network" of cluster URL.
* Find in under Details of your Weaviate cluster.
* @param objectClass the object class you want to store, e.g. "MyGreatClass"
* @param avoidDups if true (default), then <code>WeaviateEmbeddingStore</code> will generate a hashed ID based on
* provided text segment, which avoids duplicated entries in DB.
* If false, then random ID will be generated.
* @param consistencyLevel Consistency level: ONE, QUORUM (default) or ALL. Find more details <a href="https://weaviate.io/developers/weaviate/concepts/replication-architecture/consistency#tunable-write-consistency">here</a>.
*/
@Builder
public WeaviateEmbeddingStoreImpl(
public WeaviateEmbeddingStore(
String apiKey,
String scheme,
String host,
Expand All @@ -68,6 +82,14 @@ public String add(Embedding embedding) {
return id;
}

/**
* Adds a new embedding with provided ID to the store.
*
* @param id the ID of the embedding to add in UUID format, since it's Weaviate requirement.
* See <a href="https://weaviate.io/developers/weaviate/manage-data/create#id">Weaviate docs</a> and
* <a href="https://en.wikipedia.org/wiki/Universally_unique_identifier">UUID on Wikipedia</a>
* @param embedding the embedding to add
*/
@Override
public void add(String id, Embedding embedding) {
addAll(singletonList(id), singletonList(embedding), null);
Expand All @@ -88,11 +110,19 @@ public List<String> addAll(List<Embedding> embeddings, List<TextSegment> embedde
return addAll(null, embeddings, embedded);
}

/**
* {@inheritDoc}
* The score inside {@link EmbeddingMatch} is Weaviate's certainty.
*/
@Override
public List<EmbeddingMatch<TextSegment>> findRelevant(Embedding referenceEmbedding, int maxResults) {
return findRelevant(referenceEmbedding, maxResults, DEFAULT_MIN_CERTAINTY);
}

/**
* {@inheritDoc}
* The score inside {@link EmbeddingMatch} is Weaviate's certainty.
*/
@Override
public List<EmbeddingMatch<TextSegment>> findRelevant(
Embedding referenceEmbedding,
Expand Down Expand Up @@ -144,7 +174,7 @@ public List<EmbeddingMatch<TextSegment>> findRelevant(

List<Map<String, ?>> resItems = ((Map.Entry<String, List<Map<String, ?>>>) resItemsPart.get()).getValue();

return resItems.stream().map(WeaviateEmbeddingStoreImpl::toEmbeddingMatch).collect(Collectors.toList());
return resItems.stream().map(WeaviateEmbeddingStore::toEmbeddingMatch).collect(Collectors.toList());
}

private List<String> addAll(List<String> ids, List<Embedding> embeddings, List<TextSegment> embedded) {
Expand Down
Loading

0 comments on commit ad30b21

Please sign in to comment.