Skip to content

Commit eeef663

Browse files
committed
feat(notebooks): port 04_vectorizers.ipynb
- HuggingFace Sentence Transformers (sentence-transformers/all-mpnet-base-v2) - OpenAI via LangChain4j (text-embedding-ada-002) - Cohere via LangChain4j (embed-english-v3.0) - VoyageAI via LangChain4j (voyage-law-2) - Custom vectorizer implementation - Vector search demo ("That is a happy cat" → "That is a happy dog")
1 parent 6672f6d commit eeef663

File tree

1 file changed

+80
-65
lines changed

1 file changed

+80
-65
lines changed

notebooks/04_vectorizers.ipynb

Lines changed: 80 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,7 @@
1111
"2. LangChain4j Integration (OpenAI, Cohere, VoyageAI, Azure, etc.)\n",
1212
"3. Custom vectorizers\n",
1313
"\n",
14-
"Before running this notebook, be sure to:\n",
15-
"1. Have Java 17+ installed\n",
16-
"2. Have a running Redis Stack instance with RediSearch > 2.4 active\n",
17-
"\n",
18-
"For example, you can run Redis Stack locally with Docker:\n",
14+
"Before running this notebook, be sure to have a running Redis Stack instance. You can start it with Docker:\n",
1915
"\n",
2016
"```bash\n",
2117
"docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest\n",
@@ -24,38 +20,47 @@
2420
"This will run Redis on port 6379 and RedisInsight at http://localhost:8001."
2521
]
2622
},
27-
{
28-
"cell_type": "markdown",
29-
"metadata": {},
30-
"source": [
31-
"## Setup\n",
32-
"\n",
33-
"First, add the RedisVL4j JAR and its dependencies to the classpath.\n",
34-
"For local development, you can build the project with `./gradlew :core:build` and find the JAR in `core/build/libs/`."
35-
]
36-
},
3723
{
3824
"cell_type": "code",
3925
"execution_count": null,
4026
"metadata": {},
4127
"outputs": [],
4228
"source": [
43-
"// Add JARs to classpath - adjust paths as needed\n",
44-
"%jars /path/to/redisvl4j/core/build/libs/*.jar\n",
45-
"\n",
46-
"// Import necessary classes\n",
29+
"// Load Maven dependencies\n",
30+
"%maven redis.clients:jedis:5.2.0\n",
31+
"%maven org.slf4j:slf4j-nop:2.0.16\n",
32+
"%maven com.fasterxml.jackson.core:jackson-databind:2.18.0\n",
33+
"%maven com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.18.0\n",
34+
"%maven com.github.f4b6a3:ulid-creator:5.2.3\n",
35+
"%maven dev.langchain4j:langchain4j:0.36.2\n",
36+
"%maven dev.langchain4j:langchain4j-open-ai:0.36.2\n",
37+
"%maven dev.langchain4j:langchain4j-cohere:0.36.2\n",
38+
"%maven dev.langchain4j:langchain4j-voyage-ai:0.36.2\n",
39+
"%maven com.microsoft.onnxruntime:onnxruntime:1.16.3\n",
40+
"%maven com.squareup.okhttp3:okhttp:4.12.0\n",
41+
"%maven com.google.code.gson:gson:2.10.1\n",
42+
"%maven ai.djl.huggingface:tokenizers:0.30.0\n",
43+
"\n",
44+
"// Note: RedisVL JAR is in classpath (loaded automatically by Docker container)\n",
45+
"\n",
46+
"// Import RedisVL classes\n",
4747
"import com.redis.vl.utils.vectorize.*;\n",
4848
"import com.redis.vl.index.SearchIndex;\n",
4949
"import com.redis.vl.schema.IndexSchema;\n",
5050
"import com.redis.vl.schema.VectorField;\n",
5151
"import com.redis.vl.query.VectorQuery;\n",
52+
"\n",
53+
"// Import Redis client\n",
5254
"import redis.clients.jedis.UnifiedJedis;\n",
53-
"import redis.clients.jedis.search.schemafields.VectorField.VectorAlgorithm;\n",
54-
"import java.util.List;\n",
55-
"import java.util.Map;\n",
56-
"import java.util.HashMap;\n",
57-
"import java.util.ArrayList;\n",
58-
"import java.util.Arrays;"
55+
"import redis.clients.jedis.HostAndPort;\n",
56+
"\n",
57+
"// Import LangChain4J\n",
58+
"import dev.langchain4j.model.openai.OpenAiEmbeddingModel;\n",
59+
"import dev.langchain4j.model.cohere.CohereEmbeddingModel;\n",
60+
"import dev.langchain4j.model.voyageai.VoyageAiEmbeddingModel;\n",
61+
"\n",
62+
"// Import Java standard libraries\n",
63+
"import java.util.*;"
5964
]
6065
},
6166
{
@@ -104,7 +109,7 @@
104109
"source": [
105110
"// Create a vectorizer using HuggingFace Sentence Transformers\n",
106111
"// This model runs locally - no API key needed!\n",
107-
"var hf = new SentenceTransformersVectorizer(\"sentence-transformers/all-mpnet-base-v2\");\n",
112+
"BaseVectorizer hf = new SentenceTransformersVectorizer(\"sentence-transformers/all-mpnet-base-v2\");\n",
108113
"\n",
109114
"// Embed a single sentence\n",
110115
"float[] test = hf.embed(\"This is a test sentence.\");\n",
@@ -141,11 +146,9 @@
141146
"metadata": {},
142147
"outputs": [],
143148
"source": [
144-
"import dev.langchain4j.model.openai.OpenAiEmbeddingModel;\n",
145-
"\n",
146149
"// Get API key from environment\n",
147150
"String apiKey = System.getenv(\"OPENAI_API_KEY\");\n",
148-
"if (apiKey == null) {\n",
151+
"if (apiKey == null || apiKey.isEmpty()) {\n",
149152
" System.out.println(\"Skipping OpenAI example - OPENAI_API_KEY not set\");\n",
150153
"} else {\n",
151154
" // Create OpenAI embedding model\n",
@@ -155,7 +158,7 @@
155158
" .build();\n",
156159
" \n",
157160
" // Wrap in LangChain4JVectorizer\n",
158-
" var oai = new LangChain4JVectorizer(\"text-embedding-ada-002\", openaiModel);\n",
161+
" BaseVectorizer oai = new LangChain4JVectorizer(\"text-embedding-ada-002\", openaiModel);\n",
159162
" \n",
160163
" // Embed a sentence\n",
161164
" float[] openaiTest = oai.embed(\"This is a test sentence.\");\n",
@@ -185,18 +188,16 @@
185188
"metadata": {},
186189
"outputs": [],
187190
"source": [
188-
"import dev.langchain4j.model.cohere.CohereEmbeddingModel;\n",
189-
"\n",
190191
"String cohereApiKey = System.getenv(\"COHERE_API_KEY\");\n",
191-
"if (cohereApiKey == null) {\n",
192+
"if (cohereApiKey == null || cohereApiKey.isEmpty()) {\n",
192193
" System.out.println(\"Skipping Cohere example - COHERE_API_KEY not set\");\n",
193194
"} else {\n",
194195
" var cohereModel = CohereEmbeddingModel.builder()\n",
195196
" .apiKey(cohereApiKey)\n",
196197
" .modelName(\"embed-english-v3.0\")\n",
197198
" .build();\n",
198199
" \n",
199-
" var co = new LangChain4JVectorizer(\"embed-english-v3.0\", cohereModel);\n",
200+
" BaseVectorizer co = new LangChain4JVectorizer(\"embed-english-v3.0\", cohereModel);\n",
200201
" \n",
201202
" float[] cohereTest = co.embed(\"This is a test sentence.\");\n",
202203
" System.out.println(\"Cohere Vector dimensions: \" + cohereTest.length);\n",
@@ -221,18 +222,16 @@
221222
"metadata": {},
222223
"outputs": [],
223224
"source": [
224-
"import dev.langchain4j.model.voyageai.VoyageAiEmbeddingModel;\n",
225-
"\n",
226225
"String voyageApiKey = System.getenv(\"VOYAGE_API_KEY\");\n",
227-
"if (voyageApiKey == null) {\n",
226+
"if (voyageApiKey == null || voyageApiKey.isEmpty()) {\n",
228227
" System.out.println(\"Skipping VoyageAI example - VOYAGE_API_KEY not set\");\n",
229228
"} else {\n",
230229
" var voyageModel = VoyageAiEmbeddingModel.builder()\n",
231230
" .apiKey(voyageApiKey)\n",
232231
" .modelName(\"voyage-law-2\")\n",
233232
" .build();\n",
234233
" \n",
235-
" var vo = new LangChain4JVectorizer(\"voyage-law-2\", voyageModel);\n",
234+
" BaseVectorizer vo = new LangChain4JVectorizer(\"voyage-law-2\", voyageModel);\n",
236235
" \n",
237236
" float[] voyageTest = vo.embed(\"This is a test sentence.\");\n",
238237
" System.out.println(\"VoyageAI Vector dimensions: \" + voyageTest.length);\n",
@@ -271,13 +270,15 @@
271270
" \n",
272271
" @Override\n",
273272
" protected List<float[]> generateEmbeddingsBatch(List<String> texts, int batchSize) {\n",
274-
" return texts.stream()\n",
275-
" .map(this::generateEmbedding)\n",
276-
" .collect(java.util.stream.Collectors.toList());\n",
273+
" List<float[]> results = new ArrayList<>();\n",
274+
" for (String text : texts) {\n",
275+
" results.add(generateEmbedding(text));\n",
276+
" }\n",
277+
" return results;\n",
277278
" }\n",
278279
"}\n",
279280
"\n",
280-
"var customVectorizer = new CustomVectorizer();\n",
281+
"BaseVectorizer customVectorizer = new CustomVectorizer();\n",
281282
"float[] customEmbed = customVectorizer.embed(\"This is a test sentence.\");\n",
282283
"System.out.println(\"Custom vectorizer dimensions: \" + customEmbed.length);\n",
283284
"System.out.println(\"First 10 values: \" + Arrays.toString(Arrays.copyOfRange(customEmbed, 0, 10)));"
@@ -302,23 +303,31 @@
302303
"outputs": [],
303304
"source": [
304305
"// Connect to Redis\n",
305-
"var redis = new UnifiedJedis(\"redis://localhost:6379\");\n",
306-
"\n",
307-
"// Create the schema - matching the Python notebook YAML\n",
308-
"var schema = IndexSchema.builder()\n",
309-
" .name(\"vectorizers\")\n",
310-
" .prefix(\"doc\")\n",
311-
" .storageType(IndexSchema.StorageType.HASH)\n",
312-
" .addTextField(\"sentence\", textField -> {})\n",
313-
" .addVectorField(\"embedding\", 768, vectorField ->\n",
314-
" vectorField\n",
315-
" .algorithm(VectorAlgorithm.FLAT)\n",
316-
" .distanceMetric(VectorField.DistanceMetric.COSINE)\n",
317-
" .dataType(VectorField.VectorDataType.FLOAT32))\n",
318-
" .build();\n",
306+
"UnifiedJedis jedis = new UnifiedJedis(new HostAndPort(\"redis-stack\", 6379));\n",
307+
"\n",
308+
"// Create the schema from a Map (matching the Python notebook YAML)\n",
309+
"Map<String, Object> schema = Map.of(\n",
310+
" \"index\", Map.of(\n",
311+
" \"name\", \"vectorizers\",\n",
312+
" \"prefix\", \"doc\"\n",
313+
" ),\n",
314+
" \"fields\", List.of(\n",
315+
" Map.of(\"name\", \"sentence\", \"type\", \"text\"),\n",
316+
" Map.of(\n",
317+
" \"name\", \"embedding\",\n",
318+
" \"type\", \"vector\",\n",
319+
" \"attrs\", Map.of(\n",
320+
" \"dims\", 768,\n",
321+
" \"distance_metric\", \"cosine\",\n",
322+
" \"algorithm\", \"flat\",\n",
323+
" \"datatype\", \"float32\"\n",
324+
" )\n",
325+
" )\n",
326+
" )\n",
327+
");\n",
319328
"\n",
320329
"// Create the index\n",
321-
"var index = new SearchIndex(schema, redis);\n",
330+
"SearchIndex index = SearchIndex.fromDict(schema, jedis);\n",
322331
"index.create(true); // overwrite if exists\n",
323332
"System.out.println(\"Index created: \" + index.getName());"
324333
]
@@ -342,8 +351,9 @@
342351
"}\n",
343352
"\n",
344353
"// Load data into the index\n",
345-
"index.load(data);\n",
346-
"System.out.println(\"Loaded \" + data.size() + \" documents\");"
354+
"List<String> keys = index.load(data);\n",
355+
"System.out.println(\"Loaded \" + data.size() + \" documents\");\n",
356+
"System.out.println(\"Keys: \" + keys);"
347357
]
348358
},
349359
{
@@ -356,17 +366,17 @@
356366
"float[] queryEmbedding = hf.embed(\"That is a happy cat\");\n",
357367
"\n",
358368
"// Create and execute a vector query\n",
359-
"var query = VectorQuery.builder()\n",
369+
"VectorQuery query = VectorQuery.builder()\n",
360370
" .vector(queryEmbedding)\n",
361371
" .field(\"embedding\")\n",
362-
" .returnFields(List.of(\"sentence\"))\n",
372+
" .returnFields(\"sentence\", \"vector_distance\")\n",
363373
" .numResults(3)\n",
364374
" .build();\n",
365375
"\n",
366376
"List<Map<String, Object>> results = index.query(query);\n",
367377
"\n",
368378
"System.out.println(\"\\nSearch results for: 'That is a happy cat'\");\n",
369-
"for (var doc : results) {\n",
379+
"for (Map<String, Object> doc : results) {\n",
370380
" System.out.println(doc.get(\"sentence\") + \" - Distance: \" + doc.get(\"vector_distance\"));\n",
371381
"}"
372382
]
@@ -386,7 +396,8 @@
386396
"source": [
387397
"// Cleanup\n",
388398
"index.delete(true);\n",
389-
"System.out.println(\"Index deleted\");"
399+
"jedis.close();\n",
400+
"System.out.println(\"Index deleted and connection closed\");"
390401
]
391402
},
392403
{
@@ -412,8 +423,12 @@
412423
"name": "java"
413424
},
414425
"language_info": {
415-
"name": "java",
416-
"version": "17"
426+
"codemirror_mode": "java",
427+
"file_extension": ".jshell",
428+
"mimetype": "text/x-java-source",
429+
"name": "Java",
430+
"pygments_lexer": "java",
431+
"version": "21+35"
417432
}
418433
},
419434
"nbformat": 4,

0 commit comments

Comments
 (0)