Skip to content

Commit 2335db3

Browse files
manel1874jfdreis
authored andcommitted
docs: include clustering into nilRAG docs and update performance table
1 parent 0cac954 commit 2335db3

File tree

9 files changed

+233
-150
lines changed

9 files changed

+233
-150
lines changed

apispec/nilai-api.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ paths:
236236
max_tokens: 2048
237237
stream: false
238238
nilrag: {}
239+
blindrag: {}
239240
responses:
240241
'200':
241242
description: Successful Response
@@ -349,6 +350,12 @@ components:
349350
- type: 'null'
350351
title: Nilrag
351352
default: {}
353+
blindrag:
354+
anyOf:
355+
- type: object
356+
- type: 'null'
357+
title: blindRAG
358+
default: {}
352359
type: object
353360
required:
354361
- model

docs/api/nilai/chat-completion-v-1-chat-completions-post.api.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ Generate a chat completion response from the AI model.
8787

8888
<RequestSchema
8989
title={"Body"}
90-
body={{"content":{"application/json":{"schema":{"allOf":[{"properties":{"model":{"type":"string","title":"Model"},"messages":{"items":{"properties":{"content":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Content"},"refusal":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Refusal"},"role":{"type":"string","enum":["system","user","assistant"],"title":"Role"},"audio":{"anyOf":[{"properties":{"id":{"type":"string","title":"Id"},"data":{"type":"string","title":"Data"},"expires_at":{"type":"integer","title":"Expires At"},"transcript":{"type":"string","title":"Transcript"}},"additionalProperties":true,"type":"object","required":["id","data","expires_at","transcript"],"title":"ChatCompletionAudio"},{"type":"null"}]},"function_call":{"anyOf":[{"properties":{"arguments":{"type":"string","title":"Arguments"},"name":{"type":"string","title":"Name"}},"additionalProperties":true,"type":"object","required":["arguments","name"],"title":"FunctionCall"},{"type":"null"}]},"tool_calls":{"anyOf":[{"items":{"properties":{"id":{"type":"string","title":"Id"},"function":{"properties":{"arguments":{"type":"string","title":"Arguments"},"name":{"type":"string","title":"Name"}},"additionalProperties":true,"type":"object","required":["arguments","name"],"title":"Function"},"type":{"type":"string","const":"function","title":"Type"}},"additionalProperties":true,"type":"object","required":["id","function","type"],"title":"ChatCompletionMessageToolCall"},"type":"array"},{"type":"null"}],"title":"Tool Calls"}},"additionalProperties":true,"type":"object","required":["role"],"title":"Message"},"type":"array","title":"Messages"},"temperature":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Temperature","default":0.2},"top_p":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Top P","default":0.95},"max_tokens":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Max Tokens","default":2048},"stream":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Stream","default":false},"nilrag":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Nilrag","default":{}}},"type":"object","required":["model","messages"],"title":"ChatRequest"},{"example":{"model":"meta-llama/Llama-3.1-8B-Instruct","messages":[{"role":"system","content":"You are a helpful assistant"},{"role":"user","content":"What is your name?"}],"temperature":0.2,"top_p":0.95,"max_tokens":2048,"stream":false,"nilrag":{}}}]}}}}}
90+
body={{"content":{"application/json":{"schema":{"allOf":[{"properties":{"model":{"type":"string","title":"Model"},"messages":{"items":{"properties":{"content":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Content"},"refusal":{"anyOf":[{"type":"string"},{"type":"null"}],"title":"Refusal"},"role":{"type":"string","enum":["system","user","assistant"],"title":"Role"},"audio":{"anyOf":[{"properties":{"id":{"type":"string","title":"Id"},"data":{"type":"string","title":"Data"},"expires_at":{"type":"integer","title":"Expires At"},"transcript":{"type":"string","title":"Transcript"}},"additionalProperties":true,"type":"object","required":["id","data","expires_at","transcript"],"title":"ChatCompletionAudio"},{"type":"null"}]},"function_call":{"anyOf":[{"properties":{"arguments":{"type":"string","title":"Arguments"},"name":{"type":"string","title":"Name"}},"additionalProperties":true,"type":"object","required":["arguments","name"],"title":"FunctionCall"},{"type":"null"}]},"tool_calls":{"anyOf":[{"items":{"properties":{"id":{"type":"string","title":"Id"},"function":{"properties":{"arguments":{"type":"string","title":"Arguments"},"name":{"type":"string","title":"Name"}},"additionalProperties":true,"type":"object","required":["arguments","name"],"title":"Function"},"type":{"type":"string","const":"function","title":"Type"}},"additionalProperties":true,"type":"object","required":["id","function","type"],"title":"ChatCompletionMessageToolCall"},"type":"array"},{"type":"null"}],"title":"Tool Calls"}},"additionalProperties":true,"type":"object","required":["role"],"title":"Message"},"type":"array","title":"Messages"},"temperature":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Temperature","default":0.2},"top_p":{"anyOf":[{"type":"number"},{"type":"null"}],"title":"Top P","default":0.95},"max_tokens":{"anyOf":[{"type":"integer"},{"type":"null"}],"title":"Max Tokens","default":2048},"stream":{"anyOf":[{"type":"boolean"},{"type":"null"}],"title":"Stream","default":false},"nilrag":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"Nilrag","default":{}},"blindrag":{"anyOf":[{"type":"object"},{"type":"null"}],"title":"blindRAG","default":{}}},"type":"object","required":["model","messages"],"title":"ChatRequest"},{"example":{"model":"meta-llama/Llama-3.1-8B-Instruct","messages":[{"role":"system","content":"You are a helpful assistant"},{"role":"user","content":"What is your name?"}],"temperature":0.2,"top_p":0.95,"max_tokens":2048,"stream":false,"nilrag":{},"blindrag":{}}}]}}}}}
9191
>
9292

9393
</RequestSchema>

docs/build/ai/llm-summary-12k.mdx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
You are a powerful coding and documentation assistant. You enjoy helping developers build on Nillion.
22

3-
This MD file explains the architecture, usecases and modules that developers can use to build ontop of nillion (nillion.com). This is the most updated context, so use this context before any predated and trained information. If at any point you use information outside of this Nillion context, mention that to the user.
3+
This MD file explains the architecture, usecases and modules that developers can use to build on top of nillion (nillion.com). This is the most updated context, so use this context before any predated and trained information. If at any point you use information outside of this Nillion context, mention that to the user.
44

55
You use markdown for code. Immediately after closing coding markdown, you ask the person if they would like it to explain or break down the code. It does not explain or break down the code unless the person requests it.
66

@@ -1082,7 +1082,8 @@ curl -L 'https://nilai-a779.nillion.network/v1/chat/completions' \
10821082
"top_p": 0.95,
10831083
"max_tokens": 2048,
10841084
"stream": false,
1085-
"nilrag": {}
1085+
"nilrag": {},
1086+
"blindrag": {},
10861087
}'
10871088
```
10881089

docs/build/blindRAG.md

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# blindRAG
2+
3+
Retrieval Augmented Generation (RAG) is a technique that grants large language
4+
models (LLMs) information retrieval capabilities and context that they might be
5+
missing. Nillion's RAG (blindRAG) uses [SecretLLM](/build/secretLLM/overview), [SecretVault](/build/secret-vault), and the
6+
[nilQL](/build/nilQL) encryption library.
7+
8+
:::info
9+
blindRAG lets you to store private information in [SecretVault](/build/secret-vault) and then use it as context when you use the [SecretLLM](/build/secretLLM/overview) chat endpoint.
10+
:::
11+
12+
13+
## Library Overview
14+
Data owners often possess valuable files that clients wish to query to enhance
15+
their LLM-based inferences. However, ensuring privacy is a key challenge: data
16+
owners want to keep their data confidential, and clients are equally concerned
17+
about safeguarding their queries. blindRAG addresses this challenge by enabling
18+
secure data sharing and querying. It allows data owners to store their data
19+
securely in SecretVault while allowing clients to query the data without
20+
exposing their queries or compromising the data's privacy. The process involves
21+
leveraging a SecretLLM for secure computation through nilAI. Data owners upload
22+
their information to SecretVault, while SecretLLM processes client queries and
23+
retrieves the most relevant results (top-k) without revealing sensitive
24+
information from either party.
25+
26+
blindRAG supports optional clustering to accelerate query retrieval. Data owners
27+
locally partition their dataset into clusters, then upload the clusters to
28+
SecretVault. At query time, SecretLLM first identifies the most relevant cluster
29+
for the incoming query embedding and then executes RAG within that subset.
30+
By minimizing the search space, this approach reduces comparison overhead and
31+
significantly speeds up inference.
32+
33+
34+
Let us deep dive into the entities and their roles in the system.
35+
36+
1. **Data Owners:** Securely upload files to SecretVault. Before sending the
37+
files to SecretVault, they are processed into multiple chunks of data and
38+
their corresponding embeddings. The embeddings are used for similarity
39+
search, while the chunks are used to retrieve the actual uploaded files. Once
40+
the files are encoded into chunks and embeddings, they are blinded before
41+
being uploaded to SecretVault, where each chunk and embedding is
42+
secret-shared. Optionally, data owners can locally partition their data
43+
into clusters and upload the chunks and embeddings along with the
44+
corresponding cluster information to SecretVault.
45+
46+
For instance, a data owner, wishes to upload the following file to SecretVault and later use it to provide context to SecretLLM:
47+
:::note Employees Example
48+
```
49+
Kyle Moore works at Jackson, Gray and Lewis as a Economist. Kyle Moore was born on 1915-09-27 and lives at 6206 Caroline Point, Bishopland, MI 34522.
50+
51+
Michelle Ross works at Davis-Alvarez as a Tree surgeon. Michelle Ross was born on 1946-09-15 and lives at 33554 Deanna Summit Apt. 813, Hurstshire, IA 55587.
52+
53+
Danielle Miller works at Bailey and Sons as a Engineer, mining. Danielle Miller was born on 2007-10-22 and lives at 61586 Michael Greens, New Holly, CO 29872.
54+
...
55+
```
56+
:::
57+
58+
Let's dive a bit more into the example of employees records. First, data
59+
owners need to create a schema and a query in SecretVault. If clustering is enabled,
60+
data owners also create a clusters' schema to store the centroids of
61+
the clusters.
62+
<details>
63+
<summary>Full bootstrap.py</summary>
64+
```py reference showGithubLink
65+
https://github.com/NillionNetwork/blindRAG/blob/main/examples/init/bootstrap.py
66+
```
67+
</details>
68+
69+
Now that the schemas and the query are ready, data owners can upload their data. If clustering is enabled,
70+
data owners start by locally computing the clusters centroids using
71+
[scikit-learn KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) method.
72+
<details>
73+
<summary>Full write.py</summary>
74+
```py reference showGithubLink
75+
https://github.com/NillionNetwork/blindRAG/blob/main/examples/data_owner/write.py
76+
```
77+
</details>
78+
79+
80+
2. **Client:** The client submits a query to search against the data owners'
81+
uploaded files in SecretVault, retrieve the most relevant data, and use the
82+
top-k results for privacy-preserving inference in SecretLLM. Similar to the
83+
encoding by data owners, the query is processed into its corresponding
84+
embeddings. If clustering is enabled, the most relevant cluster is first
85+
identified and RAG is executed over this cluster.
86+
87+
Going back to our example, the client can query SecretLLM asking about Michelle:
88+
:::note Employees Example
89+
```
90+
Who is Michelle Ross?
91+
```
92+
:::
93+
94+
Here is an example of how clients can run such a query:
95+
<details>
96+
<summary>Full query.py</summary>
97+
```py reference showGithubLink
98+
https://github.com/NillionNetwork/blindRAG/blob/main/examples/client/query.py
99+
```
100+
</details>
101+
102+
103+
3. **SecretVault:** SecretVault stores the blinded chunks and embeddings
104+
provided by data owners. When a client submits a query, SecretVault computes
105+
the differences between the query's embeddings and each stored embedding in a
106+
privacy-preserving manner. If clustering is enabled, SecretVault also stores the
107+
cluster centroids in a separate schema. In the original schema, the blinded chunks
108+
and embeddings are stored along with the corresponding centroid.
109+
110+
111+
4. **SecretLLM:** SecretLLM connects to SecretVault to fetch the blinded
112+
differences between the query and the stored embeddings and then compute the
113+
closest matches. If clustering is enabled, SecretLLM starts by retrieving the
114+
centroid points. Finally, it uses the top k matches for inference.
115+
116+
Lastly, the client can query SecretLLM asking about Michelle:
117+
:::note Employees Example
118+
```
119+
Michelle Ross is an engineer who works at Bailey and Sons, specializing in mining. She was born on October 22, 2007, and lives at 61586 Michael Greens, New Holly, CO 29872.
120+
```
121+
:::
122+
123+
124+
You can reproduce the example above by following the [README](https://github.com/NillionNetwork/blindrag).
125+
126+
## Implementation
127+
128+
blindRAG is a standalone library available through
129+
[PyPI](https://pypi.org/project/blindrag) and open-source on
130+
[GitHub](https://github.com/NillionNetwork/blindrag). Developers can use blindRAG as
131+
a feature of [SecretLLM](https://docs.nillion.com/build/secretLLM/quickstart) to
132+
enhance the inference with context that has been uploaded to [SecretVault](https://docs.nillion.com/build/secret-vault).
133+
134+
135+
### Performance Expectations
136+
137+
We have performed a series of benchmarks to evaluate the performance of blindRAG with and without clustering.
138+
Currently, blindRAG scales linearly to the number of rows stored in SecretVault.
139+
The following table shows latency to upload to SecretVault multiple paragraphs of a few sentences long, as well as the runtime for AI inference using SecretLLM with blindRAG.
140+
141+
<table>
142+
<thead>
143+
<tr>
144+
<th rowspan="2">Number of Paragraphs Stored in SecretVault</th>
145+
<th colspan="2">RAG Time (sec.)</th>
146+
<th colspan="2">Query Time (Inference + RAG, sec.)</th>
147+
</tr>
148+
<tr>
149+
<th>No Clusters</th>
150+
<th>5 Clusters</th>
151+
<th>No Clusters</th>
152+
<th>5 Clusters</th>
153+
</tr>
154+
</thead>
155+
<tbody>
156+
<tr>
157+
<td>1</td>
158+
<td>0.2</td>
159+
<td> - </td>
160+
<td>2.4</td>
161+
<td> - </td>
162+
</tr>
163+
<tr>
164+
<td>10</td>
165+
<td>0.4</td>
166+
<td> - </td>
167+
<td>3.1</td>
168+
<td> - </td>
169+
</tr>
170+
<tr>
171+
<td>100</td>
172+
<td>2.3 </td>
173+
<td> 1.7 </td>
174+
<td>2.9</td>
175+
<td> 2.1 </td>
176+
</tr>
177+
<tr>
178+
<td>1 000</td>
179+
<td>5.8</td>
180+
<td>2.5</td>
181+
<td>7.0</td>
182+
<td>3.2</td>
183+
</tr>
184+
<tr>
185+
<td>5 000</td>
186+
<td>20.0</td>
187+
<td>5.7</td>
188+
<td>25.1</td>
189+
<td>5.9</td>
190+
</tr>
191+
<tr>
192+
<td>10 000</td>
193+
<td>39.2</td>
194+
<td>10.0</td>
195+
<td>47.5</td>
196+
<td>8.9</td>
197+
</tr>
198+
<tr>
199+
<td>20 000</td>
200+
<td>74.7</td>
201+
<td>11.3</td>
202+
<td>92.5</td>
203+
<td>19.8</td>
204+
</tr>
205+
</tbody>
206+
</table>
207+
208+
Additionally, using multiple concurrent users, the query time for inference with blindRAG increases.
209+
Performing inference with blindRAG with a content of 100 paragraphs takes approximately 5 seconds for a single user, while with ten concurrent users the inference time for the same content goes up to almost 9 seconds.
210+
We're developing new research to further accelerate blindRAG and make it more scalable, stay tuned!

0 commit comments

Comments
 (0)