Skip to content
This repository was archived by the owner on Jan 2, 2025. It is now read-only.

Commit 262a688

Browse files
committed
deduplicate w.r.t. mean pooled vector
1 parent 4e6fe0e commit 262a688

File tree

2 files changed

+19
-4
lines changed

2 files changed

+19
-4
lines changed

server/bleep/src/semantic.rs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ pub use schema::{Embedding, Payload};
3030

3131
const COLLECTION_NAME: &str = "documents";
3232
const SCORE_THRESHOLD: f32 = 0.3;
33+
const EMBEDDING_DIM: usize = 384;
3334

3435
#[derive(Error, Debug)]
3536
pub enum SemanticError {
@@ -149,7 +150,7 @@ fn collection_config() -> CreateCollection {
149150
collection_name: COLLECTION_NAME.to_string(),
150151
vectors_config: Some(VectorsConfig {
151152
config: Some(vectors_config::Config::Params(VectorParams {
152-
size: 384,
153+
size: EMBEDDING_DIM as u64,
153154
distance: Distance::Cosine.into(),
154155
..Default::default()
155156
})),
@@ -396,8 +397,9 @@ impl Semantic {
396397
.collect::<Vec<_>>()
397398
})?;
398399

399-
// TODO: Deduplicate with respect to all vectors
400-
let target_vector = vectors.first().unwrap().clone();
400+
// deduplicate with mmr with respect to the mean of query vectors
401+
// TODO: implement a more robust multi-vector deduplication strategy
402+
let target_vector = mean_pool(vectors);
401403
Ok(deduplicate_snippets(results, target_vector, limit))
402404
}
403405

@@ -629,6 +631,19 @@ fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
629631
dot(a, b) / (norm(a) * norm(b))
630632
}
631633

634+
// Calculate the element-wise mean of the embeddings
635+
fn mean_pool(embeddings: Vec<Vec<f32>>) -> Vec<f32> {
636+
let len = embeddings.len() as f32;
637+
let mut result = vec![0.0; EMBEDDING_DIM];
638+
for embedding in embeddings {
639+
for (i, v) in embedding.iter().enumerate() {
640+
result[i] += v;
641+
}
642+
}
643+
result.iter_mut().for_each(|v| *v /= len);
644+
result
645+
}
646+
632647
// returns a list of indices to preserve from `snippets`
633648
//
634649
// query_embedding: the embedding of the query terms

server/bleep/src/webserver/answer/prompts.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ pub fn hypothetical_document_prompt(query: &str) -> String {
261261
format!(
262262
r#"Write three code snippets that could hypothetically be returned by a code search engine as the answer to the query: {query}
263263
264-
- All three snippets should be written in any one of these languages: [Rust, Typescript, TSX, YAML]
264+
- Write these snippets in a variety of programming languages
265265
- The snippets should not be too similar to one another
266266
- Each snippet should be between 5 and 10 lines long
267267
- Surround the snippets in triple backticks

0 commit comments

Comments
 (0)