-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexample.exs
72 lines (55 loc) · 2.1 KB
/
example.exs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# good resources
# https://opensearch.org/blog/improving-document-retrieval-with-sparse-semantic-encoders/
# https://huggingface.co/opensearch-project/opensearch-neural-sparse-encoding-v1
Postgrex.Types.define(Example.PostgrexTypes, Pgvector.extensions(), [])
{:ok, pid} = Postgrex.start_link(database: "pgvector_example", types: Example.PostgrexTypes)
Postgrex.query!(pid, "CREATE EXTENSION IF NOT EXISTS vector", [])
Postgrex.query!(pid, "DROP TABLE IF EXISTS documents", [])
Postgrex.query!(
pid,
"CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding sparsevec(30522))",
[]
)
model_id = "opensearch-project/opensearch-neural-sparse-encoding-v1"
{:ok, model_info} = Bumblebee.load_model({:hf, model_id})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, model_id})
defmodule Example do
def fetch_embeddings(model_info, tokenizer, input) do
inputs = Bumblebee.apply_tokenizer(tokenizer, input)
outputs = Axon.predict(model_info.model, model_info.params, inputs)
values =
Nx.reduce_max(Nx.multiply(outputs[:logits], Nx.new_axis(inputs["attention_mask"], -1)),
axes: [1]
)
values = Nx.log(Nx.add(1, Nx.max(values, 0)))
# TODO zero special tokens
# special_token_ids =
# for t <- Bumblebee.Tokenizer.all_special_tokens(tokenizer),
# do: Bumblebee.Tokenizer.token_to_id(tokenizer, t)
# TODO improve
values |> Nx.to_list()
end
end
input = [
"The dog is barking",
"The cat is purring",
"The bear is growling"
]
embeddings = Example.fetch_embeddings(model_info, tokenizer, input)
for {content, embedding} <- Enum.zip(input, embeddings) do
Postgrex.query!(pid, "INSERT INTO documents (content, embedding) VALUES ($1, $2)", [
content,
embedding |> Pgvector.SparseVector.new()
])
end
query = "forest"
query_embedding =
Example.fetch_embeddings(model_info, tokenizer, [query])
|> List.first()
result =
Postgrex.query!(pid, "SELECT id, content FROM documents ORDER BY embedding <#> $1 LIMIT 5", [
query_embedding |> Pgvector.SparseVector.new()
])
for [id, content] <- result.rows do
IO.puts("#{id}: #{content}")
end