forked from elebumm/RedditVideoMakerBot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ai_methods.py
64 lines (50 loc) · 2.73 KB
/
ai_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)
# This function sort the given threads based on their total similarity with the given keywords
def sort_by_similarity(thread_objects, keywords):
# Initialize tokenizer + model.
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
# Transform the generator to a list of Submission Objects, so we can sort later based on context similarity to
# keywords
thread_objects = list(thread_objects)
threads_sentences = []
for i, thread in enumerate(thread_objects):
threads_sentences.append(" ".join([thread.title, thread.selftext]))
# Threads inference
encoded_threads = tokenizer(
threads_sentences, padding=True, truncation=True, return_tensors="pt"
)
with torch.no_grad():
threads_embeddings = model(**encoded_threads)
threads_embeddings = mean_pooling(threads_embeddings, encoded_threads["attention_mask"])
# Keywords inference
encoded_keywords = tokenizer(keywords, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
keywords_embeddings = model(**encoded_keywords)
keywords_embeddings = mean_pooling(keywords_embeddings, encoded_keywords["attention_mask"])
# Compare every keyword w/ every thread embedding
threads_embeddings_tensor = torch.tensor(threads_embeddings)
total_scores = torch.zeros(threads_embeddings_tensor.shape[0])
cosine_similarity = torch.nn.CosineSimilarity()
for keyword_embedding in keywords_embeddings:
keyword_embedding = torch.tensor(keyword_embedding).repeat(
threads_embeddings_tensor.shape[0], 1
)
similarity = cosine_similarity(keyword_embedding, threads_embeddings_tensor)
total_scores += similarity
similarity_scores, indices = torch.sort(total_scores, descending=True)
threads_sentences = np.array(threads_sentences)[indices.numpy()]
thread_objects = np.array(thread_objects)[indices.numpy()].tolist()
# print('Similarity Thread Ranking')
# for i, thread in enumerate(thread_objects):
# print(f'{i}) {threads_sentences[i]} score {similarity_scores[i]}')
return thread_objects, similarity_scores