Skip to content

Commit

Permalink
add test to models
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed Jul 30, 2020
1 parent b646912 commit 811fa61
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 55 deletions.
52 changes: 40 additions & 12 deletions MedSemanticSearch/medsearch/models/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Model class to be extended by specific types of models """
import json
from pathlib import Path
from datetime import datetime
from typing import Callable, Dict, Optional


DIRNAME = Path(__file__).parents[1].resolve()/'weights'

class ModelBase:
Expand All @@ -18,16 +20,8 @@ def __init__(self, dataset_cls:type, network_fn:Callable,
network_args={}
if network_fn is not None:
self.network = network_fn(**network_args)

def load_weights(self, filename):
pass

def save_weights(self, obj, filename):
with open(f'{filename}.json', 'w') as outfile:
json.dump(obj, outfile)

class TorchModelBase(ModelBase):

def __init__(self,
dataset_cls:type=None,
tokenizer_cls:Callable=None,
Expand All @@ -37,12 +31,27 @@ def __init__(self,
tokenizer_args:Dict=None):
super().__init__(dataset_cls, network_fn, dataset_args, network_args)

if tokenizer_args is None:
tokenizer_args={}
if tokenizer_args is None: tokenizer_args={}
if tokenizer_cls is not None:
self.tokenizer = tokenizer_cls(**tokenizer_args)

def model(self):
def evaluate(self):
pass
def loss(self):
pass

def metrics(self):
pass

@property
def weights_filename(self):
DIRNAME.mkdir(parents=True, exist_ok=True)
return str(DIRNAME/f'{self.name}_weights_{datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.pt')

def load_weights(self):
pass

def save_weights(self):
pass

class TensorflowModelBase(ModelBase):
Expand All @@ -53,5 +62,24 @@ def __init__(self,
dataset_args:Dict=None,
network_args:Dict=None):
super().__init__(dataset_cls, network_fn, dataset_args, network_args)
def model(self):

def fit(self,dataset, bs, epochs, callbacks):
pass
def evaluate(self):
pass
def loss(self):
pass

def metrics(self):
pass

@property
def weights_filename(self):
DIRNAME.mkdir(parents=True, exist_ok=True)
return str(DIRNAME/f"{self.name}_weights.h5")

def load_weights(self):
pass

def save_weights(self):
pass
53 changes: 53 additions & 0 deletions MedSemanticSearch/medsearch/models/clustering_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import numpy as np
from dataclasses import dataclass, field
from typing import Union, List, Tuple, Callable, Dict, Optional

from medsearch.models.base import TorchModelBase
from medsearch.datasets.dataset import SemanticCorpusDataset
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

class ClusteringModel(TorchModelBase):
def __init__(self,
dataset_cls:type=SemanticCorpusDataset,
network_fn:Callable=SentenceTransformer,
dataset_args:Dict=None,
network_args:Dict=None):
super().__init__(dataset_cls,None, network_fn, dataset_args, network_args)


def word_embeddings(self, corpus):
self.embedder = lambda txt: np.array(self.network.encode(txt))
self.corpus_embed = self.embedder(corpus)

def get_similarity_vecs(self, n_clusters:int=5):
clustering_model = KMeans(n_clusters=n_clusters)
clustering_model.fit(self.corpus_embed)
cluster_assignment = clustering_model.labels_
return cluster_assignment


def run_test():
list_of_models:Dict = {1:'roberta-base-nli-stsb-mean-tokens',
2:'bert-base-nli-stsb-mean-tokens'}
model = ClusteringModel(
dataset_args={"batch":1000},
network_args={"model_name_or_path":list_of_models[1]})
data = model.data.load_one_batch()
corpus = [(f'{t} <SEP> {a}')[:512] for t,a in zip(data.title, data.paperAbstract)]

model.word_embeddings(corpus)
num_clusters=5
cluster_assignment = model.get_similarity_vecs(num_clusters)
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
print("============ Cluster {i+1} =========================")
print(cluster[i])
print("\n ===================================")


if __name__ == "__main__":
run_test()
28 changes: 15 additions & 13 deletions MedSemanticSearch/medsearch/models/scaNN.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,16 @@
"""ScaNN Accelerate Vector Similarity Search by approximating the Maximum Inner Product Search
with an Anisotropic Loss function https://arxiv.org/abs/1908.10396"""

import os
import h5py
import requests
import tempfile
import numpy as np
import scann
from medsearch.models.base import ModelBase
from typing import Union, List, Tuple, Callable, Dict, Optional

_DOC="""ScaNN Accelerate Vector Similarity Search by approximating the Maximum Inner Product Search
with an Anisotropic Loss function https://arxiv.org/abs/1908.10396"""
import scann
import numpy as np
from medsearch.models.base import ModelBase

def get_glove_example():
with tempfile.TemporaryDirectory() as tmp:
response = requests.get("http://ann-benchmarks.com/glove-100-angular.hdf5")
loc = os.path.join(tmp, "glove.hdf5")
with open(loc, 'wb') as f:
f.write(response.content)
glove_h5py = h5py.File(loc)
return glove_h5py

class ScaNN(ModelBase):
def __init__(self, dataset_cls:type,
Expand Down Expand Up @@ -59,3 +52,12 @@ def compute_recall(neighbors, true_neighbors):
return total / true_neighbors.size


def get_glove_example():
with tempfile.TemporaryDirectory() as tmp:
response = requests.get("http://ann-benchmarks.com/glove-100-angular.hdf5")
loc = os.path.join(tmp, "glove.hdf5")
with open(loc, 'wb') as f:
f.write(response.content)
glove_h5py = h5py.File(loc)
return glove_h5py

46 changes: 22 additions & 24 deletions MedSemanticSearch/medsearch/models/sentence_transformer_model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from typing import Union, List, Tuple, Callable, Dict, Optional

import numpy as np
from dataclasses import dataclass, field
from medsearch.models.base import TorchModelBase
from medsearch.models.utils import cosine_similarity
from medsearch.datasets.dataset import SemanticCorpusDataset
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from typing import Union, List, Tuple, Callable, Dict, Optional

class SentenceTransformerModel(TorchModelBase):
def __init__(self,
Expand All @@ -14,18 +14,16 @@ def __init__(self,
network_args:Dict=None):
super().__init__(dataset_cls,None, network_fn, dataset_args, network_args)

def word_embeddings(self, queries:Union[str,List[str]], corpus):
self.embed = lambda txt: np.array(self.network.encode(txt))
self.emb_queries = self.embed(queries)
self.emb_corpus = self.embed(corpus)
self.corpus = corpus
def word_embeddings(self, corpus:List[str]):
self.embedder = lambda txt: np.array(self.network.encode(txt))
self.corpus_embed = self.embedder(corpus)

def get_similarity_vecs(self, topk:int=10):
results = cosine_similarity(self.emb_queries, self.emb_corpus)[0]
topk = results.argsort()[-topk:][::-1]
scores = [str(s) for s in results[topk]]
sentences = [self.corpus[idx] for idx in topk]
return sentences, scores
def get_similarity_vecs(self, query:Union[str,List[str]], topk:int=10):
self.query_embed = self.embedder(query)
scores = cosine_similarity(self.query_embed, self.corpus_embed)[0]
results = zip(range(len(scores)), scores)
results = sorted(results, key=lambda x: x[1], reverse=True)
return results[:topk]



Expand All @@ -38,16 +36,16 @@ def run_test():
network_args={"model_name_or_path":list_of_models[1]})
data = model.data.load_one_batch()
corpus = [(f'{t} <SEP> {a}')[:512] for t,a in zip(data.title, data.paperAbstract)]

queries = ["breast cancer"]
model.word_embeddings(queries, corpus)
sentences, scores = model.get_similarity_vecs()

print(f"queries: {queries}")
for i, (st, sc) in enumerate(zip(sentences,scores)):
print(f"similar paper {i} Score : {sc}")
print(f"{st}")
print(f"-------------------------------------")
queries = ["breast cancer", 'brain damage', 'heart attack']
model.word_embeddings(corpus)

for query in queries:
results = model.get_similarity_vecs(query)
print(f"========== Queries: {query} ================")
for i, (st, sc) in enumerate(results):
print(f"Similar paper {i} Score : {sc}")
print(f"{corpus[st]}")
print(f"-------------------------------------")

if __name__ == "__main__":
run_test()
16 changes: 10 additions & 6 deletions MedSemanticSearch/medsearch/models/tfidf_model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import numpy as np
import pandas as pd

from tqdm import tqdm
from typing import Union, List, Tuple, Callable, Dict, Optional

from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from medsearch.models.base import ModelBase
from medsearch.datasets.dataset import SemanticCorpusDataset

Expand All @@ -26,12 +30,12 @@ def fit(self,corpus:List[str]):
X = np.asarray(V.fit_transform(corpus).astype(np.float32).todense())
return X, V

def dotSimilarity(self, X, nTake=50):
def dotSimilarity(self, X:np.ndarray, nTake:int=50)->List[float]:
S = X @ X.T
simVec=np.argsort(S, axis=1)[:,:-nTake:-1]
return simVec.tolist()

def svmSimilarity(self, X, nTake=40):
def svmSimilarity(self, X:np.ndarray, nTake:int=40)->List[float]:
n,_= X.shape
IX = np.zeros((n,nTake), dtype=np.int64)
for i in tqdm(range(n)):
Expand All @@ -47,7 +51,7 @@ def svmSimilarity(self, X, nTake=40):
return IX.tolist()


def build_search_index(self,data, v):
def build_search_index(self,data:pd.DataFrame, v:np.ndarray):

# construct a reverse index for suppoorting search
vocab = v.vocabulary_
Expand All @@ -70,7 +74,7 @@ def makedict(s, forceidf=None):
idfd[w] = idfval
return idfd

def merge_dicts(dlist):
def merge_dicts(dlist:List)->Dict:
m = {}
for d in dlist:
for k, v in d.items():
Expand All @@ -82,7 +86,7 @@ def merge_dicts(dlist):
search_dict = [merge_dicts([t,s]) for t,s in zip(dict_title, dict_summary)]
return search_dict

def modelExampleTest(save_dicts:bool=True):
def run_test(save_dicts:bool=True):
model = TfidfModel(dataset_args={"batch":5000})
df = model.data.load_one_batch()
corpus = [f'{t} <SEP> {a}' for t,a in zip(df.title, df.paperAbstract)]
Expand All @@ -94,7 +98,7 @@ def modelExampleTest(save_dicts:bool=True):
model.save_weights(IX, model.data.data_dirname().parent/'sim_vecs')

if __name__ == "__main__":
modelExampleTest()
run_test()



21 changes: 21 additions & 0 deletions MedSemanticSearch/medsearch/models/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import numpy as np
import torch
from torch import Tensor
from typing import Union

def cosine_similarity(a:Union[Tensor,np.ndarray], b:Union[Tensor,np.ndarray])->Tensor:
"""
Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
:return: Matrix with res[i][j] = cos_sim(a[i], b[j])
"""
if isinstance(a, np.ndarray): a= torch.tensor(a)
if isinstance(b, np.ndarray): b= torch.tensor(b)
if len(a.shape) == 1:
a = a.unsqueeze(0)
if len(b.shape) == 1:
b = b.unsqueeze(0)

a_norm = a / a.norm(dim=1)[:, None]
b_norm = b / b.norm(dim=1)[:, None]
res = torch.mm(a_norm, b_norm.transpose(0, 1))
return res

0 comments on commit 811fa61

Please sign in to comment.