-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAstridEmbed.py
168 lines (135 loc) · 8.66 KB
/
AstridEmbed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
import pandas as pd
import misc_utils
from string_dataset_helpers import TripletStringDataset, StringSelectivityDataset
import EmbeddingLearner
import SupervisedSelectivityEstimator
embedding_learner_configs, frequency_configs, selectivity_learner_configs = None, None, None
#This function gives a single place to change all the necessary configurations.
#Please see misc_utils for some additional descriptions of what these attributes mean
def setup_configs():
global embedding_learner_configs, frequency_configs, selectivity_learner_configs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_learner_configs = misc_utils.AstridEmbedLearnerConfigs(embedding_dimension=64, batch_size=128,
num_epochs=32, margin=0.2, device=device, lr=0.001, channel_size=8)
path = "datasets/dblp/"
#This assumes that prepare_dataset function was called to output the files.
#If not, please change the file names appropriately
file_name_prefix = "dblp_titles"
query_type = "prefix"
frequency_configs = misc_utils.StringFrequencyConfigs(
string_list_file_name= path + file_name_prefix + ".csv",
selectivity_file_name= path + file_name_prefix + "_" + query_type + "_counts.csv",
triplets_file_name= path + file_name_prefix + "_" + query_type + "_triplets.csv"
)
selectivity_learner_configs = misc_utils.SelectivityEstimatorConfigs(
embedding_dimension=64, batch_size=128, num_epochs=64, device=device, lr=0.001,
#will be updated in train_selectivity_estimator
min_val=0.0, max_val=1.0,
embedding_model_file_name = path + file_name_prefix + "_" + query_type + "_embedding_model.pth",
selectivity_model_file_name = path + file_name_prefix + "_" + query_type + "_selectivity_model.pth"
)
return embedding_learner_configs, frequency_configs, selectivity_learner_configs
#This function trains and returns the embedding model
def train_astrid_embedding_model(string_helper, model_output_file_name=None):
global embedding_learner_configs, frequency_configs
#Some times special strings such as nan or those that start with a number confuses Pandas
df = pd.read_csv(frequency_configs.triplets_file_name)
df["Anchor"] = df["Anchor"].astype(str)
df["Positive"] = df["Positive"].astype(str)
df["Negative"] = df["Negative"].astype(str)
triplet_dataset = TripletStringDataset(df, string_helper)
train_loader = DataLoader(triplet_dataset, batch_size=embedding_learner_configs.batch_size, shuffle=True)
embedding_model = EmbeddingLearner.train_embedding_model(embedding_learner_configs, train_loader, string_helper)
if model_output_file_name is not None:
torch.save(embedding_model.state_dict(), model_output_file_name)
return embedding_model
#This function performs min-max scaling over logarithmic data.
#Typically, the selectivities are very skewed.
#This transformation reduces the skew and makes it easier for DL to learn the models
def compute_normalized_selectivities(df):
global selectivity_learner_configs
normalized_selectivities, min_val, max_val = misc_utils.normalize_labels(df["selectivity"])
df["normalized_selectivities"] = normalized_selectivities
#namedtuple's are immutable - so replace them with new instances
selectivity_learner_configs = selectivity_learner_configs._replace(min_val=min_val)
selectivity_learner_configs = selectivity_learner_configs._replace(max_val=max_val)
return df
#This function trains and returns the selectivity estimator.
def train_selectivity_estimator(train_df, string_helper, embedding_model, model_output_file_name=None):
global selectivity_learner_configs, frequency_configs
string_dataset = StringSelectivityDataset(train_df, string_helper, embedding_model)
train_loader = DataLoader(string_dataset, batch_size=selectivity_learner_configs.batch_size, shuffle=True)
selectivity_model = SupervisedSelectivityEstimator.train_selEst_model(selectivity_learner_configs, train_loader, string_helper)
if model_output_file_name is not None:
torch.save(selectivity_model.state_dict(), model_output_file_name)
return selectivity_model
#This is a helper function to get selectivity estimates for an iterator of strings
def get_selectivity_for_strings(strings, embedding_model, selectivity_model, string_helper):
global selectivity_learner_configs
from SupervisedSelectivityEstimator import SelectivityEstimator
embedding_model.eval()
selectivity_model.eval()
strings_as_tensors = []
with torch.no_grad():
for string in strings:
string_as_tensor = string_helper.string_to_tensor(string)
#By default embedding mode expects a tensor of [batch size x alphabet_size * max_string_length]
#so create a "fake" dimension that converts the 2D matrix into a 3D tensor
string_as_tensor = string_as_tensor.view(-1, *string_as_tensor.shape)
strings_as_tensors.append(embedding_model(string_as_tensor).numpy())
strings_as_tensors = np.concatenate(strings_as_tensors)
#normalized_selectivities= between 0 to 1 after the min-max and log scaling.
#denormalized_predictions are the frequencies between 0 to N
normalized_predictions = selectivity_model(torch.tensor(strings_as_tensors))
denormalized_predictions = misc_utils.unnormalize_torch(normalized_predictions, selectivity_learner_configs.min_val,
selectivity_learner_configs.max_val)
return normalized_predictions, denormalized_predictions
def load_embedding_model(model_file_name, string_helper):
from EmbeddingLearner import EmbeddingCNNNetwork
embedding_model= EmbeddingCNNNetwork(string_helper, embedding_learner_configs)
embedding_model.load_state_dict(torch.load(model_file_name))
return embedding_model
def load_selectivity_estimation_model(model_file_name, string_helper):
from SupervisedSelectivityEstimator import SelectivityEstimator
selectivity_model = SelectivityEstimator(string_helper, selectivity_learner_configs)
selectivity_model.load_state_dict(torch.load(model_file_name))
return selectivity_model
def main():
random_seed = 1234
misc_utils.initialize_random_seeds(random_seed)
#Set the configs
embedding_learner_configs, frequency_configs, selectivity_learner_configs = setup_configs()
embedding_model_file_name = selectivity_learner_configs.embedding_model_file_name
selectivity_model_file_name = selectivity_learner_configs.selectivity_model_file_name
string_helper = misc_utils.setup_vocabulary(frequency_configs.string_list_file_name)
#You can comment/uncomment the following lines based on whether you
# want to train from scratch or just reload a previously trained embedding model.
embedding_model = train_astrid_embedding_model(string_helper, embedding_model_file_name)
#embedding_model = load_embedding_model(embedding_model_file_name, string_helper)
#Load the input file and split into 50-50 train, test split
df = pd.read_csv(frequency_configs.selectivity_file_name)
#Some times strings that start with numbers or
# special strings such as nan which confuses Pandas' type inference algorithm
df["string"] = df["string"].astype(str)
df = compute_normalized_selectivities(df)
train_indices, test_indices = train_test_split(df.index, random_state=random_seed, test_size=0.5)
train_df, test_df = df.iloc[train_indices], df.iloc[test_indices]
#You can comment/uncomment the following lines based on whether you
# want to train from scratch or just reload a previously trained embedding model.
selectivity_model = train_selectivity_estimator(train_df, string_helper,
embedding_model, selectivity_model_file_name)
#selectivity_model = load_selectivity_estimation_model(selectivity_model_file_name, string_helper)
#Get the predictions from the learned model and compute basic summary statistics
normalized_predictions, denormalized_predictions = get_selectivity_for_strings(
test_df["string"].values, embedding_model, selectivity_model, string_helper)
actual = torch.tensor(test_df["normalized_selectivities"].values)
test_q_error = misc_utils.compute_qerrors(normalized_predictions, actual,
selectivity_learner_configs.min_val, selectivity_learner_configs.max_val)
print("Test data: Mean q-error loss ", np.mean(test_q_error))
print("Test data: Summary stats of Loss: Percentile: [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99] ", [np.quantile(test_q_error, q) for q in [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]])
if __name__ == "__main__":
main()