Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add time sensitive approach #151

Open
wants to merge 1 commit into
base: 149-reproduce
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions run_experiment_ts_disambiguation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,19 @@ def eval_sense(lemma,
vector_col=vector_col,
df_train = df_train, axis=1)

# Inspired by https://recordlinkage.readthedocs.io/en/latest/ref-compare.html#recordlinkage.compare.Numeric
df_test[f"bert_ts_weighted_centroid_sense_gauss_{vector_col}"] = df_test.apply(wsd.bert_ts_sense_centroid_vector,
senseid2label= senseid2label,
ts_method='weighted_gauss',
vector_col=vector_col,
df_train = df_train, axis=1)

# TO DO: uncomment this after merging with dev
#df_test[f"bert_ts_weighted_centroid_sense_{vector_col}"] = df_test.apply(wsd.bert_ts_sense_centroid_vector,
# senseid2label= senseid2label,
# ts_method='weighted_past',
# vector_col=vector_col,
# df_train = df_train, axis=1)
df_test[f"bert_ts_weighted_centroid_sense_past_{vector_col}"] = df_test.apply(wsd.bert_ts_sense_centroid_vector,
senseid2label= senseid2label,
ts_method='weighted_past',
vector_col=vector_col,
df_train = df_train, axis=1)



Expand Down Expand Up @@ -118,7 +125,8 @@ def run(lemma,
f"bert_centroid_sense_{vector_col}",
f"bert_ts_nearest_centroid_sense_{vector_col}",
f"bert_ts_weighted_centroid_sense_{vector_col}",
# f"bert_ts_weighted_past_centroid_sense_{vector_col}",
f"bert_ts_weighted_centroid_sense_past_{vector_col}",
f"bert_ts_weighted_centroid_sense_gauss_{vector_col}",
]
for vector_col in vector_cols]
bert_methods = [i for tm in bert_methods for i in tm]
Expand All @@ -132,7 +140,7 @@ def run(lemma,
def run_experiment(END):
RELATIONS = ['seed','synonym']
EVAL_MODE = 'lemma_etal'
WEMB_MODEL = Word2Vec.load("models/w2v_004/w2v_words.model")
WEMB_MODEL = Word2Vec.load("models/word2vec/w2v_1760_1900/w2v_words.model")
TRAIN_ON_DEV = True

# argument the change by experiment change
Expand All @@ -153,10 +161,15 @@ def run_experiment(END):
words = [['anger',"NN"],["apple","NN"],["art","NN"],["democracy","NN"],
["happiness","NN"],["labour","NN"],["machine","NN"],["man","NN"],
["nation","NN"],["power","NN"],["slave","NN"],['woman','NN']]

# words = [["machine","NN"]]

errors = []

for lemma, pos in words:

print(lemma, pos)

quotations_path = f"./data/sfrel_quotations_{lemma}_{pos}.pickle"
lemma_senses = pd.read_pickle(f'./data/lemma_senses_{lemma}_{pos}.pickle')

Expand Down Expand Up @@ -184,6 +197,7 @@ def run_experiment(END):
except Exception as e:
print(sense,e)
errors.append(sense)

print("Done.")
print("Errors with the following senses:")
print(errors)
Expand Down
61 changes: 58 additions & 3 deletions tasks/wsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,61 @@ def weighted(df,year,vector_col,level='label') -> pd.Series:
"""
# 1 over the distance in years
df['temp_dist'] = (1 / (abs(year - df.year) + 1))

# normalize, so weights add up to one
df['temp_dist'] = df['temp_dist'] / sum(df['temp_dist'])
# time weighted vector (tw_vector) is the product of the vector and the weight
df['tw_vector'] = df[vector_col] * df['temp_dist']
# sum vectors by label (sum or mean??)
return df.groupby(level)['tw_vector'].apply(np.sum,axis=0)
return df.groupby(level)['tw_vector'].apply(np.sum,axis=0)


def weighted_gauss(df,year,vector_col,level='label', offset=0.5, scale=0.5) -> pd.Series:
"""This function weights vector representation of
target words by their distance to the year
of the query vector. This is repeated for each
sense_id or label (i.e. value of `level` argument).

It returns sense level or binary time weighted centroid vectors

Arguments:
df (pd.DataFrame): the training data from which to construct
the time sensitive embedding
year (int): year of the vector to disambiguate
vector_col (str): name of the column in which the target vector is stored
level (str): use 'label' for binary centroid vector,
use `sense_id` for sense level centroid vectors

Returns:
as element of type pd.Series with index=level and
values the centroid vector (in this the weighted vectors
averaged by the specified level)

"""

# Date of latest quotation in the training data:
max_date = df.year.max()

# Date of first quotation in the training data:
min_date = df.year.min()

# Date of the target quotation:
origin = year

offset = offset * (max_date-min_date)
scale = scale * (max_date-min_date)

# Gauss-based weighting:
# Inspired by: https://recordlinkage.readthedocs.io/en/latest/ref-compare.html#recordlinkage.compare.Numeric
d = (abs(df.year - origin)).clip(offset, None)
expr = '2**(-((d-offset)/scale)**2)'
df['temp_dist'] = pd.eval(expr)

# Time weighted vector (tw_vector) is the product of the vector and the weight
df['tw_vector'] = df[vector_col] * df['temp_dist']

# sum vectors by label (sum or mean??)
return df.groupby(level)['tw_vector'].apply(np.sum,axis=0)

def weighted_past(df,year,vector_col,level='label') -> pd.Series:
"""This function weights vector representation of
Expand Down Expand Up @@ -313,7 +362,7 @@ class as "0" or "1" as string

vector, year = row[vector_col],row.year

ts_methods = ['weighted','nearest','weighted_past']
ts_methods = ['weighted','nearest','weighted_past','weighted_gauss']


if ts_method=='weighted':
Expand All @@ -325,6 +374,9 @@ class as "0" or "1" as string
elif ts_method=='weighted_past':
# the nearest vector in time
centroid_vectors = weighted_past(df_train,year,vector_col)
elif ts_method=='weighted_gauss':
# the nearest vector in time
centroid_vectors = weighted_gauss(df_train,year,vector_col)
else:
assert ts_method in ts_methods, f'ts_method should be one of the following options {ts_methods}'

Expand Down Expand Up @@ -369,7 +421,7 @@ class as "0" or "1" string
# if lemma doesn't appear in train return '0'
if not df_train_lemma.shape[0]: return '0'

ts_methods = ['nearest','weighted','weighted_past']
ts_methods = ['nearest','weighted','weighted_past','weighted_gauss']

if ts_method=='weighted':
# weight vector by distance
Expand All @@ -380,6 +432,9 @@ class as "0" or "1" string
elif ts_method=='weighted_past':
# the nearest vector in time
centroid_vectors = weighted_past(df_train_lemma,row.year,vector_col,level='sense_id')
elif ts_method=='weighted_gauss':
# the nearest vector in time
centroid_vectors = weighted_gauss(df_train_lemma,row.year,vector_col,level='sense_id')
else:
assert ts_method in ts_methods, f'ts_method should be one of the following options {ts_methods}'

Expand Down