Skip to content

Commit ba11107

Browse files
committed
Manhattan distance combiners
mLSTM transformers slightly enhanced sts_benchmarl_light
1 parent 1d6dd6f commit ba11107

File tree

8 files changed

+153903
-5
lines changed

8 files changed

+153903
-5
lines changed

sts_bechmark_light.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -449,12 +449,12 @@ def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1):
449449
('sd', SolveDuplicate()),
450450
('ac', AvgPOSCombiner()),
451451
])),
452-
("sent_tfidf", Pipeline([
452+
("sent_tfidf_cosine", Pipeline([
453453
("pairs", PairTransformer(element_transformer=Pipeline([
454-
("1st_verb", FuncTransformer(func=get_text)),
454+
("get_text", FuncTransformer(func=get_text)),
455455
("shaper", Shaper(newshape=(-1,))),
456456
("tf-idf", TfidfVectorizer(analyzer="char_wb",
457-
ngram_range=(2, 3),
457+
ngram_range=(2, 4),
458458
dtype=np.float32,
459459
decode_error="replace",
460460
stop_words="english"))
@@ -550,7 +550,8 @@ def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1):
550550

551551
if args.verbose == 1:
552552
print score
553-
553+
# recent run on default parameters values:
554+
#{'test_score': 0.73496312643370554, 'dev_score': 0.79295106912391955}
554555
pickle.dump(score,
555556
open("score.pickle", "wb"),
556557
protocol=pickle.HIGHEST_PROTOCOL)

utils/LSTM/__init__.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# This file is part of sensim
4+
5+
"""Helpers for sentence semantic similarity model.
6+
7+
.. Author:: Hussein AL-NATSHEH <hussein.al-natsheh@cnrs.fr>
8+
9+
"""
10+
11+
"""Helper functions."""
12+
13+
from .mlstm import mlstm_transformer
14+
from .mlstm import mlstm_element_transformer
15+
16+
17+
__all__ = ("mlstm_transformer",
18+
"mlstm_element_transformer")

utils/LSTM/mlstm.py

+117
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# coding: utf-8
2+
3+
# License: BSD 3 clause
4+
# 2018
5+
"""
6+
.. coauthor:: Haitham Selawi <haitham.selawi@mawdoo3.com>
7+
.. coauthor:: Hussein Al-Natsheh <hussein.al-natsheh@cnrs.fr>
8+
9+
"""
10+
import keras.backend as K
11+
from keras.models import load_model
12+
from keras.models import model_from_yaml
13+
from keras.preprocessing.sequence import pad_sequences
14+
from keras.preprocessing.text import Tokenizer
15+
16+
import numpy as np
17+
import pandas as pd
18+
import pickle
19+
20+
from scipy.stats import pearsonr
21+
22+
path = "../sensim/utils/LSTM/"
23+
24+
tokenizer = pickle.load(open(path+'tokenizer.p','rb'))
25+
26+
yaml_file = open(path+'model.yaml', 'r')
27+
loaded_model_yaml = yaml_file.read()
28+
yaml_file.close()
29+
30+
loaded_model = model_from_yaml(loaded_model_yaml)
31+
loaded_model.load_weights(path+'model.h5')
32+
loaded_model.compile(loss='mse', optimizer='adam')
33+
34+
maxlen = loaded_model.get_layer(index = 0).input_shape[1]
35+
36+
#A transformer function that takes 2 sentences (or vector of pairs)
37+
# and returns the numpy array representation of them
38+
def mlstm_transformer(X):
39+
Xt = np.zeros(len(X), dtype=np.object)
40+
for (i, x) in enumerate(X):
41+
xt = tokenizer.texts_to_sequences(x)
42+
xt = pad_sequences(xt, maxlen = maxlen)
43+
xt = loaded_model.predict(xt)
44+
Xt[i] = xt
45+
return Xt
46+
47+
#Element-wise version of mlstm_transformer
48+
def mlstm_element_transformer(x):
49+
xt = tokenizer.texts_to_sequences(x)
50+
xt = pad_sequences(xt, maxlen = maxlen)
51+
xt = loaded_model.predict(xt)
52+
return xt
53+
54+
def estimate(X):
55+
Xt = mlstm_transformer(X)
56+
y = np.zeros(len(X), dtype=np.float32)
57+
for (i, xt) in enumerate(Xt):
58+
y[i] = 5*np.exp(-np.linalg.norm(xt[0] - xt[1], ord = 1))
59+
return y
60+
61+
def read_tsv(path_to_file):
62+
df = pd.DataFrame()
63+
line_nb = 0
64+
with open(path_to_file) as file:
65+
for line in file.readlines():
66+
for num, cell in enumerate(line.split('\t')):
67+
df.set_value(line_nb, 'column_{}'.format(num),cell.strip())
68+
line_nb = line_nb + 1
69+
return df
70+
71+
def df_2_dset(dframe, sent1_col="Sent1", sent2_col="Sent2", score_col="Score"):
72+
X = dframe.as_matrix(columns=[sent1_col, sent2_col])
73+
y = dframe[score_col].values
74+
return X, y
75+
76+
def load_sts_benchmark_dataset(dframe_file):
77+
dframe = read_tsv(dframe_file)
78+
dframe["Score"] = np.array(dframe['column_4'], dtype=np.float32)
79+
X, y = df_2_dset(dframe, sent1_col="column_5", sent2_col="column_6")
80+
return X, y
81+
82+
def sts_score(X, y, decimals=2):
83+
y_est = estimate(X)
84+
y_est[np.where(y_est > 5)] = 5
85+
y_est[np.where(y_est < 0)] = 0
86+
if decimals is not None:
87+
y_est = np.round(y_est, decimals=decimals)
88+
pickle.dump(y_est, open("y_est.p", "wb"))
89+
print ("y_est is pickled at y_est.p")
90+
return pearsonr(y_est, y)[0]
91+
92+
X_test, y_test = load_sts_benchmark_dataset(path+'sts-test.csv')
93+
score = sts_score(X_test, y_test)#, decimals=None)
94+
print score
95+
pickle.dump(score, open("score.p", "wb"))
96+
97+
98+
"""
99+
#Example
100+
# from the test set : 3.5 resulted as 3.62
101+
line1_1 = 'A man is riding an electric bicycle.'
102+
line1_2 = 'A man is riding a bicycle.'
103+
104+
# from the test set : 2.0, resulted as 2.31
105+
line2_1 = 'A man is slicing a bun.'
106+
line2_2 = 'A man is slicing a tomato.'
107+
108+
X = [[line1_1, line1_2],
109+
[line2_1, line2_2]
110+
]
111+
112+
113+
Xt = mlstm_transformer(X)
114+
115+
for xt in Xt:
116+
print(5*np.exp(-np.linalg.norm(xt[0] - xt[1], ord = 1)))
117+
"""

utils/LSTM/model.h5

17.8 MB
Binary file not shown.

utils/LSTM/model.yaml

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
backend: !!python/unicode 'tensorflow'
2+
class_name: Model
3+
config:
4+
input_layers:
5+
- [input_words, 0, 0]
6+
layers:
7+
- class_name: InputLayer
8+
config:
9+
batch_input_shape: !!python/tuple [null, 56]
10+
dtype: int32
11+
name: input_words
12+
sparse: false
13+
inbound_nodes: []
14+
name: input_words
15+
- class_name: Embedding
16+
config:
17+
activity_regularizer: null
18+
batch_input_shape: !!python/tuple [null, null]
19+
dtype: float32
20+
embeddings_constraint: null
21+
embeddings_initializer:
22+
class_name: RandomUniform
23+
config: {maxval: 0.05, minval: -0.05, seed: null}
24+
embeddings_regularizer: null
25+
input_dim: 15349
26+
input_length: null
27+
mask_zero: false
28+
name: word_embedding
29+
output_dim: 300
30+
trainable: false
31+
inbound_nodes:
32+
- - - input_words
33+
- 0
34+
- 0
35+
- {}
36+
name: word_embedding
37+
- class_name: LSTM
38+
config:
39+
activation: tanh
40+
activity_regularizer: null
41+
bias_constraint: null
42+
bias_initializer:
43+
class_name: Zeros
44+
config: {}
45+
bias_regularizer: null
46+
dropout: 0.0
47+
go_backwards: false
48+
implementation: 1
49+
kernel_constraint: null
50+
kernel_initializer:
51+
class_name: VarianceScaling
52+
config: {distribution: uniform, mode: fan_avg, scale: 1.0, seed: null}
53+
kernel_regularizer: null
54+
name: bidirectional
55+
recurrent_activation: hard_sigmoid
56+
recurrent_constraint: null
57+
recurrent_dropout: 0.0
58+
recurrent_initializer:
59+
class_name: Orthogonal
60+
config: {gain: 1.0, seed: null}
61+
recurrent_regularizer: null
62+
return_sequences: false
63+
return_state: false
64+
stateful: false
65+
trainable: true
66+
unit_forget_bias: true
67+
units: 50
68+
unroll: false
69+
use_bias: true
70+
inbound_nodes:
71+
- - - word_embedding
72+
- 0
73+
- 0
74+
- {}
75+
name: bidirectional
76+
name: model_2
77+
output_layers:
78+
- [bidirectional, 0, 0]
79+
keras_version: 2.1.4

0 commit comments

Comments
 (0)