natsheh
diff --git a/‎sts_bechmark_light.py
+5-4 b/‎sts_bechmark_light.py
+5-4
diff --git a/‎utils/LSTM/__init__.py
+18 b/‎utils/LSTM/__init__.py
+18
diff --git a/‎utils/LSTM/mlstm.py
+117 b/‎utils/LSTM/mlstm.py
+117
diff --git a/‎utils/LSTM/model.h5
17.8 MB b/‎utils/LSTM/model.h5
17.8 MB
diff --git a/‎utils/LSTM/model.yaml
+79 b/‎utils/LSTM/model.yaml
+79
@@ -449,12 +449,12 @@ def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1):
             ('sd', SolveDuplicate()),
             ('ac', AvgPOSCombiner()),
             ])),
-        ("sent_tfidf", Pipeline([
+        ("sent_tfidf_cosine", Pipeline([
             ("pairs", PairTransformer(element_transformer=Pipeline([
-                ("1st_verb", FuncTransformer(func=get_text)),
+                ("get_text", FuncTransformer(func=get_text)),
                 ("shaper", Shaper(newshape=(-1,))),
                 ("tf-idf", TfidfVectorizer(analyzer="char_wb",
-                                           ngram_range=(2, 3),
+                                           ngram_range=(2, 4),
                                            dtype=np.float32,
                                            decode_error="replace",
                                            stop_words="english"))
@@ -550,7 +550,8 @@ def _build_distance_estimator(X, y, w2v, PoS, NER, regressor, verbose=1):
 
         if args.verbose == 1:
             print score
-
+            # recent run on default parameters values: 
+            #{'test_score': 0.73496312643370554, 'dev_score': 0.79295106912391955}
         pickle.dump(score,
                 open("score.pickle", "wb"),
                 protocol=pickle.HIGHEST_PROTOCOL)
 
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+#
+# This file is part of sensim
+
+"""Helpers for sentence semantic similarity model.
+
+.. Author:: Hussein AL-NATSHEH <hussein.al-natsheh@cnrs.fr>
+
+"""
+
+"""Helper functions."""
+
+from .mlstm import mlstm_transformer
+from .mlstm import mlstm_element_transformer
+
+
+__all__ = ("mlstm_transformer",
+           "mlstm_element_transformer")
@@ -0,0 +1,117 @@
+# coding: utf-8
+
+# License: BSD 3 clause
+# 2018
+"""
+.. coauthor:: Haitham Selawi <haitham.selawi@mawdoo3.com>
+.. coauthor:: Hussein Al-Natsheh <hussein.al-natsheh@cnrs.fr>
+
+"""
+import keras.backend as K
+from keras.models import load_model
+from keras.models import model_from_yaml
+from keras.preprocessing.sequence import pad_sequences
+from keras.preprocessing.text import Tokenizer
+
+import numpy as np
+import pandas as pd
+import pickle
+
+from scipy.stats import pearsonr
+
+path = "../sensim/utils/LSTM/"
+
+tokenizer = pickle.load(open(path+'tokenizer.p','rb'))
+
+yaml_file = open(path+'model.yaml', 'r')
+loaded_model_yaml = yaml_file.read()
+yaml_file.close()
+
+loaded_model = model_from_yaml(loaded_model_yaml)
+loaded_model.load_weights(path+'model.h5')
+loaded_model.compile(loss='mse', optimizer='adam')
+
+maxlen = loaded_model.get_layer(index = 0).input_shape[1]
+
+#A transformer function that takes 2 sentences (or vector of pairs)
+# and returns the numpy array representation of them
+def mlstm_transformer(X):
+	Xt = np.zeros(len(X), dtype=np.object)
+	for (i, x) in enumerate(X):
+		xt = tokenizer.texts_to_sequences(x)
+		xt = pad_sequences(xt, maxlen = maxlen)
+		xt = loaded_model.predict(xt)
+		Xt[i] = xt
+	return Xt
+
+#Element-wise version of mlstm_transformer
+def mlstm_element_transformer(x):
+	xt = tokenizer.texts_to_sequences(x)
+	xt = pad_sequences(xt, maxlen = maxlen)
+	xt = loaded_model.predict(xt)
+	return xt
+
+def estimate(X):
+	Xt = mlstm_transformer(X)
+	y = np.zeros(len(X), dtype=np.float32)
+	for (i, xt) in enumerate(Xt):
+		y[i] = 5*np.exp(-np.linalg.norm(xt[0] - xt[1], ord = 1))
+	return y
+
+def read_tsv(path_to_file):
+	df = pd.DataFrame()
+	line_nb = 0
+	with open(path_to_file) as file:
+		for line in file.readlines():
+			for num, cell in enumerate(line.split('\t')):
+				df.set_value(line_nb, 'column_{}'.format(num),cell.strip())
+			line_nb = line_nb + 1
+	return df
+
+def df_2_dset(dframe, sent1_col="Sent1", sent2_col="Sent2", score_col="Score"):
+	X = dframe.as_matrix(columns=[sent1_col, sent2_col])
+	y = dframe[score_col].values
+	return X, y	
+
+def load_sts_benchmark_dataset(dframe_file):
+	dframe = read_tsv(dframe_file)
+	dframe["Score"] = np.array(dframe['column_4'], dtype=np.float32)
+	X, y = df_2_dset(dframe, sent1_col="column_5", sent2_col="column_6")
+	return X, y
+
+def sts_score(X, y, decimals=2):
+	y_est = estimate(X)
+	y_est[np.where(y_est > 5)] = 5
+	y_est[np.where(y_est < 0)] = 0
+	if decimals is not None:
+		y_est = np.round(y_est, decimals=decimals)
+	pickle.dump(y_est, open("y_est.p", "wb"))
+	print ("y_est is pickled at y_est.p")
+	return pearsonr(y_est, y)[0]
+
+X_test, y_test = load_sts_benchmark_dataset(path+'sts-test.csv')
+score = sts_score(X_test, y_test)#, decimals=None)
+print score
+pickle.dump(score, open("score.p", "wb"))
+
+
+"""
+#Example
+# from the test set : 3.5 resulted as 3.62
+line1_1 = 'A man is riding an electric bicycle.'
+line1_2 = 'A man is riding a bicycle.'
+
+# from the test set : 2.0, resulted as 2.31
+line2_1 = 'A man is slicing a bun.'
+line2_2 = 'A man is slicing a tomato.'
+
+X = [[line1_1, line1_2],
+	 [line2_1, line2_2]
+	 ]
+
+
+Xt = mlstm_transformer(X)
+
+for xt in Xt:
+	print(5*np.exp(-np.linalg.norm(xt[0] - xt[1], ord = 1)))
+"""
@@ -0,0 +1,79 @@
+backend: !!python/unicode 'tensorflow'
+class_name: Model
+config:
+  input_layers:
+  - [input_words, 0, 0]
+  layers:
+  - class_name: InputLayer
+    config:
+      batch_input_shape: !!python/tuple [null, 56]
+      dtype: int32
+      name: input_words
+      sparse: false
+    inbound_nodes: []
+    name: input_words
+  - class_name: Embedding
+    config:
+      activity_regularizer: null
+      batch_input_shape: !!python/tuple [null, null]
+      dtype: float32
+      embeddings_constraint: null
+      embeddings_initializer:
+        class_name: RandomUniform
+        config: {maxval: 0.05, minval: -0.05, seed: null}
+      embeddings_regularizer: null
+      input_dim: 15349
+      input_length: null
+      mask_zero: false
+      name: word_embedding
+      output_dim: 300
+      trainable: false
+    inbound_nodes:
+    - - - input_words
+        - 0
+        - 0
+        - {}
+    name: word_embedding
+  - class_name: LSTM
+    config:
+      activation: tanh
+      activity_regularizer: null
+      bias_constraint: null
+      bias_initializer:
+        class_name: Zeros
+        config: {}
+      bias_regularizer: null
+      dropout: 0.0
+      go_backwards: false
+      implementation: 1
+      kernel_constraint: null
+      kernel_initializer:
+        class_name: VarianceScaling
+        config: {distribution: uniform, mode: fan_avg, scale: 1.0, seed: null}
+      kernel_regularizer: null
+      name: bidirectional
+      recurrent_activation: hard_sigmoid
+      recurrent_constraint: null
+      recurrent_dropout: 0.0
+      recurrent_initializer:
+        class_name: Orthogonal
+        config: {gain: 1.0, seed: null}
+      recurrent_regularizer: null
+      return_sequences: false
+      return_state: false
+      stateful: false
+      trainable: true
+      unit_forget_bias: true
+      units: 50
+      unroll: false
+      use_bias: true
+    inbound_nodes:
+    - - - word_embedding
+        - 0
+        - 0
+        - {}
+    name: bidirectional
+  name: model_2
+  output_layers:
+  - [bidirectional, 0, 0]
+keras_version: 2.1.4