Merged PR 3308: save tokenised training data during training; minor optimisations

Thomas Kaltenbrunner · Thomas Kaltenbrunner · commit 041383f75d16 · 2018-09-11T12:37:41.000Z
save tokenised training data during training; minor optimisations

Related work items: #5885
diff --git a/src/embedding/chat_process.py b/src/embedding/chat_process.py
@@ -2,7 +2,7 @@
 
 import logging
 from pathlib import Path
-
+import time
 import aiohttp
 
 import ai_training.chat_process as ait_c
@@ -53,22 +53,30 @@ async def chat_request(self, msg: ait_c.ChatRequestMessage):
             await self.setup_chat_session()
 
         # tokenize
+        t_start = time.time()
         x_tokens_testset = [
             await self.entity_wrapper.tokenize(msg.question, sw_size='xlarge')
         ]
         self.logger.debug("x_tokens_testset: {}".format(x_tokens_testset))
         self.logger.debug("x_tokens_testset: {}".format(
             len(x_tokens_testset[0])))
+        self.logger.debug("tokenizing: {}s".format(time.time() - t_start))
 
         # get question entities
+        t_start = time.time()
         msg_entities = await self.entity_wrapper.extract_entities(msg.question)
         self.logger.debug("msg_entities: {}".format(msg_entities))
+        self.logger.debug("msg_entities: {}s".format(time.time() - t_start))
 
         # get string match
+        t_start = time.time()
         sm_pred, sm_prob = await self.get_string_match(msg, msg_entities, x_tokens_testset)
+        self.logger.debug("string_match: {}s".format(time.time() - t_start))
 
         # entity matcher
+        t_start = time.time()
         er_pred, er_prob = await self.get_entity_match(msg, msg_entities, x_tokens_testset)
+        self.logger.debug("entity_match: {}s".format(time.time() - t_start))
 
         # if SM proba larger take that
         if sm_prob[0] > er_prob[0] and sm_prob[0] > STRING_PROBA_THRES:
@@ -80,8 +88,10 @@ async def chat_request(self, msg: ait_c.ChatRequestMessage):
             self.logger.info("er wins: {}".format(y_pred))
         # if both ER and SM fail completely - EMB to the rescue!
         elif x_tokens_testset[0][0] != 'UNK':
+            t_start = time.time()
             y_pred, y_prob = await self.get_embedding_match(msg, msg_entities, x_tokens_testset)
             self.logger.info("default emb: {}".format(y_pred))
+            self.logger.debug("embedding: {}s".format(time.time() - t_start))
         else:
             y_pred = [""]
             y_prob = [0.0]
@@ -178,4 +188,3 @@ async def setup_chat_session(self):
         self.cls.load_model(ai_path / MODEL_FILE)
         self.entity_wrapper.load_data(ai_path / DATA_FILE)
         self.string_match.load_train_data(ai_path / TRAIN_FILE)
-        await self.string_match.tokenize_train_data()
diff --git a/src/embedding/entity_wrapper.py b/src/embedding/entity_wrapper.py
@@ -119,11 +119,8 @@ def find_matches(self, train_ents, test_match):
         for i, tr_ents in train_ents:
             num_matches = 0
             self.logger.debug("train sample ents: {}".format(tr_ents))
-            for ent in tr_ents:
-                tmp_ent = self.split_entities(ent)
-                for e in tmp_ent:
-                    if e not in ['the'] and e in test_match:
-                        num_matches += 1
+            num_matches += sum(1 if e not in ['the'] and e in test_match else 0
+                               for ent in tr_ents for e in self.split_entities(ent))
             if num_matches > max_matches:
                 max_matches = num_matches
                 matched_labels = [(i, self.train_labels[i])]
diff --git a/src/embedding/string_match.py b/src/embedding/string_match.py
@@ -10,15 +10,17 @@ class StringMatch:
     def __init__(self, entity_wrapper):
         self.logger = logging.getLogger('string_match')
         self.train_data = None
-        self.tok_train = []
+        self.tok_train = None
         self.entity_wrapper = entity_wrapper
         self.stopword_size = 'small'
         self.filter_entities = 'False'
+        self.custom_ents_samples = None
 
     def load_train_data(self, file_path):
         with file_path.open('rb') as f:
-            self.train_data = dill.load(f)
-        self.tok_train = []
+            tmp = dill.load(f)
+        self.train_data = tmp[0]
+        self.tok_train = tmp[1]
 
     def save_train_data(self, data, file_name):
         if not isinstance(data, list):
@@ -28,14 +30,6 @@ def save_train_data(self, data, file_name):
         with open(file_name, 'wb') as f:
             dill.dump(data, f)
 
-    async def tokenize_train_data(self):
-        for q in self.train_data:
-            tok = await self.entity_wrapper.tokenize(
-                q[0],
-                filter_ents=self.filter_entities,
-                sw_size=self.stopword_size)
-            self.tok_train.append(tok)
-
     async def get_string_match(self, q, subset_idx=None,
                                all_larger_zero=False):
         self.logger.info("searching for word matches")
@@ -49,17 +43,9 @@ async def get_string_match(self, q, subset_idx=None,
         tok_q = await self.entity_wrapper.tokenize(
             q, filter_ents=self.filter_entities, sw_size=self.stopword_size)
 
-        # search for intent-like entities first
-        if "@" in q:
-            match_probas = [
-                1.0 if "@" in t[0] else 0.0 for t in self.train_data
-            ]
-        # otherwise do string match
-        else:
-            match_probas = [
-                self.__jaccard_similarity(tok_q, t)
-                if '@' not in ' '.join(t) else 0.0 for t in tok_train
-            ]
+        match_probas = [
+            self.__jaccard_similarity(tok_q, t) for t in tok_train
+        ]
 
         self.logger.info("match_probas: {}".format(match_probas))
         max_proba = max(match_probas)
diff --git a/src/embedding/tests/test_embedding_chat.py b/src/embedding/tests/test_embedding_chat.py
@@ -99,6 +99,11 @@ async def mocked_chat(mocker, loop):
         ("This is London today for entity match", "entity wins with London today"),
         ("This is a perfect string match", "string wins"),
         ("This is the question for embedding word1 word2", "embedding wins")]
+    chat.string_match.tok_train = [
+        ["this", "be", "london", "today", "for", "entity", "match"],
+        ["this", "be", "perfect", "string", "match"],
+        ["this", "be", "question", "for", "embedding", "word1", "word2"]
+    ]
     # mock out the load methods
     mocker.patch("embedding.text_classifier_class.EmbeddingComparison.load_model")
     mocker.patch.object(chat.entity_wrapper, "load_data")
diff --git a/src/embedding/training_process.py b/src/embedding/training_process.py
@@ -97,8 +97,6 @@ async def train(self, msg, topic: ait.Topic, callback):
             temp_data_file = tempdir_path / DATA_FILE
             temp_train_file = tempdir_path / TRAIN_FILE
 
-            self.string_match.save_train_data(q_and_a, temp_train_file)
-
             self.logger.info("Extracting entities...")
             q_entities, a_entities = [], []
             for question, answer in q_and_a:
@@ -114,10 +112,16 @@ async def train(self, msg, topic: ait.Topic, callback):
                 "Entities saved to {}, tokenizing...".format(temp_data_file))
 
             x_tokens = []
+            x_tokens_save = []
             for question in x:
                 tokens = await self.entity_wrapper.tokenize(question, sw_size='xlarge')
                 x_tokens.append(tokens)
+                tokens = await self.entity_wrapper.tokenize(question,
+                                                            sw_size='small',
+                                                            filter_ents='False')
+                x_tokens_save.append(tokens)
                 self.report_progress(0.3)
+            self.string_match.save_train_data([q_and_a, x_tokens_save], temp_train_file)
 
             x_tokens_set = list(set([w for l in x_tokens for w in l]))