Merged PR 3136: improve qa matching if mutiple qa pairs have same number of entities

Thomas Kaltenbrunner · Thomas Kaltenbrunner · commit ea17a3e0c59a · 2018-07-30T15:15:53.000Z
in this case subset on the training examples which have the same number of matches and run those through the embedding matcher. take the sample with the highest score as the match

Related work items: #5576
diff --git a/src/embedding/chat_process.py b/src/embedding/chat_process.py
@@ -61,14 +61,20 @@ async def chat_request(self, msg: ait_c.ChatRequestMessage):
         self.cls.update_w2v(vecs)
         yPred, yProbs = self.cls.predict(x_tokens_testset)
         if yProbs[0] < THRESHOLD or len(x_tokens_testset) < 3:
-            matched_answer = self.entity_wrapper.match_entities(
+            matched_answers = self.entity_wrapper.match_entities(
                 msg.question)
-            self.logger.info("matched_entities: {}".format(matched_answer))
-            if matched_answer:
-                self.logger.info("substituting {} for entity match {}".format(
-                    yPred, matched_answer))
-                yPred = [matched_answer]
-                yProbs = [ENTITY_MATCH_PROBA]
+            self.logger.info("matched_entities: {}".format(matched_answers))
+            if matched_answers:
+                if len(matched_answers) > 1:
+                    train_idx = [e[0] for e in matched_answers]
+                    yPred, yProbs = self.cls.predict(x_tokens_testset, subset_idx=train_idx)
+                    self.logger.info("multiple entity matches {}; pick {}".format(
+                        matched_answers, yPred))
+                else:
+                    self.logger.info("substituting {} for entity match {}".format(
+                        yPred, matched_answers))
+                    yPred = [matched_answers[0][1]]
+                    yProbs = [ENTITY_MATCH_PROBA]
         resp = ait_c.ChatResponseMessage(msg, yPred[0], float(yProbs[0]))
         return resp
 
diff --git a/src/embedding/entity_wrapper.py b/src/embedding/entity_wrapper.py
@@ -78,6 +78,9 @@ def match_entities(self, test_q):
                     if e not in ['the'] and e in test_match:
                         num_matches += 1
             if num_matches > max_matches:
+                max_matches = num_matches
+                matched_labels = [(i, self.train_labels[i])]
+            elif num_matches == max_matches and max_matches > 0:
                 matched_labels.append((i, self.train_labels[i]))
         return matched_labels
 
diff --git a/src/embedding/tests/test_embedding_chat.py b/src/embedding/tests/test_embedding_chat.py
@@ -64,6 +64,7 @@ async def mocked_chat(mocker, loop):
             'start': 8,
             'end': 18
         }]]
+
     chat.entity_wrapper.train_labels = ["You said London today",
                                             "You said Paris Fred Bloggs"]
     
@@ -127,7 +128,7 @@ async def test_chat_request_entity_no_match2(mocker, mocked_chat):
 
     msg = ait_c.ChatRequestMessage("This question has entities London today in it", None, None, update_state=True)
     response = await mocked_chat.chat_request(msg)
-    assert response.answer[0][1] == "You said London today"
+    assert response.answer == "You said London today"
     assert response.score == embedding.chat_process.ENTITY_MATCH_PROBA
     assert response.topic_out is None
     assert response.history is None
diff --git a/src/embedding/tests/test_embedding_training.py b/src/embedding/tests/test_embedding_training.py
@@ -45,6 +45,7 @@ async def mocked_train(mocker, loop):
         training.entity_wrapper,
         "get_from_er_server",
         new=get_from_er_server)
+
     training.entity_wrapper.train_entities = [
         [{
             'category': 'sys.places',
@@ -68,6 +69,7 @@ async def mocked_train(mocker, loop):
             'start': 8,
             'end': 18
         }]]
+
     training.entity_wrapper.train_labels = ["You said London today",
                                             "You said Paris Fred Bloggs"]
     
@@ -126,7 +128,6 @@ async def test_er_match_entities_2(mocked_train):
     assert matched_label[0][1] == "You said Paris Fred Bloggs"
 
 
-
 async def test_train_success(mocked_train, mocker):
     DUMMY_AIID = "123456"
     DUMMY_TRAINING_DATA = """
diff --git a/src/embedding/text_classifier_class.py b/src/embedding/text_classifier_class.py
@@ -110,21 +110,26 @@ def fit(self, X, y):
         self.X = X
         self.classes = list(set(y))
 
-    def predict(self, X, scale_probas=False):
+    def predict(self, X, scale_probas=False, subset_idx=None):
+        if subset_idx:
+            train_x = self.X_tfidf[subset_idx]
+            train_y = self.y[subset_idx]
+        else:
+            train_x = self.X_tfidf
+            train_y = self.y
         target_tfidf = self.vectorizer.transform(X)
         target_tfidf = target_tfidf - self.pca.components_[0]
-
         # compute cosine similarity
-        cossim = np.dot(target_tfidf, self.X_tfidf.T) / (
-            np.outer(np.linalg.norm(target_tfidf, axis=1), np.linalg.norm(self.X_tfidf, axis=1)))
+        cossim = np.dot(target_tfidf, train_x.T) / (
+            np.outer(np.linalg.norm(target_tfidf, axis=1), np.linalg.norm(train_x, axis=1)))
         # self.logger.info("cossim: {}".format(cossim))
         cossim = np.where(cossim < 0., 0., cossim)
-
+        if subset_idx:
+            self.logger.info("cossims: {}".format(cossim))
         # most similar vector is the predicted class
         preds = np.argmax(cossim, 1)
-        preds = [self.y[i] for i in preds]
+        preds = [train_y[i] for i in preds]
         probs = self.downscale_probas(np.max(cossim, axis=1))
-
         return preds, list(probs)
 
     def save_model(self, file_path: Path):