Merge pull request #231 from microsoft/axsuarez/DialogPrompts

axelsrz · web-flow · commit 20d73e31a8de · 2019-06-27T13:17:10.000-07:00
tests for tokenizer
diff --git a/libraries/botbuilder-dialogs/botbuilder/dialogs/choices/__init__.py b/libraries/botbuilder-dialogs/botbuilder/dialogs/choices/__init__.py
@@ -16,6 +16,7 @@
 from .model_result import ModelResult
 from .sorted_value import SortedValue
 from .token import Token
+from .tokenizer import Tokenizer
 
 __all__ = [
     "Channel",
@@ -28,5 +29,6 @@
     "ListStyle",
     "ModelResult",
     "SortedValue",
-    "Token"
+    "Token",
+    "Tokenizer"
 ]
diff --git a/libraries/botbuilder-dialogs/botbuilder/dialogs/choices/tokenizer.py b/libraries/botbuilder-dialogs/botbuilder/dialogs/choices/tokenizer.py
@@ -28,7 +28,7 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
         i: int = 0
 
         while i < length:
-            # Get botht he UNICODE value of the current character and the complete character itself
+            # Get both the UNICODE value of the current character and the complete character itself
             # which can potentially be multiple segments
             code_point = ord(text[i])
             char = chr(code_point)
@@ -45,11 +45,11 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
                 token = None
                 tokens.append(Token(
                     start = i,
-                    end = i + (len(char) - 1),
+                    end = i,
                     text = char,
                     normalized = char
                 ))
-            elif token == None:
+            elif token is None:
                 # Start a new token
                 token = Token(
                     start = i,
@@ -61,9 +61,9 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
                 # Add onto current token
                 token.text += char
             
-            i += len(char)
+            i += 1
         
-        Tokenizer._append_token(tokens, token, length)
+        Tokenizer._append_token(tokens, token, length - 1)
         
         return tokens
                 
diff --git a/libraries/botbuilder-dialogs/tests/choices/test_choice_tokenizer.py b/libraries/botbuilder-dialogs/tests/choices/test_choice_tokenizer.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import aiounittest
+from botbuilder.dialogs.choices import Tokenizer
+
+
+def _assert_token(token, start, end, text, normalized=None):
+    assert token.start == start, f"Invalid token.start of '{token.start}' for '{text}' token."
+    assert token.end == end, f"Invalid token.end of '{token.end}' for '{text}' token."
+    assert token.text == text, f"Invalid token.text of '{token.text}' for '{text}' token."
+    assert token.normalized == normalized or text, f"Invalid token.normalized of '{token.normalized}' for '{text}' token."
+
+
+class AttachmentPromptTests(aiounittest.AsyncTestCase):
+    def test_should_break_on_spaces(self):
+        tokens = Tokenizer.default_tokenizer('how now brown cow')
+        assert len(tokens) == 4
+        _assert_token(tokens[0], 0, 2, 'how')
+        _assert_token(tokens[1], 4, 6, 'now')
+        _assert_token(tokens[2], 8, 12, 'brown')
+        _assert_token(tokens[3], 14, 16, 'cow')
+
+    def test_should_break_on_punctuation(self):
+        tokens = Tokenizer.default_tokenizer('how-now.brown:cow?')
+        assert len(tokens) == 4
+        _assert_token(tokens[0], 0, 2, 'how')
+        _assert_token(tokens[1], 4, 6, 'now')
+        _assert_token(tokens[2], 8, 12, 'brown')
+        _assert_token(tokens[3], 14, 16, 'cow')
+
+    def test_should_tokenize_single_character_tokens(self):
+        tokens = Tokenizer.default_tokenizer('a b c d')
+        assert len(tokens) == 4 
+        _assert_token(tokens[0], 0, 0, 'a')
+        _assert_token(tokens[1], 2, 2, 'b')
+        _assert_token(tokens[2], 4, 4, 'c')
+        _assert_token(tokens[3], 6, 6, 'd')
+
+    def test_should_return_a_single_token(self):
+        tokens = Tokenizer.default_tokenizer('food')
+        assert len(tokens) == 1
+        _assert_token(tokens[0], 0, 3, 'food')
+
+    def test_should_return_no_tokens(self):
+        tokens = Tokenizer.default_tokenizer('.?-()')
+        assert len(tokens) == 0
+
+    def test_should_return_a_the_normalized_and_original_text_for_a_token(self):
+        tokens = Tokenizer.default_tokenizer('fOoD')
+        assert len(tokens) == 1
+        _assert_token(tokens[0], 0, 3, 'fOoD', 'food')
+
+    def test_should_break_on_emojis(self):
+        tokens = Tokenizer.default_tokenizer('food 💥👍😀')
+        assert len(tokens) == 4
+        _assert_token(tokens[0], 0, 3, 'food')
+        _assert_token(tokens[1], 5, 5, '💥')
+        _assert_token(tokens[2], 6, 6, '👍')
+        _assert_token(tokens[3], 7, 7, '😀')