Skip to content
This repository was archived by the owner on Jan 5, 2026. It is now read-only.

Commit 20d73e3

Browse files
authored
Merge pull request #231 from microsoft/axsuarez/DialogPrompts
tests for tokenizer
2 parents 39bcb74 + 80eecad commit 20d73e3

File tree

3 files changed

+68
-6
lines changed

3 files changed

+68
-6
lines changed

libraries/botbuilder-dialogs/botbuilder/dialogs/choices/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .model_result import ModelResult
1717
from .sorted_value import SortedValue
1818
from .token import Token
19+
from .tokenizer import Tokenizer
1920

2021
__all__ = [
2122
"Channel",
@@ -28,5 +29,6 @@
2829
"ListStyle",
2930
"ModelResult",
3031
"SortedValue",
31-
"Token"
32+
"Token",
33+
"Tokenizer"
3234
]

libraries/botbuilder-dialogs/botbuilder/dialogs/choices/tokenizer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
2828
i: int = 0
2929

3030
while i < length:
31-
# Get botht he UNICODE value of the current character and the complete character itself
31+
# Get both the UNICODE value of the current character and the complete character itself
3232
# which can potentially be multiple segments
3333
code_point = ord(text[i])
3434
char = chr(code_point)
@@ -45,11 +45,11 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
4545
token = None
4646
tokens.append(Token(
4747
start = i,
48-
end = i + (len(char) - 1),
48+
end = i,
4949
text = char,
5050
normalized = char
5151
))
52-
elif token == None:
52+
elif token is None:
5353
# Start a new token
5454
token = Token(
5555
start = i,
@@ -61,9 +61,9 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
6161
# Add onto current token
6262
token.text += char
6363

64-
i += len(char)
64+
i += 1
6565

66-
Tokenizer._append_token(tokens, token, length)
66+
Tokenizer._append_token(tokens, token, length - 1)
6767

6868
return tokens
6969

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Copyright (c) Microsoft Corporation. All rights reserved.
2+
# Licensed under the MIT License.
3+
4+
import aiounittest
5+
from botbuilder.dialogs.choices import Tokenizer
6+
7+
8+
def _assert_token(token, start, end, text, normalized=None):
9+
assert token.start == start, f"Invalid token.start of '{token.start}' for '{text}' token."
10+
assert token.end == end, f"Invalid token.end of '{token.end}' for '{text}' token."
11+
assert token.text == text, f"Invalid token.text of '{token.text}' for '{text}' token."
12+
assert token.normalized == normalized or text, f"Invalid token.normalized of '{token.normalized}' for '{text}' token."
13+
14+
15+
class AttachmentPromptTests(aiounittest.AsyncTestCase):
16+
def test_should_break_on_spaces(self):
17+
tokens = Tokenizer.default_tokenizer('how now brown cow')
18+
assert len(tokens) == 4
19+
_assert_token(tokens[0], 0, 2, 'how')
20+
_assert_token(tokens[1], 4, 6, 'now')
21+
_assert_token(tokens[2], 8, 12, 'brown')
22+
_assert_token(tokens[3], 14, 16, 'cow')
23+
24+
def test_should_break_on_punctuation(self):
25+
tokens = Tokenizer.default_tokenizer('how-now.brown:cow?')
26+
assert len(tokens) == 4
27+
_assert_token(tokens[0], 0, 2, 'how')
28+
_assert_token(tokens[1], 4, 6, 'now')
29+
_assert_token(tokens[2], 8, 12, 'brown')
30+
_assert_token(tokens[3], 14, 16, 'cow')
31+
32+
def test_should_tokenize_single_character_tokens(self):
33+
tokens = Tokenizer.default_tokenizer('a b c d')
34+
assert len(tokens) == 4
35+
_assert_token(tokens[0], 0, 0, 'a')
36+
_assert_token(tokens[1], 2, 2, 'b')
37+
_assert_token(tokens[2], 4, 4, 'c')
38+
_assert_token(tokens[3], 6, 6, 'd')
39+
40+
def test_should_return_a_single_token(self):
41+
tokens = Tokenizer.default_tokenizer('food')
42+
assert len(tokens) == 1
43+
_assert_token(tokens[0], 0, 3, 'food')
44+
45+
def test_should_return_no_tokens(self):
46+
tokens = Tokenizer.default_tokenizer('.?-()')
47+
assert len(tokens) == 0
48+
49+
def test_should_return_a_the_normalized_and_original_text_for_a_token(self):
50+
tokens = Tokenizer.default_tokenizer('fOoD')
51+
assert len(tokens) == 1
52+
_assert_token(tokens[0], 0, 3, 'fOoD', 'food')
53+
54+
def test_should_break_on_emojis(self):
55+
tokens = Tokenizer.default_tokenizer('food 💥👍😀')
56+
assert len(tokens) == 4
57+
_assert_token(tokens[0], 0, 3, 'food')
58+
_assert_token(tokens[1], 5, 5, '💥')
59+
_assert_token(tokens[2], 6, 6, '👍')
60+
_assert_token(tokens[3], 7, 7, '😀')

0 commit comments

Comments
 (0)