Skip to content
This repository was archived by the owner on Jan 5, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .model_result import ModelResult
from .sorted_value import SortedValue
from .token import Token
from .tokenizer import Tokenizer

__all__ = [
"Channel",
Expand All @@ -28,5 +29,6 @@
"ListStyle",
"ModelResult",
"SortedValue",
"Token"
"Token",
"Tokenizer"
]
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
i: int = 0

while i < length:
# Get botht he UNICODE value of the current character and the complete character itself
# Get both the UNICODE value of the current character and the complete character itself
# which can potentially be multiple segments
code_point = ord(text[i])
char = chr(code_point)
Expand All @@ -45,11 +45,11 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
token = None
tokens.append(Token(
start = i,
end = i + (len(char) - 1),
end = i,
text = char,
normalized = char
))
elif token == None:
elif token is None:
# Start a new token
token = Token(
start = i,
Expand All @@ -61,9 +61,9 @@ def default_tokenizer(text: str, locale: str = None) -> [Token]:
# Add onto current token
token.text += char

i += len(char)
i += 1

Tokenizer._append_token(tokens, token, length)
Tokenizer._append_token(tokens, token, length - 1)

return tokens

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import aiounittest
from botbuilder.dialogs.choices import Tokenizer


def _assert_token(token, start, end, text, normalized=None):
assert token.start == start, f"Invalid token.start of '{token.start}' for '{text}' token."
assert token.end == end, f"Invalid token.end of '{token.end}' for '{text}' token."
assert token.text == text, f"Invalid token.text of '{token.text}' for '{text}' token."
assert token.normalized == normalized or text, f"Invalid token.normalized of '{token.normalized}' for '{text}' token."


class AttachmentPromptTests(aiounittest.AsyncTestCase):
def test_should_break_on_spaces(self):
tokens = Tokenizer.default_tokenizer('how now brown cow')
assert len(tokens) == 4
_assert_token(tokens[0], 0, 2, 'how')
_assert_token(tokens[1], 4, 6, 'now')
_assert_token(tokens[2], 8, 12, 'brown')
_assert_token(tokens[3], 14, 16, 'cow')

def test_should_break_on_punctuation(self):
tokens = Tokenizer.default_tokenizer('how-now.brown:cow?')
assert len(tokens) == 4
_assert_token(tokens[0], 0, 2, 'how')
_assert_token(tokens[1], 4, 6, 'now')
_assert_token(tokens[2], 8, 12, 'brown')
_assert_token(tokens[3], 14, 16, 'cow')

def test_should_tokenize_single_character_tokens(self):
tokens = Tokenizer.default_tokenizer('a b c d')
assert len(tokens) == 4
_assert_token(tokens[0], 0, 0, 'a')
_assert_token(tokens[1], 2, 2, 'b')
_assert_token(tokens[2], 4, 4, 'c')
_assert_token(tokens[3], 6, 6, 'd')

def test_should_return_a_single_token(self):
tokens = Tokenizer.default_tokenizer('food')
assert len(tokens) == 1
_assert_token(tokens[0], 0, 3, 'food')

def test_should_return_no_tokens(self):
tokens = Tokenizer.default_tokenizer('.?-()')
assert len(tokens) == 0

def test_should_return_a_the_normalized_and_original_text_for_a_token(self):
tokens = Tokenizer.default_tokenizer('fOoD')
assert len(tokens) == 1
_assert_token(tokens[0], 0, 3, 'fOoD', 'food')

def test_should_break_on_emojis(self):
tokens = Tokenizer.default_tokenizer('food 💥👍😀')
assert len(tokens) == 4
_assert_token(tokens[0], 0, 3, 'food')
_assert_token(tokens[1], 5, 5, '💥')
_assert_token(tokens[2], 6, 6, '👍')
_assert_token(tokens[3], 7, 7, '😀')