pytorch · gabe-l-hart · Sep 23, 2024 · Sep 25, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,7 @@ torchao-build/*
 runner-et/cmake-out/*
 runner-aoti/cmake-out/*
 cmake-out/
+build/
 
 # pte files
 *.pte

diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,9 @@
 [submodule "tokenizer/third-party/sentencepiece"]
 	path = tokenizer/third-party/sentencepiece
 	url = https://github.com/google/sentencepiece.git
+[submodule "tokenizer/third-party/json"]
+	path = tokenizer/third-party/json
+	url = https://github.com/nlohmann/json.git
+[submodule "tokenizer/third-party/googletest"]
+	path = tokenizer/third-party/googletest
+	url = https://github.com/google/googletest.git
diff --git a/install/requirements.txt b/install/requirements.txt
@@ -9,6 +9,10 @@ gguf
 # Tiktoken tokenizer for Llama 3 and other advanced models
 tiktoken
 
+# Tokenizers and jinja2 for other non-llama models that use HF tokenizers
+tokenizers
+jinja2
+
 # Miscellaneous
 snakeviz
 sentencepiece

diff --git a/tokenizer/CMakeLists.txt b/tokenizer/CMakeLists.txt
@@ -11,9 +11,18 @@ add_library(
     tokenizer
     tokenizer.h
     sentencepiece.cpp
-    tiktoken.cpp)
+    tiktoken.cpp
+    hf_tokenizers.cpp
+    pre_tokenizer.cpp
+    token_decoder.cpp
+    # llama.cpp unicode
+    unicode-data.cpp
+    unicode.cpp)
 
-target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
+target_include_directories(
+    tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
+    third-party/sentencepiece/src
+    third-party/json/single_include)
 
 # add RE2 as subdirectory
 set(ABSL_ENABLE_INSTALL ON)
@@ -27,3 +36,33 @@ add_subdirectory(third-party/sentencepiece)
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
 target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)
+
+# Add tokenizer_tool binary for testing
+set(binary_dir "${CMAKE_BINARY_DIR}/bin")
+OPTION(BUILD_TOKENIZE_TOOL "Build the tokenize_tool binary" OFF)
+if (BUILD_TOKENIZE_TOOL)
+    add_executable(tokenize_tool tokenize_tool.cpp)
+    set_target_properties(tokenize_tool PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${binary_dir})
+    target_include_directories(tokenize_tool PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+    target_link_libraries(tokenize_tool tokenizer)
+endif()
+
+# Add unit tests
+OPTION(BUILD_UNIT_TESTS "Enable unit testing" OFF)
+if (BUILD_UNIT_TESTS)
+    add_subdirectory(third-party/googletest)
+    enable_testing()
+    set(test_binary_dir "${binary_dir}/test")
+
+    file(GLOB test_source_files tests/*_test.cpp)
+    foreach(test_source_file ${test_source_files})
+        get_filename_component(test_name ${test_source_file} NAME_WE)
+        message(STATUS "Configuring unit test ${test_name}")
+        add_executable(${test_name} tests/main.cpp ${test_source_file})
+        set_target_properties(${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${test_binary_dir})
+        target_include_directories(${test_name} PRIVATE ${CMAKE_SOURCE_DIR})
+        target_include_directories(${test_name} PRIVATE third-party/googletest/googletest/include)
+        target_link_libraries(${test_name} gtest tokenizer)
+        add_test(${test_name} "${test_binary_dir}/${test_name}")
+    endforeach()
+endif()
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
@@ -5,11 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 # Standard
-from typing import List, Optional
+from typing import Dict, List, Optional
 import json
 import os
 
 # Third Party
+import jinja2
 from tokenizers import Tokenizer
 
 # Local
@@ -37,6 +38,9 @@ def __init__(self, file_path: str):
         # Load the tokenizer itself
         self._tokenizer = Tokenizer.from_file(tokenizer_path)
 
+        # Load the chat template if we have a config path
+        self._chat_template: Optional[jinja2.Template] = None
+
         # If available, parse bos/eos tokens from the tokenizer config
         self._bos_id, self._eos_id = None, None
         if tokenizer_config_path is not None:
@@ -48,6 +52,8 @@ def __init__(self, file_path: str):
                 self._bos_id = self._tokenizer.token_to_id(bos_token)
             if eos_token is not None:
                 self._eos_id = self._tokenizer.token_to_id(eos_token)
+            if chat_template_str := tok_config.get("chat_template"):
+                self._chat_template = jinja2.Template(chat_template_str)
 
         # If no eos/bos tokens found, go looking for them!
         if None in [self._bos_id, self._eos_id]:
@@ -70,6 +76,8 @@ def _look_for_special_token(added_tokens: dict, search_strs: List[str]) -> Optio
             if len(candidate_toks) == 1:
                 return candidate_toks[0]["id"]
 
+    ## Interface ##
+
     def encode(
         self,
         s: str,
@@ -90,3 +98,21 @@ def bos_id(self) -> int:
 
     def eos_id(self) -> int:
         return self._eos_id
+
+    ## Additional Public Methods ##
+
+    def has_chat_template(self) -> bool:
+        return bool(self._chat_template)
+
+    def apply_chat_template(
+        self,
+        dialog: List[Dict[str, str]],
+        add_generation_prompt: bool = False,
+    ) -> str:
+        """If configured with a chat template, apply it to the list of messages
+        """
+        if not self._chat_template:
+            raise ValueError("No chat template configured!")
+        return self._chat_template.render(
+            messages=dialog, add_generation_prompt=add_generation_prompt
+        )