Skip to content

Tokenizers cpp 1251 #1379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 35 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
9a18111
feat(models): Add models.json blocks for Granite Code 3b and 8b
gabe-l-hart Sep 23, 2024
35da73e
feat: Initial model params for granite code 3b
gabe-l-hart Sep 25, 2024
8b3a541
fix(model config): Fix model configs for Granite Code
gabe-l-hart Oct 31, 2024
1b5a069
feat(granite): Add model params for granite-code-8b
gabe-l-hart Oct 31, 2024
5c5b6f3
fix(deps): Add tokenizers to the deps explicitly
gabe-l-hart Nov 7, 2024
798f985
feat(tokenizer): Add basic support for jinja2 template rendering for …
gabe-l-hart Nov 7, 2024
9bc90b0
fix(chat): Add HFTokenizerChatFormatter and use it for HF tokenizers
gabe-l-hart Nov 7, 2024
1b6d63e
fix(deps): Add jinja2 as an explicit dep
gabe-l-hart Nov 7, 2024
1d2c9fe
refactor(tokenizer): Add const qualifiers to const methods in tokeniz…
gabe-l-hart Oct 4, 2024
82ede30
feat(tokenizers): Add submodule for nlohmann/json
gabe-l-hart Oct 4, 2024
2e5b95a
feat(tokenizer): Add stub of TokenizersTokenizer
gabe-l-hart Oct 4, 2024
6b5a48a
feat(tokenize_tool): Add a simple tool to round-trip tokenize some st…
gabe-l-hart Oct 4, 2024
c1a0ba9
feat(tokenizers): Mostly complete impl of loading tokenizer.json/toke…
gabe-l-hart Oct 4, 2024
b022b02
fix(tokenize_tool): Fix logic for decoding if encoded is empty
gabe-l-hart Oct 4, 2024
10cae38
feat(tokenize)!: Add gtest submodule
gabe-l-hart Oct 17, 2024
6396b4e
feat(tokenize): Initial work for pre-tokenizer class hierarchy with t…
gabe-l-hart Oct 17, 2024
a7871c2
chore(gitignore): Ignore the build dir (standard CMake build dir name)
gabe-l-hart Oct 24, 2024
d25d3e6
test: Extract testing into a helper function
gabe-l-hart Oct 24, 2024
62bcbc5
WIP!: Initial work towards making the tiktoken implementation a gener…
gabe-l-hart Oct 24, 2024
7fbfb6b
feat(tokenizer): Add Digit pre tokenizer
gabe-l-hart Oct 25, 2024
0f1ba98
feat(tokenizers): Add a vendored copy of the unicode support from lla…
gabe-l-hart Nov 11, 2024
fd1a7cb
fix(tokenizers): Change the return type of pre_tokenize to allow owne…
gabe-l-hart Nov 11, 2024
59a2bdb
feat(tokenizers): Add byte-level pre-tokenizer
gabe-l-hart Nov 11, 2024
34bea83
fix(tokenizers): Use pass-by-value for string views
gabe-l-hart Nov 11, 2024
f1663d5
feat(tokenizers): add SequencePreTokenizer
gabe-l-hart Nov 11, 2024
941de94
feat(tokenizers): Add the PreTokenizerConfig factory class
gabe-l-hart Nov 12, 2024
6d3f47d
fix(tokenizers): Fix naming for inheriting from Tiktoken
gabe-l-hart Nov 12, 2024
927f226
fix(tokenizer): TokenizersTokenizer -> HFTokenizer
gabe-l-hart Nov 12, 2024
599e0c6
fix(tokenizer): Remove unnecessary templates on Tiktoken impl methods
gabe-l-hart Nov 12, 2024
01bacb4
feat(tokenizer): Split Tiktoken out into BPETokenizerBase and Tiktoken
gabe-l-hart Nov 13, 2024
221811b
feat(tokenizers): Add json parsing to PreTokenizerConfig factory
gabe-l-hart Nov 15, 2024
9f898fe
feat(tokenizers): Use BPETokenizerBase and PreTokenizers in HFTokenizer
gabe-l-hart Nov 15, 2024
a29a0fb
feat(tokenizers): Add a minimal implementation of the HF decoder suite
gabe-l-hart Nov 15, 2024
7084831
fix(comment): Better comment in pre_tokenizer.h
gabe-l-hart Nov 15, 2024
b8c7941
feat(tokenizers): Add decoding support to HFTokenizer
gabe-l-hart Nov 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ torchao-build/*
runner-et/cmake-out/*
runner-aoti/cmake-out/*
cmake-out/
build/

# pte files
*.pte
Expand Down
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@
[submodule "tokenizer/third-party/sentencepiece"]
path = tokenizer/third-party/sentencepiece
url = https://github.com/google/sentencepiece.git
[submodule "tokenizer/third-party/json"]
path = tokenizer/third-party/json
url = https://github.com/nlohmann/json.git
[submodule "tokenizer/third-party/googletest"]
path = tokenizer/third-party/googletest
url = https://github.com/google/googletest.git
4 changes: 4 additions & 0 deletions install/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ gguf
# Tiktoken tokenizer for Llama 3 and other advanced models
tiktoken

# Tokenizers and jinja2 for other non-llama models that use HF tokenizers
tokenizers
jinja2

# Miscellaneous
snakeviz
sentencepiece
Expand Down
43 changes: 41 additions & 2 deletions tokenizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,18 @@ add_library(
tokenizer
tokenizer.h
sentencepiece.cpp
tiktoken.cpp)
tiktoken.cpp
hf_tokenizers.cpp
pre_tokenizer.cpp
token_decoder.cpp
# llama.cpp unicode
unicode-data.cpp
unicode.cpp)

target_include_directories(tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} third-party/sentencepiece/src)
target_include_directories(
tokenizer PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
third-party/sentencepiece/src
third-party/json/single_include)

# add RE2 as subdirectory
set(ABSL_ENABLE_INSTALL ON)
Expand All @@ -27,3 +36,33 @@ add_subdirectory(third-party/sentencepiece)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

target_link_libraries(tokenizer PUBLIC re2::re2 sentencepiece-static)

# Add tokenizer_tool binary for testing
set(binary_dir "${CMAKE_BINARY_DIR}/bin")
OPTION(BUILD_TOKENIZE_TOOL "Build the tokenize_tool binary" OFF)
if (BUILD_TOKENIZE_TOOL)
add_executable(tokenize_tool tokenize_tool.cpp)
set_target_properties(tokenize_tool PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${binary_dir})
target_include_directories(tokenize_tool PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
target_link_libraries(tokenize_tool tokenizer)
endif()

# Add unit tests
OPTION(BUILD_UNIT_TESTS "Enable unit testing" OFF)
if (BUILD_UNIT_TESTS)
add_subdirectory(third-party/googletest)
enable_testing()
set(test_binary_dir "${binary_dir}/test")

file(GLOB test_source_files tests/*_test.cpp)
foreach(test_source_file ${test_source_files})
get_filename_component(test_name ${test_source_file} NAME_WE)
message(STATUS "Configuring unit test ${test_name}")
add_executable(${test_name} tests/main.cpp ${test_source_file})
set_target_properties(${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${test_binary_dir})
target_include_directories(${test_name} PRIVATE ${CMAKE_SOURCE_DIR})
target_include_directories(${test_name} PRIVATE third-party/googletest/googletest/include)
target_link_libraries(${test_name} gtest tokenizer)
add_test(${test_name} "${test_binary_dir}/${test_name}")
endforeach()
endif()
28 changes: 27 additions & 1 deletion tokenizer/hf_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,12 @@
# LICENSE file in the root directory of this source tree.

# Standard
from typing import List, Optional
from typing import Dict, List, Optional
import json
import os

# Third Party
import jinja2
from tokenizers import Tokenizer

# Local
Expand Down Expand Up @@ -37,6 +38,9 @@ def __init__(self, file_path: str):
# Load the tokenizer itself
self._tokenizer = Tokenizer.from_file(tokenizer_path)

# Load the chat template if we have a config path
self._chat_template: Optional[jinja2.Template] = None

# If available, parse bos/eos tokens from the tokenizer config
self._bos_id, self._eos_id = None, None
if tokenizer_config_path is not None:
Expand All @@ -48,6 +52,8 @@ def __init__(self, file_path: str):
self._bos_id = self._tokenizer.token_to_id(bos_token)
if eos_token is not None:
self._eos_id = self._tokenizer.token_to_id(eos_token)
if chat_template_str := tok_config.get("chat_template"):
self._chat_template = jinja2.Template(chat_template_str)

# If no eos/bos tokens found, go looking for them!
if None in [self._bos_id, self._eos_id]:
Expand All @@ -70,6 +76,8 @@ def _look_for_special_token(added_tokens: dict, search_strs: List[str]) -> Optio
if len(candidate_toks) == 1:
return candidate_toks[0]["id"]

## Interface ##

def encode(
self,
s: str,
Expand All @@ -90,3 +98,21 @@ def bos_id(self) -> int:

def eos_id(self) -> int:
return self._eos_id

## Additional Public Methods ##

def has_chat_template(self) -> bool:
return bool(self._chat_template)

def apply_chat_template(
self,
dialog: List[Dict[str, str]],
add_generation_prompt: bool = False,
) -> str:
"""If configured with a chat template, apply it to the list of messages
"""
if not self._chat_template:
raise ValueError("No chat template configured!")
return self._chat_template.render(
messages=dialog, add_generation_prompt=add_generation_prompt
)
Loading