Skip to content

HF Tokenizers #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Dec 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,4 @@ jobs:
script: |
cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
cmake --build build -j9 --config Debug

# Run unit tests
export RESOURCES_PATH=test/resources

build/sentencepiece_test
build/tiktoken_test
ctest
7 changes: 1 addition & 6 deletions .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,4 @@ jobs:
script: |
cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
cmake --build build -j9 --config Debug

# Run unit tests
export RESOURCES_PATH=test/resources

build/sentencepiece_test
build/tiktoken_test
ctest
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
[submodule "third-party/abseil-cpp"]
path = third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
[submodule "third-party/json"]
path = third-party/json
url = https://github.com/nlohmann/json.git
56 changes: 36 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ set(CMAKE_CXX_STANDARD 17)
project(Tokenizers)

option(TOKENIZERS_BUILD_TEST "Build tests" OFF)
option(TOKENIZERS_BUILD_TOOLS "Build tools" OFF)

# Ignore weak attribute warning
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-attributes")
Expand All @@ -28,23 +29,31 @@ set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(third-party/abseil-cpp)
add_subdirectory(third-party/re2)
add_subdirectory(third-party/sentencepiece)
add_subdirectory(${CMAKE_SOURCE_DIR}/third-party/abseil-cpp)
add_subdirectory(${CMAKE_SOURCE_DIR}/third-party/re2)
add_subdirectory(${CMAKE_SOURCE_DIR}/third-party/sentencepiece)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

add_library(tokenizers STATIC src/sentencepiece.cpp src/tiktoken.cpp)
file(GLOB tokenizers_source_files ${CMAKE_SOURCE_DIR}/src/*.cpp)
file(GLOB unicode_source_files ${CMAKE_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
add_library(tokenizers STATIC ${tokenizers_source_files} ${unicode_source_files})

# Using abseil from sentencepiece/third_party
target_include_directories(
tokenizers PUBLIC third-party/sentencepiece/src third-party/sentencepiece
include third-party/re2)
tokenizers PUBLIC
${CMAKE_SOURCE_DIR}/include
${CMAKE_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_SOURCE_DIR}/third-party/sentencepiece/src
${CMAKE_SOURCE_DIR}/third-party/re2
${CMAKE_SOURCE_DIR}/third-party/json/single_include
${CMAKE_SOURCE_DIR}/third-party/llama.cpp-unicode/include)

target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)

# Build test
if(TOKENIZERS_BUILD_TEST)
include(FetchContent)
enable_testing()
include(FetchContent)
# CMAKE
FetchContent_Declare(
googletest
Expand All @@ -56,18 +65,25 @@ if(TOKENIZERS_BUILD_TEST)
CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)

set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/test/resources)
add_executable(sentencepiece_test test/test_sentencepiece.cpp)
target_include_directories(
sentencepiece_test
PUBLIC third-party/sentencepiece/src third-party/sentencepiece include
GTEST_INCLUDE_PATH)
target_link_libraries(sentencepiece_test PUBLIC tokenizers gtest_main)
file(GLOB test_source_files ${CMAKE_SOURCE_DIR}/test/test_*.cpp)
foreach(test_source_file ${test_source_files})
get_filename_component(test_name ${test_source_file} NAME_WE)
message(STATUS "Configuring unit test ${test_name}")
add_executable(${test_name} ${test_source_file})
target_include_directories(${test_name} PRIVATE
GTEST_INCLUDE_PATH
${CMAKE_SOURCE_DIR}/include
${CMAKE_SOURCE_DIR}/third-party/sentencepiece
${CMAKE_SOURCE_DIR}/third-party/re2
${CMAKE_SOURCE_DIR}/third-party/json/single_include
)
target_link_libraries(${test_name} gtest_main tokenizers)
target_compile_definitions(${test_name} PRIVATE RESOURCES_PATH="${CMAKE_SOURCE_DIR}/test/resources")
add_test(${test_name} "${test_name}")
endforeach()
endif()

# tiktoken tests
add_executable(tiktoken_test test/test_base64.cpp test/test_tiktoken.cpp)
target_include_directories(
tiktoken_test PUBLIC third-party/re2 third-party/abseil-cpp include
GTEST_INCLUDE_PATH)
target_link_libraries(tiktoken_test PUBLIC tokenizers gtest_main)
# Build tools
if(TOKENIZERS_BUILD_TOOLS)
add_subdirectory(tools/tokenize_tool)
endif()
68 changes: 68 additions & 0 deletions include/detail/bpe_tokenizer_base.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

// Base class for all BPE tokenizer implementations
#pragma once

// Standard
#include <unordered_map>
#include <vector>

// Third Party
#include <re2/re2.h>

// Local
#include "result.h"
#include "tokenizer.h"

namespace tokenizers {
namespace detail {

using Encoder = std::unordered_map<std::string, uint64_t>;
using Decoder = std::unordered_map<uint64_t, std::string>;
using Re2UPtr = std::unique_ptr<re2::RE2>;

class BPETokenizerBase : public Tokenizer {
public:
Result<std::vector<uint64_t>> encode(const std::string &input, int8_t bos,
int8_t eos) const override;

Result<std::string> decode(uint64_t prev_token,
uint64_t token) const override;

protected:
explicit BPETokenizerBase() {}
virtual ~BPETokenizerBase() {}

std::pair<std::optional<std::string>, re2::StringPiece>
split_with_allowed_special_token_(re2::StringPiece &input,
const Encoder &allowed_special) const;

Result<std::pair<std::vector<uint64_t>, uint64_t>>
encode_with_special_token_(const std::string &text,
const Encoder &allowed_special) const;

Result<std::vector<uint64_t>> byte_pair_encode_(const std::string &piece,
const Encoder &encoder) const;

// Protected members that can be overloaded by other BPE tokenizers
Re2UPtr special_token_regex_;
Encoder encoder_;
Encoder special_token_encoder_;
Decoder decoder_;
Decoder special_token_decoder_;

private:
virtual Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
uint64_t &last_piece_token_len) const = 0;

virtual void _decode(re2::StringPiece input, std::string &ret) const = 0;
};

} // namespace detail
} // namespace tokenizers
54 changes: 54 additions & 0 deletions include/hf_tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

// Used by many Huggingface models. Adapted from a combination of the original
// rust implementation (https://github.com/huggingface/tokenizers/tree/main)
// and the corresponding support in llama.cpp
// (https://github.com/ggerganov/llama.cpp)
#pragma once

// Standard
#include <string>

// Third Party
#include <re2/re2.h>

// Local
#include "detail/bpe_tokenizer_base.h"
#include "error.h"
#include "pre_tokenizer.h"
#include "result.h"
#include "token_decoder.h"

namespace tokenizers {
class HFTokenizer : public detail::BPETokenizerBase {
public:
/*-- Public Interface --*/

/**
* Default initialize with no loaded data
*/
explicit HFTokenizer() {}
~HFTokenizer() {}

/**
* Load the model data into the
*/
Error load(const std::string &tokenizer_path) override;

private:
Error _encode(re2::StringPiece &input, std::vector<uint64_t> &ret,
uint64_t &last_piece_token_len) const override;

void _decode(re2::StringPiece input, std::string &ret) const override;

PreTokenizer::Ptr _pretokenizer;
TokenDecoder::Ptr _decoder;
};

} // namespace tokenizers
Loading