From 589226ea79a23f949308f74df50837c70a79c023 Mon Sep 17 00:00:00 2001 From: pkoukk Date: Tue, 27 Jun 2023 16:27:33 +0800 Subject: [PATCH 1/3] add bpe interface --- encoding.go | 8 ++++---- go.mod | 2 +- go.sum | 4 ++-- load.go | 14 ++++++++++++++ tiktoken.go | 6 ++++++ 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/encoding.go b/encoding.go index d1bd208..fc6c820 100644 --- a/encoding.go +++ b/encoding.go @@ -116,7 +116,7 @@ func initEncoding(encodingName string) (*Encoding, error) { } func cl100k_base() (*Encoding, error) { - ranks, err := loadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken") + ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken") if err != nil { return nil, err } @@ -136,7 +136,7 @@ func cl100k_base() (*Encoding, error) { } func p50k_edit() (*Encoding, error) { - ranks, err := loadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken") + ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken") if err != nil { return nil, err } @@ -150,7 +150,7 @@ func p50k_edit() (*Encoding, error) { } func p50k_base() (*Encoding, error) { - ranks, err := loadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken") + ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken") if err != nil { return nil, err } @@ -173,7 +173,7 @@ func p50k_base() (*Encoding, error) { } func r50k_base() (*Encoding, error) { - ranks, err := loadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken") + ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken") if err != nil { return nil, err } diff --git a/go.mod b/go.mod index 8fba3d1..105f638 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/pkoukk/tiktoken-go go 1.19 require ( - github.com/dlclark/regexp2 v1.8.1 + github.com/dlclark/regexp2 v1.10.0 github.com/google/uuid v1.3.0 github.com/stretchr/testify v1.8.2 ) diff --git a/go.sum b/go.sum index d220fd2..6d00ac7 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/dlclark/regexp2 v1.8.1 h1:6Lcdwya6GjPUNsBct8Lg/yRPwMhABj269AAzdGSiR+0= -github.com/dlclark/regexp2 v1.8.1/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= +github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/load.go b/load.go index 095aa8c..0611b0e 100644 --- a/load.go +++ b/load.go @@ -14,6 +14,10 @@ import ( "github.com/google/uuid" ) +type BpeLoader interface { + LoadTiktokenBpe(tiktokenBpeFile string) (map[string]int, error) +} + func readFile(blobpath string) ([]byte, error) { if !strings.HasPrefix(blobpath, "http://") && !strings.HasPrefix(blobpath, "https://") { file, err := os.Open(blobpath) @@ -91,3 +95,13 @@ func loadTiktokenBpe(tiktokenBpeFile string) (map[string]int, error) { } return bpeRanks, nil } + +type defaultBpeLoader struct{} + +func (l *defaultBpeLoader) LoadTiktokenBpe(tiktokenBpeFile string) (map[string]int, error) { + return loadTiktokenBpe(tiktokenBpeFile) +} + +func NewDefaultBpeLoader() BpeLoader { + return &defaultBpeLoader{} +} diff --git a/tiktoken.go b/tiktoken.go index 215fd47..82e8565 100644 --- a/tiktoken.go +++ b/tiktoken.go @@ -8,6 +8,12 @@ import ( "github.com/dlclark/regexp2" ) +var bpeLoader BpeLoader = NewDefaultBpeLoader() + +func SetBpeLoader(loader BpeLoader) { + bpeLoader = loader +} + func GetEncoding(encodingName string) (*Tiktoken, error) { enc, err := getEncoding(encodingName) if err != nil { From 47c012b72ff5e680e17bb756b3c31be086d8ad24 Mon Sep 17 00:00:00 2001 From: pkoukk Date: Tue, 27 Jun 2023 17:24:45 +0800 Subject: [PATCH 2/3] update readme --- README.md | 38 +++++++++++++++++++++++++------------- README_zh-hans.md | 17 +++++++++++++---- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 7dc0b78..48ec1af 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,10 @@ Tiktoken is a fast BPE tokeniser for use with OpenAI's models. This is a port of the original [tiktoken](https://github.com/openai/tiktoken). # Usage - ## Install ```bash go get github.com/pkoukk/tiktoken-go -# default tiktoken need download token dictionary from openai website, -# if you want use this lib offline, use embed branch instead -go get github.com/pkoukk/tiktoken-go@embed ``` ## Cache Tiktoken-go has the same cache mechanism as the original Tiktoken library. @@ -26,9 +22,22 @@ Once this variable is set, tiktoken-go will use this directory to cache the toke If you don't set this environment variable, tiktoken-go will download the dictionary each time you initialize an encoding for the first time. -## Example +## Alternative BPE loaders +If you don't want to use cache or download the dictionary each time, you can use alternative BPE loader. + +Just call `tiktoken.SetBpeLoader` before calling `tiktoken.GetEncoding` or `tiktoken.EncodingForModel`. + +`BpeLoader` is an interface, you can implement your own BPE loader by implementing this interface. + +### Offline BPE loader +The offline BPE loader loads the BPE dictionary from embed files, it helps if you don't want to download the dictionary at runtime. + +Due to the size of the BPE dictionary, this loader is in other project. + +Include if you require this loader: [tiktoken_loader](https://github.com/pkoukk/tiktoken-go-loader) -### get token by encoding +## Examples +### Get Token By Encoding ```go package main @@ -42,6 +51,8 @@ func main() { text := "Hello, world!" encoding := "cl100k_base" + // if you don't want download dictionary at runtime, you can use offline loader + // tiktoken.SetBpeLoader(tiktoken_loader.NewOfflineLoader()) tke, err := tiktoken.GetEncoding(encoding) if err != nil { err = fmt.Errorf("getEncoding: %v", err) @@ -58,7 +69,7 @@ func main() { } ``` -### get token by Model +### Get Token By Model ```go package main @@ -87,7 +98,8 @@ func main() { fmt.Println(len(token)) } ``` -### counting tokens for chat API calls + +### Counting Tokens For Chat API Calls Below is an example function for counting tokens for messages passed to gpt-3.5-turbo-0301 or gpt-4-0314. The following code was written by @nasa1024 based on [openai-cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) examples. @@ -159,7 +171,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string ``` -# available encodings +# Available Encodings | Encoding name | OpenAI models | | ----------------------- | ---------------------------------------------------- | | `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` | @@ -168,13 +180,13 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string -# available models +# Available Models | Model name | OpenAI models | | ---------------------------- | ------------- | -| gpt-4 | cl100k_base | | gpt-4-* | cl100k_base | -| gpt-3.5-turbo | cl100k_base | | gpt-3.5-turbo-* | cl100k_base | +| gpt-4 | cl100k_base | +| gpt-3.5-turbo | cl100k_base | | text-davinci-003 | p50k_base | | text-davinci-002 | p50k_base | | text-davinci-001 | r50k_base | @@ -211,7 +223,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string # Test > you can run test in [test](./test) folder -# compare with original [tiktoken](https://github.com/openai/tiktoken) +## compare with original [tiktoken](https://github.com/openai/tiktoken) ## get token by encoding [result](./doc/test_result.md#encoding-test-result) diff --git a/README_zh-hans.md b/README_zh-hans.md index 090bb51..31e7a0f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -7,11 +7,8 @@ tiktoken的原项目地址[tiktoken](https://github.com/openai/tiktoken). ## 安装 - ```bash go get github.com/pkoukk/tiktoken-go -# 默认的tiktoken需要从openai下载token字典,如果想要离线使用,可以使用以下分支 -go get github.com/pkoukk/tiktoken-go@embed ``` ## 缓存 Tiktoken-go 和原始的 Tiktoken 库一样,具有相同的缓存机制。 @@ -19,9 +16,19 @@ Tiktoken-go 和原始的 Tiktoken 库一样,具有相同的缓存机制。 一旦设置了该变量,tiktoken-go 将使用该目录来缓存令牌字典。 如果您未设置此环境变量,则 tiktoken-go 将在每次首次初始化编码时下载字典。 +## 替代 BPE 加载器 +默认情况下,tiktoken-go 会在运行时下载字典,如果您不想使用缓存或每次下载字典,您可以使用替代 BPE 加载器。 +只需在调用 `tiktoken.GetEncoding` 或 `tiktoken.EncodingForModel` 之前调用 `tiktoken.SetBpeLoader`。 +`BpeLoader` 是一个接口,您可以通过实现此接口来实现自己的 BPE 加载器。 + +### 离线 BPE 加载器 +离线 BPE 加载器从嵌入文件加载 BPE 字典。 +由于 BPE 字典的大小,此加载器在其他项目中。 +如果需要使用,请引用:[tiktoken_loader](https://github.com/pkoukk/tiktoken-go-loader) + ## 例子 -### get token by encoding +### Get Token By Encoding ```go package main @@ -35,6 +42,8 @@ func main() { text := "Hello, world!" encoding := "cl100k_base" + // 如果你不想在运行时下载字典,你可以使用离线加载器 + // tiktoken.SetBpeLoader(tiktoken_loader.NewOfflineLoader()) tke, err := tiktoken.GetEncoding(encoding) if err != nil { err = fmt.Errorf("getEncoding: %v", err) From 7e22393b92df1847414809619f60bacdb53877cd Mon Sep 17 00:00:00 2001 From: pkoukk Date: Tue, 27 Jun 2023 17:28:38 +0800 Subject: [PATCH 3/3] update doc format --- README_zh-hans.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README_zh-hans.md b/README_zh-hans.md index 31e7a0f..7912750 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -12,18 +12,26 @@ go get github.com/pkoukk/tiktoken-go ``` ## 缓存 Tiktoken-go 和原始的 Tiktoken 库一样,具有相同的缓存机制。 + 您可以使用环境变量 TIKTOKEN_CACHE_DIR 来设置缓存目录。 + 一旦设置了该变量,tiktoken-go 将使用该目录来缓存令牌字典。 + 如果您未设置此环境变量,则 tiktoken-go 将在每次首次初始化编码时下载字典。 + ## 替代 BPE 加载器 默认情况下,tiktoken-go 会在运行时下载字典,如果您不想使用缓存或每次下载字典,您可以使用替代 BPE 加载器。 + 只需在调用 `tiktoken.GetEncoding` 或 `tiktoken.EncodingForModel` 之前调用 `tiktoken.SetBpeLoader`。 + `BpeLoader` 是一个接口,您可以通过实现此接口来实现自己的 BPE 加载器。 ### 离线 BPE 加载器 离线 BPE 加载器从嵌入文件加载 BPE 字典。 -由于 BPE 字典的大小,此加载器在其他项目中。 + +由于 BPE 字典的文件较大,不适合包含在本项目中,故此加载器在其他项目中。 + 如果需要使用,请引用:[tiktoken_loader](https://github.com/pkoukk/tiktoken-go-loader) ## 例子