BpeTokenizer Cleanup (#7514)

tarekgh · ericstj · web-flow · commit b28b6d4c4784 · 2025-10-02T18:22:48.000-07:00
* BpeTokenizer Cleanup

* Apply suggestions from code review

Co-authored-by: Eric StJohn &lt;ericstj@microsoft.com&gt;

---------

Co-authored-by: Eric StJohn &lt;ericstj@microsoft.com&gt;
diff --git a/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs b/src/Microsoft.ML.Tokenizers/Model/BPETokenizer.cs
@@ -320,7 +320,7 @@ private BpeTokenizer(
 
             if (beginningOfSentenceToken is not null)
             {
-                if (!_vocab.TryGetValue(beginningOfSentenceToken, out int aId))
+                if (_vocab.TryGetValue(beginningOfSentenceToken, out int aId) is false && specialTokens?.TryGetValue(beginningOfSentenceToken, out aId) is false)
                 {
                     throw new InvalidOperationException($"The beginning of sentence token '{beginningOfSentenceToken}' was not present in the vocabulary.");
                 }
@@ -331,7 +331,7 @@ private BpeTokenizer(
 
             if (endOfSentenceToken is not null)
             {
-                if (!_vocab.TryGetValue(endOfSentenceToken, out int aId))
+                if (_vocab.TryGetValue(endOfSentenceToken, out int aId) is false && specialTokens?.TryGetValue(endOfSentenceToken, out aId) is false)
                 {
                     throw new InvalidOperationException($"The end of sentence token '{endOfSentenceToken}' was not present in the vocabulary.");
                 }
@@ -792,31 +792,30 @@ public string Decode(IEnumerable<int> ids, bool considerSpecialTokens)
 
             ValueStringBuilder sb = new ValueStringBuilder();
 
-            bool decodeUnknownToken = _unknownTokenId.HasValue && considerSpecialTokens;
-
-            if (decodeUnknownToken)
+            foreach (int id in ids)
             {
-                foreach (int id in ids)
+                if (_specialTokensReverse?.TryGetValue(id, out string? token) is true)
                 {
-                    if (MapIdToToken(id) is string s)
+                    if (considerSpecialTokens)
                     {
-                        sb.Append(s);
+                        sb.Append(token);
                     }
+                    continue;
                 }
-            }
-            else
-            {
-                foreach (int id in ids)
+
+                if (id == _unknownTokenId)
                 {
-                    if (id == _unknownTokenId)
+                    if (considerSpecialTokens)
                     {
-                        continue;
+                        Debug.Assert(UnknownToken is not null);
+                        sb.Append(UnknownToken);
                     }
+                    continue;
+                }
 
-                    if (MapIdToToken(id) is string s)
-                    {
-                        sb.Append(s);
-                    }
+                if (MapIdToToken(id) is string s)
+                {
+                    sb.Append(s);
                 }
             }
 
diff --git a/test/Microsoft.ML.Tokenizers.Tests/BpeTests.cs b/test/Microsoft.ML.Tokenizers.Tests/BpeTests.cs
@@ -885,6 +885,66 @@ public void TestDeepSeekR1Tokenizer(string text, int[] ids, string[] tokens, (in
             Assert.Equal(text, tokenizer.Decode(ids, considerSpecialTokens: false));
         }
 
+        [Fact]
+        public void TestTokenizerWithSpecialTokens()
+        {
+            // "https://huggingface.co/openai-community/gpt2/raw/main/vocab.json";
+            // "https://huggingface.co/openai-community/gpt2/raw/main/merges.txt";
+
+            BpeOptions options = new BpeOptions(Path.Combine(@"Gpt-2", "vocab.json"), Path.Combine(@"Gpt-2", "merges.txt"))
+            {
+                UnknownToken = "unk",
+
+                SpecialTokens = new Dictionary<string, int> // SpecialTokens not part of the original vocab.json
+                {
+                    { "<|sos|>", 50257 },
+                    { "<|eos|>", 50258 }
+                },
+                BeginningOfSentenceToken = "<|sos|>",
+                EndOfSentenceToken = "<|eos|>"
+            };
+
+            BpeTokenizer bpeTokenizer = BpeTokenizer.Create(options);
+            Assert.True(bpeTokenizer.Vocabulary.TryGetValue(options.UnknownToken, out int unkId));
+
+            string text = "Hello world!\uD800";
+
+            var ids = bpeTokenizer.EncodeToIds(text, considerPreTokenization: false);
+            Assert.Equal([50257, 15496, 2954, 6894, 0, 2954, 50258], ids); // space and u+D800 couldn't be encoded and produced unk tokens
+            Assert.Equal(unkId, ids[ids.Count - 2]);
+            Assert.Equal(options.SpecialTokens["<|sos|>"], ids[0]);
+            Assert.Equal(options.SpecialTokens["<|eos|>"], ids[^1]);
+
+            var tokens = bpeTokenizer.EncodeToTokens(text, out _, considerPreTokenization: false).Select(t => t.Value).ToArray();
+            Assert.Equal(["<|sos|>", "Hello", "unk", "world", "!", "unk", "<|eos|>"], tokens);
+
+            Assert.Equal("<|sos|>Hellounkworld!unk<|eos|>", bpeTokenizer.Decode(ids));
+            Assert.Equal("Helloworld!", bpeTokenizer.Decode(ids, considerSpecialTokens: false));
+
+            BpeOptions options1 = new BpeOptions(options.Vocabulary)
+            {
+                // Null UnknownToken means no unknown token support
+                Merges = options.Merges,
+                SpecialTokens = options.SpecialTokens,
+                BeginningOfSentenceToken = options.BeginningOfSentenceToken,
+                EndOfSentenceToken = options.EndOfSentenceToken
+            };
+
+            bpeTokenizer = BpeTokenizer.Create(options1);
+            ids = bpeTokenizer.EncodeToIds(text, considerPreTokenization: false);
+
+            // Because Unknown is not supported in this encoding, the encoding will produce different encoding results
+            Assert.Equal([50257, 39, 5037, 1764, 0, 50258], ids);
+            Assert.Equal(options.SpecialTokens["<|sos|>"], ids[0]);
+            Assert.Equal(options.SpecialTokens["<|eos|>"], ids[^1]);
+
+            tokens = bpeTokenizer.EncodeToTokens(text, out _, considerPreTokenization: false).Select(t => t.Value).ToArray();
+            Assert.Equal(["<|sos|>", "H", "ellow", "orld", "!", "<|eos|>"], tokens);
+
+            Assert.Equal("<|sos|>Helloworld!<|eos|>", bpeTokenizer.Decode(ids));
+            Assert.Equal("Helloworld!", bpeTokenizer.Decode(ids, considerSpecialTokens: false));
+        }
+
         private static BpeTokenizer CreateBpeTokenizerFromJson()
         {
             // @"https://huggingface.co/deepseek-ai/DeepSeek-R1/resolve/main/tokenizer.json?download=true"

Original file line number	Diff line number	Diff line change
`@@ -320,7 +320,7 @@ private BpeTokenizer(`
`320`	`320`
`321`	`321`	`if (beginningOfSentenceToken is not null)`
`322`	`322`	`{`
`323`		`- if (!_vocab.TryGetValue(beginningOfSentenceToken, out int aId))`
	`323`	`+ if (_vocab.TryGetValue(beginningOfSentenceToken, out int aId) is false && specialTokens?.TryGetValue(beginningOfSentenceToken, out aId) is false)`
`324`	`324`	`{`
`325`	`325`	`throw new InvalidOperationException($"The beginning of sentence token '{beginningOfSentenceToken}' was not present in the vocabulary.");`
`326`	`326`	`}`
`@@ -331,7 +331,7 @@ private BpeTokenizer(`
`331`	`331`
`332`	`332`	`if (endOfSentenceToken is not null)`
`333`	`333`	`{`
`334`		`- if (!_vocab.TryGetValue(endOfSentenceToken, out int aId))`
	`334`	`+ if (_vocab.TryGetValue(endOfSentenceToken, out int aId) is false && specialTokens?.TryGetValue(endOfSentenceToken, out aId) is false)`
`335`	`335`	`{`
`336`	`336`	`throw new InvalidOperationException($"The end of sentence token '{endOfSentenceToken}' was not present in the vocabulary.");`
`337`	`337`	`}`
`@@ -792,31 +792,30 @@ public string Decode(IEnumerable<int> ids, bool considerSpecialTokens)`
`792`	`792`
`793`	`793`	`ValueStringBuilder sb = new ValueStringBuilder();`
`794`	`794`
`795`		`- bool decodeUnknownToken = _unknownTokenId.HasValue && considerSpecialTokens;`
`796`		`-`
`797`		`- if (decodeUnknownToken)`
	`795`	`+ foreach (int id in ids)`
`798`	`796`	`{`
`799`		`- foreach (int id in ids)`
	`797`	`+ if (_specialTokensReverse?.TryGetValue(id, out string? token) is true)`
`800`	`798`	`{`
`801`		`- if (MapIdToToken(id) is string s)`
	`799`	`+ if (considerSpecialTokens)`
`802`	`800`	`{`
`803`		`- sb.Append(s);`
	`801`	`+ sb.Append(token);`
`804`	`802`	`}`
	`803`	`+ continue;`
`805`	`804`	`}`
`806`		`- }`
`807`		`- else`
`808`		`- {`
`809`		`- foreach (int id in ids)`
	`805`	`+`
	`806`	`+ if (id == _unknownTokenId)`
`810`	`807`	`{`
`811`		`- if (id == _unknownTokenId)`
	`808`	`+ if (considerSpecialTokens)`
`812`	`809`	`{`
`813`		`- continue;`
	`810`	`+ Debug.Assert(UnknownToken is not null);`
	`811`	`+ sb.Append(UnknownToken);`
`814`	`812`	`}`
	`813`	`+ continue;`
	`814`	`+ }`
`815`	`815`
`816`		`- if (MapIdToToken(id) is string s)`
`817`		`- {`
`818`		`- sb.Append(s);`
`819`		`- }`
	`816`	`+ if (MapIdToToken(id) is string s)`
	`817`	`+ {`
	`818`	`+ sb.Append(s);`
`820`	`819`	`}`
`821`	`820`	`}`
`822`	`821`