@@ -885,6 +885,66 @@ public void TestDeepSeekR1Tokenizer(string text, int[] ids, string[] tokens, (in
885885 Assert . Equal ( text , tokenizer . Decode ( ids , considerSpecialTokens : false ) ) ;
886886 }
887887
888+ [ Fact ]
889+ public void TestTokenizerWithSpecialTokens ( )
890+ {
891+ // "https://huggingface.co/openai-community/gpt2/raw/main/vocab.json";
892+ // "https://huggingface.co/openai-community/gpt2/raw/main/merges.txt";
893+
894+ BpeOptions options = new BpeOptions ( Path . Combine ( @"Gpt-2" , "vocab.json" ) , Path . Combine ( @"Gpt-2" , "merges.txt" ) )
895+ {
896+ UnknownToken = "unk" ,
897+
898+ SpecialTokens = new Dictionary < string , int > // SpecialTokens not part of the original vocab.json
899+ {
900+ { "<|sos|>" , 50257 } ,
901+ { "<|eos|>" , 50258 }
902+ } ,
903+ BeginningOfSentenceToken = "<|sos|>" ,
904+ EndOfSentenceToken = "<|eos|>"
905+ } ;
906+
907+ BpeTokenizer bpeTokenizer = BpeTokenizer . Create ( options ) ;
908+ Assert . True ( bpeTokenizer . Vocabulary . TryGetValue ( options . UnknownToken , out int unkId ) ) ;
909+
910+ string text = "Hello world!\uD800 " ;
911+
912+ var ids = bpeTokenizer . EncodeToIds ( text , considerPreTokenization : false ) ;
913+ Assert . Equal ( [ 50257 , 15496 , 2954 , 6894 , 0 , 2954 , 50258 ] , ids ) ; // space and u+D800 couldn't be encoded and produced unk tokens
914+ Assert . Equal ( unkId , ids [ ids . Count - 2 ] ) ;
915+ Assert . Equal ( options . SpecialTokens [ "<|sos|>" ] , ids [ 0 ] ) ;
916+ Assert . Equal ( options . SpecialTokens [ "<|eos|>" ] , ids [ ^ 1 ] ) ;
917+
918+ var tokens = bpeTokenizer . EncodeToTokens ( text , out _ , considerPreTokenization : false ) . Select ( t => t . Value ) . ToArray ( ) ;
919+ Assert . Equal ( [ "<|sos|>" , "Hello" , "unk" , "world" , "!" , "unk" , "<|eos|>" ] , tokens ) ;
920+
921+ Assert . Equal ( "<|sos|>Hellounkworld!unk<|eos|>" , bpeTokenizer . Decode ( ids ) ) ;
922+ Assert . Equal ( "Helloworld!" , bpeTokenizer . Decode ( ids , considerSpecialTokens : false ) ) ;
923+
924+ BpeOptions options1 = new BpeOptions ( options . Vocabulary )
925+ {
926+ // Null UnknownToken means no unknown token support
927+ Merges = options . Merges ,
928+ SpecialTokens = options . SpecialTokens ,
929+ BeginningOfSentenceToken = options . BeginningOfSentenceToken ,
930+ EndOfSentenceToken = options . EndOfSentenceToken
931+ } ;
932+
933+ bpeTokenizer = BpeTokenizer . Create ( options1 ) ;
934+ ids = bpeTokenizer . EncodeToIds ( text , considerPreTokenization : false ) ;
935+
936+ // Because Unknown is not supported in this encoding, the encoding will produce different encoding results
937+ Assert . Equal ( [ 50257 , 39 , 5037 , 1764 , 0 , 50258 ] , ids ) ;
938+ Assert . Equal ( options . SpecialTokens [ "<|sos|>" ] , ids [ 0 ] ) ;
939+ Assert . Equal ( options . SpecialTokens [ "<|eos|>" ] , ids [ ^ 1 ] ) ;
940+
941+ tokens = bpeTokenizer . EncodeToTokens ( text , out _ , considerPreTokenization : false ) . Select ( t => t . Value ) . ToArray ( ) ;
942+ Assert . Equal ( [ "<|sos|>" , "H" , "ellow" , "orld" , "!" , "<|eos|>" ] , tokens ) ;
943+
944+ Assert . Equal ( "<|sos|>Helloworld!<|eos|>" , bpeTokenizer . Decode ( ids ) ) ;
945+ Assert . Equal ( "Helloworld!" , bpeTokenizer . Decode ( ids , considerSpecialTokens : false ) ) ;
946+ }
947+
888948 private static BpeTokenizer CreateBpeTokenizerFromJson ( )
889949 {
890950 // @"https://huggingface.co/deepseek-ai/DeepSeek-R1/resolve/main/tokenizer.json?download=true"
0 commit comments