@@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
3636        { "    Hello"                , {     258 ,  23090 , }, },
3737        { "     Hello"               , {     466 ,  23090 , }, },
3838        { "     Hello\n     Hello"    , {     466 ,  23090 ,    742 ,  23090 , }, },
39+         { " \n  ="                    , {    1212 ,     40 , }, },
40+         { " ' era"                   , {      18 ,   4932 , }, },
3941    };
4042
4143    return  _k_tests;
@@ -155,7 +157,7 @@ int main(int argc, char **argv) {
155157
156158        fprintf (stderr, " %s : text size: %zu\n "  , __func__, text.size ());
157159
158-         const  std::vector<llama_token> res = llama_tokenize (ctx, text, true );
160+         const  std::vector<llama_token> res = llama_tokenize (ctx, text, false );
159161
160162        fprintf (stderr, " %s : tokens: %zu\n "  , __func__, res.size ());
161163
@@ -169,10 +171,8 @@ int main(int argc, char **argv) {
169171            }
170172
171173            for  (const  auto  & tok : res) {
172-                 ofs << tok << "  "  ;
174+                 ofs << tok << "  ' "  <<  llama_detokenize_bpe (ctx, std::vector< int >{tok}) <<  " ' "  << std::endl ;
173175            }
174- 
175-             ofs << " \n "  ;
176176        }
177177
178178        fprintf (stderr, " %s : tokens written to '%s'\n "  , __func__, (fname_text + " .tokcpp"  ).c_str ());
0 commit comments