Skip to content

Commit

Permalink
examples : add tokenize (#4039)
Browse files Browse the repository at this point in the history
  • Loading branch information
zakkor authored Nov 17, 2023
1 parent 2ab0707 commit 2fa02b4
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 0 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ else()
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(main)
add_subdirectory(tokenize)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
Expand Down
5 changes: 5 additions & 0 deletions examples/tokenize/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET tokenize)
add_executable(${TARGET} tokenize.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
44 changes: 44 additions & 0 deletions examples/tokenize/tokenize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include "common.h"
#include "llama.h"

#include <cmath>
#include <cstdio>
#include <string>
#include <vector>

int main(int argc, char ** argv) {
if (argc < 3 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
return 1;
}

auto model_path = argv[1];
auto prompt = argv[2];

const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";

llama_backend_init(false);

llama_model_params model_params = llama_model_default_params();
model_params.vocab_only = true;
llama_model * model = llama_load_model_from_file(model_path, model_params);

llama_context_params ctx_params = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(model, ctx_params);

const bool add_bos = true;

std::vector<llama_token> tokens;

tokens = ::llama_tokenize(model, prompt, add_bos, true);

for (int i = 0; i < (int) tokens.size(); i++) {
if (printing_ids) {
printf("%d\n", tokens[i]);
} else {
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
}
}

return 0;
}

0 comments on commit 2fa02b4

Please sign in to comment.