Skip to content

Commit

Permalink
feat: improve Tokenizers (#45)
Browse files Browse the repository at this point in the history
* feat: improve tokenizers

* feat: id to token & token to id
  • Loading branch information
load1n9 authored Nov 18, 2023
1 parent f607e46 commit 1ea74e3
Show file tree
Hide file tree
Showing 7 changed files with 328 additions and 34 deletions.
12 changes: 12 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ndarray = "0.15.6"
ndarray-rand = "0.14.0"
serde = {version = "1.0", features = ["derive"]}
serde_json = "1.0"
serde-wasm-bindgen = "0.6.0"
tokenizers = { version="0.14.1", default-features=false, features = ["unstable_wasm"]}
wasm-bindgen = "=0.2.84"
getrandom = { version = "0.2", features = ["js"] }
Expand Down
54 changes: 52 additions & 2 deletions crates/tokenizers/src/wasm.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::RESOURCES;
use std::str::FromStr;
use std::{collections::HashMap, str::FromStr};
use tokenizers::{models::bpe::BPE, tokenizer::Tokenizer};
use wasm_bindgen::prelude::*;

Expand Down Expand Up @@ -39,7 +39,7 @@ pub fn wasm_bpe_default() -> usize {
}

#[wasm_bindgen]
pub fn wasm_tokenizer_tokenize(id: usize, string: String) -> Vec<u32> {
pub fn wasm_tokenizer_encode(id: usize, string: String) -> Vec<u32> {
let mut data: Vec<u32> = Vec::new();
RESOURCES.with(|cell| {
let tokenizers = cell.tokenizer.borrow_mut();
Expand All @@ -53,3 +53,53 @@ pub fn wasm_tokenizer_tokenize(id: usize, string: String) -> Vec<u32> {
});
data
}

#[wasm_bindgen]
pub fn wasm_tokenizer_get_vocab(id: usize, with_added_tokens: bool) -> JsValue {
let mut data: HashMap<String, u32> = HashMap::new();
RESOURCES.with(|cell| {
let tokenizers = cell.tokenizer.borrow_mut();
data = tokenizers[id].get_vocab(with_added_tokens)
});
serde_wasm_bindgen::to_value(&data).unwrap()
}

#[wasm_bindgen]
pub fn wasm_tokenizer_get_vocab_size(id: usize, with_added_tokens: bool) -> usize {
let mut data: usize = 0;
RESOURCES.with(|cell| {
let tokenizers = cell.tokenizer.borrow_mut();
data = tokenizers[id].get_vocab_size(with_added_tokens)
});
data
}

#[wasm_bindgen]
pub fn wasm_tokenizer_decode(id: usize, ids: &[u32], skip_special_tokens: bool) -> String {
let mut data: String = String::new();
RESOURCES.with(|cell| {
let tokenizers = cell.tokenizer.borrow_mut();
data = tokenizers[id].decode(ids, skip_special_tokens).unwrap()
});
data
}

#[wasm_bindgen]
pub fn wasm_tokenizer_token_to_id(id: usize, token: String) -> u32 {
let mut data: u32 = 0;
RESOURCES.with(|cell| {
let tokenizers = cell.tokenizer.borrow_mut();
data = tokenizers[id].token_to_id(token.as_str()).unwrap()
});
data
}

#[wasm_bindgen]
pub fn wasm_tokenizer_id_to_token(id: usize, token_id: u32) -> String {
let mut data: String = String::new();
RESOURCES.with(|cell| {
let tokenizers = cell.tokenizer.borrow_mut();
data = tokenizers[id].id_to_token(token_id).unwrap()
});
data
}
7 changes: 5 additions & 2 deletions examples/tokenizers/basic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@ import { init, Tokenizer } from "../../tokenizers/mod.ts";

await init();

const tokenizer = Tokenizer.fromJson(
const tokenizer = Tokenizer.fromJSON(
await (await fetch(
`https://huggingface.co/satvikag/chatbot/resolve/main/tokenizer.json`,
)).text(),
);

console.log(tokenizer.tokenize("Hello World!"));
const encoded = tokenizer.encode("Hello World!");
console.log(encoded);
const decoded = tokenizer.decode(encoded);
console.log(decoded);
Loading

0 comments on commit 1ea74e3

Please sign in to comment.