Skip to content

Commit 84aa1c9

Browse files
committed
ls
1 parent 61796c1 commit 84aa1c9

File tree

1 file changed

+105
-0
lines changed

1 file changed

+105
-0
lines changed

visual_tokenizer.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python3
2+
3+
import os
4+
from functools import lru_cache
5+
from loguru import logger
6+
from typing import Dict
7+
8+
import gradio as gr
9+
from sentencepiece import SentencePieceProcessor
10+
from transformers import AutoTokenizer
11+
12+
CANDIDATES = [ # model name sorted by alphabet
13+
"baichuan-inc/Baichuan2-13B-Chat",
14+
"bigcode/starcoder2-15b",
15+
"deepseek-ai/deepseek-coder-33b-instruct",
16+
# "google/gemma-7b",
17+
"gpt2",
18+
# "meta-llama/Llama-2-7b-chat-hf",
19+
"mistralai/Mixtral-8x7B-Instruct-v0.1",
20+
"THUDM/chatglm3-6b",
21+
]
22+
SENTENCE_PIECE_MAPPING = {}
23+
SP_PREFIX = "SentencePiece/"
24+
25+
26+
def add_sp_tokenizer(name: str, tokenizer_path: str):
27+
"""Add a sentence piece tokenizer to the list of available tokenizers."""
28+
model_key = SP_PREFIX + name
29+
if not os.path.exists(tokenizer_path):
30+
raise FileNotFoundError(f"Tokenizer file not found: {tokenizer_path}")
31+
SENTENCE_PIECE_MAPPING[model_key] = tokenizer_path
32+
CANDIDATES.append(model_key)
33+
34+
35+
# add_sp_tokenizer("LLaMa", "llama_tokenizer.model")
36+
logger.info(f"SentencePiece tokenizer: {list(SENTENCE_PIECE_MAPPING.keys())}")
37+
38+
39+
@lru_cache
40+
def get_tokenizer_and_vocab(name):
41+
if name.startswith(SP_PREFIX):
42+
local_file_path = SENTENCE_PIECE_MAPPING[name]
43+
tokenizer = SentencePieceProcessor(local_file_path)
44+
rev_vocab = {id_: tokenizer.id_to_piece(id_) for id_ in range(tokenizer.get_piece_size())} # noqa
45+
else:
46+
tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
47+
rev_vocab = {v: k for k, v in tokenizer.get_vocab().items()}
48+
return tokenizer, rev_vocab
49+
50+
51+
def tokenize(name, text) -> Dict:
52+
tokenizer, rev_vocab = get_tokenizer_and_vocab(name)
53+
54+
ids = tokenizer.encode(text)
55+
s, entities = '', []
56+
for i in ids:
57+
entity = str(i)
58+
start = len(s)
59+
s += rev_vocab[i]
60+
end = len(s)
61+
entities.append({"entity": entity, "start": start, "end": end})
62+
63+
return {
64+
"text": s + f"\n({len(ids)} tokens / {len(text)} characters)",
65+
"entities": entities
66+
}
67+
68+
69+
@logger.catch(reraise=True)
70+
def make_demo():
71+
logger.info("Creating Interface..")
72+
73+
DEFAULT_TOKENIZER = CANDIDATES[0]
74+
DEFAULT_INPUTTEXT = "Hello world."
75+
76+
demo = gr.Interface(
77+
fn=tokenize,
78+
inputs=[
79+
gr.Dropdown(
80+
CANDIDATES, value=DEFAULT_TOKENIZER,
81+
label="Tokenizer", allow_custom_value=False
82+
),
83+
gr.TextArea(value=DEFAULT_INPUTTEXT, label="Input text"),
84+
],
85+
outputs=[
86+
gr.HighlightedText(
87+
value=tokenize(DEFAULT_TOKENIZER, DEFAULT_INPUTTEXT),
88+
label="Tokenized results"
89+
)
90+
],
91+
title="Tokenzier Visualizer",
92+
description="If you want to try more tokenizers, please contact the author@wangfeng", # noqa
93+
examples=[
94+
[DEFAULT_TOKENIZER, "乒乓球拍卖完了,无线电法国别研究,我一把把把把住了"],
95+
["bigcode/starcoder2-15b", "def print():\n print('Hello')"],
96+
],
97+
cache_examples=True,
98+
live=True,
99+
)
100+
return demo
101+
102+
103+
if __name__ == "__main__":
104+
demo = make_demo()
105+
demo.launch(server_name="0.0.0.0")

0 commit comments

Comments
 (0)