forked from PaddlePaddle/PaddleNLP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_tokenizer.py
106 lines (78 loc) · 3.8 KB
/
test_tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import numpy as np
import paddle
# import psutil
import pytest
from tokenizers import Tokenizer as HFTokenizer
from paddlenlp.transformers import AutoTokenizer
MODEL_NAME = "THUDM/chatglm3-6b"
def measure_time_and_memory(func, *args, **kwargs):
start_time = time.time()
# process = psutil.Process()
# start_memory = process.memory_info().rss / 1024 / 1024
result = func(*args, **kwargs)
end_time = time.time()
# end_memory = process.memory_info().rss / 1024 / 1024
execution_time = end_time - start_time
memory_usage = None
return result, execution_time, memory_usage
@pytest.fixture
def setup_inputs():
# single_s = (
# "In the intricate tapestry of linguistic expression, the amalgamation of diverse syntactic structures, nuanced vocabulary,"
# "and convoluted clauses not only challenges the adeptness of tokenization algorithms but also underscores the formidable complexity inherent in natural language processing tasks."
# )
single_s = "自然语言处理(NLP)是一种人工智能技术,致力于使计算机能够理解、解释和生成人类语言。通过NLP,计算机可以处理和分析大量的自然语言数据,实现自动翻译、情感分析、语言生成等任务,为各种应用场景提供智能化解决方案,如智能客服、信息抽取和文本分类等。"
return single_s
@pytest.fixture
def tokenizer_fast_hf():
from transformers import AutoTokenizer
fast_tokenizer_hf = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
return fast_tokenizer_hf
@pytest.fixture
def tokenizer_fast():
fast_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, from_slow=True)
return fast_tokenizer
@pytest.fixture
def tokenizer_base():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
return tokenizer
def test_tokenizer_type(tokenizer_fast_hf, tokenizer_fast, tokenizer_base):
# assert isinstance(tokenizer_fast_hf, HFTokenizer)
assert isinstance(tokenizer_fast._tokenizer, HFTokenizer)
assert not hasattr(tokenizer_base, "_tokenizer")
# assert tokenizer_fast_hf.from_hub == "huggingface"
assert tokenizer_fast.from_hub == tokenizer_base.from_hub
def test_tokenizer_cost(tokenizer_fast_hf, tokenizer_fast, tokenizer_base, setup_inputs):
costs = []
breakpoint()
for tokenizer in ["tokenizer_fast_hf", "tokenizer_fast", "tokenizer_base"]:
(
_,
_time,
_memory,
) = measure_time_and_memory(eval(tokenizer), [setup_inputs] * 20000)
costs.append({tokenizer: (_memory, _time)})
print(costs)
def test_tokenizer_decode(tokenizer_fast_hf, tokenizer_fast, tokenizer_base, setup_inputs):
token_hf = tokenizer_fast_hf(setup_inputs)
token_fast = tokenizer_fast(setup_inputs)
token_base = tokenizer_base(setup_inputs)
breakpoint()
assert token_hf["input_ids"] == token_fast["input_ids"] == token_base["input_ids"]
def test_output_type(tokenizer_fast, setup_inputs):
isinstance(tokenizer_fast.encode(setup_inputs, return_tensors="pd")["input_ids"], paddle.Tensor)
isinstance(tokenizer_fast.encode(setup_inputs, return_tensors="np")["input_ids"], np.ndarray)