Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Bug]: AssertionError: All tokenizer files should be in the same directory #9156

Open
1 task done
wangzy0327 opened this issue Sep 20, 2024 · 1 comment
Open
1 task done
Assignees
Labels
bug Something isn't working

Comments

@wangzy0327
Copy link

wangzy0327 commented Sep 20, 2024

软件环境

Markdown

  • paddlepaddle:2.6.0
  • paddlepaddle-gpu: 2.6.0
  • paddlenlp: 2.8.1
  • nvidia cuda 11.2

尝试使用paddlenlp执行gpt模型的推理。选用模型gpt-cpm-small-cn-distill
下面是执行脚本

benchmark_gpt.py

import os
import time
import paddle  
from paddle import nn  
from paddlenlp.transformers import BertModel, BertTokenizer, ErnieModel, ErnieTokenizer
from paddlenlp.transformers import GPTTokenizer, GPTLMHeadModel
from paddlenlp.transformers import Llama3Tokenizer, LlamaModel
import numpy as np

def benchmark(net, input_ids, token_type_ids, repeat=5, warmup=3):
    # warm up
    for _ in range(warmup):
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
    # time
    t = []
    for _ in range(repeat):
        t1 = time.time()
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
        t2 = time.time()
        t.append((t2 - t1)*1000)
    print("--[benchmark] Run for %d times, the average latency is: %f ms" % (repeat, np.mean(t)))    


class TestBase:
    def __init__(self):
        device_info = paddle.get_device()
        print("Current Paddle device : %s"%(device_info))
        self.net = None
        self.input = None
        self.cinn_net = None

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.numpy(), pd_out.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input)

class TestGPT(TestBase):
    def __init__(self, batch_size=1):
        super().__init__()
        max_seq_length = 1024  # 最大序列长度
        model_name = 'gpt-cpm-small-cn-distill'
        self.net = GPTLMHeadModel.from_pretrained(model_name)
        self.tokenizer = GPTTokenizer.from_pretrained(model_name)

        # 随机生成输入数据
        encoded_text = self.tokenizer(text="请输入测试样例")
        self.input_ids = paddle.to_tensor([encoded_text['input_ids']])
        self.token_type_ids = paddle.to_tensor([encoded_text['token_type_ids']])        

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input_ids, self.token_type_ids)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.last_hidden_state.numpy(), pd_out.last_hidden_state.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input_ids, self.token_type_ids)

if __name__ == "__main__":
    print("Test GPT Model gpt-cpm-small ........")
    model = TestGPT()       
    model.benchmark(use_cinn=False)

执行后输出结果为:

/home/wzy/.local/lib/python3.8/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils.
  warnings.warn("Setuptools is replacing distutils.")
Test GPT Model gpt-cpm-small ........
Current Paddle device : gpu:0
[2024-09-20 01:37:50,268] [    INFO] - Loading weights file from cache at /home/wzy/.paddlenlp/models/gpt-cpm-large-cn/model_state.pdparams
[2024-09-20 01:38:03,948] [    INFO] - Loaded weights file from disk, setting weights to model.
W0920 01:38:03.950428 12651 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0920 01:38:03.953485 12651 gpu_resources.cc:164] device: 0, cuDNN Version: 8.1.
[2024-09-20 01:38:30,694] [    INFO] - All model checkpoint weights were used when initializing GPTForCausalLM.

[2024-09-20 01:38:30,695] [ WARNING] - Some weights of GPTForCausalLM were not initialized from the model checkpoint at gpt-cpm-large-cn and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2024-09-20 01:38:30,995] [    INFO] - Generation config file not found, using a generation config created from the model config.
Traceback (most recent call last):
  File "benchmark_lstm.py", line 297, in <module>
    model = TestGPT()
  File "benchmark_lstm.py", line 241, in __init__
    self.tokenizer = GPTTokenizer.from_pretrained(model_name)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils.py", line 709, in from_pretrained
    tokenizer, tokenizer_config_file_dir = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils_base.py", line 1515, in from_pretrained
    assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory."
AssertionError: All tokenizer files should be in the same directory.

重复问题

  • I have searched the existing issues

错误描述

/home/wzy/.local/lib/python3.8/site-packages/_distutils_hack/__init__.py:26: UserWarning: Setuptools is replacing distutils.
  warnings.warn("Setuptools is replacing distutils.")
Test GPT Model gpt-cpm-small ........
Current Paddle device : gpu:0
[2024-09-20 01:37:50,268] [    INFO] - Loading weights file from cache at /home/wzy/.paddlenlp/models/gpt-cpm-large-cn/model_state.pdparams
[2024-09-20 01:38:03,948] [    INFO] - Loaded weights file from disk, setting weights to model.
W0920 01:38:03.950428 12651 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0920 01:38:03.953485 12651 gpu_resources.cc:164] device: 0, cuDNN Version: 8.1.
[2024-09-20 01:38:30,694] [    INFO] - All model checkpoint weights were used when initializing GPTForCausalLM.

[2024-09-20 01:38:30,695] [ WARNING] - Some weights of GPTForCausalLM were not initialized from the model checkpoint at gpt-cpm-large-cn and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2024-09-20 01:38:30,995] [    INFO] - Generation config file not found, using a generation config created from the model config.
Traceback (most recent call last):
  File "benchmark_lstm.py", line 297, in <module>
    model = TestGPT()
  File "benchmark_lstm.py", line 241, in __init__
    self.tokenizer = GPTTokenizer.from_pretrained(model_name)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils.py", line 709, in from_pretrained
    tokenizer, tokenizer_config_file_dir = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
  File "/home/wzy/.local/lib/python3.8/site-packages/paddlenlp/transformers/tokenizer_utils_base.py", line 1515, in from_pretrained
    assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory."
AssertionError: All tokenizer files should be in the same directory.

稳定复现步骤 & 代码

import os
import time
import paddle  
from paddle import nn  
from paddlenlp.transformers import BertModel, BertTokenizer, ErnieModel, ErnieTokenizer
from paddlenlp.transformers import GPTTokenizer, GPTLMHeadModel
from paddlenlp.transformers import Llama3Tokenizer, LlamaModel
import numpy as np

def benchmark(net, input_ids, token_type_ids, repeat=5, warmup=3):
    # warm up
    for _ in range(warmup):
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
    # time
    t = []
    for _ in range(repeat):
        t1 = time.time()
        net(input_ids, token_type_ids)
        paddle.device.synchronize()
        t2 = time.time()
        t.append((t2 - t1)*1000)
    print("--[benchmark] Run for %d times, the average latency is: %f ms" % (repeat, np.mean(t)))    


class TestBase:
    def __init__(self):
        device_info = paddle.get_device()
        print("Current Paddle device : %s"%(device_info))
        self.net = None
        self.input = None
        self.cinn_net = None

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.numpy(), pd_out.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input)

class TestGPT(TestBase):
    def __init__(self, batch_size=1):
        super().__init__()
        max_seq_length = 1024  # 最大序列长度
        model_name = 'gpt-cpm-small-cn-distill'
        self.net = GPTLMHeadModel.from_pretrained(model_name)
        self.tokenizer = GPTTokenizer.from_pretrained(model_name)

        # 随机生成输入数据
        encoded_text = self.tokenizer(text="请输入测试样例")
        self.input_ids = paddle.to_tensor([encoded_text['input_ids']])
        self.token_type_ids = paddle.to_tensor([encoded_text['token_type_ids']])        

    def to_eval(self, use_cinn):
        set_flags(use_cinn)
        if use_cinn:
            if not self.cinn_net:
                self.cinn_net = to_cinn_net(self.net)
            net = self.cinn_net
        else:
            net = self.net
        net.eval()
        return net

    def eval(self, use_cinn):
        net = self.to_eval(use_cinn)
        out = net(self.input_ids, self.token_type_ids)
        return out

    def check_cinn_output(self):
        pd_out = self.eval(use_cinn=False)
        cinn_out = self.eval(use_cinn=True)
        np.testing.assert_allclose(
            cinn_out.last_hidden_state.numpy(), pd_out.last_hidden_state.numpy(), atol=1e-3, rtol=1e-3
        )
        print("--[check_cinn_output] cinn result right.")

    def benchmark(self, use_cinn):
        print("--[benchmark] benchmark %s" % ("cinn" if use_cinn else "nocinn"))
        net = self.to_eval(use_cinn)
        benchmark(net, self.input_ids, self.token_type_ids)

if __name__ == "__main__":
    print("Test GPT Model gpt-cpm-small ........")
    model = TestGPT()       
    model.benchmark(use_cinn=False)
@wangzy0327 wangzy0327 added the bug Something isn't working label Sep 20, 2024
@DrownFish19
Copy link
Collaborator

gpt-cpm-small-cngpt-cpm-small-cn-distill需要使用GPTChineseTokenizer。

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

3 participants