python scripts/vllm_infer.py执行模型批量推理报错Caught signal 8 (Floating point exception: integer divide by zero) #6741
Open
Description
Reminder
- I have read the above rules and searched the existing issues.
System Info
尝试过https://github.com/hiyouga/LLaMA-Factory/issues/2750#issuecomment-2376066672的方式,但是没用
执行方式:
python scripts/vllm_infer.py
--model_name_or_path /app/basemodel
--adapter_name_or_path /app/model
--dataset voc_triples_train
--dataset_dir /app/data
--template qwen
--max_samples 100
[template-name1-wztx7:2630 :0:2630] Caught signal 8 (Floating point exception: integer divide by zero)
==== backtrace (tid: 2630) ====
0 0x0000000000043090 killpg() ???:0
1 0x0000000000aef033 cublasLt_for_cublas_ZZZ() ???:0
2 0x0000000000837ee3 cublasLt_for_cublas_ZZZ() ???:0
3 0x00000000006b5132 cublasLtLegacyGemmUtilizationZZZ() ???:0
4 0x000000000078d897 cublasLtMatmulAlgoCheck() ???:0
5 0x000000000078e8c5 cublasLtMatmulAlgoCheck() ???:0
6 0x000000000078f59e cublasLtMatmulAlgoCheck() ???:0
7 0x00000000007b13e4 cublasLtMatmulAlgoGetHeuristic() ???:0
8 0x00000000036a2121 at::cuda::blas::gemm_and_biasc10::BFloat16() :0
9 0x00000000036f4255 at::native::(anonymous namespace)::addmm_out_cuda_impl() Blas.cpp:0
10 0x00000000034425dc at::(anonymous namespace)::wrapper_CUDA_addmm() RegisterCUDA.cpp:0
11 0x00000000034426ad c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::Scalar const&, c10::Scalar const&), &at::(anonymous namespace)::wrapper_CUDA_addmm>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::Scalar const&, c10::Scalar const&> >, at::Tensor (at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::Scalar const&, c10::Scalar const&)>::call() RegisterCUDA.cpp:0
12 0x00000000025a4e0e at::ops::addmm::call() ???:0
13 0x0000000001d8e4f4 at::native::linear() ???:0
14 0x000000000301bca3 c10::impl::wrap_kernel_functor_unboxed<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeImplicitAutograd__linear>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&> >, at::Tensor (at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&)>::call() RegisterCompositeImplicitAutograd.cpp:0
15 0x000000000258cf6c at::_ops::linear::call() ???:0
16 0x0000000000733bb5 torch::autograd::THPVariable_linear() python_nn_functions.cpp:0
17 0x00000000005f58f9 PyCFunction_Call() ???:0
18 0x00000000005f64c6 _PyObject_MakeTpCall() ???:0
19 0x0000000000571162 _PyEval_EvalFrameDefault() ???:0
20 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
21 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
22 0x000000000056bbda _PyEval_EvalFrameDefault() ???:0
23 0x000000000050af2e PyMethod_New() ???:0
24 0x0000000000570b26 _PyEval_EvalFrameDefault() ???:0
25 0x00000000005f5ca6 _PyFunction_Vectorcall() ???:0
26 0x000000000050b02c PyMethod_New() ???:0
27 0x00000000005f5207 PyObject_Call() ???:0
28 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
29 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
30 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
31 0x000000000050b02c PyMethod_New() ???:0
32 0x00000000005f5207 PyObject_Call() ???:0
33 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
34 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
35 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
36 0x000000000059ca8e PyUnicode_New() ???:0
37 0x00000000005f64c6 _PyObject_MakeTpCall() ???:0
38 0x0000000000571162 _PyEval_EvalFrameDefault() ???:0
39 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
40 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
41 0x000000000050b02c PyMethod_New() ???:0
42 0x00000000005f5207 PyObject_Call() ???:0
43 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
44 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
45 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
46 0x000000000050b02c PyMethod_New() ???:0
47 0x00000000005f5207 PyObject_Call() ???:0
48 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
49 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
50 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
51 0x000000000059cb0f PyUnicode_New() ???:0
52 0x00000000005f64c6 _PyObject_MakeTpCall() ???:0
53 0x00000000005715fd _PyEval_EvalFrameDefault() ???:0
54 0x00000000005f5ca6 _PyFunction_Vectorcall() ???:0
55 0x000000000050b549 PyMethod_New() ???:0
56 0x00000000005f5207 PyObject_Call() ???:0
57 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
ERROR 01-23 11:14:19 multiproc_worker_utils.py:117] Worker VllmWorkerProcess pid 2630 died, exit code: -8
INFO 01-23 11:14:19 multiproc_worker_utils.py:121] Killing local vLLM worker processes
[template-name1-wztx7:2363 :0:2363] Caught signal 8 (Floating point exception: integer divide by zero)
==== backtrace (tid: 2363) ====
0 0x0000000000043090 killpg() ???:0
1 0x0000000000aef033 cublasLt_for_cublas_ZZZ() ???:0
2 0x0000000000837ee3 cublasLt_for_cublas_ZZZ() ???:0
3 0x00000000006b5132 cublasLtLegacyGemmUtilizationZZZ() ???:0
4 0x000000000078d897 cublasLtMatmulAlgoCheck() ???:0
5 0x000000000078e8c5 cublasLtMatmulAlgoCheck() ???:0
6 0x000000000078f59e cublasLtMatmulAlgoCheck() ???:0
7 0x00000000007b13e4 cublasLtMatmulAlgoGetHeuristic() ???:0
8 0x00000000036a2121 at::cuda::blas::gemm_and_biasc10::BFloat16() :0
9 0x00000000036f4255 at::native::(anonymous namespace)::addmm_out_cuda_impl() Blas.cpp:0
10 0x00000000034425dc at::(anonymous namespace)::wrapper_CUDA_addmm() RegisterCUDA.cpp:0
11 0x00000000034426ad c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::Scalar const&, c10::Scalar const&), &at::(anonymous namespace)::wrapper_CUDA_addmm>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::Scalar const&, c10::Scalar const&> >, at::Tensor (at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::Scalar const&, c10::Scalar const&)>::call() RegisterCUDA.cpp:0
12 0x00000000025a4e0e at::ops::addmm::call() ???:0
13 0x0000000001d8e4f4 at::native::linear() ???:0
14 0x000000000301bca3 c10::impl::wrap_kernel_functor_unboxed<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeImplicitAutograd__linear>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&> >, at::Tensor (at::Tensor const&, at::Tensor const&, std::optionalat::Tensor const&)>::call() RegisterCompositeImplicitAutograd.cpp:0
15 0x000000000258cf6c at::_ops::linear::call() ???:0
16 0x0000000000733bb5 torch::autograd::THPVariable_linear() python_nn_functions.cpp:0
17 0x00000000005f58f9 PyCFunction_Call() ???:0
18 0x00000000005f64c6 _PyObject_MakeTpCall() ???:0
19 0x0000000000571162 _PyEval_EvalFrameDefault() ???:0
20 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
21 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
22 0x000000000056bbda _PyEval_EvalFrameDefault() ???:0
23 0x000000000050af2e PyMethod_New() ???:0
24 0x0000000000570b26 _PyEval_EvalFrameDefault() ???:0
25 0x00000000005f5ca6 _PyFunction_Vectorcall() ???:0
26 0x000000000050b02c PyMethod_New() ???:0
27 0x00000000005f5207 PyObject_Call() ???:0
28 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
29 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
30 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
31 0x000000000050b02c PyMethod_New() ???:0
32 0x00000000005f5207 PyObject_Call() ???:0
33 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
34 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
35 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
36 0x000000000059ca8e PyUnicode_New() ???:0
37 0x00000000005f64c6 _PyObject_MakeTpCall() ???:0
38 0x0000000000571162 _PyEval_EvalFrameDefault() ???:0
39 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
40 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
41 0x000000000050b02c PyMethod_New() ???:0
42 0x00000000005f5207 PyObject_Call() ???:0
43 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
44 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
45 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
46 0x000000000050b02c PyMethod_New() ???:0
47 0x00000000005f5207 PyObject_Call() ???:0
48 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
49 0x0000000000569d8a _PyEval_EvalCodeWithName() ???:0
50 0x00000000005f5e83 _PyFunction_Vectorcall() ???:0
51 0x000000000059cb0f PyUnicode_New() ???:0
52 0x00000000005f64c6 _PyObject_MakeTpCall() ???:0
53 0x00000000005715fd _PyEval_EvalFrameDefault() ???:0
54 0x00000000005f5ca6 _PyFunction_Vectorcall() ???:0
55 0x000000000050b549 PyMethod_New() ???:0
56 0x00000000005f5207 PyObject_Call() ???:0
57 0x000000000056d2cd _PyEval_EvalFrameDefault() ???:0
/usr/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 7 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
/usr/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
Reproduction
Put your message here.
Others
No response