Skip to content

[BUG] pt: C++ interface throws errors when the number of ranks is larger than the number of GPUs #3578

@github-actions

Description

@github-actions

terminate called after throwing an instance of 'c10::Error'
what(): CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
Exception raised from c10_cuda_check_implementation at ../c10/cuda/CUDAException.cpp:44 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits, std::allocator >) + 0x6c (0x7f55c1b9fa0c in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0xfa (0x7f55c1b498bc in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libc10.so)
frame #2: c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) + 0x3cc (0x7f55c173201c in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libc10_cuda.so)
frame #3: c10::cuda::ExchangeDevice(int) + 0x62 (0x7f55c1732542 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libc10_cuda.so)
frame #4: + 0x2935c (0x7f55c16fe35c in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libc10_cuda.so)
frame #5: + 0x12fc71d (0x7f5522c1771d in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cuda.so)
frame #6: + 0x34ccdf5 (0x7f5524de7df5 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cuda.so)
frame #7: + 0x34ccf84 (0x7f5524de7f84 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cuda.so)
frame #8: at::_ops::empty_strided::redispatch(c10::DispatchKeySet, c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional) + 0x107 (0x7f55779aefb7 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #9: + 0x2d23a0b (0x7f5577da3a0b in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #10: at::_ops::empty_strided::call(c10::ArrayRefc10::SymInt, c10::ArrayRefc10::SymInt, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional) + 0x1b9 (0x7f55779ff349 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #11: + 0x1c64e49 (0x7f5576ce4e49 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #12: at::native::_to_copy(at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x1af0 (0x7f55770962d0 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #13: + 0x2f5545f (0x7f5577fd545f in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #14: at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x109 (0x7f55775f74b9 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #15: + 0x2d271fa (0x7f5577da71fa in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #16: at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x109 (0x7f55775f74b9 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #17: + 0x46f3a45 (0x7f5579773a45 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #18: + 0x46f3f12 (0x7f5579773f12 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #19: at::_ops::_to_copy::call(at::Tensor const&, std::optionalc10::ScalarType, std::optionalc10::Layout, std::optionalc10::Device, std::optional, bool, std::optionalc10::MemoryFormat) + 0x1fe (0x7f557769565e in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #20: at::native::to(at::Tensor const&, c10::Device, c10::ScalarType, bool, bool, std::optionalc10::MemoryFormat) + 0xf7 (0x7f557708dcd7 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #21: + 0x319275d (0x7f557821275d in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #22: at::_ops::to_device::call(at::Tensor const&, c10::Device, c10::ScalarType, bool, bool, std::optionalc10::MemoryFormat) + 0x1ce (0x7f557785899e in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #23: torch::jit::Unpickler::readInstruction() + 0x1d5a (0x7f557aa190ca in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #24: torch::jit::Unpickler::run() + 0xa8 (0x7f557aa1a418 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #25: torch::jit::Unpickler::parse_ivalue() + 0x32 (0x7f557aa1bf92 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #26: torch::jit::readArchiveAndTensors(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optional<std::function<c10::StrongTypePtr (c10::QualifiedName const&)> >, std::optional<std::function<c10::intrusive_ptr<c10::ivalue::Object, c10::detail::intrusive_target_default_null_typec10::ivalue::Object > (c10::StrongTypePtr const&, c10::IValue)> >, std::optionalc10::Device, caffe2::serialize::PyTorchStreamReader&, c10::Type::SingletonOrSharedTypePtrc10::Type ()(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&), std::shared_ptrtorch::jit::DeserializationStorageContext) + 0x569 (0x7f557a9d5629 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #27: + 0x594a178 (0x7f557a9ca178 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #28: + 0x594cfc3 (0x7f557a9ccfc3 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #29: torch::jit::import_ir_module(std::shared_ptrtorch::jit::CompilationUnit, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optionalc10::Device, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, std::__cxx11::basic_string<char, std::char_traits, std::allocator >, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, std::__cxx11::basic_string<char, std::char_traits, std::allocator > > > >&, bool, bool) + 0x3df (0x7f557a9d2a1f in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #30: torch::jit::import_ir_module(std::shared_ptrtorch::jit::CompilationUnit, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optionalc10::Device, bool) + 0x92 (0x7f557a9d2cd2 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #31: torch::jit::load(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, std::optionalc10::Device, bool) + 0xc0 (0x7f557a9d2de0 in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libtorch_cpu.so)
frame #32: deepmd::DeepPotPT::init(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x3d2 (0x7f55bf64f21a in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #33: deepmd::DeepPotPT::DeepPotPT(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0xba (0x7f55bf64ed74 in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #34: void __gnu_cxx::new_allocatordeepmd::DeepPotPT::construct<deepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(deepmd::DeepPotPT
, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0xa8 (0x7f55bf64d508 in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #35: void std::allocator_traits<std::allocatordeepmd::DeepPotPT >::construct<deepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(std::allocatordeepmd::DeepPotPT&, deepmd::DeepPotPT*, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x8a (0x7f55bf64cb12 in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #36: std::_Sp_counted_ptr_inplace<deepmd::DeepPotPT, std::allocatordeepmd::DeepPotPT, (__gnu_cxx::_Lock_policy)2>::_Sp_counted_ptr_inplace<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(std::allocatordeepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x12a (0x7f55bf64bc52 in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #37: std::__shared_count<(__gnu_cxx::_Lock_policy)2>::__shared_count<deepmd::DeepPotPT, std::allocatordeepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(deepmd::DeepPotPT*&, std::_Sp_alloc_shared_tag<std::allocatordeepmd::DeepPotPT >, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x155 (0x7f55bf649e39 in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #38: std::__shared_ptr<deepmd::DeepPotPT, (__gnu_cxx::_Lock_policy)2>::__shared_ptr<std::allocatordeepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(std::_Sp_alloc_shared_tag<std::allocatordeepmd::DeepPotPT >, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0xa2 (0x7f55bf647eac in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #39: std::shared_ptrdeepmd::DeepPotPT::shared_ptr<std::allocatordeepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(std::_Sp_alloc_shared_tag<std::allocatordeepmd::DeepPotPT >, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x8f (0x7f55bf645eab in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #40: std::shared_ptrdeepmd::DeepPotPT std::allocate_shared<deepmd::DeepPotPT, std::allocatordeepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(std::allocatordeepmd::DeepPotPT const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x8a (0x7f55bf643abb in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #41: std::shared_ptrdeepmd::DeepPotPT std::make_shared<deepmd::DeepPotPT, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&>(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0xaf (0x7f55bf641402 in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #42: deepmd::DeepPot::init(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x384 (0x7f55bf636a7e in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #43: deepmd::DeepPot::DeepPot(std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&, int const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator > const&) + 0x5e (0x7f55bf63667e in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_cc.so)
frame #44: DP_NewDeepPotWithParam2 + 0x12d (0x7f55c229bf4b in /__w/deepmd-kit/deepmd-kit/dp_test/lib/libdeepmd_c.so)
frame #45: deepmd::hpp::DeepPot::init(std::string const&, int const&, std::string const&) + 0xeb (0x7f55c1f8730b in /__w/deepmd-kit/deepmd-kit/dp_test/lib/deepmd_lmp/dpplugin.so)
frame #46: LAMMPS_NS::PairDeepMD::settings(int, char**) + 0x6b8 (0x7f55c1f7e170 in /__w/deepmd-kit/deepmd-kit/dp_test/lib/deepmd_lmp/dpplugin.so)
frame #47: LAMMPS_NS::Input::execute_command() + 0x741 (0x7f55c2bb79f1 in /__w/_tool/Python/3.11.8/x64/lib/python3.11/site-packages/lammps/liblammps.so)
frame #48: LAMMPS_NS::Input::one(std::string const&) + 0x89 (0x7f55c2bb8919 in /__w/_tool/Python/3.11.8/x64/lib/python3.11/site-packages/lammps/liblammps.so)
frame #49: lammps_command + 0x91 (0x7f55c2c09631 in /__w/_tool/Python/3.11.8/x64/lib/python3.11/site-packages/lammps/liblammps.so)
frame #50: + 0x7e2e (0x7f55ca69be2e in /lib/x86_64-linux-gnu/libffi.so.8)
frame #51: + 0x4493 (0x7f55ca698493 in /lib/x86_64-linux-gnu/libffi.so.8)
frame #52: + 0xe6d0 (0x7f55ca0ec6d0 in /__w/_tool/Python/3.11.8/x64/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so)
frame #53: + 0x14249 (0x7f55ca0f2249 in /__w/_tool/Python/3.11.8/x64/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so)

Line: 685

("balance_args",),
[(["--balance"],), ([],)],
)
# TODO: [BUG] pt: C++ interface throws errors when the number of ranks is larger than the number of GPUs
# terminate called after throwing an instance of 'c10::Error'
# what(): CUDA error: invalid device ordinal
# CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
# For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
# Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
# Exception raised from c10_cuda_check_implementation at ../c10/cuda/CUDAException.cpp:44 (most recent call first):
# frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) + 0x6c (0x7f55c1b9fa0c in /__w/deepmd-kit/deepmd-kit/libtorch/lib/libc10.so)

Metadata

Metadata

Assignees

Type

Projects

Status

Done

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions