Skip to content

[BUG] Dask PCA failing w/ cudaErrorContextIsDestroyed #4183

Open
@cjnolet

Description

@cjnolet
import dask
import cupy as cp
import dask_cuda
import distributed
from cuml.dask.decomposition import PCA
cluster = dask_cuda.LocalCUDACluster()
client = distributed.Client(cluster)
futures = client.scatter(cp.ones((100000, 100), dtype='float32'), broadcast=True) 
arr = dask.array.from_delayed(futures, shape=(700000, 100), dtype='float32')


pca = PCA(n_components=2)
pca.fit(arr)
Run out-of-band function '_func_set_scheduler_as_nccl_root'
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_1336/3741049561.py in <module>
      2 
      3 pca = PCA(n_components=2)
----> 4 pca.fit(arr)

~/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/dask/decomposition/pca.py in fit(self, X)
    173         """
    174 
--> 175         self._fit(X)
    176         return self
    177 

~/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/dask/decomposition/base.py in _fit(self, X, _transform)
    103 
    104         wait(list(pca_fit.values()))
--> 105         raise_exception_from_futures(list(pca_fit.values()))
    106 
    107         comms.destroy()

~/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/dask/common/utils.py in raise_exception_from_futures(futures)
    150     if errs:
    151         raise RuntimeError("%d of %d worker jobs failed: %s" % (
--> 152             len(errs), len(futures), ", ".join(map(str, errs))
    153             ))
    154 

RuntimeError: 1 of 1 worker jobs failed: CUDA error encountered at: file=/datasets/cnolet/workspace/cuml/cpp/build/_deps/raft-src/cpp/include/raft/linalg/matrix_vector_op.cuh line=71: call='cudaPeekAtLastError()', Reason=cudaErrorContextIsDestroyed:context is destroyed
Obtained 46 stack frames
#0 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft9exception18collect_call_stackEv+0x46) [0x7fbc54b219e6]
#1 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft10cuda_errorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE+0x69) [0x7fbc54b22459]
#2 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN4raft6linalg18matrixVectorOpImplIfLi4E17__nv_dl_wrapper_tI11__nv_dl_tagIPFvPfPKfS6_iibbP11CUstream_stEXadL_ZNS_5stats10meanCenterIfiLi256EEEvPT_PKSD_SG_T0_SH_bbS8_EELj1EEJEEiLi256EEEvSE_SG_SG_T2_SK_bbT1_S8_+0x2fe) [0x7fbc54c607ae]
#3 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcumlprims.so(_ZN8MLCommon5Stats3opg16mean_center_implIfLi256EEEvRKSt6vectorIPNS_6Matrix4DataIT_EESaIS8_EERKNS4_14PartDescriptorERKS7_RKN4raft5comms7comms_tEPP11CUstream_sti+0xdf) [0x7fbca423505f]
#4 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcumlprims.so(_ZN8MLCommon5Stats3opg8cov_implIfLi256EEEvRKN4raft8handle_tERNS_6Matrix4DataIT_EERKSt6vectorIPSA_SaISD_EERKNS7_14PartDescriptorERKSA_bPP11CUstream_sti+0x6c) [0x7fbca423b16c]
#5 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3PCA3opg8fit_implIfEEvRN4raft8handle_tERSt6vectorIPN8MLCommon6Matrix4DataIT_EESaISC_EERNS8_14PartDescriptorEPSA_SI_SI_SI_SI_SI_NS_17paramsPCATemplateINS_9mg_solverEEEPP11CUstream_stib+0x18c) [0x7fbc550bf20c]
#6 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3PCA3opg8fit_implIfEEvRN4raft8handle_tERSt6vectorIPN8MLCommon6Matrix4DataIT_EESaISC_EERNS8_14PartDescriptorEPSA_SI_SI_SI_SI_SI_NS_17paramsPCATemplateINS_9mg_solverEEEb+0x230) [0x7fbc550bf620]
#7 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3PCA3opg3fitERN4raft8handle_tERSt6vectorIPN8MLCommon6Matrix4DataIfEESaISA_EERNS7_14PartDescriptorEPfSG_SG_SG_SG_SG_NS_17paramsPCATemplateINS_9mg_solverEEEb+0x4c) [0x7fbc550b929c]
#8 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/decomposition/pca_mg.cpython-37m-x86_64-linux-gnu.so(+0x26012) [0x7fba78168012]
#9 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(PyObject_Call+0x66) [0x56487ae247b6]
#10 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x1d0d) [0x56487aecea6d]
#11 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalCodeWithName+0xc5c) [0x56487ae2359c]
#12 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(PyEval_EvalCodeEx+0x3c) [0x56487ad795fc]
#13 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/decomposition/base_mg.cpython-37m-x86_64-linux-gnu.so(+0x23994) [0x7fba780e9994]
#14 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/decomposition/base_mg.cpython-37m-x86_64-linux-gnu.so(+0x2a4b9) [0x7fba780f04b9]
#15 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/lib/python3.7/site-packages/cuml/decomposition/base_mg.cpython-37m-x86_64-linux-gnu.so(+0x2b8f6) [0x7fba780f18f6]
#16 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(PyObject_Call+0x66) [0x56487ae247b6]
#17 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x1d0d) [0x56487aecea6d]
#18 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalCodeWithName+0xc5c) [0x56487ae2359c]
#19 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallKeywords+0x693) [0x56487ae43223]
#20 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(+0x1800c5) [0x56487ae8a0c5]
#21 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x48a2) [0x56487aed1602]
#22 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallDict+0x118) [0x56487ae42138]
#23 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x1d0d) [0x56487aecea6d]
#24 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallKeywords+0x187) [0x56487ae42d17]
#25 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x3f5) [0x56487aecd155]
#26 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallDict+0x118) [0x56487ae42138]
#27 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x1d0d) [0x56487aecea6d]
#28 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallKeywords+0x187) [0x56487ae42d17]
#29 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(+0x1800c5) [0x56487ae8a0c5]
#30 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x621) [0x56487aecd381]
#31 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallDict+0x118) [0x56487ae42138]
#32 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x1d0d) [0x56487aecea6d]
#33 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallKeywords+0x187) [0x56487ae42d17]
#34 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(+0x1800c5) [0x56487ae8a0c5]
#35 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x621) [0x56487aecd381]
#36 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyFunction_FastCallKeywords+0x187) [0x56487ae42d17]
#37 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(+0x1800c5) [0x56487ae8a0c5]
#38 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyEval_EvalFrameDefault+0x621) [0x56487aecd381]
#39 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(_PyObject_FastCallDict+0x1b6) [0x56487ae240a6]
#40 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(+0x12f041) [0x56487ae39041]
#41 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(PyObject_Call+0x66) [0x56487ae247b6]
#42 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(+0x221a23) [0x56487af2ba23]
#43 in /home/nfs/cnolet/miniconda3_2/envs/cuml_2108_082421/bin/python(+0x1daa77) [0x56487aee4a77]
#44 in /lib/x86_64-linux-gnu/libpthread.so.0(+0x76db) [0x7fbf7b4226db]
#45 in /lib/x86_64-linux-gnu/libc.so.6(clone+0x3f) [0x7fbf7b14b71f]

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions