Recently I have been running some large image reconstruction computations on a GPU using the dask local scheduler and CuPy. I was trying to change to the distributed scheduler to take advantage of the nice visualization tools, but am having some problems. In particular, when I switch scheduler I start seeing CUDA runtime errors. However, I don’t think this is really a CuPy/CUDA problem because the same script runs without error using the local scheduler.
I am looking for some insight into what is going wrong here. am I misusing the distributed scheduler? Is there some reason I should not expect this to play nice with CuPy?
Here’s a simple script showing this problem.
I am running Python 3.9.7
dask 2022.05.2
CuPy: cupy-cuda112 10.1.0
import dask.array as da
import cupy as cp
from dask.distributed import Client
import numpy as np
if __name__ == "__main__":
# script generates error if using distributed scheduler
client = Client()
# generate random numbers on cpu
m = da.random.normal(size=(10, 1024, 1204), chunks=(1, 1024, 1024))
# move to gpu
n = da.map_blocks(lambda a: cp.asarray(a),
m,
dtype=float,
meta=cp.array([]),
)
# take average
c = n.mean(axis=0).compute()
Running this script gives errors like:
2023-03-12 14:13:14,812 - distributed.worker - WARNING - Compute Failed
Key: ('normal-mean_chunk-a63538303ea5d3aa30770a1cffa52d23', 6, 0, 1)
Function: execute_task
args: ((subgraph_callable-ca6bd2bd-67ff-4833-9c0b-0e3dbe99880f, (<function _apply_random at 0x00000257FFB01D30>, None, 'normal', array([2091449326, 1212383101, 2368016930, 142118476, 2123126501,
34296365, 3071889805, 2443673445, 2637676760, 3752601443,
3968457419, 1266953699, 2717250215, 4281392320, 1417859837,
3808676731, 3925839290, 4281767247, 3410509063, 3707335665,
4000099702, 2596823318, 1645030008, 3144614706, 414108185,
3256532987, 1704711130, 1732176453, 4122088066, 625945448,
1607808019, 487631450, 2896540170, 1707515452, 2988695013,
3716837561, 1280465119, 1538272760, 339428695, 1872723080,
2481544243, 2765609795, 3179652502, 811024844, 3684187116,
2925976497, 4111897842, 1401907683, 3793865124, 507433036,
3854105569, 2275119304, 855828169, 1664696094, 2237304806,
2629391234, 4272235888, 3733506322, 115218582, 3026614138,
538769257, 3339688838, 932858209, 2914506355, 3986535113,
kwargs: {}
Exception: "CUDARuntimeError('cudaErrorInvalidValue: invalid argument')"
Traceback (most recent call last):
File "C:\Users\q2ilab\Documents\mcsim_private\misc_scripts\2023_03_12_dask_distributed_test.py", line 31, in <module>
c = n.mean(axis=0).compute()
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\base.py", line 312, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\base.py", line 600, in compute
results = schedule(dsk, keys, **kwargs)
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\distributed\client.py", line 3014, in get
results = self.gather(packed, asynchronous=asynchronous, direct=direct)
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\distributed\client.py", line 2188, in gather
return self.sync(
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\distributed\utils.py", line 320, in sync
return sync(
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\distributed\utils.py", line 387, in sync
raise exc.with_traceback(tb)
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\distributed\utils.py", line 360, in f
result = yield future
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\tornado\gen.py", line 769, in run
value = future.result()
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\distributed\client.py", line 2051, in _gather
raise exception.with_traceback(traceback)
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\optimization.py", line 990, in __call__
return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\core.py", line 149, in get
result = _execute_task(task, cache)
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\core.py", line 119, in <genexpr>
return func(*(_execute_task(a, cache) for a in args))
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\core.py", line 113, in _execute_task
return [_execute_task(a, cache) for a in arg]
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\core.py", line 113, in <listcomp>
return [_execute_task(a, cache) for a in arg]
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\dask\core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "C:\Users\q2ilab\Documents\mcsim_private\misc_scripts\2023_03_12_dask_distributed_test.py", line 24, in <lambda>
n = da.map_blocks(lambda a: xp.asarray(a),
File "C:\Users\q2ilab\miniconda3\envs\mcsim39\lib\site-packages\cupy\_creation\from_data.py", line 76, in asarray
return _core.array(a, dtype, False, order)
File "cupy\_core\core.pyx", line 2249, in cupy._core.core.array
File "cupy\_core\core.pyx", line 2270, in cupy._core.core.array
File "cupy\_core\core.pyx", line 2418, in cupy._core.core._array_default
File "cupy\cuda\memory.pyx", line 470, in cupy.cuda.memory.MemoryPointer.copy_from_host_async
File "cupy_backends\cuda\api\runtime.pyx", line 560, in cupy_backends.cuda.api.runtime.memcpyAsync
File "cupy_backends\cuda\api\runtime.pyx", line 132, in cupy_backends.cuda.api.runtime.check_status