Summing multiple non-contiguous subsets of an array

Hi @mewmew_laser_kittens,

Here’s another dask.array-ish solution that ran (on my laptop) 10 times faster than the original one with list-comprehension. Feel free to ask for clarification if you need it.

import numpy as np
import dask.array as da


def your_expression(arr, arr_idx, rng):
    res = np.concatenate(
        [arr[np.where(arr_idx == i)].sum(axis=0)[None, :] for i in rng],
        axis=0
    )
    # res.ndim = 2; insert new axis to get 3, as expected by blockwise()
    return res[np.newaxis, ...]


n = 256
darr = da.random.randint(n, size = (10000, 10000))
darr_idx = da.random.randint(n, size = 10000)
range_ = da.arange(n)
res = da.core.blockwise(
    *(your_expression, 'ikj'),
    *(darr, 'ij'),
    *(darr_idx, 'i'),
    *(range_, 'k'),
    adjust_chunks={'i': 1},
    dtype=np.int_,
    meta=np.array([[[]]])
)

identity = lambda x, axis, keepdims: x
meta = np.array([[]])
res = da.reduction(
    res, identity, np.sum, axis=0, dtype=np.int_, meta=meta
)
res.compute()
2 Likes