Dask delayed isn't more quicker that dask.array

Dear Dask community,

I am trying to optimize a function that reads data from a large HDF5 file and performs dask.array.einsum operations on that data.

To illustrate my point, I have generated some random data. The code I want to optimize using dask.array.einsum is as follows:

import numpy as np 
import h5py
import dask.array as da
import time
def generate_and_save_random_matrix(file_name, key, shape):
    random_matrix = np.random.random(shape)
    
    with h5py.File(file_name, 'w') as f:
        f.create_dataset(key, data=random_matrix)

file_name = 'random_matrix.h5'
key = 'data'               
nmo = 100
shape = (nmo**2, nmo**2)
generate_and_save_random_matrix(file_name, key, shape)

time0 = time.time()
nocc = 17
nvir = nmo - nocc
cte = -np.sqrt(3)/np.sqrt(2)
mask_bg = np.eye(nocc)
mask_ag = mask_bg
mask_np = np.eye(nvir)
mask_mp = mask_np
delta_vir = 1 - mask_np
delta_occ = 1 - mask_bg
d = np.random.random((nvir, nocc, nvir, nocc))
with h5py.File(str(file_name), "r") as f:
    eri_mo = da.from_array((f["data"]), chunks=
                            (100,100))
    eri_mo = eri_mo.reshape(nmo, nmo, nmo, nmo)
    int1_ = eri_mo[nocc:, :nocc, nocc:, nocc:]
    int2_ = eri_mo[nocc:, nocc:, nocc:, :nocc]
    int3_ = eri_mo[nocc:, :nocc, :nocc, :nocc]
    int4_ = eri_mo[:nocc, :nocc, nocc:, :nocc]
    int_mbnp = int1_.transpose(2,1,0,3)
    int_mpnb = int2_.transpose(2,3,0,1)
    int_magb = int3_.transpose(3,0,1,2)
    int_mbga = int3_.transpose(1,0,3,2)
    int_manp = int1_.transpose(2,0,1,3)
    int_mpna = int2_.transpose(2,0,3,1)
    int_gbna = int4_.transpose(2,1,3,0)
    int_ganb = int4_.transpose(2,3,1,0)

    deltas = np.einsum('nm,ab->nbma', delta_vir, delta_occ)
    c2_1 = int_mbnp-int_mpnb
    s_2 = da.einsum('ag,nbmp->nbmapg', mask_ag, c2_1)
    c2_2 =  int_mpna-int_manp
    s_2 +=  da.einsum('bg,nmap->nbmapg', mask_bg, c2_2)
    c2_3 = int_magb-int_mbga
    s_2 += da.einsum('np,bmag->nbmapg', mask_np, c2_3)
    s_2 = da.einsum('nbma,nbmapg->nbmapg', deltas, cte*s_2)
    s_2_t = da.einsum('nbmapg->pgnbma', s_2)
    
    c1_1 = int_mbnp + int_mpnb
    s_1 = da.einsum('ag,nbmp->nbmapg', -mask_ag, c1_1)
    c1_2 = int_manp + int_mpna
    s_1 += da.einsum('bg,nmap->nbmapg', -mask_bg, c1_2)
    c1_3 = int_magb+int_mbga
    s_1 += da.einsum('np,bmag->nbmapg', mask_np, c1_3)
    s_1_t = da.einsum('nbmapg->pgnbma', s_1)
    da0 = da.einsum('pgnbma,nbma,nbmaqd->pgqd', s_1_t, d, s_1)
    da0 += da.einsum('pgnbma,nbma,nbmaqd->pgqd', s_2_t, d, s_2)
    da0 = da0.sum().compute()
    print(da0)

print('Time', time.time()-time0)

I’ve tried to optimize it with dask.delayed, but I haven’t seen any improvements in memory usage or execution time.

def generate_and_save_random_matrix(file_name, key, shape):
    random_matrix = np.random.random(shape)
    with h5py.File(file_name, 'w') as f:
        f.create_dataset(key, data=random_matrix)

file_name = 'random_matrix.h5'  
key = 'data'               
nmo = 100
shape = (nmo**2, nmo**2)
generate_and_save_random_matrix(file_name, key, shape)

time0 = time.time()
nocc = 17
nvir = nmo - nocc
d = np.random.random((nvir, nocc, nvir, nocc))

@delayed
def slice_array(eri_mo, nocc):
    int1_ = eri_mo[nocc:, :nocc, nocc:, nocc:]
    int2_ = eri_mo[nocc:, nocc:, nocc:, :nocc]
    int3_ = eri_mo[nocc:, :nocc, :nocc, :nocc]
    int4_ = eri_mo[:nocc, :nocc, nocc:, :nocc]
    return (int1_, int2_, int3_, int4_)

@delayed
def transpose_delayed(array, axes):
    return np.transpose(array, axes=axes)

@delayed 
def extract_file(chunk, nmo):
    eri_mo = da.from_array((f["data"]), chunks=chunk)
    return eri_mo.reshape(nmo,nmo,nmo,nmo)

cte = -np.sqrt(3)/np.sqrt(2)
chunk = (100,100)
mask_bg = np.eye(nocc)
mask_ag = mask_bg
mask_np = np.eye(nvir)
mask_mp = mask_np
delta_vir = 1 - mask_np
delta_occ = 1 - mask_bg
deltas = np.einsum('nm,ab->nbma', delta_vir, delta_occ)
with h5py.File(str(file_name), "r") as f:
    eri_mo = extract_file(chunk, nmo)
    int_ = slice_array(eri_mo, nocc)
    int_mbnp = transpose_delayed(int_[0],(2,1,0,3))
    int_mpnb = transpose_delayed(int_[1],(2,3,0,1))
    int_magb = transpose_delayed(int_[2],(3,0,1,2))
    int_mbga = transpose_delayed(int_[2],(1,0,3,2))
    int_manp = transpose_delayed(int_[0],(2,0,1,3))
    int_mpna = transpose_delayed(int_[1],(2,0,3,1))
    int_gbna = transpose_delayed(int_[3],(2,1,3,0))
    int_ganb = transpose_delayed(int_[4],(2,3,1,0))
    c2_1 = delayed(np.subtract)(int_mbnp, int_mpnb)
    s_2 = delayed(np.einsum)('ag,nbmp->nbmapg', mask_ag, c2_1)
    c2_2 =  delayed(np.subtract)(int_mpna,int_manp)
    s_2 = s_2 + delayed(np.einsum)('bg,nmap->nbmapg', mask_bg, c2_2)
    c2_3 = delayed(np.subtract)(int_magb,int_mbga)
    s_2 = s_2 + delayed(np.einsum)('np,bmag->nbmapg', mask_np, c2_3)    
    s_2 = delayed(np.einsum)('nbma,nbmapg->nbmapg', deltas, cte*s_2)
    s_2_t = transpose_delayed(s_2, (4,5,0,1,2,3))
    da0 = delayed(np.einsum)('pgnbma,nbma,nbmaqd->pgqd', s_2_t, d, s_2)
    c1_1 = delayed(np.add)(int_mbnp,int_mpnb)
    s_1 = delayed(np.einsum)('ag,nbmp->nbmapg', -mask_ag, c1_1)
    c1_2 = delayed(np.add)(int_manp,int_mpna)
    s_1 = s_1 + delayed(np.einsum)('bg,nmap->nbmapg', -mask_bg, c1_2)
    c1_3 = delayed(np.add)(int_magb,int_mbga)
    s_1 = s_1 + delayed(np.einsum)('np,bmag->nbmapg', mask_np, c1_3)
    s_1_t = transpose_delayed(s_1, (4,5,0,1,2,3))
    da0 = da0 + delayed(np.einsum)('pgnbma,nbma,nbmaqd->pgqd', s_1_t, d, s_1)
    print(da0.sum().compute().compute())

print('time ', time.time()-time0)

Is there anything I’m doing wrong? Is it expected to obtain the same performance between dask.array and dask.delayed?

In the short term, I plan to perform the same calculations using CuPy, for which I believe dask.delayed would be more appropriate.

Thanks in advance for any suggestions you can give me.

Best

Daniel

Hi @DanielBajac,

If you mimic the chunk parallelisation of Dask Array with Delayed, I’m not sure why you would expect Delayed to be faster? Your basically computing the same tasks graph. In general, if you are performing array computations, I would almost always recommend Dask Arrays, since some functions can be better optimize that how you would implement it with Delayed.

You can perfectly use Cupy as a backend of Dask Arrays!

Thank you very much, @guillaumeeb! I successfully used CuPy as the backend, just as you recommended.

1 Like