Dear Dask community,
I am trying to optimize a function that reads data from a large HDF5 file and performs dask.array.einsum operations on that data.
To illustrate my point, I have generated some random data. The code I want to optimize using dask.array.einsum is as follows:
import numpy as np
import h5py
import dask.array as da
import time
def generate_and_save_random_matrix(file_name, key, shape):
random_matrix = np.random.random(shape)
with h5py.File(file_name, 'w') as f:
f.create_dataset(key, data=random_matrix)
file_name = 'random_matrix.h5'
key = 'data'
nmo = 100
shape = (nmo**2, nmo**2)
generate_and_save_random_matrix(file_name, key, shape)
time0 = time.time()
nocc = 17
nvir = nmo - nocc
cte = -np.sqrt(3)/np.sqrt(2)
mask_bg = np.eye(nocc)
mask_ag = mask_bg
mask_np = np.eye(nvir)
mask_mp = mask_np
delta_vir = 1 - mask_np
delta_occ = 1 - mask_bg
d = np.random.random((nvir, nocc, nvir, nocc))
with h5py.File(str(file_name), "r") as f:
eri_mo = da.from_array((f["data"]), chunks=
(100,100))
eri_mo = eri_mo.reshape(nmo, nmo, nmo, nmo)
int1_ = eri_mo[nocc:, :nocc, nocc:, nocc:]
int2_ = eri_mo[nocc:, nocc:, nocc:, :nocc]
int3_ = eri_mo[nocc:, :nocc, :nocc, :nocc]
int4_ = eri_mo[:nocc, :nocc, nocc:, :nocc]
int_mbnp = int1_.transpose(2,1,0,3)
int_mpnb = int2_.transpose(2,3,0,1)
int_magb = int3_.transpose(3,0,1,2)
int_mbga = int3_.transpose(1,0,3,2)
int_manp = int1_.transpose(2,0,1,3)
int_mpna = int2_.transpose(2,0,3,1)
int_gbna = int4_.transpose(2,1,3,0)
int_ganb = int4_.transpose(2,3,1,0)
deltas = np.einsum('nm,ab->nbma', delta_vir, delta_occ)
c2_1 = int_mbnp-int_mpnb
s_2 = da.einsum('ag,nbmp->nbmapg', mask_ag, c2_1)
c2_2 = int_mpna-int_manp
s_2 += da.einsum('bg,nmap->nbmapg', mask_bg, c2_2)
c2_3 = int_magb-int_mbga
s_2 += da.einsum('np,bmag->nbmapg', mask_np, c2_3)
s_2 = da.einsum('nbma,nbmapg->nbmapg', deltas, cte*s_2)
s_2_t = da.einsum('nbmapg->pgnbma', s_2)
c1_1 = int_mbnp + int_mpnb
s_1 = da.einsum('ag,nbmp->nbmapg', -mask_ag, c1_1)
c1_2 = int_manp + int_mpna
s_1 += da.einsum('bg,nmap->nbmapg', -mask_bg, c1_2)
c1_3 = int_magb+int_mbga
s_1 += da.einsum('np,bmag->nbmapg', mask_np, c1_3)
s_1_t = da.einsum('nbmapg->pgnbma', s_1)
da0 = da.einsum('pgnbma,nbma,nbmaqd->pgqd', s_1_t, d, s_1)
da0 += da.einsum('pgnbma,nbma,nbmaqd->pgqd', s_2_t, d, s_2)
da0 = da0.sum().compute()
print(da0)
print('Time', time.time()-time0)
I’ve tried to optimize it with dask.delayed, but I haven’t seen any improvements in memory usage or execution time.
def generate_and_save_random_matrix(file_name, key, shape):
random_matrix = np.random.random(shape)
with h5py.File(file_name, 'w') as f:
f.create_dataset(key, data=random_matrix)
file_name = 'random_matrix.h5'
key = 'data'
nmo = 100
shape = (nmo**2, nmo**2)
generate_and_save_random_matrix(file_name, key, shape)
time0 = time.time()
nocc = 17
nvir = nmo - nocc
d = np.random.random((nvir, nocc, nvir, nocc))
@delayed
def slice_array(eri_mo, nocc):
int1_ = eri_mo[nocc:, :nocc, nocc:, nocc:]
int2_ = eri_mo[nocc:, nocc:, nocc:, :nocc]
int3_ = eri_mo[nocc:, :nocc, :nocc, :nocc]
int4_ = eri_mo[:nocc, :nocc, nocc:, :nocc]
return (int1_, int2_, int3_, int4_)
@delayed
def transpose_delayed(array, axes):
return np.transpose(array, axes=axes)
@delayed
def extract_file(chunk, nmo):
eri_mo = da.from_array((f["data"]), chunks=chunk)
return eri_mo.reshape(nmo,nmo,nmo,nmo)
cte = -np.sqrt(3)/np.sqrt(2)
chunk = (100,100)
mask_bg = np.eye(nocc)
mask_ag = mask_bg
mask_np = np.eye(nvir)
mask_mp = mask_np
delta_vir = 1 - mask_np
delta_occ = 1 - mask_bg
deltas = np.einsum('nm,ab->nbma', delta_vir, delta_occ)
with h5py.File(str(file_name), "r") as f:
eri_mo = extract_file(chunk, nmo)
int_ = slice_array(eri_mo, nocc)
int_mbnp = transpose_delayed(int_[0],(2,1,0,3))
int_mpnb = transpose_delayed(int_[1],(2,3,0,1))
int_magb = transpose_delayed(int_[2],(3,0,1,2))
int_mbga = transpose_delayed(int_[2],(1,0,3,2))
int_manp = transpose_delayed(int_[0],(2,0,1,3))
int_mpna = transpose_delayed(int_[1],(2,0,3,1))
int_gbna = transpose_delayed(int_[3],(2,1,3,0))
int_ganb = transpose_delayed(int_[4],(2,3,1,0))
c2_1 = delayed(np.subtract)(int_mbnp, int_mpnb)
s_2 = delayed(np.einsum)('ag,nbmp->nbmapg', mask_ag, c2_1)
c2_2 = delayed(np.subtract)(int_mpna,int_manp)
s_2 = s_2 + delayed(np.einsum)('bg,nmap->nbmapg', mask_bg, c2_2)
c2_3 = delayed(np.subtract)(int_magb,int_mbga)
s_2 = s_2 + delayed(np.einsum)('np,bmag->nbmapg', mask_np, c2_3)
s_2 = delayed(np.einsum)('nbma,nbmapg->nbmapg', deltas, cte*s_2)
s_2_t = transpose_delayed(s_2, (4,5,0,1,2,3))
da0 = delayed(np.einsum)('pgnbma,nbma,nbmaqd->pgqd', s_2_t, d, s_2)
c1_1 = delayed(np.add)(int_mbnp,int_mpnb)
s_1 = delayed(np.einsum)('ag,nbmp->nbmapg', -mask_ag, c1_1)
c1_2 = delayed(np.add)(int_manp,int_mpna)
s_1 = s_1 + delayed(np.einsum)('bg,nmap->nbmapg', -mask_bg, c1_2)
c1_3 = delayed(np.add)(int_magb,int_mbga)
s_1 = s_1 + delayed(np.einsum)('np,bmag->nbmapg', mask_np, c1_3)
s_1_t = transpose_delayed(s_1, (4,5,0,1,2,3))
da0 = da0 + delayed(np.einsum)('pgnbma,nbma,nbmaqd->pgqd', s_1_t, d, s_1)
print(da0.sum().compute().compute())
print('time ', time.time()-time0)
Is there anything I’m doing wrong? Is it expected to obtain the same performance between dask.array and dask.delayed?
In the short term, I plan to perform the same calculations using CuPy, for which I believe dask.delayed would be more appropriate.
Thanks in advance for any suggestions you can give me.
Best
Daniel