Hello, as I need to handle very large matrices, there is a memory shortage error when using numpy. Therefore, I have chosen to use dask.array to store matrices, but I am very unfamiliar with dask.array. When processing data, I found that dask.array is very slow. Is there any way to improve the speed? Thank you very much. Below is my code,the picture is processing time
import BWStest as bws
import numpy as np
from skimage.measure import label
import dask.array
from tqdm import tqdm
def BWS(Xarrar, Yarray, threshold):
"""Simpler but accelerated version of BWStest
Compute the Baumgartner-weiB-Schindler test statistic for two-sample case.
Args:
Xarrar (np.ndarray): A height by width matrix, each column corresponds to a sample for each obs
Yarray (np.ndarray): A height by width matrix, each column corresponds to a sample for each obs
threshold (float): Significance level, only .05 (Default) and 0.01 are supported.
Returns:
H: 0 => Accept the null hypothesis at significance level ALPHA
1 => Reject the null hypothesis at significance level ALPHA
[1] A Nonparametric Test for the General Two-Sample Problem, W. Baumgartner, P. Weiß and H. Schindler
Biometrics Vol. 54, No. 3 (Sep., 1998), pp. 1129-1135
This toolbox can be used only for research purposes, you should cite
the aforementioned papers in any resulting publication.
"""
n, m = Xarrar.shape
rank = rankdata(np.vstack((Xarrar, Yarray)), axis=0)
xrank = np.sort(rank[0:n,:], axis=0)
yrank = np.sort(rank[n::,:], axis=0)
temp = np.arange(1, n+1)[:,np.newaxis]*np.ones((1,m))
tempx = (xrank - 2.0*temp) ** 2
tempy = (yrank - 2.0*temp) ** 2
temp = temp/(n+1) * (1-temp/(n+1)) * 2 * n
BX = 1/n * np.sum(tempx / temp, axis=0)
BY = 1/n * np.sum(tempy / temp, axis=0)
# test statistic
B = 1/2*(BX + BY)
if threshold ==0.05:
if n == 5:
b = 2.533
elif n == 6:
b = 2.552
elif n == 7:
b = 2.620
elif n == 8:
b = 2.564
elif n == 9:
b = 2.575
elif n == 10:
b = 2.583
else:
b = 2.493
else:
b = 3.880
H = (B >=b)
H = np.transpose(H)
return H
# stack = np.random.rand(1, 12000, 20000)
CalWin = [7,25]
stack = []
thershold = 0.05
for i in range(5):
image = np.random.rand(12000, 20000)
dask_image = dask.array.from_array(image, chunks=(2048, 20000*2048))
stack.append(dask_image)
stacks = dask.array.stack(stack, axis=0)
print(stacks)
npages,nlines,nwidths = np.shape(stacks)
RadiusRow=(CalWin[0] - 1) // 2
RadiusCol=(CalWin[1] -1 ) // 2
InitRow=(CalWin[0] + 1) // 2
InitCol=(CalWin[1] + 1) // 2
mlistack = np.pad(np.abs(stacks), ((0, 0), (RadiusRow, RadiusRow),
(RadiusCol, RadiusCol)), mode='symmetric').compute()
meanmli = np.mean(mlistack, axis=0)
# print(meanmli.compute())
nlines_EP,nwidths_EP = meanmli.shape
PixelInd = dask.array.zeros((CalWin[0] * CalWin[1],nlines*nwidths),
chunks=(CalWin[0] * CalWin[1],2048*nlines), dtype=np.bool)
num = 0
p = 0
all = nlines*nwidths
all_step = np.floor(all/10)
# compar_pixel = np.zeros((CalWin[0] * CalWin[1],nlines*nwidths))
compar_pixel = dask.array.zeros((CalWin[0] * CalWin[1],nlines*nwidths),
chunks=(CalWin[0] * CalWin[1],2048*nlines), dtype=np.bool)
for kk in tqdm(range(InitCol, nwidths_EP-RadiusCol), desc="Processing columns", ascii=' ='):
for ll in tqdm(range(InitRow, nlines_EP-RadiusRow), desc="Processing rows", leave=False,ascii=' ='):
Matrix = mlistack[:,ll-RadiusRow-1:ll+RadiusRow,kk-RadiusCol-1:kk+RadiusCol]
# print(Matrix)
Ref = Matrix[:,InitRow-1,InitCol-1]
Xarray = np.tile(Ref[:, np.newaxis], (1, CalWin[0] * CalWin[1]))
Matrix_T = np.transpose(Matrix, (0, 2, 1))
Yarray = (np.reshape(Matrix_T, (Matrix_T.shape[0], CalWin[0] * CalWin[1])))
# bws is my own module for BWS algorithm
T = bws.BWS(Xarray, Yarray, thershold)
# print(~T)
SeedPoint = np.transpose(np.reshape(~T, (CalWin[1], CalWin[0])))
ll = np.transpose(SeedPoint)
l = label(ll,2)
l_flat = l.flatten()
LL = label(SeedPoint, 2)
LL_flat = np.transpose(LL).flatten()
# compar_pixel.compute()
compar_pixel[:,num] = (LL_flat == LL[InitRow - 1, InitCol - 1])
PixelInd[:,num] = (LL_flat == LL[InitRow - 1, InitCol - 1])
num = num + 1
BroNum = np.sum(PixelInd, 0)
BroNum = np.reshape(BroNum[::], [nlines, nwidths], order='F')
BroNum = np.float32((BroNum-1))
CalWin = CalWin