Dear Representative,
Does dask_ml.model_selection.GridSearchCV function being supported with GPU through using LocalCUDACluster()? I am trying to run sklearn.neural_network.MLPRegressor coupled with using dask_ml.model_selection.GridSearchCV to train my model, and I don’t know whether the way below is the correct way to process my task by using local GPUs? If not, what is the correct way to do? Thank you so much.
The code I am using as below:
#===#
import os
import time
import tracemalloc
import joblib
import pandas as pd
import dask.array as da
X_train=joblib.load(os.getcwd()+‘/X_train.pkl’)
X_test=joblib.load(os.getcwd()+‘/X_test.pkl’)
y_train=joblib.load(os.getcwd()+‘/y_train.pkl’)
y_test=joblib.load(os.getcwd()+‘/y_test.pkl’)
#==#
#y_train=pd.Series(y_train.iloc[:,0]).ravel()
#y_test=pd.Series(y_test.iloc[:,0]).ravel()
#==#
from dask import dataframe as dd
X_trainO = dd.from_pandas(X_train, npartitions=3)
#X_test = dd.from_pandas(X_test, npartitions=3)
#X_test = dd.from_array(X_test)
y_trainO = dd.from_pandas(pd.DataFrame(y_train), npartitions=3)
#y_test = dd.from_pandas(pd.DataFrame(y_test), npartitions=3)
#y_test = dd.from_array(y_test)
param_list=joblib.load(os.getcwd()+‘/param_list.pkl’)
njobs=3
def ST(X_train, X_test, y_train, y_test,param_list,njobs):
import os
import time
import pandas as pd
import numpy as np
import sklearn.neural_network as snn
import joblib
import dask_ml.model_selection as dcv
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()
client = Client(cluster)
start_time = time.time()
tracemalloc.start()
estimator=snn.MLPRegressor
estimator.client = client
regressor = dcv.GridSearchCV(estimator(verbose=True),param_list,cv=5,n_jobs=njobs)
regressor.fit(X_trainO, y_trainO)
print(regressor)
current, peak = tracemalloc.get_traced_memory()
elapsed_time = time.time() - start_time
tracemalloc.stop()
joblib.dump(regressor,‘Model-Result.pkl’)
g=open(os.getcwd()+‘/Model-Performance.txt’,‘w’)
g.write(“Mean cross-validated score of the best_estimator: {:.2f}”.format(regressor.best_score_)+‘\n’)
g.write(“Test dataset R2 score: {:.2f}”.format(regressor.score(X_test, y_test))+‘\n’)
g.write(“Test dataset MSError: {:.2f}”.format(np.mean((regressor.predict(X_test) - y_test) ** 2))+‘\n’)
g.write(“Test dataset Case Number: {:.2f}”.format(len( y_test))+‘\n’)
g.write(“Train dataset Case Number: {:.2f}”.format(len( y_train))+‘\n’)
g.write(‘Model Fitting Computational Time: ‘+str(float(elapsed_time)/3600)+‘hours\n’)
g.write(“Model Fitting of memory usage is {”+str(int(current)/(106))+“}MB; Peak was {”+str(int(peak)/(106))+“}MB”+’\n’)
g.close()
df = pd.DataFrame(regressor.cv_results_,columns= [‘params’,‘rank_test_score’,‘std_fit_time’,‘mean_score_time’,‘std_score_time’,‘split0_test_score’,‘split1_test_score’,‘split2_test_score’,‘mean_test_score’,‘std_test_score’,‘param_activation’,‘param_alpha’,‘param_hidden_layer_sizes’,‘param_solver’])
df.to_csv(os.getcwd()+‘/Model-Comparison.csv’, index = False, header=True)
return
if name == ‘main’:
ST(X_train, X_test, y_train, y_test,param_list,njobs)
#=====#