GPU Memory Monitoring During Distributed XGBoost Training
Hello,
I was wondering if anyone had any advice or information they could direct me to regarding ways to monitor GPU memory usage during distributed training using logger.info()
.
I’ve been trying to implement GPU memory monitoring with PyNVML and XGBoost callbacks during training loops, but I’m encountering an issue: the memory values reported remain exactly the same (down to the byte level) throughout the entire training process.
For example, it outputs values like 7000MB/8000MB available
for all four GPUs every 10 iterations, with not a single bit of change even when checking at the byte level.
Any advice on why the GPU memory values remain static during distributed training would be greatly appreciated. Is there a better approach to monitor GPU memory usage in a distributed XGBoost/Dask setup?
Simplified Training Setup
with LocalCUDACluster(n_workers=n_gpus, device_memory_limit="8GB") as cluster:
with Client(cluster) as client:
# Convert to Dask arrays
X_train_da = da.from_array(X_train, chunks=(chunk_size, -1))
y_train_da = da.from_array(y_train, chunks=chunk_size)
# Create DMatrix
dtrain = xgb.dask.DaskDMatrix(client, X_train_da, y_train_da)
dval = xgb.dask.DaskDMatrix(client, X_val_da, y_val_da)
# Train with XGBoost
output = xgb.dask.train(
client,
params,
dtrain,
num_boost_round=1000,
evals=[(dtrain, "train"), (dval, "val")],
verbose_eval=True
)
XGBoost Callback Implementation
class GPUMemoryCallback(xgb.callback.TrainingCallback):
def __init__(self, logger, log_interval=10):
self.logger = logger
self.log_interval = log_interval
def after_iteration(self, model, epoch, evals_log):
if (epoch + 1) % self.log_interval == 0:
# This doesn't work well in distributed setting
log_gpu_memory_usage(self.logger, stage=f"training_iteration_{epoch + 1}")
return False
Full Training Function with Callback
def fit(self, train_df, val_df, compute_resources):
# ... preprocessing code ...
try:
with LocalCUDACluster(n_workers=n_gpus, device_memory_limit="8GB") as cluster:
with Client(cluster) as client:
# Create Dask arrays and DMatrix objects
dtrain = xgb.dask.DaskDMatrix(client, X_train_da, y_train_da)
dval = xgb.dask.DaskDMatrix(client, X_val_da, y_val_da)
# Prepare callbacks - THIS IS THE KEY PART
train_callbacks = []
if self._is_using_gpu():
gpu_callback = GPUMemoryCallback(logger, log_interval=10)
train_callbacks.append(gpu_callback)
# Train with XGBoost - callback gets serialized and sent to workers
output = xgb.dask.train(
client,
self.params,
dtrain,
num_boost_round=1000,
early_stopping_rounds=50,
evals=[(dtrain, "train"), (dval, "val")],
callbacks=train_callbacks, # <-- Callback used here
verbose_eval=True
)
GPU Memory Logging Function
def log_gpu_memory_usage(logger, stage):
gpu_info = {}
try:
device_count = nvmlDeviceGetCount()
gpu_info["gpu_count"] = device_count
for i in range(device_count):
handle = nvmlDeviceGetHandleByIndex(i)
memory_info = nvmlDeviceGetMemoryInfo(handle)
name = nvmlDeviceGetName(handle).decode("utf-8")
gpu_info[f"gpu_{i}"] = {
"name": name,
"memory_total_mb": memory_info.total / (1024 * 1024),
"memory_used_mb": memory_info.used / (1024 * 1024),
"memory_free_mb": memory_info.free / (1024 * 1024),
"memory_usage_percent": (memory_info.used / memory_info.total) * 100,
}