GPU Memory Monitoring During Distributed XGBoost Training
Hello,
I was wondering if anyone had any advice or information they could direct me to regarding ways to monitor GPU memory usage during distributed training using logger.info().
I’ve been trying to implement GPU memory monitoring with PyNVML and XGBoost callbacks during training loops, but I’m encountering an issue: the memory values reported remain exactly the same (down to the byte level) throughout the entire training process.
For example, it outputs values like 7000MB/8000MB available for all four GPUs every 10 iterations, with not a single bit of change even when checking at the byte level.
Any advice on why the GPU memory values remain static during distributed training would be greatly appreciated. Is there a better approach to monitor GPU memory usage in a distributed XGBoost/Dask setup?
Simplified Training Setup
with LocalCUDACluster(n_workers=n_gpus, device_memory_limit="8GB") as cluster:
with Client(cluster) as client:
# Convert to Dask arrays
X_train_da = da.from_array(X_train, chunks=(chunk_size, -1))
y_train_da = da.from_array(y_train, chunks=chunk_size)
# Create DMatrix
dtrain = xgb.dask.DaskDMatrix(client, X_train_da, y_train_da)
dval = xgb.dask.DaskDMatrix(client, X_val_da, y_val_da)
# Train with XGBoost
output = xgb.dask.train(
client,
params,
dtrain,
num_boost_round=1000,
evals=[(dtrain, "train"), (dval, "val")],
verbose_eval=True
)
XGBoost Callback Implementation
class GPUMemoryCallback(xgb.callback.TrainingCallback):
def __init__(self, logger, log_interval=10):
self.logger = logger
self.log_interval = log_interval
def after_iteration(self, model, epoch, evals_log):
if (epoch + 1) % self.log_interval == 0:
# This doesn't work well in distributed setting
log_gpu_memory_usage(self.logger, stage=f"training_iteration_{epoch + 1}")
return False
Full Training Function with Callback
def fit(self, train_df, val_df, compute_resources):
# ... preprocessing code ...
try:
with LocalCUDACluster(n_workers=n_gpus, device_memory_limit="8GB") as cluster:
with Client(cluster) as client:
# Create Dask arrays and DMatrix objects
dtrain = xgb.dask.DaskDMatrix(client, X_train_da, y_train_da)
dval = xgb.dask.DaskDMatrix(client, X_val_da, y_val_da)
# Prepare callbacks - THIS IS THE KEY PART
train_callbacks = []
if self._is_using_gpu():
gpu_callback = GPUMemoryCallback(logger, log_interval=10)
train_callbacks.append(gpu_callback)
# Train with XGBoost - callback gets serialized and sent to workers
output = xgb.dask.train(
client,
self.params,
dtrain,
num_boost_round=1000,
early_stopping_rounds=50,
evals=[(dtrain, "train"), (dval, "val")],
callbacks=train_callbacks, # <-- Callback used here
verbose_eval=True
)
GPU Memory Logging Function
def log_gpu_memory_usage(logger, stage):
gpu_info = {}
try:
device_count = nvmlDeviceGetCount()
gpu_info["gpu_count"] = device_count
for i in range(device_count):
handle = nvmlDeviceGetHandleByIndex(i)
memory_info = nvmlDeviceGetMemoryInfo(handle)
name = nvmlDeviceGetName(handle).decode("utf-8")
gpu_info[f"gpu_{i}"] = {
"name": name,
"memory_total_mb": memory_info.total / (1024 * 1024),
"memory_used_mb": memory_info.used / (1024 * 1024),
"memory_free_mb": memory_info.free / (1024 * 1024),
"memory_usage_percent": (memory_info.used / memory_info.total) * 100,
}