I try to start a multi-gpu training on a 8xH100 server using pytorch DDP with NCCL backend. When I run my training code, it hangs as it creates the process groups. This code battle tested on many platforms and it works just fine. So I’m confident that its not the code.
I created the following test to see whats wrong
import socket
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
import argparse
class SimpleModel(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(10, 1)
def forward(self, x):
return self.linear(x)
class DummyDataset(Dataset):
def __init__(self, size=1000):
self.size = size
def __len__(self):
return self.size
def __getitem__(self, idx):
x = torch.randn(10)
y = torch.sum(x) + torch.randn(1) * 0.1 # Create noisy target
return x, y
def setup(rank, world_size):
# Print some debug information
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
print(f"Hostname: {socket.gethostname()}")
# Set environment variables
os.environ['MASTER_ADDR'] = '127.0.0.1' # Changed from 'localhost' to explicit IP
os.environ['MASTER_PORT'] = '29500' # Changed port
# Optional: Set NCCL timeout
os.environ['NCCL_TIMEOUT'] = '20' # 20 minutes timeout
# Initialize the process group
print("Creating dist group")
dist.init_process_group("nccl", rank=rank, world_size=world_size)
print("Setting the device")
torch.cuda.set_device(rank)
def cleanup():
dist.destroy_process_group()
def train(rank, world_size):
print(f"Running on rank {rank}")
# Initialize distributed training
setup(rank, world_size)
# Create model and move it to GPU
print("Creating the model")
model = SimpleModel().to(rank)
print("Creating the DDP model")
ddp_model = DDP(model, device_ids=[rank])
# Create dataset and dataloader
dataset = DummyDataset()
sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
dataloader = DataLoader(
dataset,
batch_size=32,
shuffle=False,
sampler=sampler
)
# Setup optimizer and loss function
optimizer = optim.SGD(ddp_model.parameters(), lr=0.01)
criterion = nn.MSELoss()
# Training loop
for epoch in range(2):
sampler.set_epoch(epoch)
for batch_idx, (data, target) in enumerate(dataloader):
data, target = data.to(rank), target.to(rank)
optimizer.zero_grad()
output = ddp_model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx % 10 == 0 and rank == 0:
print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')
cleanup()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local-rank", type=int, default=0)
args = parser.parse_args()
# Determine world size (number of GPUs)
world_size = torch.cuda.device_count()
rank = int(os.environ.get('RANK', 0))
print(world_size)
print(rank)
if world_size < 2:
print("Need at least 2 GPUs for this test!")
exit(1)
train(rank, world_size)
This code gives the following output
W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793]
W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793] *****************************************
W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal p
erformance in your application as needed.
W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793] *****************************************
2
1
Running on rank 1
2
0
Running on rank 0
CUDA available: True
CUDA device count: 2
Hostname: 192-222-53-248
Creating dist group
Setting the device
CUDA available: True
CUDA device count: 2
Hostname: 192-222-53-248
Creating dist group
Setting the device
Creating the model
Creating the model
Creating the DDP model
Creating the DDP model
192-222-53-248:10403:10403 [0] NCCL INFO Bootstrap : Using eno1:172.27.124.181<0>
192-222-53-248:10403:10403 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
192-222-53-248:10403:10403 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
192-222-53-248:10403:10403 [0] NCCL INFO NET/Plugin: Using internal network plugin.
192-222-53-248:10403:10403 [0] NCCL INFO cudaDriverVersion 12040
NCCL version 2.21.5+cuda12.1
192-222-53-248:10404:10404 [1] NCCL INFO cudaDriverVersion 12040
192-222-53-248:10404:10404 [1] NCCL INFO Bootstrap : Using eno1:172.27.124.181<0>
192-222-53-248:10404:10404 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
192-222-53-248:10404:10404 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
192-222-53-248:10404:10404 [1] NCCL INFO NET/Plugin: Using internal network plugin.
192-222-53-248:10403:11069 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
192-222-53-248:10403:11069 [0] NCCL INFO NET/Socket : Using [0]eno1:172.27.124.181<0>
192-222-53-248:10403:11069 [0] NCCL INFO Using non-device net plugin version 0
192-222-53-248:10403:11069 [0] NCCL INFO Using network Socket
192-222-53-248:10404:11070 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.
192-222-53-248:10404:11070 [1] NCCL INFO NET/Socket : Using [0]eno1:172.27.124.181<0>
192-222-53-248:10404:11070 [1] NCCL INFO Using non-device net plugin version 0
192-222-53-248:10404:11070 [1] NCCL INFO Using network Socket
192-222-53-248:10404:11070 [1] NCCL INFO ncclCommInitRank comm 0x466a1ef0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 62000 commId 0x55abc613c631f216 - Init START
192-222-53-248:10403:11069 [0] NCCL INFO ncclCommInitRank comm 0x185c0070 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 61000 commId 0x55abc613c631f216 - Init START
192-222-53-248:10404:11070 [1] NCCL INFO MNNVL busId 0x62000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
192-222-53-248:10403:11069 [0] NCCL INFO MNNVL busId 0x61000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
W0203 13:44:02.974000 10022 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 10403 closing signal SIGTERM
E0203 13:44:03.439000 10022 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -11) local_rank: 1 (pid: 10404) of binary: /home/ubuntu/miniconda3/envs/tts3/bin/python
Traceback (most recent call last):
File "/home/ubuntu/miniconda3/envs/tts3/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.5.0', 'console_scripts', 'torchrun')())
File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
return f(*args, **kwargs)
File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
run(args)
File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
elastic_launch(
File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
=======================================================
nccl_test.py FAILED
-------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
-------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2025-02-03_13:44:02
host : 192-222-53-248
rank : 1 (local_rank: 1)
exitcode : -11 (pid: 10404)
error_file: <N/A>
traceback : Signal 11 (SIGSEGV) received by PID 10404
=======================================================
I’ve tried many things so far and nothing seems to work. Also I was getting a segmentation on a previous h100 instance. I feel like something is wrong with the h100 instances but maybe has any recommendations?