PyTorch DDP NCCL hangs on h100 server

I try to start a multi-gpu training on a 8xH100 server using pytorch DDP with NCCL backend. When I run my training code, it hangs as it creates the process groups. This code battle tested on many platforms and it works just fine. So I’m confident that its not the code.

I created the following test to see whats wrong

import socket
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
import argparse

class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(10, 1)

    def forward(self, x):
        return self.linear(x)

class DummyDataset(Dataset):
    def __init__(self, size=1000):
        self.size = size

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        x = torch.randn(10)
        y = torch.sum(x) + torch.randn(1) * 0.1  # Create noisy target
        return x, y

def setup(rank, world_size):
    # Print some debug information
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Hostname: {socket.gethostname()}")

    # Set environment variables
    os.environ['MASTER_ADDR'] = '127.0.0.1'  # Changed from 'localhost' to explicit IP
    os.environ['MASTER_PORT'] = '29500'  # Changed port

    # Optional: Set NCCL timeout
    os.environ['NCCL_TIMEOUT'] = '20'  # 20 minutes timeout

    # Initialize the process group
    print("Creating dist group")
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    print("Setting the device")
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

def train(rank, world_size):
    print(f"Running on rank {rank}")

    # Initialize distributed training
    setup(rank, world_size)

    # Create model and move it to GPU
    print("Creating the model")
    model = SimpleModel().to(rank)
    print("Creating the DDP model")
    ddp_model = DDP(model, device_ids=[rank])

    # Create dataset and dataloader
    dataset = DummyDataset()
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(
        dataset,
        batch_size=32,
        shuffle=False,
        sampler=sampler
    )

    # Setup optimizer and loss function
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.01)
    criterion = nn.MSELoss()

    # Training loop
    for epoch in range(2):
        sampler.set_epoch(epoch)
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(rank), target.to(rank)

            optimizer.zero_grad()
            output = ddp_model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            if batch_idx % 10 == 0 and rank == 0:
                print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}')

    cleanup()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--local-rank", type=int, default=0)
    args = parser.parse_args()

    # Determine world size (number of GPUs)
    world_size = torch.cuda.device_count()
    rank = int(os.environ.get('RANK', 0))

    print(world_size)
    print(rank)

    if world_size < 2:
        print("Need at least 2 GPUs for this test!")
        exit(1)

    train(rank, world_size)

This code gives the following output

W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793] 
W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793] *****************************************
W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal p
erformance in your application as needed. 
W0203 13:43:58.907000 10022 site-packages/torch/distributed/run.py:793] *****************************************
2                                                                                                                      
1                                                                                                                                                                                                                                              
Running on rank 1                                                                                                      
2                                                                                                                      
0                           
Running on rank 0                                                                                                      
CUDA available: True                                                                                                   
CUDA device count: 2                                                                                                   
Hostname: 192-222-53-248                                                                                                                                                                                                                       
Creating dist group                                                                                                    
Setting the device                                                                                                     
CUDA available: True                                                                                                   
CUDA device count: 2                                                                                                   
Hostname: 192-222-53-248                                                                                               
Creating dist group                                                                                                    
Setting the device                                                                                                     
Creating the model                                                                                                     
Creating the model                                                                                                     
Creating the DDP model                                                                                                                                                                                                                         
Creating the DDP model                                                                                                                                                                                                                         
192-222-53-248:10403:10403 [0] NCCL INFO Bootstrap : Using eno1:172.27.124.181<0>                               
192-222-53-248:10403:10403 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)                           
192-222-53-248:10403:10403 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
192-222-53-248:10403:10403 [0] NCCL INFO NET/Plugin: Using internal network plugin.                                                                                                                                                            
192-222-53-248:10403:10403 [0] NCCL INFO cudaDriverVersion 12040
NCCL version 2.21.5+cuda12.1                                                                                           
192-222-53-248:10404:10404 [1] NCCL INFO cudaDriverVersion 12040               
192-222-53-248:10404:10404 [1] NCCL INFO Bootstrap : Using eno1:172.27.124.181<0>                                                                                                                                                              
192-222-53-248:10404:10404 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)
192-222-53-248:10404:10404 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so
192-222-53-248:10404:10404 [1] NCCL INFO NET/Plugin: Using internal network plugin.
192-222-53-248:10403:11069 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 1.                                 
192-222-53-248:10403:11069 [0] NCCL INFO NET/Socket : Using [0]eno1:172.27.124.181<0>
192-222-53-248:10403:11069 [0] NCCL INFO Using non-device net plugin version 0                                                                                                                                                                 
192-222-53-248:10403:11069 [0] NCCL INFO Using network Socket      
192-222-53-248:10404:11070 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 1.                                                                                                                                                              
192-222-53-248:10404:11070 [1] NCCL INFO NET/Socket : Using [0]eno1:172.27.124.181<0>
192-222-53-248:10404:11070 [1] NCCL INFO Using non-device net plugin version 0
192-222-53-248:10404:11070 [1] NCCL INFO Using network Socket
192-222-53-248:10404:11070 [1] NCCL INFO ncclCommInitRank comm 0x466a1ef0 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 62000 commId 0x55abc613c631f216 - Init START
192-222-53-248:10403:11069 [0] NCCL INFO ncclCommInitRank comm 0x185c0070 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 61000 commId 0x55abc613c631f216 - Init START
192-222-53-248:10404:11070 [1] NCCL INFO MNNVL busId 0x62000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
192-222-53-248:10403:11069 [0] NCCL INFO MNNVL busId 0x61000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0
W0203 13:44:02.974000 10022 site-packages/torch/distributed/elastic/multiprocessing/api.py:897] Sending process 10403 closing signal SIGTERM
E0203 13:44:03.439000 10022 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: -11) local_rank: 1 (pid: 10404) of binary: /home/ubuntu/miniconda3/envs/tts3/bin/python
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/envs/tts3/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==2.5.0', 'console_scripts', 'torchrun')())
  File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
    return f(*args, **kwargs) 
  File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/run.py", line 919, in main
    run(args)                                          
  File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/run.py", line 910, in run
    elastic_launch(                                                                                                                                                                                                                            
  File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/ubuntu/miniconda3/envs/tts3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
=======================================================
nccl_test.py FAILED
-------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
-------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-02-03_13:44:02
  host      : 192-222-53-248
  rank      : 1 (local_rank: 1)
  exitcode  : -11 (pid: 10404)
  error_file: <N/A>
  traceback : Signal 11 (SIGSEGV) received by PID 10404
=======================================================

I’ve tried many things so far and nothing seems to work. Also I was getting a segmentation on a previous h100 instance. I feel like something is wrong with the h100 instances but maybe has any recommendations?

I wasn’t able to replicate the issue.

Can you provide step-by-step instructions to replicate the issue?