Initial release: NCCL Mesh Plugin for direct-connect RDMA topologies

- Enables NCCL over multi-subnet mesh topologies - 8+ GB/s bandwidth over 100Gbps RDMA - Successfully tested with distributed LLM inference (Mistral-7B) - Custom subnet-aware NIC selection - Background handshake thread for deadlock-free connection setup
2026-01-11 11:34:06 +00:00 · 2026-01-09 14:09:33 -05:00 · 2026-01-09 14:09:33 -05:00 · 031bc48953
commit 031bc48953
13 changed files with 3074 additions and 0 deletions
--- a/examples/benchmark_bandwidth.py
+++ b/examples/benchmark_bandwidth.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""
+Bandwidth benchmark for NCCL Mesh Plugin
+
+Usage:
+    # On each node (adjust --rank):
+    python benchmark_bandwidth.py --rank 0 --world-size 3 --master-ip 10.0.0.170
+"""
+
+import argparse
+import time
+import torch
+import torch.distributed as dist
+
+
+def benchmark_allreduce(size_mb: int, iterations: int, warmup: int = 5):
+    """Benchmark all-reduce bandwidth"""
+    
+    # Create tensor
+    num_elements = (size_mb * 1024 * 1024) // 4  # float32 = 4 bytes
+    tensor = torch.ones(num_elements, device='cuda', dtype=torch.float32)
+    
+    # Warmup
+    for _ in range(warmup):
+        dist.all_reduce(tensor)
+    torch.cuda.synchronize()
+    
+    # Benchmark
+    start = time.perf_counter()
+    for _ in range(iterations):
+        dist.all_reduce(tensor)
+    torch.cuda.synchronize()
+    elapsed = time.perf_counter() - start
+    
+    # Calculate bandwidth
+    # All-reduce transfers 2*(N-1)/N * size data in ring algorithm
+    total_data_gb = (size_mb * iterations) / 1024
+    bandwidth_gbs = total_data_gb / elapsed
+    
+    return bandwidth_gbs, elapsed
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Benchmark NCCL bandwidth')
+    parser.add_argument('--rank', type=int, required=True)
+    parser.add_argument('--world-size', type=int, default=3)
+    parser.add_argument('--master-ip', type=str, default='10.0.0.170')
+    parser.add_argument('--master-port', type=int, default=29500)
+    parser.add_argument('--iterations', type=int, default=20)
+    args = parser.parse_args()
+
+    # Initialize
+    init_method = f'tcp://{args.master_ip}:{args.master_port}'
+    dist.init_process_group('nccl', rank=args.rank, world_size=args.world_size,
+                           init_method=init_method)
+    
+    if args.rank == 0:
+        print(f'\n{"="*60}')
+        print(f'NCCL Mesh Plugin Bandwidth Benchmark')
+        print(f'World size: {args.world_size}')
+        print(f'Iterations per size: {args.iterations}')
+        print(f'{"="*60}\n')
+        print(f'{"Size":<12} {"Bandwidth":<15} {"Time":<12}')
+        print(f'{"-"*12} {"-"*15} {"-"*12}')
+
+    # Test different sizes
+    sizes_mb = [1, 4, 16, 64, 128, 256, 512]
+    
+    for size_mb in sizes_mb:
+        bandwidth, elapsed = benchmark_allreduce(size_mb, args.iterations)
+        
+        if args.rank == 0:
+            print(f'{size_mb:>6} MB    {bandwidth:>8.2f} GB/s    {elapsed:>6.3f} s')
+        
+        # Sync between sizes
+        dist.barrier()
+
+    if args.rank == 0:
+        print(f'\n{"="*60}')
+        print('Benchmark complete!')
+        print(f'{"="*60}\n')
+
+    dist.destroy_process_group()
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/distributed_llm.py
+++ b/examples/distributed_llm.py
@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""
+Distributed LLM Inference with NCCL Mesh Plugin
+
+This example demonstrates loading and running inference on a large language
+model distributed across multiple GPUs using the NCCL Mesh Plugin.
+
+Usage:
+    # On each node (adjust --rank):
+    python distributed_llm.py --rank 0 --world-size 3 --master-ip 10.0.0.170
+
+Environment setup (run on each node):
+    cd ~/nccl-mesh-plugin
+    export LD_LIBRARY_PATH=$(pwd):$LD_LIBRARY_PATH
+    export NCCL_NET_PLUGIN=mesh
+    export NCCL_DEBUG=WARN
+"""
+
+import argparse
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from accelerate import Accelerator
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Distributed LLM Inference')
+    parser.add_argument('--rank', type=int, required=True)
+    parser.add_argument('--world-size', type=int, default=3)
+    parser.add_argument('--master-ip', type=str, default='10.0.0.170')
+    parser.add_argument('--master-port', type=int, default=29500)
+    parser.add_argument('--model', type=str, default='mistralai/Mistral-7B-Instruct-v0.2',
+                       help='Model to load (default: Mistral-7B)')
+    parser.add_argument('--prompt', type=str, 
+                       default='The future of distributed AI computing is',
+                       help='Prompt for generation')
+    parser.add_argument('--max-tokens', type=int, default=100,
+                       help='Maximum tokens to generate')
+    args = parser.parse_args()
+
+    # Initialize accelerator
+    accelerator = Accelerator()
+    
+    print(f'Rank {accelerator.process_index}: Loading tokenizer...')
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    
+    print(f'Rank {accelerator.process_index}: Loading model...')
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.bfloat16,
+        device_map='auto',
+    )
+    
+    print(f'Rank {accelerator.process_index}: Model loaded!')
+
+    # Only rank 0 generates
+    if accelerator.is_main_process:
+        print(f'\nGenerating text...')
+        print(f'Prompt: "{args.prompt}"\n')
+        
+        inputs = tokenizer(args.prompt, return_tensors='pt').to('cuda')
+        
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=args.max_tokens,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+        )
+        
+        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        
+        print('=' * 60)
+        print('Generated Text:')
+        print('=' * 60)
+        print(result)
+        print('=' * 60)
+
+    # Wait for all ranks
+    accelerator.wait_for_everyone()
+    print(f'Rank {accelerator.process_index}: Done!')
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/test_allreduce.py
+++ b/examples/test_allreduce.py
@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""
+Basic all-reduce test for NCCL Mesh Plugin
+
+Usage:
+    # On rank 0:
+    python test_allreduce.py --rank 0 --world-size 3 --master-ip 10.0.0.170
+    
+    # On rank 1:
+    python test_allreduce.py --rank 1 --world-size 3 --master-ip 10.0.0.170
+    
+    # On rank 2:
+    python test_allreduce.py --rank 2 --world-size 3 --master-ip 10.0.0.170
+"""
+
+import argparse
+import torch
+import torch.distributed as dist
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Test NCCL all-reduce')
+    parser.add_argument('--rank', type=int, required=True, help='Rank of this process')
+    parser.add_argument('--world-size', type=int, default=3, help='Total number of processes')
+    parser.add_argument('--master-ip', type=str, default='10.0.0.170', help='Master node IP')
+    parser.add_argument('--master-port', type=int, default=29500, help='Master node port')
+    args = parser.parse_args()
+
+    # Initialize process group
+    init_method = f'tcp://{args.master_ip}:{args.master_port}'
+    print(f'Rank {args.rank}: Initializing with {init_method}')
+    
+    dist.init_process_group(
+        backend='nccl',
+        rank=args.rank,
+        world_size=args.world_size,
+        init_method=init_method
+    )
+    
+    print(f'Rank {args.rank}: Process group initialized')
+
+    # Create tensor on GPU
+    tensor = torch.ones(1000, device='cuda')
+    print(f'Rank {args.rank}: Created tensor with sum = {tensor.sum().item()}')
+
+    # All-reduce (sum)
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+    
+    result = tensor[0].item()
+    expected = float(args.world_size)
+    
+    print(f'Rank {args.rank}: After all-reduce, tensor[0] = {result}')
+    
+    if abs(result - expected) < 0.001:
+        print(f'Rank {args.rank}: ✓ SUCCESS! Result matches expected value {expected}')
+    else:
+        print(f'Rank {args.rank}: ✗ FAILED! Expected {expected}, got {result}')
+
+    # Cleanup
+    dist.destroy_process_group()
+    print(f'Rank {args.rank}: Done')
+
+
+if __name__ == '__main__':
+    main()