mirror of
https://github.com/autoscriptlabs/nccl-mesh-plugin.git
synced 2026-01-11 11:34:06 +00:00
Initial release: NCCL Mesh Plugin for direct-connect RDMA topologies
- Enables NCCL over multi-subnet mesh topologies - 8+ GB/s bandwidth over 100Gbps RDMA - Successfully tested with distributed LLM inference (Mistral-7B) - Custom subnet-aware NIC selection - Background handshake thread for deadlock-free connection setup
This commit is contained in:
commit
031bc48953
13 changed files with 3074 additions and 0 deletions
87
examples/benchmark_bandwidth.py
Normal file
87
examples/benchmark_bandwidth.py
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Bandwidth benchmark for NCCL Mesh Plugin
|
||||
|
||||
Usage:
|
||||
# On each node (adjust --rank):
|
||||
python benchmark_bandwidth.py --rank 0 --world-size 3 --master-ip 10.0.0.170
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import time
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
|
||||
def benchmark_allreduce(size_mb: int, iterations: int, warmup: int = 5):
|
||||
"""Benchmark all-reduce bandwidth"""
|
||||
|
||||
# Create tensor
|
||||
num_elements = (size_mb * 1024 * 1024) // 4 # float32 = 4 bytes
|
||||
tensor = torch.ones(num_elements, device='cuda', dtype=torch.float32)
|
||||
|
||||
# Warmup
|
||||
for _ in range(warmup):
|
||||
dist.all_reduce(tensor)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
# Benchmark
|
||||
start = time.perf_counter()
|
||||
for _ in range(iterations):
|
||||
dist.all_reduce(tensor)
|
||||
torch.cuda.synchronize()
|
||||
elapsed = time.perf_counter() - start
|
||||
|
||||
# Calculate bandwidth
|
||||
# All-reduce transfers 2*(N-1)/N * size data in ring algorithm
|
||||
total_data_gb = (size_mb * iterations) / 1024
|
||||
bandwidth_gbs = total_data_gb / elapsed
|
||||
|
||||
return bandwidth_gbs, elapsed
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Benchmark NCCL bandwidth')
|
||||
parser.add_argument('--rank', type=int, required=True)
|
||||
parser.add_argument('--world-size', type=int, default=3)
|
||||
parser.add_argument('--master-ip', type=str, default='10.0.0.170')
|
||||
parser.add_argument('--master-port', type=int, default=29500)
|
||||
parser.add_argument('--iterations', type=int, default=20)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize
|
||||
init_method = f'tcp://{args.master_ip}:{args.master_port}'
|
||||
dist.init_process_group('nccl', rank=args.rank, world_size=args.world_size,
|
||||
init_method=init_method)
|
||||
|
||||
if args.rank == 0:
|
||||
print(f'\n{"="*60}')
|
||||
print(f'NCCL Mesh Plugin Bandwidth Benchmark')
|
||||
print(f'World size: {args.world_size}')
|
||||
print(f'Iterations per size: {args.iterations}')
|
||||
print(f'{"="*60}\n')
|
||||
print(f'{"Size":<12} {"Bandwidth":<15} {"Time":<12}')
|
||||
print(f'{"-"*12} {"-"*15} {"-"*12}')
|
||||
|
||||
# Test different sizes
|
||||
sizes_mb = [1, 4, 16, 64, 128, 256, 512]
|
||||
|
||||
for size_mb in sizes_mb:
|
||||
bandwidth, elapsed = benchmark_allreduce(size_mb, args.iterations)
|
||||
|
||||
if args.rank == 0:
|
||||
print(f'{size_mb:>6} MB {bandwidth:>8.2f} GB/s {elapsed:>6.3f} s')
|
||||
|
||||
# Sync between sizes
|
||||
dist.barrier()
|
||||
|
||||
if args.rank == 0:
|
||||
print(f'\n{"="*60}')
|
||||
print('Benchmark complete!')
|
||||
print(f'{"="*60}\n')
|
||||
|
||||
dist.destroy_process_group()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
84
examples/distributed_llm.py
Normal file
84
examples/distributed_llm.py
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Distributed LLM Inference with NCCL Mesh Plugin
|
||||
|
||||
This example demonstrates loading and running inference on a large language
|
||||
model distributed across multiple GPUs using the NCCL Mesh Plugin.
|
||||
|
||||
Usage:
|
||||
# On each node (adjust --rank):
|
||||
python distributed_llm.py --rank 0 --world-size 3 --master-ip 10.0.0.170
|
||||
|
||||
Environment setup (run on each node):
|
||||
cd ~/nccl-mesh-plugin
|
||||
export LD_LIBRARY_PATH=$(pwd):$LD_LIBRARY_PATH
|
||||
export NCCL_NET_PLUGIN=mesh
|
||||
export NCCL_DEBUG=WARN
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from accelerate import Accelerator
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Distributed LLM Inference')
|
||||
parser.add_argument('--rank', type=int, required=True)
|
||||
parser.add_argument('--world-size', type=int, default=3)
|
||||
parser.add_argument('--master-ip', type=str, default='10.0.0.170')
|
||||
parser.add_argument('--master-port', type=int, default=29500)
|
||||
parser.add_argument('--model', type=str, default='mistralai/Mistral-7B-Instruct-v0.2',
|
||||
help='Model to load (default: Mistral-7B)')
|
||||
parser.add_argument('--prompt', type=str,
|
||||
default='The future of distributed AI computing is',
|
||||
help='Prompt for generation')
|
||||
parser.add_argument('--max-tokens', type=int, default=100,
|
||||
help='Maximum tokens to generate')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize accelerator
|
||||
accelerator = Accelerator()
|
||||
|
||||
print(f'Rank {accelerator.process_index}: Loading tokenizer...')
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
||||
|
||||
print(f'Rank {accelerator.process_index}: Loading model...')
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map='auto',
|
||||
)
|
||||
|
||||
print(f'Rank {accelerator.process_index}: Model loaded!')
|
||||
|
||||
# Only rank 0 generates
|
||||
if accelerator.is_main_process:
|
||||
print(f'\nGenerating text...')
|
||||
print(f'Prompt: "{args.prompt}"\n')
|
||||
|
||||
inputs = tokenizer(args.prompt, return_tensors='pt').to('cuda')
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=args.max_tokens,
|
||||
do_sample=True,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
)
|
||||
|
||||
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
print('=' * 60)
|
||||
print('Generated Text:')
|
||||
print('=' * 60)
|
||||
print(result)
|
||||
print('=' * 60)
|
||||
|
||||
# Wait for all ranks
|
||||
accelerator.wait_for_everyone()
|
||||
print(f'Rank {accelerator.process_index}: Done!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
65
examples/test_allreduce.py
Normal file
65
examples/test_allreduce.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Basic all-reduce test for NCCL Mesh Plugin
|
||||
|
||||
Usage:
|
||||
# On rank 0:
|
||||
python test_allreduce.py --rank 0 --world-size 3 --master-ip 10.0.0.170
|
||||
|
||||
# On rank 1:
|
||||
python test_allreduce.py --rank 1 --world-size 3 --master-ip 10.0.0.170
|
||||
|
||||
# On rank 2:
|
||||
python test_allreduce.py --rank 2 --world-size 3 --master-ip 10.0.0.170
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Test NCCL all-reduce')
|
||||
parser.add_argument('--rank', type=int, required=True, help='Rank of this process')
|
||||
parser.add_argument('--world-size', type=int, default=3, help='Total number of processes')
|
||||
parser.add_argument('--master-ip', type=str, default='10.0.0.170', help='Master node IP')
|
||||
parser.add_argument('--master-port', type=int, default=29500, help='Master node port')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Initialize process group
|
||||
init_method = f'tcp://{args.master_ip}:{args.master_port}'
|
||||
print(f'Rank {args.rank}: Initializing with {init_method}')
|
||||
|
||||
dist.init_process_group(
|
||||
backend='nccl',
|
||||
rank=args.rank,
|
||||
world_size=args.world_size,
|
||||
init_method=init_method
|
||||
)
|
||||
|
||||
print(f'Rank {args.rank}: Process group initialized')
|
||||
|
||||
# Create tensor on GPU
|
||||
tensor = torch.ones(1000, device='cuda')
|
||||
print(f'Rank {args.rank}: Created tensor with sum = {tensor.sum().item()}')
|
||||
|
||||
# All-reduce (sum)
|
||||
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
|
||||
|
||||
result = tensor[0].item()
|
||||
expected = float(args.world_size)
|
||||
|
||||
print(f'Rank {args.rank}: After all-reduce, tensor[0] = {result}')
|
||||
|
||||
if abs(result - expected) < 0.001:
|
||||
print(f'Rank {args.rank}: ✓ SUCCESS! Result matches expected value {expected}')
|
||||
else:
|
||||
print(f'Rank {args.rank}: ✗ FAILED! Expected {expected}, got {result}')
|
||||
|
||||
# Cleanup
|
||||
dist.destroy_process_group()
|
||||
print(f'Rank {args.rank}: Done')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue