observability.nccl_benchmark_torch.run
byte_value
def byte_value(byte: str) -> int
converts data to bytes if in MB, or GB format
factor_range
def factor_range(minbytes: str, maxbytes: str, stepfactor: float) -> List[int]
creates the range for the values for which nccl bw needs to be tested
run_allgather
def run_allgather(group: torch.distributed.ProcessGroup, bytes: int,
buffers: torch.Tensor, world_size: int,
dtype: torch.dtype) -> Tuple[float, float, float]
group: distributed group bytes: total data that needs to be transferred buffers: torch tensor that can fit the maximum num of elements specified in -e argument
returns: (inplace_time, inplace_algbw, inplace_busbw)
run_allredeuce
def run_allredeuce(group: torch.distributed.ProcessGroup, bytes: int,
buffers: torch.Tensor, world_size: int,
dtype: torch.dtype) -> Tuple[float, float, float]
group: distributed group bytes: total data that needs to be transferred buffers: torch tensor that can fit the maximum num of elements specified in -e argument
returns: (inplace_time, inplace_algbw, inplace_busbw)
calculate_bw
def calculate_bw(time: float, bytes: int, world_size: int,
routine: str) -> Tuple[float, float, float]
takes teh execution time and calculates algo bw and busbe returns (algbw, busbw)
get_routine
def get_routine(routine: str) -> callable
all routines will have same interface for calling returns which routine to use for the test