forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Summary: Pull Request resolved: pytorch#57454 DDP with NCCL AllReduce for the entire model experiment from Quip https://fb.quip.com/iQUtAeKIxWpF I have been testing this on the AI cluster. There seem to be some connection problems with RPC when using multiple trainers or parameter servers. ``` Namespace(bconfig_id='3', dconfig_id='DummyData', mconfig_id='DummyModel', pconfig_id='None', tconfig_id='DdpNcclTrainer') benchmark warmup done metrics for trainer=0 +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.972 | 0.097122 | 0.311644 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.58138 | 4.31439 | 0.00229848 | 0.0479424 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.4807 | 0.222566 | 0.0555432 | 0.235676 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.191488 | 3.54099 | 3.11694 | 0.557106 | 0.746395 | +-----------------------------------+----------+---------+----------+------------+-----------+ metrics for trainer=1 +-----------------------------------+----------+---------+----------+-------------+------------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+=============+============+ | backward_metric,backward | 2.4617 | 2.59174 | 2.51196 | 0.000938276 | 0.0306313 | +-----------------------------------+----------+---------+----------+-------------+------------+ | batch_level_metric,batch_all | 4.22605 | 4.71757 | 4.27921 | 0.00468424 | 0.0684415 | +-----------------------------------+----------+---------+----------+-------------+------------+ | foward_metric,forward_pass | 0.807936 | 1.50118 | 0.846008 | 0.00601693 | 0.0775688 | +-----------------------------------+----------+---------+----------+-------------+------------+ | hook_future_metric,nccl_allreduce | 0.108544 | 0.1536 | 0.11222 | 2.16726e-05 | 0.00465538 | +-----------------------------------+----------+---------+----------+-------------+------------+ metrics for all trainer +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.24198 | 0.584391 | 0.764455 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.71757 | 4.2968 | 0.00378467 | 0.0615197 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.50118 | 0.534287 | 0.128284 | 0.358167 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.108544 | 3.54099 | 1.61458 | 2.5456 | 1.59549 | +-----------------------------------+----------+---------+----------+------------+-----------+ ``` Test Plan: Imported from OSS Reviewed By: H-Huang, ngimel Differential Revision: D28296175 Pulled By: gcramer23 fbshipit-source-id: 5dd208fc86f8b5558d7c8860d685bb25c2e09fe7
- Loading branch information
1 parent
94080f4
commit bc2540f
Showing
21 changed files
with
1,071 additions
and
0 deletions.
There are no files selected for viewing
15 changes: 15 additions & 0 deletions
15
benchmarks/distributed/rpc/parameter_server/BenchmarkConfigurations.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from dataclasses import dataclass | ||
|
||
|
||
@dataclass | ||
class BenchmarkConfigurations: | ||
trainer_count: int = 1 | ||
ps_count: int = 0 | ||
batch_size: int = 1 | ||
print_metrics_to_dir: bool = False | ||
master_addr: str = "localhost" | ||
master_port: str = "29500" | ||
rpc_async_timeout: int = 5 | ||
rpc_init_method: str = "tcp://localhost:29501" | ||
trainer_config: dict = None | ||
ps_config: dict = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# RPC PS Benchmark | ||
|
||
## How to add your experiment | ||
|
||
1. Data | ||
- Create a data class and add it to the data directory | ||
- Update benchmark_class_helper.py to include your data class in the data_map | ||
- Add configurations to data_configurations.json in the configurations directory | ||
2. Model | ||
- Create a model class and add it to the model directory | ||
- Update benchmark_class_helper.py to include your model class in the model_map | ||
- Add configurations to model_configurations.json in the configurations directory | ||
3. Trainer | ||
- Create a trainer class and add it to the trainer directory | ||
- Update benchmark_class_helper.py to include your trainer class in the trainer_map | ||
- Add configurations to trainer_configurations.json in the configurations directory | ||
4. Parameter Server | ||
- Create a parameter server class and add it to the parameter_servers directory | ||
- Update benchmark_class_helper.py to include your parameter_server class in the ps_map | ||
- Add configurations to parameter_server_configurations.json in the configurations directory | ||
5. Script | ||
- Create a bash script for your experiment and add it to the bash_experiment_scripts directory | ||
|
||
## Trainer class | ||
|
||
The trainer directory contains base classes to provide a starting point for implementing a trainer. | ||
Inherit from a base class and implement your trainer. The benchmark has two requirements for trainers. | ||
|
||
1. It must implement a __init__ method that takes rank, trainer_count, and ps_rref as arguments | ||
|
||
```python | ||
def __init__(self, rank, trainer_count, ps_rref, backend, use_cuda_rpc): | ||
``` | ||
|
||
2. It must implement a train method that takes model and data as arguments. | ||
|
||
```python | ||
def train(self, model, data): | ||
``` | ||
|
||
## Parameter Server class | ||
|
||
The parameter_server directory contains base classes to provide a starting point for implementing a parameter server. | ||
Inherit from a base class and implement your parameter server. The benchmark has two requirements for parameter servers. | ||
|
||
1. It must implement a __init__ method that takes rank and ps_trainer_count as arguments | ||
|
||
```python | ||
def __init__(self, rank, ps_trainer_count, backend, use_cuda_rpc): | ||
``` | ||
|
||
2. It must implement a reset_state method | ||
|
||
```python | ||
def reset_state(ps_rref): | ||
``` |
13 changes: 13 additions & 0 deletions
13
benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/ddp_nccl_allreduce.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/sh | ||
|
||
# requires slurm | ||
# configuration ids | ||
benchmark=3 | ||
data="DummyData" | ||
model="DummyModel" | ||
trainer="DdpNcclTrainer" | ||
server="None" | ||
# moves to directory and runs the benchmark with the configurations selected | ||
cd "$(dirname $(dirname "$0"))" | ||
source ./bash_experiment_scripts/helper_functions.sh | ||
run_benchmark_basic "$benchmark" "$data" "$model" "$trainer" "$server" |
7 changes: 7 additions & 0 deletions
7
benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/helper_functions.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/sh | ||
|
||
run_benchmark_basic() { | ||
# requires slurm | ||
gpurun='srun -p q2 --cpus-per-task=16 -t 5:00:00 --gpus-per-node=4' | ||
$gpurun python launcher.py --benchmark=$1 --data=$2 --model=$3 --trainer=$4 --server=$5 | ||
} |
33 changes: 33 additions & 0 deletions
33
benchmarks/distributed/rpc/parameter_server/benchmark_class_helper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from data.DummyData import DummyData | ||
from models.DummyModel import DummyModel | ||
from trainers.DdpNcclTrainer import DdpNcclTrainer | ||
|
||
trainer_map = { | ||
"DdpNcclTrainer": DdpNcclTrainer | ||
} | ||
|
||
ps_map = {} | ||
|
||
model_map = { | ||
"DummyModel": DummyModel | ||
} | ||
|
||
data_map = { | ||
"DummyData": DummyData | ||
} | ||
|
||
|
||
def get_benchmark_trainer_map(): | ||
return trainer_map | ||
|
||
|
||
def get_benchmark_ps_map(): | ||
return ps_map | ||
|
||
|
||
def get_benchmark_model_map(): | ||
return model_map | ||
|
||
|
||
def get_benchmark_data_map(): | ||
return data_map |
8 changes: 8 additions & 0 deletions
8
benchmarks/distributed/rpc/parameter_server/configurations/benchmark_configurations.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"3": { | ||
"trainer_count": 2, | ||
"ps_count": 0, | ||
"rpc_async_timeout": 15, | ||
"batch_size": 5 | ||
} | ||
} |
20 changes: 20 additions & 0 deletions
20
benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
{ | ||
"DummyData": { | ||
"data_class": "DummyData", | ||
"configurations": { | ||
"max_val": 100, | ||
"input_samples": 100, | ||
"input_dim": 100, | ||
"sparsity_percentage": 20 | ||
} | ||
}, | ||
"DummyData2": { | ||
"data_class": "DummyData", | ||
"configurations": { | ||
"max_val": 100, | ||
"input_samples": 100, | ||
"input_dim": 100, | ||
"sparsity_percentage": 80 | ||
} | ||
} | ||
} |
22 changes: 22 additions & 0 deletions
22
benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
{ | ||
"DummyModel": { | ||
"model_class": "DummyModel", | ||
"configurations": { | ||
"num_embeddings": 100, | ||
"embedding_dim": 100, | ||
"dense_input_size": 100, | ||
"dense_output_size": 100, | ||
"sparse": false | ||
} | ||
}, | ||
"DummyModelSparse": { | ||
"model_class": "DummyModel", | ||
"configurations": { | ||
"num_embeddings": 100, | ||
"embedding_dim": 100, | ||
"dense_input_size": 100, | ||
"dense_output_size": 100, | ||
"sparse": true | ||
} | ||
} | ||
} |
1 change: 1 addition & 0 deletions
1
...arks/distributed/rpc/parameter_server/configurations/parameter_server_configurations.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{} |
8 changes: 8 additions & 0 deletions
8
benchmarks/distributed/rpc/parameter_server/configurations/trainer_configurations.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"DdpNcclTrainer": { | ||
"trainer_class": "DdpNcclTrainer", | ||
"configurations": { | ||
"epochs": 10 | ||
} | ||
} | ||
} |
46 changes: 46 additions & 0 deletions
46
benchmarks/distributed/rpc/parameter_server/data/DummyData.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import random | ||
|
||
import numpy as np | ||
import torch | ||
from torch.utils.data import Dataset | ||
|
||
|
||
class DummyData(Dataset): | ||
|
||
def __init__( | ||
self, | ||
max_val: int, | ||
input_samples: int, | ||
input_dim: int, | ||
sparsity_percentage: int | ||
): | ||
self.max_val = max_val | ||
self.input_samples = input_samples | ||
self.input_dim = input_dim | ||
self.sparsity_percentage = sparsity_percentage | ||
|
||
def generate_input(): | ||
precentage_of_elements = (100 - self.sparsity_percentage) / float(100) | ||
index_count = int(self.max_val * precentage_of_elements) | ||
elements = list(range(self.max_val)) | ||
random.shuffle(elements) | ||
elements = elements[:index_count] | ||
data = [ | ||
[ | ||
elements[random.randint(0, index_count - 1)] | ||
for _ in range(self.input_dim) | ||
] | ||
for _ in range(self.input_samples) | ||
] | ||
return torch.from_numpy(np.array(data)) | ||
|
||
self.input = generate_input() | ||
self.target = torch.randint(0, max_val, [input_samples]) | ||
self.start = 0 | ||
self.end = max_val | ||
|
||
def __len__(self): | ||
return len(self.input) | ||
|
||
def __getitem__(self, index): | ||
return self.input[index], self.target[index] |
Oops, something went wrong.