benchmark rpc ps (pytorch#57454)

Summary: Pull Request resolved: pytorch#57454 DDP with NCCL AllReduce for the entire model experiment from Quip https://fb.quip.com/iQUtAeKIxWpF I have been testing this on the AI cluster. There seem to be some connection problems with RPC when using multiple trainers or parameter servers. ``` Namespace(bconfig_id='3', dconfig_id='DummyData', mconfig_id='DummyModel', pconfig_id='None', tconfig_id='DdpNcclTrainer') benchmark warmup done metrics for trainer=0 +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.972 | 0.097122 | 0.311644 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.58138 | 4.31439 | 0.00229848 | 0.0479424 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.4807 | 0.222566 | 0.0555432 | 0.235676 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.191488 | 3.54099 | 3.11694 | 0.557106 | 0.746395 | +-----------------------------------+----------+---------+----------+------------+-----------+ metrics for trainer=1 +-----------------------------------+----------+---------+----------+-------------+------------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+=============+============+ | backward_metric,backward | 2.4617 | 2.59174 | 2.51196 | 0.000938276 | 0.0306313 | +-----------------------------------+----------+---------+----------+-------------+------------+ | batch_level_metric,batch_all | 4.22605 | 4.71757 | 4.27921 | 0.00468424 | 0.0684415 | +-----------------------------------+----------+---------+----------+-------------+------------+ | foward_metric,forward_pass | 0.807936 | 1.50118 | 0.846008 | 0.00601693 | 0.0775688 | +-----------------------------------+----------+---------+----------+-------------+------------+ | hook_future_metric,nccl_allreduce | 0.108544 | 0.1536 | 0.11222 | 2.16726e-05 | 0.00465538 | +-----------------------------------+----------+---------+----------+-------------+------------+ metrics for all trainer +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.24198 | 0.584391 | 0.764455 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.71757 | 4.2968 | 0.00378467 | 0.0615197 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.50118 | 0.534287 | 0.128284 | 0.358167 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.108544 | 3.54099 | 1.61458 | 2.5456 | 1.59549 | +-----------------------------------+----------+---------+----------+------------+-----------+ ``` Test Plan: Imported from OSS Reviewed By: H-Huang, ngimel Differential Revision: D28296175 Pulled By: gcramer23 fbshipit-source-id: 5dd208fc86f8b5558d7c8860d685bb25c2e09fe7
Chao1Han · May 8, 2021 · bc2540f · bc2540f
1 parent 94080f4
commit bc2540f
Show file tree

Hide file tree

Showing 21 changed files with 1,071 additions and 0 deletions.
diff --git a/benchmarks/distributed/rpc/parameter_server/BenchmarkConfigurations.py b/benchmarks/distributed/rpc/parameter_server/BenchmarkConfigurations.py
@@ -0,0 +1,15 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class BenchmarkConfigurations:
+    trainer_count: int = 1
+    ps_count: int = 0
+    batch_size: int = 1
+    print_metrics_to_dir: bool = False
+    master_addr: str = "localhost"
+    master_port: str = "29500"
+    rpc_async_timeout: int = 5
+    rpc_init_method: str = "tcp://localhost:29501"
+    trainer_config: dict = None
+    ps_config: dict = None
diff --git a/benchmarks/distributed/rpc/parameter_server/README.md b/benchmarks/distributed/rpc/parameter_server/README.md
@@ -0,0 +1,56 @@
+# RPC PS Benchmark
+
+## How to add your experiment
+
+1. Data
+    - Create a data class and add it to the data directory
+    - Update benchmark_class_helper.py to include your data class in the data_map
+    - Add configurations to data_configurations.json in the configurations directory
+2. Model
+    - Create a model class and add it to the model directory
+    - Update benchmark_class_helper.py to include your model class in the model_map
+    - Add configurations to model_configurations.json in the configurations directory
+3. Trainer
+    - Create a trainer class and add it to the trainer directory
+    - Update benchmark_class_helper.py to include your trainer class in the trainer_map
+    - Add configurations to trainer_configurations.json in the configurations directory
+4. Parameter Server
+    - Create a parameter server class and add it to the parameter_servers directory
+    - Update benchmark_class_helper.py to include your parameter_server class in the ps_map
+    - Add configurations to parameter_server_configurations.json in the configurations directory
+5. Script
+    - Create a bash script for your experiment and add it to the bash_experiment_scripts directory
+
+## Trainer class
+
+The trainer directory contains base classes to provide a starting point for implementing a trainer.
+Inherit from a base class and implement your trainer. The benchmark has two requirements for trainers.
+
+1. It must implement a __init__ method that takes rank, trainer_count, and ps_rref as arguments
+
+    ```python
+    def __init__(self, rank, trainer_count, ps_rref, backend, use_cuda_rpc):
+    ```
+
+2. It must implement a train method that takes model and data as arguments.
+
+    ```python
+    def train(self, model, data):
+    ```
+
+## Parameter Server class
+
+The parameter_server directory contains base classes to provide a starting point for implementing a parameter server.
+Inherit from a base class and implement your parameter server. The benchmark has two requirements for parameter servers.
+
+1. It must implement a __init__ method that takes rank and ps_trainer_count as arguments
+
+    ```python
+    def __init__(self, rank, ps_trainer_count, backend, use_cuda_rpc):
+    ```
+
+2. It must implement a reset_state method
+
+    ```python
+    def reset_state(ps_rref):
+    ```
diff --git a/benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/ddp_nccl_allreduce.sh b/benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/ddp_nccl_allreduce.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+# requires slurm
+# configuration ids
+benchmark=3
+data="DummyData"
+model="DummyModel"
+trainer="DdpNcclTrainer"
+server="None"
+# moves to directory and runs the benchmark with the configurations selected
+cd "$(dirname $(dirname "$0"))"
+source ./bash_experiment_scripts/helper_functions.sh
+run_benchmark_basic "$benchmark" "$data" "$model" "$trainer" "$server"
diff --git a/benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/helper_functions.sh b/benchmarks/distributed/rpc/parameter_server/bash_experiment_scripts/helper_functions.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+run_benchmark_basic() {
+    # requires slurm
+    gpurun='srun -p q2 --cpus-per-task=16 -t 5:00:00 --gpus-per-node=4'
+    $gpurun python launcher.py --benchmark=$1 --data=$2 --model=$3 --trainer=$4 --server=$5
+}
diff --git a/benchmarks/distributed/rpc/parameter_server/benchmark_class_helper.py b/benchmarks/distributed/rpc/parameter_server/benchmark_class_helper.py
@@ -0,0 +1,33 @@
+from data.DummyData import DummyData
+from models.DummyModel import DummyModel
+from trainers.DdpNcclTrainer import DdpNcclTrainer
+
+trainer_map = {
+    "DdpNcclTrainer": DdpNcclTrainer
+}
+
+ps_map = {}
+
+model_map = {
+    "DummyModel": DummyModel
+}
+
+data_map = {
+    "DummyData": DummyData
+}
+
+
+def get_benchmark_trainer_map():
+    return trainer_map
+
+
+def get_benchmark_ps_map():
+    return ps_map
+
+
+def get_benchmark_model_map():
+    return model_map
+
+
+def get_benchmark_data_map():
+    return data_map
diff --git a/benchmarks/distributed/rpc/parameter_server/configurations/benchmark_configurations.json b/benchmarks/distributed/rpc/parameter_server/configurations/benchmark_configurations.json
@@ -0,0 +1,8 @@
+{
+    "3": {
+        "trainer_count": 2,
+        "ps_count": 0,
+        "rpc_async_timeout": 15,
+        "batch_size": 5
+    }
+}
diff --git a/benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json b/benchmarks/distributed/rpc/parameter_server/configurations/data_configurations.json
@@ -0,0 +1,20 @@
+{
+    "DummyData": {
+        "data_class": "DummyData",
+        "configurations": {
+            "max_val": 100,
+            "input_samples": 100,
+            "input_dim": 100,
+            "sparsity_percentage": 20
+        }
+    },
+    "DummyData2": {
+        "data_class": "DummyData",
+        "configurations": {
+            "max_val": 100,
+            "input_samples": 100,
+            "input_dim": 100,
+            "sparsity_percentage": 80
+        }
+    }
+}
diff --git a/benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json b/benchmarks/distributed/rpc/parameter_server/configurations/model_configurations.json
@@ -0,0 +1,22 @@
+{
+    "DummyModel": {
+        "model_class": "DummyModel",
+        "configurations": {
+            "num_embeddings": 100,
+            "embedding_dim": 100,
+            "dense_input_size": 100,
+            "dense_output_size": 100,
+            "sparse": false
+        }
+    },
+    "DummyModelSparse": {
+        "model_class": "DummyModel",
+        "configurations": {
+            "num_embeddings": 100,
+            "embedding_dim": 100,
+            "dense_input_size": 100,
+            "dense_output_size": 100,
+            "sparse": true
+        }
+    }
+}
diff --git a/...arks/distributed/rpc/parameter_server/configurations/parameter_server_configurations.json b/...arks/distributed/rpc/parameter_server/configurations/parameter_server_configurations.json
@@ -0,0 +1 @@
+{}
diff --git a/benchmarks/distributed/rpc/parameter_server/configurations/trainer_configurations.json b/benchmarks/distributed/rpc/parameter_server/configurations/trainer_configurations.json
@@ -0,0 +1,8 @@
+{
+    "DdpNcclTrainer": {
+        "trainer_class": "DdpNcclTrainer",
+        "configurations": {
+            "epochs": 10
+        }
+    }
+}
diff --git a/benchmarks/distributed/rpc/parameter_server/data/DummyData.py b/benchmarks/distributed/rpc/parameter_server/data/DummyData.py
@@ -0,0 +1,46 @@
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+
+class DummyData(Dataset):
+
+    def __init__(
+        self,
+        max_val: int,
+        input_samples: int,
+        input_dim: int,
+        sparsity_percentage: int
+    ):
+        self.max_val = max_val
+        self.input_samples = input_samples
+        self.input_dim = input_dim
+        self.sparsity_percentage = sparsity_percentage
+
+        def generate_input():
+            precentage_of_elements = (100 - self.sparsity_percentage) / float(100)
+            index_count = int(self.max_val * precentage_of_elements)
+            elements = list(range(self.max_val))
+            random.shuffle(elements)
+            elements = elements[:index_count]
+            data = [
+                [
+                    elements[random.randint(0, index_count - 1)]
+                    for _ in range(self.input_dim)
+                ]
+                for _ in range(self.input_samples)
+            ]
+            return torch.from_numpy(np.array(data))
+
+        self.input = generate_input()
+        self.target = torch.randint(0, max_val, [input_samples])
+        self.start = 0
+        self.end = max_val
+
+    def __len__(self):
+        return len(self.input)
+
+    def __getitem__(self, index):
+        return self.input[index], self.target[index]