Skip to content

Commit

Permalink
[CPU] FusedAdam and CPU training support (microsoft#3991)
Browse files Browse the repository at this point in the history
* fused adam can build

* use cpu adam to implement fused adam

* enable zero stage 1 and 2 for synchronized accelerator (a.k.a. CPU)

* remove unused parameters

* fix format error

* Remove adam class

* fix format

* support stage3

* reuse simd.h

* fix format

* make memory_stat return meaningful dict

* fix format

* add cpu_adam

* reuse cpu_adam

* header cleanup

* fix cpu_adam

* fix format, add missing file

---------

Co-authored-by: Olatunji Ruwase <[email protected]>
  • Loading branch information
delock and tjruwase authored Jul 25, 2023
1 parent 1cc9caa commit 0f54063
Show file tree
Hide file tree
Showing 16 changed files with 510 additions and 328 deletions.
18 changes: 13 additions & 5 deletions accelerator/cpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def Stream(self):
return None

def stream(self, stream):
from deepspeed.runtime.utils import noop_decorator
return noop_decorator
from deepspeed.runtime.utils import noop_context
return noop_context()

def current_stream(self, device_index=None):
return None
Expand Down Expand Up @@ -139,7 +139,11 @@ def reset_max_memory_cached(self, device_index=None):
return

def memory_stats(self, device_index=None):
return self.get_rss()
mem = self.get_rss()
mem_stat = {}
mem_stat['allocated_bytes.all.current'] = mem
mem_stat['allocated_bytes.all.peak'] = self.max_mem
return mem_stat

def reset_peak_memory_stats(self, device_index=None):
self.reset_rss()
Expand Down Expand Up @@ -250,12 +254,16 @@ def get_op_builder(self, class_name):
# is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
# if successful this also means we're doing a local install and not JIT compile path
from op_builder import __deepspeed__ # noqa: F401
from op_builder.cpu import CCLCommBuilder, NotImplementedBuilder
from op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
except ImportError:
from deepspeed.ops.op_builder.cpu import CCLCommBuilder, NotImplementedBuilder
from deepspeed.ops.op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder

if class_name == "CCLCommBuilder":
return CCLCommBuilder
elif class_name == "FusedAdamBuilder":
return FusedAdamBuilder
elif class_name == "CPUAdamBuilder":
return CPUAdamBuilder
else:
# return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
return NotImplementedBuilder
Expand Down
293 changes: 0 additions & 293 deletions csrc/adam/cpu_adam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,299 +4,6 @@
// DeepSpeed Team

#include "cpu_adam.h"
#include <torch/extension.h>
#include <cassert>
#include <iostream>
#include <memory>
#include <type_traits>
#include <unordered_map>

#if defined(__ENABLE_CUDA__)
#include <cuda_runtime_api.h>
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#include "custom_cuda_layers.h"
#endif

static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;

// C++ interface

void Adam_Optimizer::Step_1(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
ds_half_precision_t* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<1>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size) {
float betta1_minus1 = 1 - _betta1;
float betta2_minus1 = 1 - _betta2;

float step_size = -1 * _alpha / _bias_correction1;
float w_decay = -1 * _alpha * _weight_decay;
ds_half_precision_t* grads_cast_h;
ds_half_precision_t* params_cast_h;
if (half_precision) {
grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
}

for (size_t t = rounded_size; t < _param_size; t += TILE) {
size_t copy_size = TILE;
if ((t + TILE) > _param_size) copy_size = _param_size - t;
size_t offset = copy_size + t;
#if defined(__ENABLE_CUDA__)
if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
#endif
#pragma omp parallel for
for (size_t k = t; k < offset; k++) {
float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
float param = half_precision ? (float)params_cast_h[k] : _params[k];
float momentum = _exp_avg[k];
float variance = _exp_avg_sq[k];
if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
momentum = momentum * _betta1;
momentum = grad * betta1_minus1 + momentum;

variance = variance * _betta2;
grad = grad * grad;
variance = grad * betta2_minus1 + variance;

grad = sqrt(variance);
grad = grad * _bias_correction2 + _eps;
grad = momentum / grad;
if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
param = grad * step_size + param;
#if defined(__ENABLE_CUDA__)
if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
#endif
if (half_precision)
params_cast_h[k] = (ds_half_precision_t)param;
else
_params[k] = param;
_exp_avg[k] = momentum;
_exp_avg_sq[k] = variance;
}
#if defined(__ENABLE_CUDA__)
if (dev_params) {
launch_param_update(
_doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);

_buf_index = !_buf_index;
}
#endif
}
}
}

void Adam_Optimizer::Step_4(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
ds_half_precision_t* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<4>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_1((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}

int create_adam_optimizer(int optimizer_id,
float alpha = 1e-3,
float betta1 = 0.9,
float betta2 = 0.999,
float eps = 1e-8,
float weight_decay = 0,
bool adamw_mode = true,
bool should_log = false)
{
auto opt =
std::make_shared<Adam_Optimizer>(alpha, betta1, betta2, eps, weight_decay, adamw_mode);

s_optimizers[optimizer_id] = opt;

if (should_log) {
std::string avx_type = "";
#if defined(__AVX512__)
avx_type = "AVX512";
#else
#if defined(__AVX256__)
avx_type = "AVX2";
#else
avx_type = "scalar";
#endif
#endif

printf("Adam Optimizer #%d is created with %s arithmetic capability.\n",
optimizer_id,
avx_type.c_str());
printf("Config: alpha=%f, betas=(%f, %f), weight_decay=%f, adam_w=%d\n",
alpha,
betta1,
betta2,
weight_decay,
(int)adamw_mode);
}

return 0;
}

void Adam_Optimizer::Step_8(float* _params,
float* grads,
float* _exp_avg,
float* _exp_avg_sq,
size_t _param_size,
ds_half_precision_t* dev_params,
bool half_precision)
{
size_t rounded_size = 0;
#if defined(__AVX512__) or defined(__AVX256__)
Step_AVX<8>(&rounded_size,
_params,
grads,
_exp_avg,
_exp_avg_sq,
_param_size,
dev_params,
half_precision);
#endif
if (_param_size > rounded_size)
Step_4((_params + rounded_size),
(grads + rounded_size),
(_exp_avg + rounded_size),
(_exp_avg_sq + rounded_size),
(_param_size - rounded_size),
(dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
half_precision);
}

int ds_adam_step(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq)
{
auto params_c = params.contiguous();
auto grads_c = grads.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();

// assert(params.options().dtype() == grads.options().dtype());

float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();

std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);

opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.numel(),
nullptr,
(params.options().dtype() == at::kHalf));

#if defined(__ENABLE_CUDA__)
opt->SynchronizeStreams();
#endif
return 0;
}

int ds_adam_step_plus_copy(int optimizer_id,
size_t step,
float lr,
float beta1,
float beta2,
float epsilon,
float weight_decay,
bool bias_correction,
torch::Tensor& params,
torch::Tensor& grads,
torch::Tensor& exp_avg,
torch::Tensor& exp_avg_sq,
torch::Tensor& gpu_params)
{
#if defined(__ENABLE_CUDA__)
auto params_c = params.contiguous();
auto gpu_params_c = gpu_params.contiguous();
auto exp_avg_c = exp_avg.contiguous();
auto exp_avg_sq_c = exp_avg_sq.contiguous();
auto grads_c = grads.contiguous();

float* params_ptr = (float*)params_c.data_ptr();
float* grads_ptr = (float*)grads_c.data_ptr();
ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();

std::shared_ptr<Adam_Optimizer> opt =
std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
opt->IncrementStep(step, beta1, beta2);
opt->update_state(lr, epsilon, weight_decay, bias_correction);
opt->Step_8(params_ptr,
grads_ptr,
exp_avg_ptr,
exp_avg_sq_ptr,
params_c.numel(),
gpu_params_ptr,
(params.options().dtype() == at::kHalf));

opt->SynchronizeStreams();
#else
assert(false);
#endif
return 0;
}

int destroy_adam_optimizer(int optimizer_id)
{
s_optimizers.erase(optimizer_id);

return 0;
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
{
Expand Down
Loading

0 comments on commit 0f54063

Please sign in to comment.