Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local execution e2e training #1472

Open
wants to merge 23 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ file(GLOB_RECURSE SRC
LIST_DIRECTORIES False
src/*.cc
src/cuda/cuda_helper.cu
src/cuda/loss_function_kernels.cu
src/cuda/optimizer_kernels.cu
src/cuda/ops/*.cu
)

Expand Down
13 changes: 10 additions & 3 deletions lib/kernels/include/kernels/array_shape.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,16 @@ struct ArrayShape {
std::optional<std::size_t> at_maybe(legion_dim_t) const;
std::optional<std::size_t> at_maybe(ff_dim_t) const;

ArrayShape
sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const;

ArrayShape sub_shape(std::optional<ff_dim_t> start,
std::optional<ff_dim_t> end) const;

ArrayShape sub_shape(std::optional<legion_dim_t> start,
std::optional<legion_dim_t> end) const;

bool operator==(ArrayShape const &) const;
bool operator!=(ArrayShape const &) const;

public:
LegionTensorDims dims;
Expand Down
9 changes: 7 additions & 2 deletions lib/kernels/include/kernels/optimizer_kernels.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
#define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H

#include "device.h"
#include "kernels/device.h"
#include "kernels/ff_handle.h"

namespace FlexFlow {

Expand All @@ -20,7 +21,8 @@ void sgd_nccl_update_task_gpu(ffStream_t,
float lr,
float momentum,
bool nesterov,
float weight_decay PerDeviceFFHandle const &,
float weight_decay,
PerDeviceFFHandle const &,
float const *weight_grad_ptr,
size_t size,
float *weight_ptr,
Expand All @@ -32,6 +34,8 @@ void adam_ps_update_task_gpu(ffStream_t,
float beta2,
float weight_decay,
float epsilon,
size_t size,
int num_replicas,
float const *weight_grad_ptr,
float *adam_m_ptr,
float *adam_v_ptr,
Expand All @@ -43,6 +47,7 @@ void adam_nccl_update_task_gpu(ffStream_t,
float beta2,
float weight_decay,
float epsilon,
size_t size,
PerDeviceFFHandle const &,
float const *weight_grad_ptr,
float *adam_m_ptr,
Expand Down
38 changes: 35 additions & 3 deletions lib/kernels/src/array_shape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,36 @@
return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
}

ArrayShape ArrayShape::sub_shape(
std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {

Check warning on line 53 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L53

Added line #L53 was not covered by tests
NOT_IMPLEMENTED();
}

ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,

Check warning on line 57 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L57

Added line #L57 was not covered by tests
std::optional<ff_dim_t> end) const {
std::vector<size_t> new_shape;
ff_dim_t start_idx = start.value_or(ff_dim_t{0});
ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()});

Check warning on line 61 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L59-L61

Added lines #L59 - L61 were not covered by tests

while (start_idx < end_idx) {
new_shape.push_back(this->at(start_idx));
start_idx = ff_dim_t{start_idx.value + 1};

Check warning on line 65 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L64-L65

Added lines #L64 - L65 were not covered by tests
}
return ArrayShape{new_shape};
}

Check warning on line 68 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L67-L68

Added lines #L67 - L68 were not covered by tests

ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,

Check warning on line 70 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L70

Added line #L70 was not covered by tests
std::optional<legion_dim_t> end) const {
std::vector<size_t> new_shape;
legion_dim_t start_idx = start.value_or(legion_dim_t{0});
legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()});

Check warning on line 74 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L72-L74

Added lines #L72 - L74 were not covered by tests

while (start_idx < end_idx) {
new_shape.push_back(this->at(start_idx));
start_idx = add_to_legion_dim(start_idx, 1);

Check warning on line 78 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L77-L78

Added lines #L77 - L78 were not covered by tests
}
return ArrayShape{new_shape};
}

Check warning on line 81 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L80-L81

Added lines #L80 - L81 were not covered by tests

std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
if (index.value < dims.size()) {
return dims.at(index);
Expand All @@ -77,6 +101,14 @@
dtype};
}

bool ArrayShape::operator==(ArrayShape const &other) const {
return this->dims == other.dims;
}

bool ArrayShape::operator!=(ArrayShape const &other) const {
return this->dims != other.dims;

Check warning on line 109 in lib/kernels/src/array_shape.cc

View check run for this annotation

Codecov / codecov/patch

lib/kernels/src/array_shape.cc#L108-L109

Added lines #L108 - L109 were not covered by tests
}

std::string format_as(ArrayShape const &x) {
std::ostringstream oss;
oss << "<ArrayShape";
Expand Down
6 changes: 6 additions & 0 deletions lib/kernels/src/cuda/cuda_helper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ __global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
}
}

__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) {
CUDA_KERNEL_LOOP(i, size) {
ptr[i] = (b - a) * ptr[i] + a;
}
}

__global__ void ones_kernel(float *ptr, coord_t size) {
CUDA_KERNEL_LOOP(i, size) {
ptr[i] = 1.0f;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
* limitations under the License.
*/

#include "device.h"
#include "kernels/nccl.h"
#include "kernels/optimizer_kernels.h"

namespace FlexFlow {
Expand Down Expand Up @@ -40,66 +42,70 @@ __global__ void sgd_update(size_t count,
}
}

__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
float const *w_grad_ptr,
size_t size,
int num_replicas,
float *w_ptr,
float *v_ptr) {
cudaStream_t stream;
void sgd_ps_update_task_gpu(cudaStream_t stream,
float lr,
float momentum,
bool nesterov,
float weight_decay,
float const *weight_grad_ptr,
size_t size,
int num_replicas,
float *weight_ptr,
float *sgd_v_ptr) {
checkCUDA(get_legion_stream(&stream));
// Step 1: Gather gradients in the first replica
for (int i = 1; i < num_replicas; i++) {
float const *src = w_grad_ptr + i * size;
float const *src = weight_grad_ptr + i * size;
apply_add_with_scale<float>
<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
(float *)w_grad_ptr, src, size, 1.0f);
(float *)weight_grad_ptr, src, size, 1.0f);
}
// checkCUDA(cudaDeviceSynchronize());
// Step 2: SGD update
sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
size,
op->lr,
op->weight_decay,
op->momentum,
op->nesterov,
w_grad_ptr,
v_ptr,
w_ptr);
sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
lr,
weight_decay,
momentum,
nesterov,
weight_grad_ptr,
sgd_v_ptr,
weight_ptr);
// checkCUDA(cudaDeviceSynchronize());
}

#ifdef FF_USE_NCCL
__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
PerDeviceOpState const *meta,
float const *w_grad_ptr,
size_t size,
float *w_ptr,
float *v_ptr) {
void sgd_nccl_update_task_gpu(cudaStream_t stream,
float lr,
float momentum,
bool nesterov,
float weight_decay,
PerDeviceFFHandle const &handle,
float const *weight_grad_ptr,
size_t size,
float *weight_ptr,
float *sgd_v_ptr) {
// Use NCCL to sync gradients
// fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
cudaStream_t stream;
checkCUDA(get_legion_stream(&stream));
checkNCCL(ncclAllReduce(w_grad_ptr,
(float *)w_grad_ptr,
checkNCCL(ncclAllReduce(weight_grad_ptr,
(float *)weight_grad_ptr,
size,
ncclFloat,
ncclSum,
meta->handle.ncclComm,
ncclDataType_t::ncclFloat,
ncclRedOp_t::ncclSum,
handle.ncclComm,
stream));
// fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
// print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");

// Step 2: SGD update
sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
size,
op->lr,
op->weight_decay,
op->momentum,
op->nesterov,
w_grad_ptr,
v_ptr,
w_ptr);
sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
lr,
weight_decay,
momentum,
nesterov,
weight_grad_ptr,
sgd_v_ptr,
weight_ptr);
// checkCUDA(cudaDeviceSynchronize());
}
#endif
Expand Down Expand Up @@ -144,71 +150,79 @@ __global__ void adam_update(int count,
}
}

__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
float const *w_grad_ptr,
size_t size,
int num_replicas,
float *w_ptr,
float *v_ptr,
float *m_ptr) {
cudaStream_t stream;
void adam_ps_update_task_gpu(cudaStream_t stream,
float alpha_t,
float beta1,
float beta2,
float weight_decay,
float epsilon,
size_t size,
int num_replicas,
float const *weight_grad_ptr,
float *adam_m_ptr,
float *adam_v_ptr,
float *weight_ptr) {
checkCUDA(get_legion_stream(&stream));
// Step 1: Gather gradients in the first replica
for (int i = 1; i < num_replicas; i++) {
float const *src = w_grad_ptr + i * size;
float const *src = weight_grad_ptr + i * size;
add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
size, 1.0f, src, (float *)w_grad_ptr);
size, 1.0f, src, (float *)weight_grad_ptr);
}
// checkCUDA(cudaDeviceSynchronize());
// fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
// op->alpha, op->alpha_t, op->weight_decay);
// Step 2: Adam update
adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
size,
op->alpha_t,
op->beta1,
op->beta2,
op->weight_decay,
op->epsilon,
w_grad_ptr,
m_ptr,
v_ptr,
w_ptr);
alpha_t,
beta1,
beta2,
weight_decay,
epsilon,
weight_grad_ptr,
adam_m_ptr,
adam_v_ptr,
weight_ptr);
// checkCUDA(cudaDeviceSynchronize());
}

#ifdef FF_USE_NCCL
__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
PerDeviceOpState const *meta,
float const *w_grad_ptr,
size_t size,
float *w_ptr,
float *v_ptr,
float *m_ptr) {
void adam_nccl_update_task_gpu(cudaStream_t stream,
float alpha_t,
float beta1,
float beta2,
float weight_decay,
float epsilon,
size_t size,
PerDeviceFFHandle const &handle,
float const *weight_grad_ptr,
float *adam_m_ptr,
float *adam_v_ptr,
float *weight_ptr) {
// Use NCCL to sync gradients
cudaStream_t stream;
checkCUDA(get_legion_stream(&stream));
checkNCCL(ncclAllReduce(w_grad_ptr,
(float *)w_grad_ptr,
checkNCCL(ncclAllReduce(weight_grad_ptr,
(float *)weight_grad_ptr,
size,
ncclFloat,
ncclSum,
meta->handle.ncclComm,
ncclDataType_t::ncclFloat,
ncclRedOp_t::ncclSum,
handle.ncclComm,
stream));
// fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
// op->alpha, op->alpha_t, op->weight_decay);
// Step 2: Adam update
adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
size,
op->alpha_t,
op->beta1,
op->beta2,
op->weight_decay,
op->epsilon,
w_grad_ptr,
m_ptr,
v_ptr,
w_ptr);
alpha_t,
beta1,
beta2,
weight_decay,
epsilon,
weight_grad_ptr,
adam_m_ptr,
adam_v_ptr,
weight_ptr);
// checkCUDA(cudaDeviceSynchronize());
}
#endif
Expand Down
1 change: 1 addition & 0 deletions lib/kernels/src/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ inline int GET_BLOCKS(int const N) {
}

__global__ void scale_kernel(float *ptr, size_t size, float a, float b);
__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b);

__global__ void ones_kernel(float *ptr, size_t size);

Expand Down
Loading
Loading