flexflow · reyna-abhyankar · Aug 25, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
@@ -8,6 +8,8 @@ file(GLOB_RECURSE SRC
      LIST_DIRECTORIES False
      src/*.cc
      src/cuda/cuda_helper.cu
+     src/cuda/loss_function_kernels.cu
+     src/cuda/optimizer_kernels.cu
      src/cuda/ops/*.cu
      )
 

diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
@@ -42,9 +42,16 @@ struct ArrayShape {
   std::optional<std::size_t> at_maybe(legion_dim_t) const;
   std::optional<std::size_t> at_maybe(ff_dim_t) const;
 
-  ArrayShape
-      sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-                std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
+  ArrayShape sub_shape(legion_dim_t start, ff_dim_t end) const;
+
+  ArrayShape sub_shape(std::optional<ff_dim_t> start,
+                       std::optional<ff_dim_t> end) const;
+
+  ArrayShape sub_shape(std::optional<legion_dim_t> start,
+                       std::optional<legion_dim_t> end) const;
+
+  bool operator==(ArrayShape const &) const;
+  bool operator!=(ArrayShape const &) const;
 
 public:
   LegionTensorDims dims;

diff --git a/lib/kernels/include/kernels/optimizer_kernels.h b/lib/kernels/include/kernels/optimizer_kernels.h
@@ -1,7 +1,8 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_OPTIMIZER_KERNELS_H
 
-#include "device.h"
+#include "kernels/device.h"
+#include "kernels/ff_handle.h"
 
 namespace FlexFlow {
 
@@ -20,7 +21,8 @@ void sgd_nccl_update_task_gpu(ffStream_t,
                               float lr,
                               float momentum,
                               bool nesterov,
-                              float weight_decay PerDeviceFFHandle const &,
+                              float weight_decay,
+                              PerDeviceFFHandle const &,
                               float const *weight_grad_ptr,
                               size_t size,
                               float *weight_ptr,
@@ -32,6 +34,8 @@ void adam_ps_update_task_gpu(ffStream_t,
                              float beta2,
                              float weight_decay,
                              float epsilon,
+                             size_t size,
+                             int num_replicas,
                              float const *weight_grad_ptr,
                              float *adam_m_ptr,
                              float *adam_v_ptr,
@@ -43,6 +47,7 @@ void adam_nccl_update_task_gpu(ffStream_t,
                                float beta2,
                                float weight_decay,
                                float epsilon,
+                               size_t size,
                                PerDeviceFFHandle const &,
                                float const *weight_grad_ptr,
                                float *adam_m_ptr,

diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
@@ -50,12 +50,36 @@
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
-ArrayShape ArrayShape::sub_shape(
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
-    std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
+ArrayShape ArrayShape::sub_shape(legion_dim_t start, ff_dim_t end) const {
   NOT_IMPLEMENTED();
 }
 
+ArrayShape ArrayShape::sub_shape(std::optional<ff_dim_t> start,
+                                 std::optional<ff_dim_t> end) const {
+  std::vector<size_t> new_shape;
+  ff_dim_t start_idx = start.value_or(ff_dim_t{0});
+  ff_dim_t end_idx = end.value_or(ff_dim_t{this->num_dims()});
+
+  while (start_idx < end_idx) {
+    new_shape.push_back(this->at(start_idx));
+    start_idx = ff_dim_t{start_idx.value + 1};
+  }
+  return ArrayShape{new_shape};
+}
+
+ArrayShape ArrayShape::sub_shape(std::optional<legion_dim_t> start,
+                                 std::optional<legion_dim_t> end) const {
+  std::vector<size_t> new_shape;
+  legion_dim_t start_idx = start.value_or(legion_dim_t{0});
+  legion_dim_t end_idx = end.value_or(legion_dim_t{this->num_dims()});
+
+  while (start_idx < end_idx) {
+    new_shape.push_back(this->at(start_idx));
+    start_idx = add_to_legion_dim(start_idx, 1);
+  }
+  return ArrayShape{new_shape};
+}
+
 std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
   if (index.value < dims.size()) {
     return dims.at(index);
@@ -77,6 +101,14 @@
                      dtype};
 }
 
+bool ArrayShape::operator==(ArrayShape const &other) const {
+  return this->dims == other.dims;
+}
+
+bool ArrayShape::operator!=(ArrayShape const &other) const {
+  return this->dims != other.dims;
+}
+
 std::string format_as(ArrayShape const &x) {
   std::ostringstream oss;
   oss << "<ArrayShape";

diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
@@ -35,6 +35,12 @@ __global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
   }
 }
 
+__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b) {
+  CUDA_KERNEL_LOOP(i, size) {
+    ptr[i] = (b - a) * ptr[i] + a;
+  }
+}
+
 __global__ void ones_kernel(float *ptr, coord_t size) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = 1.0f;

diff --git a/lib/kernels/src/cuda/optimizer_kernel.cu → lib/kernels/src/cuda/optimizer_kernels.cu b/lib/kernels/src/cuda/optimizer_kernel.cu → lib/kernels/src/cuda/optimizer_kernels.cu
@@ -13,6 +13,8 @@
  * limitations under the License.
  */
 
+#include "device.h"
+#include "kernels/nccl.h"
 #include "kernels/optimizer_kernels.h"
 
 namespace FlexFlow {
@@ -40,66 +42,70 @@ __global__ void sgd_update(size_t count,
   }
 }
 
-__host__ void SGDOptimizer::ps_update_task_gpu(SGDOptimizer const *op,
-                                               float const *w_grad_ptr,
-                                               size_t size,
-                                               int num_replicas,
-                                               float *w_ptr,
-                                               float *v_ptr) {
-  cudaStream_t stream;
+void sgd_ps_update_task_gpu(cudaStream_t stream,
+                            float lr,
+                            float momentum,
+                            bool nesterov,
+                            float weight_decay,
+                            float const *weight_grad_ptr,
+                            size_t size,
+                            int num_replicas,
+                            float *weight_ptr,
+                            float *sgd_v_ptr) {
   checkCUDA(get_legion_stream(&stream));
   // Step 1: Gather gradients in the first replica
   for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
+    float const *src = weight_grad_ptr + i * size;
     apply_add_with_scale<float>
         <<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-            (float *)w_grad_ptr, src, size, 1.0f);
+            (float *)weight_grad_ptr, src, size, 1.0f);
   }
   // checkCUDA(cudaDeviceSynchronize());
   //  Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 
 #ifdef FF_USE_NCCL
-__host__ void SGDOptimizer::nccl_update_task_gpu(SGDOptimizer const *op,
-                                                 PerDeviceOpState const *meta,
-                                                 float const *w_grad_ptr,
-                                                 size_t size,
-                                                 float *w_ptr,
-                                                 float *v_ptr) {
+void sgd_nccl_update_task_gpu(cudaStream_t stream,
+                              float lr,
+                              float momentum,
+                              bool nesterov,
+                              float weight_decay,
+                              PerDeviceFFHandle const &handle,
+                              float const *weight_grad_ptr,
+                              size_t size,
+                              float *weight_ptr,
+                              float *sgd_v_ptr) {
   // Use NCCL to sync gradients
   // fprintf(stderr, "weight(%p) Before ncclAllReduce...\n", w_grad_ptr);
-  cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
+  checkNCCL(ncclAllReduce(weight_grad_ptr,
+                          (float *)weight_grad_ptr,
                           size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
+                          ncclDataType_t::ncclFloat,
+                          ncclRedOp_t::ncclSum,
+                          handle.ncclComm,
                           stream));
   // fprintf(stderr, "weight(%p) After ncclAllReduce...\n", w_grad_ptr);
   // print_tensor<float>((float*)w_grad_ptr, 16, "[After ncclAllReduce]");
 
   // Step 2: SGD update
-  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-      size,
-      op->lr,
-      op->weight_decay,
-      op->momentum,
-      op->nesterov,
-      w_grad_ptr,
-      v_ptr,
-      w_ptr);
+  sgd_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(size,
+                                                                lr,
+                                                                weight_decay,
+                                                                momentum,
+                                                                nesterov,
+                                                                weight_grad_ptr,
+                                                                sgd_v_ptr,
+                                                                weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 #endif
@@ -144,71 +150,79 @@ __global__ void adam_update(int count,
   }
 }
 
-__host__ void AdamOptimizer::ps_update_task_gpu(AdamOptimizer const *op,
-                                                float const *w_grad_ptr,
-                                                size_t size,
-                                                int num_replicas,
-                                                float *w_ptr,
-                                                float *v_ptr,
-                                                float *m_ptr) {
-  cudaStream_t stream;
+void adam_ps_update_task_gpu(cudaStream_t stream,
+                             float alpha_t,
+                             float beta1,
+                             float beta2,
+                             float weight_decay,
+                             float epsilon,
+                             size_t size,
+                             int num_replicas,
+                             float const *weight_grad_ptr,
+                             float *adam_m_ptr,
+                             float *adam_v_ptr,
+                             float *weight_ptr) {
   checkCUDA(get_legion_stream(&stream));
   // Step 1: Gather gradients in the first replica
   for (int i = 1; i < num_replicas; i++) {
-    float const *src = w_grad_ptr + i * size;
+    float const *src = weight_grad_ptr + i * size;
     add_kernel<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
-        size, 1.0f, src, (float *)w_grad_ptr);
+        size, 1.0f, src, (float *)weight_grad_ptr);
   }
   // checkCUDA(cudaDeviceSynchronize());
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
   adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
+      alpha_t,
+      beta1,
+      beta2,
+      weight_decay,
+      epsilon,
+      weight_grad_ptr,
+      adam_m_ptr,
+      adam_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 
 #ifdef FF_USE_NCCL
-__host__ void AdamOptimizer::nccl_update_task_gpu(AdamOptimizer const *op,
-                                                  PerDeviceOpState const *meta,
-                                                  float const *w_grad_ptr,
-                                                  size_t size,
-                                                  float *w_ptr,
-                                                  float *v_ptr,
-                                                  float *m_ptr) {
+void adam_nccl_update_task_gpu(cudaStream_t stream,
+                               float alpha_t,
+                               float beta1,
+                               float beta2,
+                               float weight_decay,
+                               float epsilon,
+                               size_t size,
+                               PerDeviceFFHandle const &handle,
+                               float const *weight_grad_ptr,
+                               float *adam_m_ptr,
+                               float *adam_v_ptr,
+                               float *weight_ptr) {
   // Use NCCL to sync gradients
-  cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  checkNCCL(ncclAllReduce(w_grad_ptr,
-                          (float *)w_grad_ptr,
+  checkNCCL(ncclAllReduce(weight_grad_ptr,
+                          (float *)weight_grad_ptr,
                           size,
-                          ncclFloat,
-                          ncclSum,
-                          meta->handle.ncclComm,
+                          ncclDataType_t::ncclFloat,
+                          ncclRedOp_t::ncclSum,
+                          handle.ncclComm,
                           stream));
   // fprintf(stderr, "alpha = %.8lf alpha_t = %.8lf decay = %.8lf\n",
   //         op->alpha, op->alpha_t, op->weight_decay);
   //  Step 2: Adam update
   adam_update<<<GET_BLOCKS(size), CUDA_NUM_THREADS, 0, stream>>>(
       size,
-      op->alpha_t,
-      op->beta1,
-      op->beta2,
-      op->weight_decay,
-      op->epsilon,
-      w_grad_ptr,
-      m_ptr,
-      v_ptr,
-      w_ptr);
+      alpha_t,
+      beta1,
+      beta2,
+      weight_decay,
+      epsilon,
+      weight_grad_ptr,
+      adam_m_ptr,
+      adam_v_ptr,
+      weight_ptr);
   // checkCUDA(cudaDeviceSynchronize());
 }
 #endif

diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h
@@ -71,6 +71,7 @@ inline int GET_BLOCKS(int const N) {
 }
 
 __global__ void scale_kernel(float *ptr, size_t size, float a, float b);
+__global__ void scale_kernel(float *ptr, unsigned long size, float a, float b);
 
 __global__ void ones_kernel(float *ptr, size_t size);
-Original file line number
+Diff line change
@@ Expand Up / @@ -71,6 +71,7 @@ inline int GET_BLOCKS(int const N) { @@
     }
     __global__ void scale_kernel(float *ptr, size_t size, float a, float b);
+    __global__ void scale_kernel(float *ptr, unsigned long size, float a, float b);
     __global__ void ones_kernel(float *ptr, size_t size);
@@ Expand Down @@