Skip to content

Commit

Permalink
timer
Browse files Browse the repository at this point in the history
  • Loading branch information
xinhaoc committed Feb 15, 2024
1 parent 9141c46 commit 8185289
Showing 1 changed file with 30 additions and 0 deletions.
30 changes: 30 additions & 0 deletions src/runtime/optimizer_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,15 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(
checkCUDA(get_legion_stream(&stream));
// assert(op->reservedWorkSpaceSize < meta->handle.workSpaceSize);

cudaEvent_t t_start, t_start1, t_start2, t_end;
cudaEventCreate(&t_start);
cudaEventCreate(&t_start1);
cudaEventCreate(&t_start2);
cudaEventCreate(&t_end);
cudaEventRecord(t_start, stream);
cudaEventRecord(t_start1, stream);
cudaEventRecord(t_start2, stream);

void *workSpace_ptr = meta->handle.workSpace;

for (int i = 0; i < op->parameters_num; i++) {
Expand All @@ -242,6 +251,13 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(
static_cast<char *>(workSpace_ptr) + size[i] * sizeof(float);
}

cudaEventRecord(t_end, stream);
checkCUDA(cudaEventSynchronize(t_end));
float elapsed = 0;
checkCUDA(cudaEventElapsedTime(&elapsed, t_start1, t_end));
cudaEventDestroy(t_start1);
printf("[optimizer] data copy time = %.2lfms\n", elapsed);

// do allreduce once
checkNCCL(ncclAllReduce(meta->handle.workSpace,
(float *)meta->handle.workSpace,
Expand All @@ -250,6 +266,12 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(
ncclSum,
meta->handle.ncclComm,
stream));
cudaEventRecord(t_end, stream);
checkCUDA(cudaEventSynchronize(t_end));
elapsed = 0;
checkCUDA(cudaEventElapsedTime(&elapsed, t_start2, t_end));
cudaEventDestroy(t_start2);
printf("[optimizer] allreduce time = %.2lfms\n", elapsed);

workSpace_ptr = static_cast<char *>(meta->handle.workSpace);
float alpha_t = op->alpha_t;
Expand Down Expand Up @@ -277,6 +299,14 @@ __host__ void AdamOptimizer::nccl_unified_update_task_gpu(
beta2_t *= op->beta2;
alpha_t = op->alpha * sqrt(1 - beta2_t) / (1 - beta1_t);
}
cudaEventRecord(t_end, stream);
checkCUDA(cudaEventSynchronize(t_end));
elapsed = 0;
checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
cudaEventDestroy(t_start);
cudaEventDestroy(t_end);
printf("[optimizer] total time = %.2lfms\n", elapsed);
assert(false);
}
#endif

Expand Down

0 comments on commit 8185289

Please sign in to comment.