diff --git a/source/lib/src/gpu/tabulate.cu b/source/lib/src/gpu/tabulate.cu index b758980b21..9f924efd9b 100644 --- a/source/lib/src/gpu/tabulate.cu +++ b/source/lib/src/gpu/tabulate.cu @@ -792,9 +792,9 @@ void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, DPErrcheck(gpuMemset(dz_dy, 0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); tabulate_fusion_se_a_grad_grad_fifth_order_polynomial <<>>( - dz_dy, table, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, table_info[0], - table_info[1], table_info[2], table_info[3], table_info[4], nnei, - last_layer_size, is_sorted); + dz_dy, table, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, + table_info[0], table_info[1], table_info[2], table_info[3], + table_info[4], nnei, last_layer_size, is_sorted); DPErrcheck(gpuGetLastError()); DPErrcheck(gpuDeviceSynchronize()); } diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index d2e4dd5f2f..488a99bd7d 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -335,8 +335,8 @@ class TabulateFusionSeAGradGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_grad_gpu( - dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, nloc, - nnei, last_layer_size, is_sorted); + dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, + nloc, nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( @@ -344,8 +344,8 @@ class TabulateFusionSeAGradGradOp : public OpKernel { "last layer of embedding net must be less than 1024!")); } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_grad_cpu( - dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, nloc, - nnei, last_layer_size, is_sorted); + dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, + nloc, nnei, last_layer_size, is_sorted); } } @@ -549,8 +549,8 @@ class TabulateFusionSeAttenGradGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_grad_gpu( - dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, nloc, - nnei, last_layer_size, is_sorted); + dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, + nloc, nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( @@ -558,8 +558,8 @@ class TabulateFusionSeAttenGradGradOp : public OpKernel { "last layer of embedding net must be less than 1024!")); } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_grad_cpu( - dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, nloc, - nnei, last_layer_size, is_sorted); + dz_dy, table, table_info, em_x, em, two_embed, dz_dy_dem_x, dz_dy_dem, + nloc, nnei, last_layer_size, is_sorted); } } @@ -975,81 +975,81 @@ REGISTER_CPU(float); REGISTER_CPU(double); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -#define REGISTER_GPU(T) \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusion") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionGradGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAGradGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeA") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAGradGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAGradGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAtten") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAttenOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAttenGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeAttenGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAttenGradGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusion") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionGradGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAGradGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeA") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAGradGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAGradGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAtten") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAttenOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAttenGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeAttenGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeAttenGradGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ TabulateFusionSeAttenGradGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeT") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeTOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeTGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeTGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeTGradGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeTGradGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeR") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeROp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeRGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ - TabulateFusionSeRGradOp); \ - REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeRGradGrad") \ - .Device(DEVICE_GPU) \ - .TypeConstraint("T") \ - .HostMemory("table_info"), \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeT") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeTOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeTGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeTGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeTGradGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeTGradGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeR") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeROp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeRGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ + TabulateFusionSeRGradOp); \ + REGISTER_KERNEL_BUILDER(Name("TabulateFusionSeRGradGrad") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T") \ + .HostMemory("table_info"), \ TabulateFusionSeRGradGradOp); REGISTER_GPU(float); REGISTER_GPU(double);