[Windows][PT2.6] Split larger Other Kernels lib (#1096)

torch_xpu_ops_sycl_kernels leads to around 1.83GB in size on windows, splitting it to reduce the lib size. New libs introduced in this PR: torch_xpu_ops_sycl_tensor_srcs torch_xpu_ops_sycl_norm_loss_srcs torch_xpu_ops_sycl_poly_srcs torch_xpu_ops_sycl_dist_srcs --------- Co-authored-by: Feng Yuan <[email protected]>
intel · Nov 22, 2024 · bfdbaf4 · bfdbaf4
1 parent 0d189dd
commit bfdbaf4
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 9 deletions.
diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake
@@ -122,7 +122,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "MSVC"
   set(SYCL_OFFLINE_COMPILER_CG_OPTIONS "-options '${SYCL_OFFLINE_COMPILER_CG_OPTIONS}'")
 
   if(WIN32)
-    set(AOT_TARGETS "ats-m150,lnl-m,mtl-u,mtl-h")
+    set(AOT_TARGETS "ats-m150,mtl-u,mtl-h,xe2-lpg,xe2-hpg")
   else()
     set(AOT_TARGETS "pvc,xe-lpg,ats-m150")
   endif()

diff --git a/src/BuildOnWindows.cmake b/src/BuildOnWindows.cmake
@@ -3,11 +3,6 @@
 set(TORCH_XPU_OPS_LIBRARIES)
 set(SYCL_LINK_LIBRARIES_KEYWORD PRIVATE)
 
-# Walk around cyclic dependence
-# libtorch_xpu.so links to libtorch_xpu_ops.a
-# Load libtorch_xpu_ops_aten.so explicitly by torch/__init__.py:_load_dll_libraries (Break cycle)
-# libtorch_xpu_ops_aten.so links to libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so
-# libtorch_xpu_ops_sycl_unary_binary_kernels.so and libtorch_xpu_ops_sycl_kernels.so links to libtorch_xpu.so
 add_library(
   torch_xpu_ops
   STATIC
@@ -21,7 +16,6 @@ add_library(
   ${ATen_XPU_NATIVE_CPP_SRCS}
   ${ATen_XPU_GEN_SRCS})
 install(TARGETS torch_xpu_ops_aten DESTINATION "${TORCH_INSTALL_LIB_DIR}")
-# target_compile_definitions(torch_xpu_ops_aten PRIVATE CAFFE2_BUILD_MAIN_LIB)
 target_compile_definitions(torch_xpu_ops_aten PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
 target_link_libraries(torch_xpu_ops_aten PUBLIC torch_xpu)
 target_link_libraries(torch_xpu_ops_aten PUBLIC torch_cpu)
@@ -48,8 +42,11 @@ else()
   set(ATen_XPU_SYCL_REDUCE_SRCS)
   set(ATen_XPU_SYCL_ACTIVATION_SRCS)
   set(ATen_XPU_SYCL_FOREACH_SRCS)
+  set(ATen_XPU_SYCL_TENSOR_SRCS)
+  set(ATen_XPU_SYCL_NORM_LOSS_SRCS)
+  set(ATen_XPU_SYCL_POLY_SRCS)
+  set(ATen_XPU_SYCL_DISTRIBUTION_SRCS)
   set(ATen_XPU_SYCL_OTHERS_SRCS)
-
   foreach(sycl_src ${ATen_XPU_SYCL_SRCS})
     string(REGEX MATCH "Binary" IS_BINARY ${sycl_src})
     string(REGEX MATCH "Unary" IS_UNARY ${sycl_src})
@@ -63,6 +60,13 @@ else()
     string(REGEX MATCH "Activation" IS_ACTIVATION ${sycl_src})
     string(REGEX MATCH "Foreach" IS_FOREACH ${sycl_src})
     string(REGEX MATCH "Reduce" IS_REDUCE ${sycl_src})
+	string(REGEX MATCH "Tensor" IS_TENSOR ${sycl_src})
+	string(REGEX MATCH "Norm" IS_NORM ${sycl_src})
+	string(REGEX MATCH "Loss" IS_LOSS ${sycl_src})
+	string(REGEX MATCH "Polynomial" IS_POLY ${sycl_src})
+	#Move resize kernel to Norm and Loss lib, to resolve symbol.
+	string(REGEX MATCH "Resize" IS_RESIZE ${sycl_src})
+	string(REGEX MATCH "Distribution" IS_DISTRIBUTION ${sycl_src})
 
     if(NOT IS_FOREACH STREQUAL "")
       list(APPEND ATen_XPU_SYCL_FOREACH_SRCS ${sycl_src})
@@ -74,11 +78,18 @@ else()
       list(APPEND ATen_XPU_SYCL_REDUCE_SRCS ${sycl_src})
     elseif(NOT IS_ACTIVATION STREQUAL "")
       list(APPEND ATen_XPU_SYCL_ACTIVATION_SRCS ${sycl_src})
+	elseif(NOT IS_TENSOR STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_TENSOR_SRCS ${sycl_src})
+	elseif(NOT IS_DISTRIBUTION STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_DISTRIBUTION_SRCS ${sycl_src})
+	elseif(NOT IS_NORM STREQUAL "" OR NOT IS_LOSS STREQUAL "" OR NOT IS_RESIZE STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_NORM_LOSS_SRCS ${sycl_src})
+	elseif(NOT IS_POLY STREQUAL "")
+	  list(APPEND ATen_XPU_SYCL_POLY_SRCS ${sycl_src})
     else()
       list(APPEND ATen_XPU_SYCL_OTHERS_SRCS ${sycl_src})
     endif()
   endforeach()
-
   # Binary kernel lib
   set(sycl_binary_lib torch_xpu_ops_sycl_binary_kernels)
   sycl_add_library(
@@ -148,7 +159,63 @@ else()
 
   # Decouple with PyTorch cmake definition.
   install(TARGETS ${sycl_foreach_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Tensor kernel lib
+  set(sycl_tensor_lib torch_xpu_ops_sycl_tensor_kernels)
+  sycl_add_library(
+    ${sycl_tensor_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_TENSOR_SRCS})
+  target_compile_definitions(${sycl_tensor_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_tensor_lib})
+  target_link_libraries(${sycl_tensor_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_tensor_lib})
 
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_tensor_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Norm and Loss kernel lib
+  set(sycl_norm_loss_lib torch_xpu_ops_sycl_norm_loss_kernels)
+  sycl_add_library(
+    ${sycl_norm_loss_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_NORM_LOSS_SRCS})
+  target_compile_definitions(${sycl_norm_loss_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_norm_loss_lib})
+  target_link_libraries(${sycl_norm_loss_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_norm_loss_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_norm_loss_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Polynomial kernel lib
+  set(sycl_poly_lib torch_xpu_ops_sycl_poly_kernels)
+  sycl_add_library(
+    ${sycl_poly_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_POLY_SRCS})
+  target_compile_definitions(${sycl_poly_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_poly_lib})
+  target_link_libraries(${sycl_poly_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_poly_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_poly_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
+  # Distribution kernel lib
+  set(sycl_dist_lib torch_xpu_ops_sycl_dist_kernels)
+  sycl_add_library(
+    ${sycl_dist_lib}
+    SHARED
+    SYCL_SOURCES ${ATen_XPU_SYCL_DISTRIBUTION_SRCS})
+  target_compile_definitions(${sycl_dist_lib} PRIVATE TORCH_XPU_BUILD_MAIN_LIB)
+  target_link_libraries(torch_xpu_ops_aten PUBLIC ${sycl_dist_lib})
+  target_link_libraries(${sycl_dist_lib} PUBLIC torch_xpu)
+  list(APPEND TORCH_XPU_OPS_LIBRARIES ${sycl_dist_lib})
+
+  # Decouple with PyTorch cmake definition.
+  install(TARGETS ${sycl_dist_lib} DESTINATION "${TORCH_INSTALL_LIB_DIR}")
+
   # Other kernel lib
   set(sycl_lib torch_xpu_ops_sycl_kernels)
   sycl_add_library(