From 434a56b73e7264107754963777b15f42459205b9 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 10:06:26 +0000
Subject: [PATCH 01/46] update

---
 .github/workflows/install.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 1c9478ab0..70231ef4b 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -9,7 +9,7 @@ on:  # yamllint disable-line rule:truthy
 jobs:
 
   import:
-    runs-on: ubuntu-latest
+    runs-on: [windows-latest]
 
     strategy:
       matrix:

From 2d0d74772fa894321f986aa208d3bc16d3de13e2 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 10:12:11 +0000
Subject: [PATCH 02/46] update

---
 .github/workflows/install.yml | 4 +++-
 CHANGELOG.md                  | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 70231ef4b..950739419 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -9,10 +9,11 @@ on:  # yamllint disable-line rule:truthy
 jobs:
 
   import:
-    runs-on: [windows-latest]
+    runs-on: ${{ matrix.os }}
 
     strategy:
       matrix:
+        os: [windows-latest]
         cuda-version: ['cpu', 'cu121']
 
     steps:
@@ -30,6 +31,7 @@ jobs:
         run: |
           source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
           pip install --verbose -e .
+        shell: bash
 
       - name: Test imports
         run: |
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4c8a5e97c..c069e6c2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [0.5.0] - 2023-MM-DD
 ### Added
+- Added Windows support ([#315](https://github.com/pyg-team/pyg-lib/pull/315))
 - Added macOS Apple Silicon support ([#310](https://github.com/pyg-team/pyg-lib/pull/310))
 ### Changed
 ### Removed

From ef9ef00ca5c3f159351d6c17741db5420aedfe41 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 10:23:51 +0000
Subject: [PATCH 03/46] update

---
 .github/workflows/building.yml    |  1 +
 .github/workflows/cuda/Windows.sh | 10 +++++-----
 .github/workflows/install.yml     |  3 +--
 .github/workflows/nightly.yml     |  1 +
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/building.yml b/.github/workflows/building.yml
index 65ea31698..d0398ac7f 100644
--- a/.github/workflows/building.yml
+++ b/.github/workflows/building.yml
@@ -121,6 +121,7 @@ jobs:
           python -c "import pyg_lib; print('pyg-lib:', pyg_lib.__version__)"
           python -c "import pyg_lib; print('CUDA:', pyg_lib.cuda_version())"
           cd ..
+        shell: bash
 
       - name: Configure AWS
         uses: aws-actions/configure-aws-credentials@v1
diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 0bee4bf53..8f275405d 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -1,10 +1,5 @@
 #!/bin/bash
 
-# Install NVIDIA drivers, see:
-# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
-curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
-7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
-
 case ${1} in
   cu121)
     CUDA_SHORT=12.1
@@ -42,6 +37,11 @@ case ${1} in
     ;;
 esac
 
+# Install NVIDIA drivers, see:
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
 curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
 echo ""
 echo "Installing from ${CUDA_FILE}..."
diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 950739419..d0c5d4104 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -29,9 +29,8 @@ jobs:
 
       - name: Install package
         run: |
-          source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
+          # source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
           pip install --verbose -e .
-        shell: bash
 
       - name: Test imports
         run: |
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1eef0b3b8..968350427 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -127,6 +127,7 @@ jobs:
           python -c "import pyg_lib; print('pyg-lib:', pyg_lib.__version__)"
           python -c "import pyg_lib; print('CUDA:', pyg_lib.cuda_version())"
           cd ..
+        shell: bash
 
       - name: Configure AWS
         uses: aws-actions/configure-aws-credentials@v1

From 4fe3b6a0ad7244ffe144d67d18356fc8e3e914cd Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 11:05:28 +0000
Subject: [PATCH 04/46] update

---
 .github/actions/setup/action.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index af42016a8..3388149b0 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -26,6 +26,9 @@ runs:
         sudo rm -rf /usr/share/dotnet
       shell: bash
 
+    - name: Set up Windows developer command prompt
+      uses: ilammy/msvc-dev-cmd@v1
+
     - name: Install CUDA ${{ inputs.cuda-version }}
       if: ${{ inputs.cuda-version != 'cpu' }}
       run: |

From 0732f2232441d17f8901acdcf137ebc04a7acd19 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 19:12:23 +0000
Subject: [PATCH 05/46] update

---
 CMakeLists.txt | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d66365542..6e98ad35d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,16 +71,16 @@ else()
   target_include_directories(${PROJECT_NAME} PRIVATE ${PHMAP_DIR})
 endif()
 
-set(METIS_DIR third_party/METIS)
-target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_DIR}/include)
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-set(GKLIB_PATH "${METIS_DIR}/GKlib")
-include(${GKLIB_PATH}/GKlibSystem.cmake)
-include_directories(${GKLIB_PATH})
-include_directories("${METIS_DIR}/include")
-add_subdirectory("${METIS_DIR}/libmetis")
-target_link_libraries(${PROJECT_NAME} PRIVATE metis)
+# set(METIS_DIR third_party/METIS)
+# target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_DIR}/include)
+# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+# set(GKLIB_PATH "${METIS_DIR}/GKlib")
+# include(${GKLIB_PATH}/GKlibSystem.cmake)
+# include_directories(${GKLIB_PATH})
+# include_directories("${METIS_DIR}/include")
+# add_subdirectory("${METIS_DIR}/libmetis")
+# target_link_libraries(${PROJECT_NAME} PRIVATE metis)
 
 find_package(Torch REQUIRED)
 target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})

From 257635e037baca736a4cede61f7ca5e88a8099e9 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 19:32:34 +0000
Subject: [PATCH 06/46] update

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e98ad35d..565cd2597 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.15)
 project(pyg)
 set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(PYG_VERSION 0.4.0)
 
 option(BUILD_TEST "Enable testing" OFF)

From e7f5dd41de1bc069965fb50512201a890871a77b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 19:41:24 +0000
Subject: [PATCH 07/46] update

---
 pyg_lib/csrc/ops/cpu/matmul_kernel.cpp | 301 +++++++++++++------------
 1 file changed, 156 insertions(+), 145 deletions(-)

diff --git a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
index 06a020086..73f8631d3 100644
--- a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
+++ b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
@@ -206,76 +206,82 @@ void grouped_matmul_out_kernel_mkl_impl(const std::vector<at::Tensor> input,
                                         const std::vector<at::Tensor> other,
                                         std::vector<at::Tensor> out) {
   // matrix_params<M, N, K>
-  using matrix_params = std::tuple<int, int, int>;
-  phmap::flat_hash_map<matrix_params, std::vector<size_t>> groups;
-  for (size_t i = 0; i < input.size(); ++i) {
-    const matrix_params mp = {input[i].size(0), other[i].size(-1),
-                              input[i].size(-1)};
-    if (groups.count(mp)) {
-      groups[mp].push_back(i);
-    } else {
-      groups.insert({mp, {i}});
-    }
-  }
-
-  AT_DISPATCH_FLOATING_TYPES(
-      input.front().scalar_type(), "grouped_matmul_out_kernel_mkl_impl", [&] {
-        const auto group_count = static_cast<int>(groups.size());
-        std::vector<scalar_t> alpha(group_count, 1);
-        std::vector<scalar_t> beta(group_count, 0);
-
-        std::vector<int> ms(group_count);
-        std::vector<int> ns(group_count);
-        std::vector<int> ks(group_count);
-        std::vector<int> ld_src0(group_count);
-        std::vector<int> ld_src1(group_count);
-        std::vector<int> ld_dst(group_count);
-        std::vector<int> group_sizes(group_count);
-        std::vector<scalar_t*> src0;
-        std::vector<scalar_t*> src1;
-        std::vector<scalar_t*> dst;
-
-        size_t group_idx = 0;
-        for (const auto& group_kv : groups) {
-          int m;
-          int n;
-          int k;
-          std::tie(m, n, k) = group_kv.first;
-          const auto& indices = group_kv.second;
-
-          ms[group_idx] = m;
-          ns[group_idx] = n;
-          ks[group_idx] = k;
-          ld_src0[group_idx] = k;
-          ld_src1[group_idx] = n;
-          ld_dst[group_idx] = n;
-          group_sizes[group_idx] = indices.size();
-          ++group_idx;
-
-          for (const auto tensor_idx : indices) {
-            src0.push_back(input[tensor_idx].data_ptr<scalar_t>());
-            src1.push_back(other[tensor_idx].data_ptr<scalar_t>());
-            dst.push_back(out[tensor_idx].data_ptr<scalar_t>());
-          }
-        }
-
-        auto src0_ptrs = const_cast<const scalar_t**>(src0.data());
-        auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
-        auto dst_ptrs = dst.data();
-
-#if AT_MKL_SEQUENTIAL()
-        // unlikely to happen - requires Torch to be built from source with
-        // explicit flag denoting MKL sequential version
-        parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
-                                       src1_ptrs, ld_src1, beta, dst_ptrs,
-                                       ld_dst, group_count, group_sizes);
-#else
-        mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
-                              src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
-                              beta.data(), dst_ptrs, ld_dst.data(), group_count,
-                              group_sizes.data());
-#endif
-      });
+  /* using matrix_params = std::tuple<int, int, int>; */
+  /* phmap::flat_hash_map<matrix_params, std::vector<size_t>> groups; */
+  /* for (size_t i = 0; i < input.size(); ++i) { */
+  /*   const matrix_params mp = {input[i].size(0), other[i].size(-1), */
+  /*                             input[i].size(-1)}; */
+  /*   if (groups.count(mp)) { */
+  /*     groups[mp].push_back(i); */
+  /*   } else { */
+  /*     groups.insert({mp, {i}}); */
+  /*   } */
+  /* } */
+
+  /* AT_DISPATCH_FLOATING_TYPES( */
+  /*     input.front().scalar_type(), "grouped_matmul_out_kernel_mkl_impl", [&]
+   * { */
+  /*       const auto group_count = static_cast<int>(groups.size()); */
+  /*       std::vector<scalar_t> alpha(group_count, 1); */
+  /*       std::vector<scalar_t> beta(group_count, 0); */
+
+  /*       std::vector<int> ms(group_count); */
+  /*       std::vector<int> ns(group_count); */
+  /*       std::vector<int> ks(group_count); */
+  /*       std::vector<int> ld_src0(group_count); */
+  /*       std::vector<int> ld_src1(group_count); */
+  /*       std::vector<int> ld_dst(group_count); */
+  /*       std::vector<int> group_sizes(group_count); */
+  /*       std::vector<scalar_t*> src0; */
+  /*       std::vector<scalar_t*> src1; */
+  /*       std::vector<scalar_t*> dst; */
+
+  /*       size_t group_idx = 0; */
+  /*       for (const auto& group_kv : groups) { */
+  /*         int m; */
+  /*         int n; */
+  /*         int k; */
+  /*         std::tie(m, n, k) = group_kv.first; */
+  /*         const auto& indices = group_kv.second; */
+
+  /*         ms[group_idx] = m; */
+  /*         ns[group_idx] = n; */
+  /*         ks[group_idx] = k; */
+  /*         ld_src0[group_idx] = k; */
+  /*         ld_src1[group_idx] = n; */
+  /*         ld_dst[group_idx] = n; */
+  /*         group_sizes[group_idx] = indices.size(); */
+  /*         ++group_idx; */
+
+  /*         for (const auto tensor_idx : indices) { */
+  /*           src0.push_back(input[tensor_idx].data_ptr<scalar_t>()); */
+  /*           src1.push_back(other[tensor_idx].data_ptr<scalar_t>()); */
+  /*           dst.push_back(out[tensor_idx].data_ptr<scalar_t>()); */
+  /*         } */
+  /*       } */
+
+  /*       auto src0_ptrs = const_cast<const scalar_t**>(src0.data()); */
+  /*       auto src1_ptrs = const_cast<const scalar_t**>(src1.data()); */
+  /*       auto dst_ptrs = dst.data(); */
+
+  /* #if AT_MKL_SEQUENTIAL() */
+  /*       // unlikely to happen - requires Torch to be built from source with
+   */
+  /*       // explicit flag denoting MKL sequential version */
+  /*       parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
+   */
+  /*                                      src1_ptrs, ld_src1, beta, dst_ptrs, */
+  /*                                      ld_dst, group_count, group_sizes); */
+  /* #else */
+  /*       mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
+   */
+  /*                             src0_ptrs, ld_src0.data(), src1_ptrs,
+   * ld_src1.data(), */
+  /*                             beta.data(), dst_ptrs, ld_dst.data(),
+   * group_count, */
+  /*                             group_sizes.data()); */
+  /* #endif */
+  /*     }); */
 }
 
 std::vector<at::Tensor> grouped_matmul_kernel(const at::TensorList input,
@@ -328,81 +334,86 @@ void segment_matmul_out_kernel_mkl_impl(const at::Tensor& input,
                                         const at::Tensor& other,
                                         at::Tensor& out,
                                         const at::IntArrayRef& sizes) {
-  const int n = other.size(-1);
-  const int k = input.size(-1);
-  const int nk = n * k;
-  phmap::flat_hash_map<int, std::vector<size_t>> groups;
-  std::vector<offset_params> offsets = {{0, 0, 0}};
-  offsets.reserve(sizes.size() + 1);
-  for (size_t i = 0; i < sizes.size(); ++i) {
-    const int m = sizes[i];
-    if (groups.count(m)) {
-      groups[m].push_back(i);
-    } else {
-      groups.insert({m, {i}});
-    }
-
-    offset_params offset = {m * k, nk, m * n};
-    offset += offsets.back();
-    offsets.push_back(offset);
-  }
-  offsets.pop_back();
-
-  AT_DISPATCH_FLOATING_TYPES(
-      input.scalar_type(), "segment_matmul_out_kernel_mkl_impl", [&] {
-        const auto group_count = static_cast<int>(groups.size());
-        std::vector<scalar_t> alpha(group_count, 1);
-        std::vector<scalar_t> beta(group_count, 0);
-        std::vector<int> ns(group_count, n);
-        std::vector<int> ks(group_count, k);
-        std::vector<int> ld_src0(group_count, k);
-        std::vector<int> ld_src1(group_count, n);
-        std::vector<int> ld_dst(group_count, n);
-
-        std::vector<int> ms(group_count);
-        std::vector<int> group_sizes(group_count);
-        std::vector<scalar_t*> src0;
-        std::vector<scalar_t*> src1;
-        std::vector<scalar_t*> dst;
-
-        const auto src0_base_ptr = input.data_ptr<scalar_t>();
-        const auto src1_base_ptr = other.data_ptr<scalar_t>();
-        const auto dst_base_ptr = out.data_ptr<scalar_t>();
-
-        size_t group_idx = 0;
-        for (const auto& group_kv : groups) {
-          int m = group_kv.first;
-          const auto& indices = group_kv.second;
-
-          ms[group_idx] = m;
-          group_sizes[group_idx] = indices.size();
-          ++group_idx;
-
-          for (const auto offset_idx : indices) {
-            const auto offset = offsets[offset_idx];
-            src0.push_back(src0_base_ptr + offset.src0_offset);
-            src1.push_back(src1_base_ptr + offset.src1_offset);
-            dst.push_back(dst_base_ptr + offset.dst_offset);
-          }
-        }
-
-        auto src0_ptrs = const_cast<const scalar_t**>(src0.data());
-        auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
-        auto dst_ptrs = dst.data();
-
-#if AT_MKL_SEQUENTIAL()
-        // unlikely to happen - requires Torch to be built from source with
-        // explicit flag denoting MKL sequential version
-        parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
-                                       src1_ptrs, ld_src1, beta, dst_ptrs,
-                                       ld_dst, group_count, group_sizes);
-#else
-        mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
-                              src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
-                              beta.data(), dst_ptrs, ld_dst.data(), group_count,
-                              group_sizes.data());
-#endif
-      });
+  /* const int n = other.size(-1); */
+  /* const int k = input.size(-1); */
+  /* const int nk = n * k; */
+  /* phmap::flat_hash_map<int, std::vector<size_t>> groups; */
+  /* std::vector<offset_params> offsets = {{0, 0, 0}}; */
+  /* offsets.reserve(sizes.size() + 1); */
+  /* for (size_t i = 0; i < sizes.size(); ++i) { */
+  /*   const int m = sizes[i]; */
+  /*   if (groups.count(m)) { */
+  /*     groups[m].push_back(i); */
+  /*   } else { */
+  /*     groups.insert({m, {i}}); */
+  /*   } */
+
+  /*   offset_params offset = {m * k, nk, m * n}; */
+  /*   offset += offsets.back(); */
+  /*   offsets.push_back(offset); */
+  /* } */
+  /* offsets.pop_back(); */
+
+  /* AT_DISPATCH_FLOATING_TYPES( */
+  /*     input.scalar_type(), "segment_matmul_out_kernel_mkl_impl", [&] { */
+  /*       const auto group_count = static_cast<int>(groups.size()); */
+  /*       std::vector<scalar_t> alpha(group_count, 1); */
+  /*       std::vector<scalar_t> beta(group_count, 0); */
+  /*       std::vector<int> ns(group_count, n); */
+  /*       std::vector<int> ks(group_count, k); */
+  /*       std::vector<int> ld_src0(group_count, k); */
+  /*       std::vector<int> ld_src1(group_count, n); */
+  /*       std::vector<int> ld_dst(group_count, n); */
+
+  /*       std::vector<int> ms(group_count); */
+  /*       std::vector<int> group_sizes(group_count); */
+  /*       std::vector<scalar_t*> src0; */
+  /*       std::vector<scalar_t*> src1; */
+  /*       std::vector<scalar_t*> dst; */
+
+  /*       const auto src0_base_ptr = input.data_ptr<scalar_t>(); */
+  /*       const auto src1_base_ptr = other.data_ptr<scalar_t>(); */
+  /*       const auto dst_base_ptr = out.data_ptr<scalar_t>(); */
+
+  /*       size_t group_idx = 0; */
+  /*       for (const auto& group_kv : groups) { */
+  /*         int m = group_kv.first; */
+  /*         const auto& indices = group_kv.second; */
+
+  /*         ms[group_idx] = m; */
+  /*         group_sizes[group_idx] = indices.size(); */
+  /*         ++group_idx; */
+
+  /*         for (const auto offset_idx : indices) { */
+  /*           const auto offset = offsets[offset_idx]; */
+  /*           src0.push_back(src0_base_ptr + offset.src0_offset); */
+  /*           src1.push_back(src1_base_ptr + offset.src1_offset); */
+  /*           dst.push_back(dst_base_ptr + offset.dst_offset); */
+  /*         } */
+  /*       } */
+
+  /*       auto src0_ptrs = const_cast<const scalar_t**>(src0.data()); */
+  /*       auto src1_ptrs = const_cast<const scalar_t**>(src1.data()); */
+  /*       auto dst_ptrs = dst.data(); */
+
+  /* #if AT_MKL_SEQUENTIAL() */
+  /*       // unlikely to happen - requires Torch to be built from source with
+   */
+  /*       // explicit flag denoting MKL sequential version */
+  /*       parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
+   */
+  /*                                      src1_ptrs, ld_src1, beta, dst_ptrs, */
+  /*                                      ld_dst, group_count, group_sizes); */
+  /* #else */
+  /*       mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
+   */
+  /*                             src0_ptrs, ld_src0.data(), src1_ptrs,
+   * ld_src1.data(), */
+  /*                             beta.data(), dst_ptrs, ld_dst.data(),
+   * group_count, */
+  /*                             group_sizes.data()); */
+  /* #endif */
+  /*     }); */
 }
 
 at::Tensor segment_matmul_kernel(const at::Tensor& input,

From fb770cbe13de11f24a37c6861412582ac211071f Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 19:46:47 +0000
Subject: [PATCH 08/46] update

---
 pyg_lib/csrc/partition/cpu/metis_kernel.cpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pyg_lib/csrc/partition/cpu/metis_kernel.cpp b/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
index df516224f..7430574f2 100644
--- a/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
+++ b/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
@@ -1,7 +1,7 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
-#include <metis.h>
+/* #include <metis.h> */
 
 namespace pyg {
 namespace partition {
@@ -31,14 +31,16 @@ at::Tensor metis_kernel(const at::Tensor& rowptr,
   auto part = at::empty({nvtxs}, rowptr.options());
   auto part_data = part.data_ptr<int64_t>();
 
-  if (recursive) {
-    METIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
-                             &num_partitions, NULL, NULL, NULL, &objval,
-                             part_data);
-  } else {
-    METIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
-                        &num_partitions, NULL, NULL, NULL, &objval, part_data);
-  }
+  /* if (recursive) { */
+  /*   METIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
+   */
+  /*                            &num_partitions, NULL, NULL, NULL, &objval, */
+  /*                            part_data); */
+  /* } else { */
+  /*   METIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt, */
+  /*                       &num_partitions, NULL, NULL, NULL, &objval,
+   * part_data); */
+  /* } */
 
   return part;
 }

From 221ccb8281d9bd8a34b28bec4fa5dccf8e49be5b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 20:04:40 +0000
Subject: [PATCH 09/46] update

---
 setup.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 4ea063ff2..67993157b 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
 class CMakeExtension(Extension):
     def __init__(self, name, sourcedir=''):
         Extension.__init__(self, name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
+        self.sourcedir = osp.abspath(sourcedir)
 
 
 class CMakeBuild(build_ext):
@@ -29,6 +29,7 @@ def check_env_flag(name: str, default: str = "") -> bool:
         return value in ["1", "ON", "YES", "TRUE", "Y"]
 
     def get_ext_filename(self, ext_name):
+        print("GET EXT FILENAME")
         # Remove Python ABI suffix:
         ext_filename = super().get_ext_filename(ext_name)
         ext_filename_parts = ext_filename.split('.')
@@ -40,7 +41,7 @@ def build_extension(self, ext):
 
         import torch
 
-        extdir = os.path.abspath(osp.dirname(self.get_ext_fullpath(ext.name)))
+        extdir = osp.abspath(osp.dirname(self.get_ext_fullpath(ext.name)))
         self.build_type = "DEBUG" if self.debug else "RELEASE"
         if self.debug is None:
             if CMakeBuild.check_env_flag("DEBUG"):
@@ -79,10 +80,13 @@ def build_extension(self, ext):
 
         build_args = []
 
+        print("1111 ---------------")
         subprocess.check_call(['cmake', ext.sourcedir] + cmake_args,
                               cwd=self.build_temp)
+        print("2222 ---------------")
         subprocess.check_call(['cmake', '--build', '.'] + build_args,
                               cwd=self.build_temp)
+        print("3333 ---------------")
 
 
 def maybe_append_with_mkl(dependencies):

From 5ea50352dd149570b7ce367ac46199730b085d61 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Thu, 4 Apr 2024 21:38:00 +0000
Subject: [PATCH 10/46] update

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 565cd2597..b1834a847 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,4 +121,6 @@ set_target_properties(${PROJECT_NAME} PROPERTIES
 # Cmake creates *.dylib by default, but python expects *.so by default
 if (APPLE)
   set_property(TARGET ${PROJECT_NAME} PROPERTY SUFFIX .so)
+elseif (MSVC)
+  set_property(TARGET ${PROJECT_NAME} PROPERTY SUFFIX .pyd)
 endif()

From 255960a53640d0baf44234b68f9c56d53d79cbe9 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 08:39:09 +0000
Subject: [PATCH 11/46] update

---
 CMakeLists.txt |  2 +-
 README.md      | 10 +++++-----
 setup.py       | 36 ++++++++++++++++++++----------------
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b1834a847..1a70643b9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,6 +121,6 @@ set_target_properties(${PROJECT_NAME} PROPERTIES
 # Cmake creates *.dylib by default, but python expects *.so by default
 if (APPLE)
   set_property(TARGET ${PROJECT_NAME} PROPERTY SUFFIX .so)
-elseif (MSVC)
+elseif (MSVC AND USE_PYTHON)
   set_property(TARGET ${PROJECT_NAME} PROPERTY SUFFIX .pyd)
 endif()
diff --git a/README.md b/README.md
index 5834e651b..81ec7fb0d 100644
--- a/README.md
+++ b/README.md
@@ -37,31 +37,31 @@ The following combinations are supported:
 | PyTorch 2.2  | `cpu` | `cu102` | `cu113` | `cu116` | `cu117` | `cu118` | `cu121` |
 |--------------|-------|---------|---------|---------|---------|---------|---------|
 | **Linux**    | ✅    |         |         |         |         | ✅      | ✅      |
-| **Windows**  |       |         |         |         |         |         |         |
+| **Windows**  | ✅    |         |         |         |         | ✅      | ✅      |
 | **macOS**    | ✅    |         |         |         |         |         |         |
 
 | PyTorch 2.1  | `cpu` | `cu102` | `cu113` | `cu116` | `cu117` | `cu118` | `cu121` |
 |--------------|-------|---------|---------|---------|---------|---------|---------|
 | **Linux**    | ✅    |         |         |         |         | ✅      | ✅      |
-| **Windows**  |       |         |         |         |         |         |         |
+| **Windows**  | ✅    |         |         |         |         | ✅      | ✅      |
 | **macOS**    | ✅    |         |         |         |         |         |         |
 
 | PyTorch 2.0  | `cpu` | `cu102` | `cu113` | `cu116` | `cu117` | `cu118` | `cu121` |
 |--------------|-------|---------|---------|---------|---------|---------|---------|
 | **Linux**    | ✅    |         |         |         | ✅      | ✅      |         |
-| **Windows**  |       |         |         |         |         |         |         |
+| **Windows**  | ✅    |         |         |         | ✅      | ✅      |         |
 | **macOS**    | ✅    |         |         |         |         |         |         |
 
 | PyTorch 1.13 | `cpu` | `cu102` | `cu113` | `cu116` | `cu117` | `cu118` | `cu121` |
 |--------------|-------|---------|---------|---------|---------|---------|---------|
 | **Linux**    | ✅    |         |         | ✅      | ✅      |         |         |
-| **Windows**  |       |         |         |         |         |         |         |
+| **Windows**  | ✅    |         |         | ✅      | ✅      |         |         |
 | **macOS**    | ✅    |         |         |         |         |         |         |
 
 | PyTorch 1.12 | `cpu` | `cu102` | `cu113` | `cu116` | `cu117` | `cu118` | `cu121` |
 |--------------|-------|---------|---------|---------|---------|---------| --------|
 | **Linux**    | ✅    | ✅      | ✅      | ✅      |         |         |         |
-| **Windows**  |       |         |         |         |         |         |         |
+| **Windows**  | ✅    | ✅      | ✅      | ✅      |         |         |         |
 | **macOS**    | ✅    |         |         |         |         |         |         |
 
 ### Form nightly
diff --git a/setup.py b/setup.py
index 67993157b..b8782c473 100644
--- a/setup.py
+++ b/setup.py
@@ -6,6 +6,7 @@
 import importlib
 import os
 import os.path as osp
+import re
 import subprocess
 import warnings
 
@@ -34,6 +35,7 @@ def get_ext_filename(self, ext_name):
         ext_filename = super().get_ext_filename(ext_name)
         ext_filename_parts = ext_filename.split('.')
         ext_filename_parts = ext_filename_parts[:-2] + ext_filename_parts[-1:]
+        print('.'.join(ext_filename_parts))
         return '.'.join(ext_filename_parts)
 
     def build_extension(self, ext):
@@ -89,26 +91,28 @@ def build_extension(self, ext):
         print("3333 ---------------")
 
 
-def maybe_append_with_mkl(dependencies):
-    if CMakeBuild.check_env_flag('USE_MKL_BLAS'):
-        import re
+def mkl_dependencies():
+    if not CMakeBuild.check_env_flag('USE_MKL_BLAS'):
+        return []
 
-        import torch
-        torch_config = torch.__config__.show()
-        with_mkl_blas = 'BLAS_INFO=mkl' in torch_config
-        if torch.backends.mkl.is_available() and with_mkl_blas:
-            product_version = '2023.1.0'
-            pattern = r'oneAPI Math Kernel Library Version [0-9]{4}\.[0-9]+'
-            match = re.search(pattern, torch_config)
-            if match:
-                product_version = match.group(0).split(' ')[-1]
+    import torch
+
+    dependencies = []
+    torch_config = torch.__config__.show()
+    with_mkl_blas = 'BLAS_INFO=mkl' in torch_config
+    if torch.backends.mkl.is_available() and with_mkl_blas:
+        product_version = '2023.1.0'
+        pattern = r'oneAPI Math Kernel Library Version [0-9]{4}\.[0-9]+'
+        match = re.search(pattern, torch_config)
+        if match:
+            product_version = match.group(0).split(' ')[-1]
+        dependencies.append(f'mkl-include=={product_version}')
+        dependencies.append(f'mkl-static=={product_version}')
 
-            dependencies.append(f'mkl-include=={product_version}')
-            dependencies.append(f'mkl-static=={product_version}')
+    return dependencies
 
 
-install_requires = []
-maybe_append_with_mkl(install_requires)
+install_requires = [] + mkl_dependencies()
 
 triton_requires = [
     'triton',

From bd68fb3d621228f43e4b5f00916509e92d5171a9 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 09:04:27 +0000
Subject: [PATCH 12/46] update

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a70643b9..15973f5b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.15)
 project(pyg)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_SHARED_LIBRARY_PREFIX "lib")
 set(PYG_VERSION 0.4.0)
 
 option(BUILD_TEST "Enable testing" OFF)

From fa9af6650e45c44e8a1dd06cfd0e422b152b4386 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 09:12:39 +0000
Subject: [PATCH 13/46] update

---
 setup.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/setup.py b/setup.py
index b8782c473..d6890f50c 100644
--- a/setup.py
+++ b/setup.py
@@ -90,6 +90,13 @@ def build_extension(self, ext):
                               cwd=self.build_temp)
         print("3333 ---------------")
 
+        print('. --------------')
+        print(os.listdir('.'))
+        print('build --------------')
+        print(os.listdir(osp.join('.', 'build')))
+        print('build lib.win --------------')
+        print(os.listdir(osp.join('.', 'build', 'lib.win-amd64-3.8')))
+
 
 def mkl_dependencies():
     if not CMakeBuild.check_env_flag('USE_MKL_BLAS'):

From af49e9bd8230dece989f470a511e962571e7ac58 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 09:18:53 +0000
Subject: [PATCH 14/46] update

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index d6890f50c..133965570 100644
--- a/setup.py
+++ b/setup.py
@@ -94,8 +94,8 @@ def build_extension(self, ext):
         print(os.listdir('.'))
         print('build --------------')
         print(os.listdir(osp.join('.', 'build')))
-        print('build lib.win --------------')
-        print(os.listdir(osp.join('.', 'build', 'lib.win-amd64-3.8')))
+        print('build temp.win --------------')
+        print(os.listdir(osp.join('.', 'build', 'temp.win-amd64-3.8')))
 
 
 def mkl_dependencies():

From 0a6d0a9b684537ca9cd0d164ab40c8ad0f8ef7d7 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 09:29:24 +0000
Subject: [PATCH 15/46] update

---
 setup.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 133965570..d4532665b 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,7 @@
 # Environment flags to control different options
 #
-#   USE_MKL_BLAS=1
-#     enables use of MKL BLAS (requires PyTorch to be built with MKL support)
+# - USE_MKL_BLAS=1:
+#   Enables use of MKL BLAS (requires PyTorch to be built with MKL support)
 
 import importlib
 import os
@@ -44,6 +44,8 @@ def build_extension(self, ext):
         import torch
 
         extdir = osp.abspath(osp.dirname(self.get_ext_fullpath(ext.name)))
+        print(ext)
+        print(extdir)
         self.build_type = "DEBUG" if self.debug else "RELEASE"
         if self.debug is None:
             if CMakeBuild.check_env_flag("DEBUG"):

From bfe51a87c5c459954fb010ccbf0002b975d85a9b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 09:36:32 +0000
Subject: [PATCH 16/46] update

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index d4532665b..e1040f692 100644
--- a/setup.py
+++ b/setup.py
@@ -65,6 +65,7 @@ def build_extension(self, ext):
             '-DUSE_PYTHON=ON',
             f'-DWITH_CUDA={"ON" if WITH_CUDA else "OFF"}',
             f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}',
+            f'-DCMAKE_RUNTIME_OUTPUT_DIRECTORY={extdir}',
             f'-DCMAKE_BUILD_TYPE={self.build_type}',
             f'-DCMAKE_PREFIX_PATH={torch.utils.cmake_prefix_path}',
         ]

From 2a112b3dd49a7362331852ee1babb23adfa2fdce Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 09:45:12 +0000
Subject: [PATCH 17/46] update

---
 .github/workflows/install.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index d0c5d4104..950739419 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -29,8 +29,9 @@ jobs:
 
       - name: Install package
         run: |
-          # source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
+          source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
           pip install --verbose -e .
+        shell: bash
 
       - name: Test imports
         run: |

From 0012f5ac7fd0e29b9f3558ed5f96f0677aee74cd Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 09:55:51 +0000
Subject: [PATCH 18/46] update

---
 .github/workflows/install.yml |  2 +-
 setup.py                      | 14 --------------
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 950739419..ee189d652 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -13,7 +13,7 @@ jobs:
 
     strategy:
       matrix:
-        os: [windows-latest]
+        os: [windows-2019]
         cuda-version: ['cpu', 'cu121']
 
     steps:
diff --git a/setup.py b/setup.py
index e1040f692..cbb4ecddd 100644
--- a/setup.py
+++ b/setup.py
@@ -30,12 +30,10 @@ def check_env_flag(name: str, default: str = "") -> bool:
         return value in ["1", "ON", "YES", "TRUE", "Y"]
 
     def get_ext_filename(self, ext_name):
-        print("GET EXT FILENAME")
         # Remove Python ABI suffix:
         ext_filename = super().get_ext_filename(ext_name)
         ext_filename_parts = ext_filename.split('.')
         ext_filename_parts = ext_filename_parts[:-2] + ext_filename_parts[-1:]
-        print('.'.join(ext_filename_parts))
         return '.'.join(ext_filename_parts)
 
     def build_extension(self, ext):
@@ -44,8 +42,6 @@ def build_extension(self, ext):
         import torch
 
         extdir = osp.abspath(osp.dirname(self.get_ext_fullpath(ext.name)))
-        print(ext)
-        print(extdir)
         self.build_type = "DEBUG" if self.debug else "RELEASE"
         if self.debug is None:
             if CMakeBuild.check_env_flag("DEBUG"):
@@ -85,20 +81,10 @@ def build_extension(self, ext):
 
         build_args = []
 
-        print("1111 ---------------")
         subprocess.check_call(['cmake', ext.sourcedir] + cmake_args,
                               cwd=self.build_temp)
-        print("2222 ---------------")
         subprocess.check_call(['cmake', '--build', '.'] + build_args,
                               cwd=self.build_temp)
-        print("3333 ---------------")
-
-        print('. --------------')
-        print(os.listdir('.'))
-        print('build --------------')
-        print(os.listdir(osp.join('.', 'build')))
-        print('build temp.win --------------')
-        print(os.listdir(osp.join('.', 'build', 'temp.win-amd64-3.8')))
 
 
 def mkl_dependencies():

From 77642ced464f3eac1863c267e57baa9a2e351f73 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 10:08:31 +0000
Subject: [PATCH 19/46] update

---
 .github/workflows/cuda/Windows.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 8f275405d..17789911e 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -39,7 +39,7 @@ esac
 
 # Install NVIDIA drivers, see:
 # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
-curl -k -L "https://drive.google.com/u/0/uc?id=1injUyo3lnarMgWyRcXqKg4UGnN0ysmuq&export=download" --output "/tmp/gpu_driver_dlls.zip"
+curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "/tmp/gpu_driver_dlls.zip"
 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 
 curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"

From e6ba17d90727f2944dac41cfca5538b49ad4d054 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 11:28:36 +0000
Subject: [PATCH 20/46] update

---
 .github/workflows/cuda/Windows.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 17789911e..307b77827 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -39,8 +39,8 @@ esac
 
 # Install NVIDIA drivers, see:
 # https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
-curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "/tmp/gpu_driver_dlls.zip"
-7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+# curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "/tmp/gpu_driver_dlls.zip"
+# 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
 
 curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
 echo ""

From 936f4cf2396cd49e54cb78dbb57020b324eb86e8 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 12:46:22 +0000
Subject: [PATCH 21/46] update

---
 .github/workflows/cuda/Windows.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 307b77827..131be2b29 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -48,3 +48,14 @@ echo "Installing from ${CUDA_FILE}..."
 PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cuobjdump_${CUDA_SHORT} nvprune_${CUDA_SHORT} cupti_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cudart_${CUDA_SHORT} cufft_dev_${CUDA_SHORT} curand_dev_${CUDA_SHORT} cusolver_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT} thrust_${CUDA_SHORT} npp_dev_${CUDA_SHORT} nvrtc_dev_${CUDA_SHORT} nvml_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
 echo "Done!"
 rm -f "${CUDA_FILE}"
+
+echo Installing NvToolsExt...
+curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "tmp/NvToolsExt.7z"
+7z x tmp/NvToolsExt.7z -o"/tmp/NvToolsExt"
+mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt\bin\x64"
+mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt\include"
+mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt\lib\x64"
+xcopy /Y "/tmp/NvToolsExt/bin/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+xcopy /Y "/tmp/NvToolsExt/include/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
+xcopy /Y "/tmp/NvToolsExt/lib/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
+export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"

From 826c85fce2ed7ae06c6aa51987005a84bbd6cd2c Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 13:01:00 +0000
Subject: [PATCH 22/46] update

---
 .github/workflows/cuda/Windows.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 131be2b29..183d45856 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -50,11 +50,11 @@ echo "Done!"
 rm -f "${CUDA_FILE}"
 
 echo Installing NvToolsExt...
-curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "tmp/NvToolsExt.7z"
+curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "/tmp/NvToolsExt.7z"
 7z x tmp/NvToolsExt.7z -o"/tmp/NvToolsExt"
-mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt\bin\x64"
-mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt\include"
-mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt\lib\x64"
+mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
+mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 xcopy /Y "/tmp/NvToolsExt/bin/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 xcopy /Y "/tmp/NvToolsExt/include/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
 xcopy /Y "/tmp/NvToolsExt/lib/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"

From 195a8d4e6a9d34040bd82825b1100c446090d1f2 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 13:14:01 +0000
Subject: [PATCH 23/46] update

---
 .github/workflows/cuda/Windows.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 183d45856..822a5e004 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -52,10 +52,10 @@ rm -f "${CUDA_FILE}"
 echo Installing NvToolsExt...
 curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "/tmp/NvToolsExt.7z"
 7z x tmp/NvToolsExt.7z -o"/tmp/NvToolsExt"
-mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
-mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
-mkdir "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
-xcopy /Y "/tmp/NvToolsExt/bin/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
-xcopy /Y "/tmp/NvToolsExt/include/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
-xcopy /Y "/tmp/NvToolsExt/lib/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
+mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
+mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
+cp "/tmp/NvToolsExt/bin/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+cp "/tmp/NvToolsExt/include/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
+cp "/tmp/NvToolsExt/lib/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"

From e9910ca55b767f83d46e9991f43539caa02e71b1 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 13:24:59 +0000
Subject: [PATCH 24/46] update

---
 .github/workflows/cuda/Windows.sh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 822a5e004..8d13ca35f 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -55,7 +55,19 @@ curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
-cp "/tmp/NvToolsExt/bin/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
-cp "/tmp/NvToolsExt/include/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
-cp "/tmp/NvToolsExt/lib/x64/*.*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
+echo "-------"
+ls "/tmp/NvToolsExt"
+echo "-------"
+ls "/tmp/NvToolsExt/bin"
+echo "-------"
+ls "/tmp/NvToolsExt/bin/x64"
+echo "-------"
+ls "/tmp/NvToolsExt/include"
+echo "-------"
+ls "/tmp/NvToolsExt/lib"
+echo "-------"
+ls "/tmp/NvToolsExt/lib/x64"
+cp "/tmp/NvToolsExt/bin/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+cp "/tmp/NvToolsExt/include/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
+cp "/tmp/NvToolsExt/lib/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"

From 505b1d2b76f4dd56381a2406ed2ef5959f4de299 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 13:29:56 +0000
Subject: [PATCH 25/46] update

---
 .github/workflows/cuda/Windows.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 8d13ca35f..59de34013 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -51,7 +51,7 @@ rm -f "${CUDA_FILE}"
 
 echo Installing NvToolsExt...
 curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "/tmp/NvToolsExt.7z"
-7z x tmp/NvToolsExt.7z -o"/tmp/NvToolsExt"
+7z x "/tmp/NvToolsExt.7z" -o"/tmp/NvToolsExt"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"

From be7397fc349ac6fd4de8f91309fceb0a41d0989e Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 13:37:28 +0000
Subject: [PATCH 26/46] update

---
 .github/workflows/cuda/Windows.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 59de34013..caef76086 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -67,7 +67,7 @@ echo "-------"
 ls "/tmp/NvToolsExt/lib"
 echo "-------"
 ls "/tmp/NvToolsExt/lib/x64"
-cp "/tmp/NvToolsExt/bin/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
-cp "/tmp/NvToolsExt/include/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
-cp "/tmp/NvToolsExt/lib/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
+cp -r "/tmp/NvToolsExt/bin/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+cp -r "/tmp/NvToolsExt/include/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
+cp -r "/tmp/NvToolsExt/lib/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"

From aa5650e3deedb1667f3c9a3f71e4baea18da7867 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 15:07:32 +0000
Subject: [PATCH 27/46] update

---
 .github/workflows/cuda/Windows.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index caef76086..4529e6bb1 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -67,7 +67,7 @@ echo "-------"
 ls "/tmp/NvToolsExt/lib"
 echo "-------"
 ls "/tmp/NvToolsExt/lib/x64"
-cp -r "/tmp/NvToolsExt/bin/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
-cp -r "/tmp/NvToolsExt/include/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
-cp -r "/tmp/NvToolsExt/lib/x64/*" "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
+cp -r /tmp/NvToolsExt/bin/x64/* "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+cp -r /tmp/NvToolsExt/include/* "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
+cp -r /tmp/NvToolsExt/lib/x64/* "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"

From e6eaa1cb8a4d11fa4ccf8abe52b3eede5cba26e2 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 15:18:27 +0000
Subject: [PATCH 28/46] update

---
 .github/workflows/cuda/Windows.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 4529e6bb1..04ac24c2b 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -67,7 +67,7 @@ echo "-------"
 ls "/tmp/NvToolsExt/lib"
 echo "-------"
 ls "/tmp/NvToolsExt/lib/x64"
-cp -r /tmp/NvToolsExt/bin/x64/* "/cProgram Files/NVIDIA Corporation/NvToolsExt/bin/x64"
-cp -r /tmp/NvToolsExt/include/* "/cProgram Files/NVIDIA Corporation/NvToolsExt/include"
-cp -r /tmp/NvToolsExt/lib/x64/* "/cProgram Files/NVIDIA Corporation/NvToolsExt/lib/x64"
+cp -r /tmp/NvToolsExt/bin/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+cp -r /tmp/NvToolsExt/include/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
+cp -r /tmp/NvToolsExt/lib/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"

From 64583daaadfde105a0852a09586a728e5212db8c Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sat, 6 Apr 2024 17:28:17 +0000
Subject: [PATCH 29/46] update

---
 .github/workflows/cuda/Windows.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 04ac24c2b..805b21923 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -71,3 +71,5 @@ cp -r /tmp/NvToolsExt/bin/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/
 cp -r /tmp/NvToolsExt/include/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
 cp -r /tmp/NvToolsExt/lib/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
+
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"

From b95e7d06751aa99746d54eabf4a0176a12cb1d93 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 06:53:21 +0000
Subject: [PATCH 30/46] update

---
 .github/workflows/cuda/Windows.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 805b21923..d2a1c50e2 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -72,4 +72,5 @@ cp -r /tmp/NvToolsExt/include/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/
 cp -r /tmp/NvToolsExt/lib/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 
+export CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
 export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"

From 332c064356ba3e9c75ce16d0248e7289860ab57f Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 07:07:31 +0000
Subject: [PATCH 31/46] update

---
 .github/workflows/cuda/Windows.sh | 1 -
 setup.py                          | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index d2a1c50e2..805b21923 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -72,5 +72,4 @@ cp -r /tmp/NvToolsExt/include/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/
 cp -r /tmp/NvToolsExt/lib/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 
-export CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
 export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
diff --git a/setup.py b/setup.py
index cbb4ecddd..b4c9820c9 100644
--- a/setup.py
+++ b/setup.py
@@ -66,6 +66,10 @@ def build_extension(self, ext):
             f'-DCMAKE_PREFIX_PATH={torch.utils.cmake_prefix_path}',
         ]
 
+        cuda_arch_list = os.getenv('TORCH_CUDA_ARCH_LIST')
+        if WITH_CUDA and cuda_arch_list is not None:
+            cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
+
         if CMakeBuild.check_env_flag('USE_MKL_BLAS'):
             include_dir = f"{sysconfig.get_path('data')}{os.sep}include"
             cmake_args.append(f'-DBLAS_INCLUDE_DIR={include_dir}')

From 88a8a71cf66caa8904e06c419376dea3e0568510 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 07:20:06 +0000
Subject: [PATCH 32/46] update

---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index b4c9820c9..b9a78f029 100644
--- a/setup.py
+++ b/setup.py
@@ -69,6 +69,9 @@ def build_extension(self, ext):
         cuda_arch_list = os.getenv('TORCH_CUDA_ARCH_LIST')
         if WITH_CUDA and cuda_arch_list is not None:
             cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
+        else:
+            cuda_arch_list = "3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+            cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
 
         if CMakeBuild.check_env_flag('USE_MKL_BLAS'):
             include_dir = f"{sysconfig.get_path('data')}{os.sep}include"

From 69a3eee089d4b5eb85e2d21baf124ade5a7f4b5b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 07:33:11 +0000
Subject: [PATCH 33/46] update

---
 .github/workflows/cuda/Windows.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 805b21923..7cd46c282 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -72,4 +72,4 @@ cp -r /tmp/NvToolsExt/include/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/
 cp -r /tmp/NvToolsExt/lib/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 
-export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+export TORCH_CUDA_ARCH_LIST="35;50;60;70;75;80;86"

From 5452828a771ea53855726b9dac19b2f076197fb7 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 07:44:00 +0000
Subject: [PATCH 34/46] update

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b9a78f029..558e0354d 100644
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,7 @@ def build_extension(self, ext):
         if WITH_CUDA and cuda_arch_list is not None:
             cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
         else:
-            cuda_arch_list = "3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+            cuda_arch_list = "35;50;60;70;75;80;86"
             cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
 
         if CMakeBuild.check_env_flag('USE_MKL_BLAS'):

From bec70967dac332a07c0cae30a5fffd307910b30b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 07:53:35 +0000
Subject: [PATCH 35/46] update

---
 setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 558e0354d..5ba6fdf78 100644
--- a/setup.py
+++ b/setup.py
@@ -67,10 +67,13 @@ def build_extension(self, ext):
         ]
 
         cuda_arch_list = os.getenv('TORCH_CUDA_ARCH_LIST')
+        print("ARCH LIST")
+        print("-----------")
+        print(cuda_arch_list)
         if WITH_CUDA and cuda_arch_list is not None:
             cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
         else:
-            cuda_arch_list = "35;50;60;70;75;80;86"
+            cuda_arch_list = "50;60;70;75;80;86"
             cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
 
         if CMakeBuild.check_env_flag('USE_MKL_BLAS'):

From 4016fc315e53e41d0edfd3042fac0cb105104262 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 08:59:39 +0000
Subject: [PATCH 36/46] update

---
 CMakeLists.txt |  2 ++
 setup.py       | 11 ++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15973f5b7..469a348fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,8 @@ endif()
 if(WITH_CUDA)
   enable_language(CUDA)
   add_definitions(-DWITH_CUDA)
+  message("CUDA FLAGS HEHEHEHEHEHE")
+  message("${CMAKE_CUDA_FLAGS}")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 
   if (NOT "$ENV{EXTERNAL_CUTLASS_INCLUDE_DIR}" STREQUAL "")
diff --git a/setup.py b/setup.py
index 5ba6fdf78..a5b8b5f6f 100644
--- a/setup.py
+++ b/setup.py
@@ -70,11 +70,12 @@ def build_extension(self, ext):
         print("ARCH LIST")
         print("-----------")
         print(cuda_arch_list)
-        if WITH_CUDA and cuda_arch_list is not None:
-            cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
-        else:
-            cuda_arch_list = "50;60;70;75;80;86"
-            cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
+        cmake_args.append('-DCUDA_ARCH_PTX=5.0+PTX')
+        # if WITH_CUDA and cuda_arch_list is not None:
+        #     cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
+        # else:
+        #     cuda_arch_list = "50;60;70;75;80;86"
+        #     cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
 
         if CMakeBuild.check_env_flag('USE_MKL_BLAS'):
             include_dir = f"{sysconfig.get_path('data')}{os.sep}include"

From c3009bfae054e36dbc7d2fdb1c54c260c8fcc72f Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Sun, 7 Apr 2024 09:38:28 +0000
Subject: [PATCH 37/46] update

---
 .github/workflows/cuda/Windows.sh | 3 ++-
 setup.py                          | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index 7cd46c282..c670755a7 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -72,4 +72,5 @@ cp -r /tmp/NvToolsExt/include/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/
 cp -r /tmp/NvToolsExt/lib/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
 export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 
-export TORCH_CUDA_ARCH_LIST="35;50;60;70;75;80;86"
+export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+export TORCH_CUDA_ARCH_LIST="35;50+PTX;6.0;7.0;7.5;8.0;8.6"
diff --git a/setup.py b/setup.py
index a5b8b5f6f..5b4cc0747 100644
--- a/setup.py
+++ b/setup.py
@@ -66,6 +66,8 @@ def build_extension(self, ext):
             f'-DCMAKE_PREFIX_PATH={torch.utils.cmake_prefix_path}',
         ]
 
+        os.environ['TORCH_CUDA_ARCH_LIST'] = '8.0 8.6 9.0'
+
         cuda_arch_list = os.getenv('TORCH_CUDA_ARCH_LIST')
         print("ARCH LIST")
         print("-----------")

From e4cee393353170563f7fe85e61c3a6206332730b Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 05:51:51 +0000
Subject: [PATCH 38/46] update

---
 .github/workflows/cuda/Windows.sh | 30 +++++++-----------------------
 .github/workflows/install.yml     |  2 ++
 CMakeLists.txt                    |  2 --
 setup.py                          | 12 ++++++------
 4 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/cuda/Windows.sh b/.github/workflows/cuda/Windows.sh
index c670755a7..582541da0 100644
--- a/.github/workflows/cuda/Windows.sh
+++ b/.github/workflows/cuda/Windows.sh
@@ -37,11 +37,6 @@ case ${1} in
     ;;
 esac
 
-# Install NVIDIA drivers, see:
-# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
-# curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "/tmp/gpu_driver_dlls.zip"
-# 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
-
 curl -k -L "${CUDA_URL}/${CUDA_FILE}" --output "${CUDA_FILE}"
 echo ""
 echo "Installing from ${CUDA_FILE}..."
@@ -49,28 +44,17 @@ PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s
 echo "Done!"
 rm -f "${CUDA_FILE}"
 
+# echo Installing NVIDIA drivers...
+# https://github.com/pytorch/vision/blob/master/packaging/windows/internal/cuda_install.bat#L99-L102
+# curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "/tmp/gpu_driver_dlls.zip"
+# 7z x "/tmp/gpu_driver_dlls.zip" -o"/c/Windows/System32"
+
 echo Installing NvToolsExt...
-curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "/tmp/NvToolsExt.7z"
-7z x "/tmp/NvToolsExt.7z" -o"/tmp/NvToolsExt"
+curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output /tmp/NvToolsExt.7z
+7z x /tmp/NvToolsExt.7z -o"/tmp/NvToolsExt"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
 mkdir -p "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
-echo "-------"
-ls "/tmp/NvToolsExt"
-echo "-------"
-ls "/tmp/NvToolsExt/bin"
-echo "-------"
-ls "/tmp/NvToolsExt/bin/x64"
-echo "-------"
-ls "/tmp/NvToolsExt/include"
-echo "-------"
-ls "/tmp/NvToolsExt/lib"
-echo "-------"
-ls "/tmp/NvToolsExt/lib/x64"
 cp -r /tmp/NvToolsExt/bin/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
 cp -r /tmp/NvToolsExt/include/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/include"
 cp -r /tmp/NvToolsExt/lib/x64/* "/c/Program Files/NVIDIA Corporation/NvToolsExt/lib/x64"
-export NVTOOLSEXT_PATH="/c/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"
-
-export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-export TORCH_CUDA_ARCH_LIST="35;50+PTX;6.0;7.0;7.5;8.0;8.6"
diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index ee189d652..6ce265025 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -32,6 +32,8 @@ jobs:
           source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
           pip install --verbose -e .
         shell: bash
+        env:
+          TORCH_CUDA_ARCH_LIST: "3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
 
       - name: Test imports
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 469a348fe..15973f5b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,8 +42,6 @@ endif()
 if(WITH_CUDA)
   enable_language(CUDA)
   add_definitions(-DWITH_CUDA)
-  message("CUDA FLAGS HEHEHEHEHEHE")
-  message("${CMAKE_CUDA_FLAGS}")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 
   if (NOT "$ENV{EXTERNAL_CUTLASS_INCLUDE_DIR}" STREQUAL "")
diff --git a/setup.py b/setup.py
index 5b4cc0747..7020c9b14 100644
--- a/setup.py
+++ b/setup.py
@@ -66,13 +66,13 @@ def build_extension(self, ext):
             f'-DCMAKE_PREFIX_PATH={torch.utils.cmake_prefix_path}',
         ]
 
-        os.environ['TORCH_CUDA_ARCH_LIST'] = '8.0 8.6 9.0'
+        # os.environ['TORCH_CUDA_ARCH_LIST'] = '8.0 8.6 9.0'
 
-        cuda_arch_list = os.getenv('TORCH_CUDA_ARCH_LIST')
-        print("ARCH LIST")
-        print("-----------")
-        print(cuda_arch_list)
-        cmake_args.append('-DCUDA_ARCH_PTX=5.0+PTX')
+        # cuda_arch_list = os.getenv('TORCH_CUDA_ARCH_LIST')
+        # print("ARCH LIST")
+        # print("-----------")
+        # print(cuda_arch_list)
+        # cmake_args.append('-DCUDA_ARCH_PTX=5.0+PTX')
         # if WITH_CUDA and cuda_arch_list is not None:
         #     cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
         # else:

From 9db6ea5a19d19f79976d3c12768db13b8e1a990d Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 06:02:42 +0000
Subject: [PATCH 39/46] update

---
 .github/workflows/building.yml         |   2 +
 .github/workflows/install.yml          |   2 +-
 .github/workflows/nightly.yml          |   2 +
 pyg_lib/csrc/ops/cpu/matmul_kernel.cpp | 307 ++++++++++++-------------
 setup.py                               |  13 --
 5 files changed, 152 insertions(+), 174 deletions(-)

diff --git a/.github/workflows/building.yml b/.github/workflows/building.yml
index d0398ac7f..1c19e9f21 100644
--- a/.github/workflows/building.yml
+++ b/.github/workflows/building.yml
@@ -112,6 +112,8 @@ jobs:
           source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
           python setup.py bdist_wheel --dist-dir=dist
         shell: bash
+        env:
+          TORCH_CUDA_ARCH_LIST: "5.0+PTX;6.0;7.0;7.5;8.0;8.6"
 
       - name: Test wheel
         run: |
diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 6ce265025..5200ad263 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -33,7 +33,7 @@ jobs:
           pip install --verbose -e .
         shell: bash
         env:
-          TORCH_CUDA_ARCH_LIST: "3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
+          TORCH_CUDA_ARCH_LIST: "5.0+PTX;6.0;7.0;7.5;8.0;8.6"
 
       - name: Test imports
         run: |
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 968350427..8aeeba1b1 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -118,6 +118,8 @@ jobs:
           source ./.github/workflows/cuda/${{ runner.os }}-env.sh ${{ matrix.cuda-version }}
           python setup.py bdist_wheel --dist-dir=dist
         shell: bash
+        env:
+          TORCH_CUDA_ARCH_LIST: "5.0+PTX;6.0;7.0;7.5;8.0;8.6"
 
       - name: Test wheel
         run: |
diff --git a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
index 73f8631d3..d75f7a2b1 100644
--- a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
+++ b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
@@ -86,8 +86,7 @@ void mkl_blas_gemm_batched(const int* m_array,
                            const int* ldc_array,
                            const int group_count,
                            const int* group_size) {
-  TORCH_INTERNAL_ASSERT(false,
-                        "mkl_blas_gemm_batched: MKL BLAS is not supported");
+  TORCH_INTERNAL_ASSERT(false, "MKL BLAS is not supported");
 }
 
 void mkl_blas_gemm_batched(const int* m_array,
@@ -103,8 +102,7 @@ void mkl_blas_gemm_batched(const int* m_array,
                            const int* ldc_array,
                            const int group_count,
                            const int* group_size) {
-  TORCH_INTERNAL_ASSERT(false,
-                        "mkl_blas_gemm_batched: MKL BLAS is not supported");
+  TORCH_INTERNAL_ASSERT(false, "MKL BLAS is not supported");
 }
 
 #endif
@@ -206,82 +204,76 @@ void grouped_matmul_out_kernel_mkl_impl(const std::vector<at::Tensor> input,
                                         const std::vector<at::Tensor> other,
                                         std::vector<at::Tensor> out) {
   // matrix_params<M, N, K>
-  /* using matrix_params = std::tuple<int, int, int>; */
-  /* phmap::flat_hash_map<matrix_params, std::vector<size_t>> groups; */
-  /* for (size_t i = 0; i < input.size(); ++i) { */
-  /*   const matrix_params mp = {input[i].size(0), other[i].size(-1), */
-  /*                             input[i].size(-1)}; */
-  /*   if (groups.count(mp)) { */
-  /*     groups[mp].push_back(i); */
-  /*   } else { */
-  /*     groups.insert({mp, {i}}); */
-  /*   } */
-  /* } */
-
-  /* AT_DISPATCH_FLOATING_TYPES( */
-  /*     input.front().scalar_type(), "grouped_matmul_out_kernel_mkl_impl", [&]
-   * { */
-  /*       const auto group_count = static_cast<int>(groups.size()); */
-  /*       std::vector<scalar_t> alpha(group_count, 1); */
-  /*       std::vector<scalar_t> beta(group_count, 0); */
-
-  /*       std::vector<int> ms(group_count); */
-  /*       std::vector<int> ns(group_count); */
-  /*       std::vector<int> ks(group_count); */
-  /*       std::vector<int> ld_src0(group_count); */
-  /*       std::vector<int> ld_src1(group_count); */
-  /*       std::vector<int> ld_dst(group_count); */
-  /*       std::vector<int> group_sizes(group_count); */
-  /*       std::vector<scalar_t*> src0; */
-  /*       std::vector<scalar_t*> src1; */
-  /*       std::vector<scalar_t*> dst; */
-
-  /*       size_t group_idx = 0; */
-  /*       for (const auto& group_kv : groups) { */
-  /*         int m; */
-  /*         int n; */
-  /*         int k; */
-  /*         std::tie(m, n, k) = group_kv.first; */
-  /*         const auto& indices = group_kv.second; */
-
-  /*         ms[group_idx] = m; */
-  /*         ns[group_idx] = n; */
-  /*         ks[group_idx] = k; */
-  /*         ld_src0[group_idx] = k; */
-  /*         ld_src1[group_idx] = n; */
-  /*         ld_dst[group_idx] = n; */
-  /*         group_sizes[group_idx] = indices.size(); */
-  /*         ++group_idx; */
-
-  /*         for (const auto tensor_idx : indices) { */
-  /*           src0.push_back(input[tensor_idx].data_ptr<scalar_t>()); */
-  /*           src1.push_back(other[tensor_idx].data_ptr<scalar_t>()); */
-  /*           dst.push_back(out[tensor_idx].data_ptr<scalar_t>()); */
-  /*         } */
-  /*       } */
-
-  /*       auto src0_ptrs = const_cast<const scalar_t**>(src0.data()); */
-  /*       auto src1_ptrs = const_cast<const scalar_t**>(src1.data()); */
-  /*       auto dst_ptrs = dst.data(); */
-
-  /* #if AT_MKL_SEQUENTIAL() */
-  /*       // unlikely to happen - requires Torch to be built from source with
-   */
-  /*       // explicit flag denoting MKL sequential version */
-  /*       parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
-   */
-  /*                                      src1_ptrs, ld_src1, beta, dst_ptrs, */
-  /*                                      ld_dst, group_count, group_sizes); */
-  /* #else */
-  /*       mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
-   */
-  /*                             src0_ptrs, ld_src0.data(), src1_ptrs,
-   * ld_src1.data(), */
-  /*                             beta.data(), dst_ptrs, ld_dst.data(),
-   * group_count, */
-  /*                             group_sizes.data()); */
-  /* #endif */
-  /*     }); */
+  using matrix_params = std::tuple<int, int, int>;
+  phmap::flat_hash_map<matrix_params, std::vector<size_t>> groups;
+  for (size_t i = 0; i < input.size(); ++i) {
+    const matrix_params mp = {input[i].size(0), other[i].size(-1),
+                              input[i].size(-1)};
+    if (groups.count(mp)) {
+      groups[mp].push_back(i);
+    } else {
+      groups.insert({mp, {i}});
+    }
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(
+      input.front().scalar_type(), "grouped_matmul_out_kernel_mkl_impl", [&] {
+        const auto group_count = static_cast<int>(groups.size());
+        std::vector<scalar_t> alpha(group_count, 1);
+        std::vector<scalar_t> beta(group_count, 0);
+
+        std::vector<int> ms(group_count);
+        std::vector<int> ns(group_count);
+        std::vector<int> ks(group_count);
+        std::vector<int> ld_src0(group_count);
+        std::vector<int> ld_src1(group_count);
+        std::vector<int> ld_dst(group_count);
+        std::vector<int> group_sizes(group_count);
+        std::vector<scalar_t*> src0;
+        std::vector<scalar_t*> src1;
+        std::vector<scalar_t*> dst;
+
+        size_t group_idx = 0;
+        for (const auto& group_kv : groups) {
+          int m;
+          int n;
+          int k;
+          std::tie(m, n, k) = group_kv.first;
+          const auto& indices = group_kv.second;
+
+          ms[group_idx] = m;
+          ns[group_idx] = n;
+          ks[group_idx] = k;
+          ld_src0[group_idx] = k;
+          ld_src1[group_idx] = n;
+          ld_dst[group_idx] = n;
+          group_sizes[group_idx] = indices.size();
+          ++group_idx;
+
+          for (const auto tensor_idx : indices) {
+            src0.push_back(input[tensor_idx].data_ptr<scalar_t>());
+            src1.push_back(other[tensor_idx].data_ptr<scalar_t>());
+            dst.push_back(out[tensor_idx].data_ptr<scalar_t>());
+          }
+        }
+
+        auto src0_ptrs = const_cast<const scalar_t**>(src0.data());
+        auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
+        auto dst_ptrs = dst.data();
+
+#if AT_MKL_SEQUENTIAL()
+        // unlikely to happen - requires Torch to be built from source with
+        // explicit flag denoting MKL sequential version
+        parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
+                                       src1_ptrs, ld_src1, beta, dst_ptrs,
+                                       ld_dst, group_count, group_sizes);
+#else
+        mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
+                              src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
+                              beta.data(), dst_ptrs, ld_dst.data(), group_count,
+                              group_sizes.data());
+#endif
+      });
 }
 
 std::vector<at::Tensor> grouped_matmul_kernel(const at::TensorList input,
@@ -334,86 +326,81 @@ void segment_matmul_out_kernel_mkl_impl(const at::Tensor& input,
                                         const at::Tensor& other,
                                         at::Tensor& out,
                                         const at::IntArrayRef& sizes) {
-  /* const int n = other.size(-1); */
-  /* const int k = input.size(-1); */
-  /* const int nk = n * k; */
-  /* phmap::flat_hash_map<int, std::vector<size_t>> groups; */
-  /* std::vector<offset_params> offsets = {{0, 0, 0}}; */
-  /* offsets.reserve(sizes.size() + 1); */
-  /* for (size_t i = 0; i < sizes.size(); ++i) { */
-  /*   const int m = sizes[i]; */
-  /*   if (groups.count(m)) { */
-  /*     groups[m].push_back(i); */
-  /*   } else { */
-  /*     groups.insert({m, {i}}); */
-  /*   } */
-
-  /*   offset_params offset = {m * k, nk, m * n}; */
-  /*   offset += offsets.back(); */
-  /*   offsets.push_back(offset); */
-  /* } */
-  /* offsets.pop_back(); */
-
-  /* AT_DISPATCH_FLOATING_TYPES( */
-  /*     input.scalar_type(), "segment_matmul_out_kernel_mkl_impl", [&] { */
-  /*       const auto group_count = static_cast<int>(groups.size()); */
-  /*       std::vector<scalar_t> alpha(group_count, 1); */
-  /*       std::vector<scalar_t> beta(group_count, 0); */
-  /*       std::vector<int> ns(group_count, n); */
-  /*       std::vector<int> ks(group_count, k); */
-  /*       std::vector<int> ld_src0(group_count, k); */
-  /*       std::vector<int> ld_src1(group_count, n); */
-  /*       std::vector<int> ld_dst(group_count, n); */
-
-  /*       std::vector<int> ms(group_count); */
-  /*       std::vector<int> group_sizes(group_count); */
-  /*       std::vector<scalar_t*> src0; */
-  /*       std::vector<scalar_t*> src1; */
-  /*       std::vector<scalar_t*> dst; */
-
-  /*       const auto src0_base_ptr = input.data_ptr<scalar_t>(); */
-  /*       const auto src1_base_ptr = other.data_ptr<scalar_t>(); */
-  /*       const auto dst_base_ptr = out.data_ptr<scalar_t>(); */
-
-  /*       size_t group_idx = 0; */
-  /*       for (const auto& group_kv : groups) { */
-  /*         int m = group_kv.first; */
-  /*         const auto& indices = group_kv.second; */
-
-  /*         ms[group_idx] = m; */
-  /*         group_sizes[group_idx] = indices.size(); */
-  /*         ++group_idx; */
-
-  /*         for (const auto offset_idx : indices) { */
-  /*           const auto offset = offsets[offset_idx]; */
-  /*           src0.push_back(src0_base_ptr + offset.src0_offset); */
-  /*           src1.push_back(src1_base_ptr + offset.src1_offset); */
-  /*           dst.push_back(dst_base_ptr + offset.dst_offset); */
-  /*         } */
-  /*       } */
-
-  /*       auto src0_ptrs = const_cast<const scalar_t**>(src0.data()); */
-  /*       auto src1_ptrs = const_cast<const scalar_t**>(src1.data()); */
-  /*       auto dst_ptrs = dst.data(); */
-
-  /* #if AT_MKL_SEQUENTIAL() */
-  /*       // unlikely to happen - requires Torch to be built from source with
-   */
-  /*       // explicit flag denoting MKL sequential version */
-  /*       parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
-   */
-  /*                                      src1_ptrs, ld_src1, beta, dst_ptrs, */
-  /*                                      ld_dst, group_count, group_sizes); */
-  /* #else */
-  /*       mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
-   */
-  /*                             src0_ptrs, ld_src0.data(), src1_ptrs,
-   * ld_src1.data(), */
-  /*                             beta.data(), dst_ptrs, ld_dst.data(),
-   * group_count, */
-  /*                             group_sizes.data()); */
-  /* #endif */
-  /*     }); */
+  const int n = other.size(-1);
+  const int k = input.size(-1);
+  const int nk = n * k;
+  phmap::flat_hash_map<int, std::vector<size_t>> groups;
+  std::vector<offset_params> offsets = {{0, 0, 0}};
+  offsets.reserve(sizes.size() + 1);
+  for (size_t i = 0; i < sizes.size(); ++i) {
+    const int m = sizes[i];
+    if (groups.count(m)) {
+      groups[m].push_back(i);
+    } else {
+      groups.insert({m, {i}});
+    }
+
+    offset_params offset = {m * k, nk, m * n};
+    offset += offsets.back();
+    offsets.push_back(offset);
+  }
+  offsets.pop_back();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "segment_matmul_out_kernel_mkl_impl", [&] {
+        const auto group_count = static_cast<int>(groups.size());
+        std::vector<scalar_t> alpha(group_count, 1);
+        std::vector<scalar_t> beta(group_count, 0);
+        std::vector<int> ns(group_count, n);
+        std::vector<int> ks(group_count, k);
+        std::vector<int> ld_src0(group_count, k);
+        std::vector<int> ld_src1(group_count, n);
+        std::vector<int> ld_dst(group_count, n);
+
+        std::vector<int> ms(group_count);
+        std::vector<int> group_sizes(group_count);
+        std::vector<scalar_t*> src0;
+        std::vector<scalar_t*> src1;
+        std::vector<scalar_t*> dst;
+
+        const auto src0_base_ptr = input.data_ptr<scalar_t>();
+        const auto src1_base_ptr = other.data_ptr<scalar_t>();
+        const auto dst_base_ptr = out.data_ptr<scalar_t>();
+
+        size_t group_idx = 0;
+        for (const auto& group_kv : groups) {
+          int m = group_kv.first;
+          const auto& indices = group_kv.second;
+
+          ms[group_idx] = m;
+          group_sizes[group_idx] = indices.size();
+          ++group_idx;
+
+          for (const auto offset_idx : indices) {
+            const auto offset = offsets[offset_idx];
+            src0.push_back(src0_base_ptr + offset.src0_offset);
+            src1.push_back(src1_base_ptr + offset.src1_offset);
+            dst.push_back(dst_base_ptr + offset.dst_offset);
+          }
+        }
+
+        auto src0_ptrs = const_cast<const scalar_t**>(src0.data());
+        auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
+        auto dst_ptrs = dst.data();
+
+#if AT_MKL_SEQUENTIAL()
+        // unlikely to happen - requires Torch to be built from source with
+        // explicit flag denoting MKL sequential version
+        parallel_mkl_blas_gemm_batched(ms, ns, ks, alpha, src0_ptrs, ld_src0,
+                                       src1_ptrs, ld_src1, beta, dst_ptrs,
+                                       ld_dst, group_count, group_sizes);
+#else
+        mkl_blas_gemm_batched(ms.data(), ns.data(), ks.data(), alpha.data(),
+                              src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
+                              beta.data(), dst_ptrs, ld_dst.data(), group_count,
+                              group_sizes.data());
+#endif
+      });
 }
 
 at::Tensor segment_matmul_kernel(const at::Tensor& input,
diff --git a/setup.py b/setup.py
index 7020c9b14..cbb4ecddd 100644
--- a/setup.py
+++ b/setup.py
@@ -66,19 +66,6 @@ def build_extension(self, ext):
             f'-DCMAKE_PREFIX_PATH={torch.utils.cmake_prefix_path}',
         ]
 
-        # os.environ['TORCH_CUDA_ARCH_LIST'] = '8.0 8.6 9.0'
-
-        # cuda_arch_list = os.getenv('TORCH_CUDA_ARCH_LIST')
-        # print("ARCH LIST")
-        # print("-----------")
-        # print(cuda_arch_list)
-        # cmake_args.append('-DCUDA_ARCH_PTX=5.0+PTX')
-        # if WITH_CUDA and cuda_arch_list is not None:
-        #     cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
-        # else:
-        #     cuda_arch_list = "50;60;70;75;80;86"
-        #     cmake_args.append(f'-DCMAKE_CUDA_ARCHITECTURES={cuda_arch_list}')
-
         if CMakeBuild.check_env_flag('USE_MKL_BLAS'):
             include_dir = f"{sysconfig.get_path('data')}{os.sep}include"
             cmake_args.append(f'-DBLAS_INCLUDE_DIR={include_dir}')

From b777858b4cc0caf6f9a76dbaa33a6b2b0d19255d Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 06:43:07 +0000
Subject: [PATCH 40/46] update

---
 pyg_lib/csrc/ops/cpu/matmul_kernel.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
index d75f7a2b1..20ee14f44 100644
--- a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
+++ b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
@@ -261,6 +261,7 @@ void grouped_matmul_out_kernel_mkl_impl(const std::vector<at::Tensor> input,
         auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
         auto dst_ptrs = dst.data();
 
+#if WITH_MKL_BLAS()
 #if AT_MKL_SEQUENTIAL()
         // unlikely to happen - requires Torch to be built from source with
         // explicit flag denoting MKL sequential version
@@ -272,6 +273,7 @@ void grouped_matmul_out_kernel_mkl_impl(const std::vector<at::Tensor> input,
                               src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
                               beta.data(), dst_ptrs, ld_dst.data(), group_count,
                               group_sizes.data());
+#endif
 #endif
       });
 }
@@ -388,6 +390,7 @@ void segment_matmul_out_kernel_mkl_impl(const at::Tensor& input,
         auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
         auto dst_ptrs = dst.data();
 
+#if WITH_MKL_BLAS()
 #if AT_MKL_SEQUENTIAL()
         // unlikely to happen - requires Torch to be built from source with
         // explicit flag denoting MKL sequential version
@@ -399,6 +402,7 @@ void segment_matmul_out_kernel_mkl_impl(const at::Tensor& input,
                               src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
                               beta.data(), dst_ptrs, ld_dst.data(), group_count,
                               group_sizes.data());
+#endif
 #endif
       });
 }

From c35f37baebaddb70751f3475ba9c052443d28266 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 06:52:32 +0000
Subject: [PATCH 41/46] update

---
 pyg_lib/csrc/ops/cpu/matmul_kernel.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
index 20ee14f44..21b41d843 100644
--- a/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
+++ b/pyg_lib/csrc/ops/cpu/matmul_kernel.cpp
@@ -203,6 +203,7 @@ void grouped_matmul_out_kernel_at_impl(const std::vector<at::Tensor> input,
 void grouped_matmul_out_kernel_mkl_impl(const std::vector<at::Tensor> input,
                                         const std::vector<at::Tensor> other,
                                         std::vector<at::Tensor> out) {
+#if WITH_MKL_BLAS()
   // matrix_params<M, N, K>
   using matrix_params = std::tuple<int, int, int>;
   phmap::flat_hash_map<matrix_params, std::vector<size_t>> groups;
@@ -261,7 +262,6 @@ void grouped_matmul_out_kernel_mkl_impl(const std::vector<at::Tensor> input,
         auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
         auto dst_ptrs = dst.data();
 
-#if WITH_MKL_BLAS()
 #if AT_MKL_SEQUENTIAL()
         // unlikely to happen - requires Torch to be built from source with
         // explicit flag denoting MKL sequential version
@@ -273,9 +273,9 @@ void grouped_matmul_out_kernel_mkl_impl(const std::vector<at::Tensor> input,
                               src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
                               beta.data(), dst_ptrs, ld_dst.data(), group_count,
                               group_sizes.data());
-#endif
 #endif
       });
+#endif
 }
 
 std::vector<at::Tensor> grouped_matmul_kernel(const at::TensorList input,
@@ -328,6 +328,7 @@ void segment_matmul_out_kernel_mkl_impl(const at::Tensor& input,
                                         const at::Tensor& other,
                                         at::Tensor& out,
                                         const at::IntArrayRef& sizes) {
+#if WITH_MKL_BLAS()
   const int n = other.size(-1);
   const int k = input.size(-1);
   const int nk = n * k;
@@ -390,7 +391,6 @@ void segment_matmul_out_kernel_mkl_impl(const at::Tensor& input,
         auto src1_ptrs = const_cast<const scalar_t**>(src1.data());
         auto dst_ptrs = dst.data();
 
-#if WITH_MKL_BLAS()
 #if AT_MKL_SEQUENTIAL()
         // unlikely to happen - requires Torch to be built from source with
         // explicit flag denoting MKL sequential version
@@ -402,9 +402,9 @@ void segment_matmul_out_kernel_mkl_impl(const at::Tensor& input,
                               src0_ptrs, ld_src0.data(), src1_ptrs, ld_src1.data(),
                               beta.data(), dst_ptrs, ld_dst.data(), group_count,
                               group_sizes.data());
-#endif
 #endif
       });
+#endif
 }
 
 at::Tensor segment_matmul_kernel(const at::Tensor& input,

From 7ffb312b288edde99b733c410c9d94c8eb2489ba Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 07:55:16 +0000
Subject: [PATCH 42/46] update

---
 CMakeLists.txt                              | 20 ++++++++++----------
 pyg_lib/csrc/partition/cpu/metis_kernel.cpp | 20 +++++++++-----------
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15973f5b7..1e971f167 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,16 +73,16 @@ else()
   target_include_directories(${PROJECT_NAME} PRIVATE ${PHMAP_DIR})
 endif()
 
-# set(METIS_DIR third_party/METIS)
-# target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_DIR}/include)
-# set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-# set(GKLIB_PATH "${METIS_DIR}/GKlib")
-# include(${GKLIB_PATH}/GKlibSystem.cmake)
-# include_directories(${GKLIB_PATH})
-# include_directories("${METIS_DIR}/include")
-# add_subdirectory("${METIS_DIR}/libmetis")
-# target_link_libraries(${PROJECT_NAME} PRIVATE metis)
+set(METIS_DIR third_party/METIS)
+target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_DIR}/include)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+set(GKLIB_PATH "${METIS_DIR}/GKlib")
+include(${GKLIB_PATH}/GKlibSystem.cmake)
+include_directories(${GKLIB_PATH})
+include_directories("${METIS_DIR}/include")
+add_subdirectory("${METIS_DIR}/libmetis")
+target_link_libraries(${PROJECT_NAME} PRIVATE metis)
 
 find_package(Torch REQUIRED)
 target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
diff --git a/pyg_lib/csrc/partition/cpu/metis_kernel.cpp b/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
index 7430574f2..df516224f 100644
--- a/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
+++ b/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
@@ -1,7 +1,7 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
-/* #include <metis.h> */
+#include <metis.h>
 
 namespace pyg {
 namespace partition {
@@ -31,16 +31,14 @@ at::Tensor metis_kernel(const at::Tensor& rowptr,
   auto part = at::empty({nvtxs}, rowptr.options());
   auto part_data = part.data_ptr<int64_t>();
 
-  /* if (recursive) { */
-  /*   METIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
-   */
-  /*                            &num_partitions, NULL, NULL, NULL, &objval, */
-  /*                            part_data); */
-  /* } else { */
-  /*   METIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt, */
-  /*                       &num_partitions, NULL, NULL, NULL, &objval,
-   * part_data); */
-  /* } */
+  if (recursive) {
+    METIS_PartGraphRecursive(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
+                             &num_partitions, NULL, NULL, NULL, &objval,
+                             part_data);
+  } else {
+    METIS_PartGraphKway(&nvtxs, &ncon, xadj, adjncy, vwgt, NULL, adjwgt,
+                        &num_partitions, NULL, NULL, NULL, &objval, part_data);
+  }
 
   return part;
 }

From 61a5725c42b49ae475e2822453dcb1d63dd32b34 Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 08:04:38 +0000
Subject: [PATCH 43/46] update

---
 .github/workflows/building.yml |  1 -
 .github/workflows/nightly.yml  |  1 -
 CMakeLists.txt                 | 21 ++++++++++++++-------
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/building.yml b/.github/workflows/building.yml
index 1c19e9f21..442ed1c0c 100644
--- a/.github/workflows/building.yml
+++ b/.github/workflows/building.yml
@@ -15,7 +15,6 @@ jobs:
         torch-version: [1.12.0, 1.13.0, 2.0.0, 2.1.0, 2.2.0]
         cuda-version: ['cpu', 'cu113', 'cu116', 'cu117', 'cu118', 'cu121']
         exclude:
-          - os: windows-2019  # No windows support yet :(
           - torch-version: 1.12.0
             python-version: '3.12'
           - torch-version: 1.13.0
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 8aeeba1b1..f23c6721f 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -19,7 +19,6 @@ jobs:
         torch-version: [1.12.0, 1.13.0, 2.0.0, 2.1.0, 2.2.0]
         cuda-version: ['cpu', 'cu113', 'cu116', 'cu117', 'cu118', 'cu121']
         exclude:
-          - os: windows-2019  # No windows support yet :(
           - torch-version: 1.12.0
             python-version: '3.12'
           - torch-version: 1.13.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e971f167..cb54d51d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -75,13 +75,20 @@ endif()
 
 set(METIS_DIR third_party/METIS)
 target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_DIR}/include)
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
-set(GKLIB_PATH "${METIS_DIR}/GKlib")
-include(${GKLIB_PATH}/GKlibSystem.cmake)
-include_directories(${GKLIB_PATH})
-include_directories("${METIS_DIR}/include")
-add_subdirectory("${METIS_DIR}/libmetis")
+if (MSVC)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
+else()
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
+endif()
+if (NOT MSVC)
+  set(GKLIB_PATH "${METIS_DIR}/GKlib")
+  include(${GKLIB_PATH}/GKlibSystem.cmake)
+  include_directories(${GKLIB_PATH})
+  include_directories("${METIS_DIR}/include")
+  add_subdirectory("${METIS_DIR}/libmetis")
+endif()
 target_link_libraries(${PROJECT_NAME} PRIVATE metis)
 
 find_package(Torch REQUIRED)

From 4f0dd5a5cf1bb91e999f85d472ef45f17faee74a Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 08:12:04 +0000
Subject: [PATCH 44/46] update

---
 CMakeLists.txt                              | 7 ++++---
 pyg_lib/csrc/partition/cpu/metis_kernel.cpp | 6 ++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb54d51d6..187a81073 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,8 +73,6 @@ else()
   target_include_directories(${PROJECT_NAME} PRIVATE ${PHMAP_DIR})
 endif()
 
-set(METIS_DIR third_party/METIS)
-target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_DIR}/include)
 if (MSVC)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /DIDXTYPEWIDTH=64 /DREALTYPEWIDTH=32")
@@ -82,14 +80,17 @@ else()
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIDXTYPEWIDTH=64 -DREALTYPEWIDTH=32")
 endif()
+
 if (NOT MSVC)
+  set(METIS_DIR third_party/METIS)
+  target_include_directories(${PROJECT_NAME} PRIVATE ${METIS_DIR}/include)
   set(GKLIB_PATH "${METIS_DIR}/GKlib")
   include(${GKLIB_PATH}/GKlibSystem.cmake)
   include_directories(${GKLIB_PATH})
   include_directories("${METIS_DIR}/include")
   add_subdirectory("${METIS_DIR}/libmetis")
+  target_link_libraries(${PROJECT_NAME} PRIVATE metis)
 endif()
-target_link_libraries(${PROJECT_NAME} PRIVATE metis)
 
 find_package(Torch REQUIRED)
 target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
diff --git a/pyg_lib/csrc/partition/cpu/metis_kernel.cpp b/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
index df516224f..56f6da886 100644
--- a/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
+++ b/pyg_lib/csrc/partition/cpu/metis_kernel.cpp
@@ -1,7 +1,9 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
+#ifndef _WIN32
 #include <metis.h>
+#endif
 
 namespace pyg {
 namespace partition {
@@ -14,6 +16,9 @@ at::Tensor metis_kernel(const at::Tensor& rowptr,
                         const c10::optional<at::Tensor>& node_weight,
                         const c10::optional<at::Tensor>& edge_weight,
                         bool recursive) {
+#ifdef _WIN32
+  TORCH_INTERNAL_ASSERT(false, "METIS not yet supported on Windows");
+#else
   int64_t nvtxs = rowptr.numel() - 1;
   int64_t ncon = 1;
   auto* xadj = rowptr.data_ptr<int64_t>();
@@ -41,6 +46,7 @@ at::Tensor metis_kernel(const at::Tensor& rowptr,
   }
 
   return part;
+#endif
 }
 
 }  // namespace

From 55b636fc35d052a8311c4abbace9bb6f158eaedb Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 08:42:25 +0000
Subject: [PATCH 45/46] update

---
 .github/workflows/install.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/install.yml b/.github/workflows/install.yml
index 5200ad263..809bca3c0 100644
--- a/.github/workflows/install.yml
+++ b/.github/workflows/install.yml
@@ -9,11 +9,10 @@ on:  # yamllint disable-line rule:truthy
 jobs:
 
   import:
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-latest
 
     strategy:
       matrix:
-        os: [windows-2019]
         cuda-version: ['cpu', 'cu121']
 
     steps:

From e5edd0c3f1560643ece0461ebd2c9f579aec8a9f Mon Sep 17 00:00:00 2001
From: rusty1s <matthias.fey@tu-dortmund.de>
Date: Mon, 8 Apr 2024 11:13:44 +0000
Subject: [PATCH 46/46] update

---
 .github/workflows/building.yml | 3 +++
 .github/workflows/nightly.yml  | 3 +++
 README.md                      | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/building.yml b/.github/workflows/building.yml
index 442ed1c0c..6c7f81447 100644
--- a/.github/workflows/building.yml
+++ b/.github/workflows/building.yml
@@ -71,6 +71,9 @@ jobs:
             python-version: '3.8'
           - os: macos-14
             python-version: '3.9'
+          - os: windows-2019
+            torch-version: 2.0.0
+            cuda-version: 'cu121'
 
     steps:
       - name: Checkout repository
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index f23c6721f..d3653b719 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -75,6 +75,9 @@ jobs:
             python-version: '3.8'
           - os: macos-14
             python-version: '3.9'
+          - os: windows-2019
+            torch-version: 2.0.0
+            cuda-version: 'cu121'
 
     steps:
       - name: Checkout repository
diff --git a/README.md b/README.md
index 81ec7fb0d..e71c6b79f 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ The following combinations are supported:
 
 | PyTorch 2.0  | `cpu` | `cu102` | `cu113` | `cu116` | `cu117` | `cu118` | `cu121` |
 |--------------|-------|---------|---------|---------|---------|---------|---------|
-| **Linux**    | ✅    |         |         |         | ✅      | ✅      |         |
+| **Linux**    | ✅    |         |         |         | ✅      | ✅      | ✅      |
 | **Windows**  | ✅    |         |         |         | ✅      | ✅      |         |
 | **macOS**    | ✅    |         |         |         |         |         |         |