From 427da00d053ccd1d36b826a03118a3b8e1446141 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 11 Sep 2024 11:20:36 -0700
Subject: [PATCH] 24.06.01 release

---
 .github/workflows/ci-gh-nightly-release.yml |  5 +++
 .github/workflows/ci-gh-release.yml         |  3 ++
 .github/workflows/gh-build-and-test.yml     | 28 +++++++++++------
 CMakeLists.txt                              |  2 +-
 README.md                                   |  8 +++++
 cmake/versions.json                         |  6 ++--
 conda/conda-build/meta.yaml                 | 25 +++++++++------
 src/cunumeric/item/write.cc                 |  8 ++---
 src/cunumeric/item/write.cu                 | 14 ++++-----
 src/cunumeric/item/write_template.inl       | 11 ++++---
 tests/cpp/integration/util.inl              | 35 ++++++++-------------
 tests/integration/test_singleton_access.py  |  8 ++---
 12 files changed, 88 insertions(+), 65 deletions(-)

diff --git a/.github/workflows/ci-gh-nightly-release.yml b/.github/workflows/ci-gh-nightly-release.yml
index 0540d2b8f..598d76b5a 100644
--- a/.github/workflows/ci-gh-nightly-release.yml
+++ b/.github/workflows/ci-gh-nightly-release.yml
@@ -23,6 +23,10 @@ jobs:
         upload-enabled:
           - true
           - false
+        python-version:
+          - "3.10"
+          - "3.11"
+          - "3.12"
     uses:
       ./.github/workflows/gh-build-and-test.yml
     with:
@@ -30,5 +34,6 @@ jobs:
       platform: ${{ matrix.platform }}
       build-type: release
       upload-enabled: ${{ matrix.upload-enabled }}
+      python-version: ${{ matrix.python-version }}
       dependencies-workflow: "ci-gh-nightly-release.yml"
     secrets: inherit
diff --git a/.github/workflows/ci-gh-release.yml b/.github/workflows/ci-gh-release.yml
index 98bb737c5..2e636687a 100644
--- a/.github/workflows/ci-gh-release.yml
+++ b/.github/workflows/ci-gh-release.yml
@@ -25,6 +25,8 @@ jobs:
           - cpu
         upload-enabled:
           - false
+        python-version:
+          - "3.12"
         exclude:
           - platform: linux-aarch64
             target-device: gpu
@@ -35,5 +37,6 @@ jobs:
       platform: ${{ matrix.platform }}
       build-type: release
       upload-enabled: ${{ matrix.upload-enabled }}
+      python-version: ${{ matrix.python-version }}
       dependencies-workflow: "ci-gh-nightly-release.yml"
     secrets: inherit
diff --git a/.github/workflows/gh-build-and-test.yml b/.github/workflows/gh-build-and-test.yml
index b71d771e5..4322b89ee 100644
--- a/.github/workflows/gh-build-and-test.yml
+++ b/.github/workflows/gh-build-and-test.yml
@@ -17,6 +17,10 @@ on:
         required: true
         type: string
         description: The workflow file name used by the dependency
+      python-version:
+        required: false
+        type: string
+        default: "3.12"
 
 jobs:
   setup-build:
@@ -41,9 +45,9 @@ jobs:
 
   build:
     needs: setup-build
-    name: "Build (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }})"
+    name: "Build (${{ inputs.platform }}, ${{ inputs.target-device }}, ${{ inputs.build-type }}, Python ${{ inputs.python-version }})"
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-build.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       target-device: ${{ inputs.target-device }}
@@ -53,10 +57,11 @@ jobs:
       platform: ${{ inputs.platform }}
       dependencies-file: "cmake/versions.json"
       dependencies-workflow: ${{ inputs.dependencies-workflow }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: ${{ inputs.upload-enabled }}
+      python-version: ${{ inputs.python-version }}
     secrets: inherit
 
 
@@ -65,20 +70,21 @@ jobs:
     if: ${{ github.repository_owner == 'nv-legate' && contains(github.workflow, 'release') && inputs.upload-enabled == true }}
     name: Upload package to Server
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       build-type: ${{ inputs.build-type }}
       name: Upload package to Server
       target-device: ${{ inputs.target-device }}
       platform: ${{ inputs.platform }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: ${{ inputs.upload-enabled }}
       upload-action: "upload-package"
       pkgSubString: "cunumeric-"
       repos-Root: "cunumeric"
+      python-version: ${{ inputs.python-version }}
     secrets: inherit    
 
   setup-test:
@@ -150,7 +156,7 @@ jobs:
       matrix: ${{fromJson(needs.setup-test.outputs.matrix)}}
 
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-test-within-container.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       build-type: ${{ inputs.build-type }}
@@ -160,10 +166,11 @@ jobs:
       has-gpu: ${{ matrix.runner.type == 'gpu' }}
       test-options: ${{ matrix.test-config.test-options }}
       platform: ${{ inputs.platform }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: ${{ inputs.upload-enabled }}
+      python-version: ${{ inputs.python-version }}
     secrets: inherit
 
   updateTestStatus:
@@ -171,18 +178,19 @@ jobs:
     name: Update Test status on Server
     if: ${{ (github.repository_owner == 'nv-legate') && contains(github.workflow, 'Nightly') && (inputs.upload-enabled == true) }}
     uses:
-      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.8
+      nv-legate/legate-gh-ci/.github/workflows/gh-upload.yml@v1.9
     with:
       client-repo: ${{ github.event.repository.name }}
       build-type: ${{ inputs.build-type }}
       name: UpdateTestStatus
       target-device: ${{ inputs.target-device }}
       platform: ${{ inputs.platform }}
-      legate-gh-ci-tag: "v1.8"
+      legate-gh-ci-tag: "v1.9"
       build-mode: ""
       ucx-enabled: false
       upload-enabled: true
       upload-action: "update-test-status"
       pkgSubString: "cunumeric-"
-      repos-Root: "cunumeric"  
+      repos-Root: "cunumeric"
+      python-version: ${{ inputs.python-version }}
     secrets: inherit
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f51994da..ced20b896 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,7 +57,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-set(cunumeric_version 24.05.00)
+set(cunumeric_version 24.06.00)
 
 # For now we want the optimization flags to match on both normal make and cmake
 # builds so we override the cmake defaults here for release, this changes
diff --git a/README.md b/README.md
index 262b62873..a92dcbd81 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,14 @@ or install it into an existing environment:
 conda install -c conda-forge -c legate cunumeric
 ```
 
+In an environment without GPUs available, `conda install` will by default choose a CPU-only package.
+To install a version with GPU support in such an environment, use environment variable `CONDA_OVERRIDE_CUDA`.
+
+```shell
+CONDA_OVERRIDE_CUDA="12.2" \
+  conda install -c conda-forge -c legate legate-core
+```
+
 Once installed, you can verify the installation by running one of the examples
 from the cuNumeric repository, for instance:
 
diff --git a/cmake/versions.json b/cmake/versions.json
index 194580bed..ff2061f52 100644
--- a/cmake/versions.json
+++ b/cmake/versions.json
@@ -2,12 +2,12 @@
   "packages" : {
     "legate_core" : {
       "repo": "legate.core.internal",
-      "artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<<repo>>-${{ inputs.target-device }}-release-<<git_tag>>",
-      "version": "24.05.00",
+      "artifact_name": "${{ inputs.platform }}-${{ inputs.build-type }}-<<repo>>-python${{ env.PYTHON_VERSION }}-${{ inputs.target-device }}-release-<<git_tag>>",
+      "version": "24.06.00",
       "git_url" : "git@github.com:nv-legate/legate.core.internal.git",
       "git_shallow": false,
       "always_download": false,
-      "git_tag" : "2e1ca409a4f67593aeb859834085919907e9e531"
+      "git_tag" : "6f1c6e55789be286ec8e2e94dc1d95e5dbbc10a2"
     }
   }
 }
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 98d457eb5..57747a3c7 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -84,12 +84,15 @@ build:
     - AWS_SECRET_ACCESS_KEY
 {% if not gpu_enabled_bool %}
     - CPU_ONLY=1
+  # The CPU-only packages having more track_features than the GPU builds helps
+  # the solver to prefer the GPU builds when both are viable candidates.
+  # ref: https://docs.conda.io/projects/conda-build/en/latest/resources/define-metadata.html#track-features
   track_features:
     - cpu_only
 {% endif %}
 
 ignore_run_exports_from:
-  # scikit-build should really be a part of the build env, but then it installs its own Python.  Conda build stacks 
+  # scikit-build should really be a part of the build env, but then it installs its own Python.  Conda build stacks
   # the build environment on the host environment, and the build python takes over causing paths havoc.  So, we put
   # scikit-build into the host env, but we ignore any exports it may bring.
   - scikit-build
@@ -101,12 +104,11 @@ requirements:
     - cmake {{ cmake_version }}
     - {{ compiler('c') }} =11.2
     - {{ compiler('cxx') }} =11.2
-        # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
-
-    - cuda-nvcc ={{ cuda_version }}
+    # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
+    - cuda-nvcc
     # cudart needed for CPU and GPU builds because of curand
-
-    - cuda-cudart-dev ={{ cuda_version }}
+    - cuda-cudart-dev
+    - cuda-version ={{ cuda_version }}
 
 
   host:
@@ -125,6 +127,7 @@ requirements:
 {% else %}
     - legate-core >={{ core_version }} =*_cpu*
 {% endif %}
+    - cuda-version ={{ cuda_version }}
 
   run:
     - numpy {{ numpy_version }}
@@ -132,12 +135,16 @@ requirements:
     - libcusparse
     - opt_einsum >=3.3
     - scipy
+    - openblas =* =*openmp*
+    # Pin to all minor versions of CUDA newer than the one built against, within the same major version.
+    # cuda-version constrains the CUDA runtime version and ensures a compatible driver is available
+    - {{ pin_compatible('cuda-version', min_pin='x.x', max_pin='x') }}
+{% if gpu_enabled_bool %}
+    - __cuda >={{ cuda_version }}
+{% endif %}
 
   run_constrained:
     - __glibc >=2.17  # [linux]
-{% if gpu_enabled_bool %}
-    - __cuda
-{% endif %}
 
 about:
   home: https://github.com/nv-legate/cunumeric
diff --git a/src/cunumeric/item/write.cc b/src/cunumeric/item/write.cc
index bede37529..34cd747c6 100644
--- a/src/cunumeric/item/write.cc
+++ b/src/cunumeric/item/write.cc
@@ -21,11 +21,11 @@ namespace cunumeric {
 
 using namespace legate;
 
-template <typename VAL>
-struct WriteImplBody<VariantKind::CPU, VAL> {
-  void operator()(AccessorWO<VAL, 1> out, const AccessorRO<VAL, 1>& value) const
+template <typename VAL, int DIM>
+struct WriteImplBody<VariantKind::CPU, VAL, DIM> {
+  void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, DIM>& value) const
   {
-    out[0] = value[0];
+    out[0] = value[Point<DIM>::ZEROES()];
   }
 };
 
diff --git a/src/cunumeric/item/write.cu b/src/cunumeric/item/write.cu
index 73aeb615e..aa056c9b8 100644
--- a/src/cunumeric/item/write.cu
+++ b/src/cunumeric/item/write.cu
@@ -20,19 +20,19 @@
 
 namespace cunumeric {
 
-template <typename VAL>
+template <typename VAL, int DIM>
 static __global__ void __launch_bounds__(1, 1)
-  write_value(const AccessorWO<VAL, 1> out, const AccessorRO<VAL, 1> value)
+  write_value(const AccessorWO<VAL, 1> out, const AccessorRO<VAL, DIM> value)
 {
-  out[0] = value[0];
+  out[0] = value[Point<DIM>::ZEROES()];
 }
 
-template <typename VAL>
-struct WriteImplBody<VariantKind::GPU, VAL> {
-  void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, 1>& value) const
+template <typename VAL, int DIM>
+struct WriteImplBody<VariantKind::GPU, VAL, DIM> {
+  void operator()(const AccessorWO<VAL, 1>& out, const AccessorRO<VAL, DIM>& value) const
   {
     auto stream = get_cached_stream();
-    write_value<VAL><<<1, 1, 0, stream>>>(out, value);
+    write_value<VAL, DIM><<<1, 1, 0, stream>>>(out, value);
     CUNUMERIC_CHECK_CUDA_STREAM(stream);
   }
 };
diff --git a/src/cunumeric/item/write_template.inl b/src/cunumeric/item/write_template.inl
index e8ba95fa1..a7f828efa 100644
--- a/src/cunumeric/item/write_template.inl
+++ b/src/cunumeric/item/write_template.inl
@@ -23,18 +23,18 @@ namespace cunumeric {
 
 using namespace legate;
 
-template <VariantKind KIND, typename VAL>
+template <VariantKind KIND, typename VAL, int DIM>
 struct WriteImplBody;
 
 template <VariantKind KIND>
 struct WriteImpl {
-  template <Type::Code CODE>
+  template <Type::Code CODE, int DIM>
   void operator()(legate::PhysicalStore out_arr, legate::PhysicalStore in_arr) const
   {
     using VAL = type_of<CODE>;
     auto out  = out_arr.write_accessor<VAL, 1>();
-    auto in   = in_arr.read_accessor<VAL, 1>();
-    WriteImplBody<KIND, VAL>()(out, in);
+    auto in   = in_arr.read_accessor<VAL, DIM>();
+    WriteImplBody<KIND, VAL, DIM>()(out, in);
   }
 };
 
@@ -43,7 +43,8 @@ static void write_template(TaskContext& context)
 {
   auto in  = context.input(0);
   auto out = context.output(0);
-  type_dispatch(out.type().code(), WriteImpl<KIND>{}, out, in);
+  auto dim = std::max(1, in.dim());
+  legate::double_dispatch(dim, out.type().code(), WriteImpl<KIND>(), out, in);
 }
 
 }  // namespace cunumeric
diff --git a/tests/cpp/integration/util.inl b/tests/cpp/integration/util.inl
index ccb203ea9..79bb33156 100644
--- a/tests/cpp/integration/util.inl
+++ b/tests/cpp/integration/util.inl
@@ -15,32 +15,23 @@
  */
 
 namespace {
+
+template <typename T, typename U = void>
+struct has_operator_left_shift : std::false_type {};
+
 template <typename T>
-std::stringstream& print_value(std::stringstream& ss, T value)
-{
-  ss << value;
-  return ss;
-}
+struct has_operator_left_shift<T, std::void_t<decltype(std::cout << std::declval<T>())>>
+  : std::true_type {};
 
-template <>
-std::stringstream& print_value<complex<float>>(std::stringstream& ss, complex<float> value)
-{
-  // operator<< missing for cuda::std::complex
-  // The issue is going to be fixed in the next cuda release.
-#if CUDART_VERSION >= 12050
-  ss << value;
-#endif
-  return ss;
-}
+template <typename T>
+constexpr bool has_operator_left_shift_v = has_operator_left_shift<T>::value;
 
-template <>
-std::stringstream& print_value<complex<double>>(std::stringstream& ss, complex<double> value)
+template <typename T>
+std::stringstream& print_value(std::stringstream& ss, T value)
 {
-  // operator<< missing for cuda::std::complex
-  // The issue is going to be fixed in the next cuda release.
-#if CUDART_VERSION >= 12050
-  ss << value;
-#endif
+  if constexpr (has_operator_left_shift_v<T>) {
+    ss << value;
+  }
   return ss;
 }
 
diff --git a/tests/integration/test_singleton_access.py b/tests/integration/test_singleton_access.py
index a719f42bf..8d146a35b 100644
--- a/tests/integration/test_singleton_access.py
+++ b/tests/integration/test_singleton_access.py
@@ -64,11 +64,11 @@ def array_gen(lib):
         yield arr
     for arr in nonscalar_gen(lib):
         idx_tuple = arr.ndim * (2,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1,), -1)
         yield arr
     for arr in nonscalar_gen(lib):
         idx_tuple = arr.ndim * (2,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1, 1), -1)
         yield arr
     # set single item on scalar array
     for arr in scalar_gen(lib, 42):
@@ -77,11 +77,11 @@ def array_gen(lib):
         yield arr
     for arr in scalar_gen(lib, 42):
         idx_tuple = arr.ndim * (0,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1,), -1)
         yield arr
     for arr in scalar_gen(lib, 42):
         idx_tuple = arr.ndim * (0,)
-        arr[idx_tuple] = -1
+        arr[idx_tuple] = lib.full((1, 1), -1)
         yield arr
     # set "multiple" items on scalar array
     for arr in scalar_gen(lib, 42):