From 7dc8f4e2c271467b50689e51e86e1561a59a82a6 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Fri, 30 Sep 2022 11:16:02 -0700
Subject: [PATCH 01/89] Run CI every 4 hours (#623)

Run CI every 4 hours in addition to commit-triggered runs.

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 .github/workflows/ci.yml | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eda13fbda..9bf976c8a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,6 +6,9 @@ on:
   pull_request:
     branches-ignore:
       - gh-pages  # deployment target branch (this workflow should not exist on that branch anyway)
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    - cron:  '* */4 * * *'
 env:
   COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
   PROJECT: github-cunumeric-ci

From 298f094acbe3cb84757d12cabfc721f0bf15838c Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Fri, 30 Sep 2022 11:54:18 -0700
Subject: [PATCH 02/89] src/cunumeric/matrix: stop including coll.h in
 solve_template.inl (#620) (#637)

This file was included unnecessarily, and led to build issues on
distributed machines. In particular, including coll.h pulls in mpi.h,
which is an unresolved header to NVCC.

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
Co-authored-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 src/cunumeric/matrix/solve_template.inl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/cunumeric/matrix/solve_template.inl b/src/cunumeric/matrix/solve_template.inl
index a4f0f7894..bff40ad9c 100644
--- a/src/cunumeric/matrix/solve_template.inl
+++ b/src/cunumeric/matrix/solve_template.inl
@@ -18,8 +18,6 @@
 
 #include <vector>
 
-#include "core/comm/coll.h"
-
 // Useful for IDEs
 #include "cunumeric/matrix/solve.h"
 

From e0601452f855163984e035aefc8e1e367e291279 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 5 Oct 2022 12:16:33 -0700
Subject: [PATCH 03/89] Adjust the schedule of the CI runs (#641)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9bf976c8a..6abe49c3b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ on:
       - gh-pages  # deployment target branch (this workflow should not exist on that branch anyway)
   schedule:
     # * is a special character in YAML so you have to quote this string
-    - cron:  '* */4 * * *'
+    - cron:  '0 */6 * * *'
 env:
   COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
   PROJECT: github-cunumeric-ci

From 52bb4f5b3fce583901e49c43a05b76f76fae4586 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Mon, 10 Oct 2022 11:51:21 -0700
Subject: [PATCH 04/89] Move test driver code to legate.core (#627)

---
 test.py                                       |   8 +-
 tests/_utils/__init__.py                      |  74 -----
 tests/_utils/args.py                          | 286 ------------------
 tests/_utils/config.py                        | 161 ----------
 tests/_utils/logger.py                        |  67 ----
 tests/_utils/stages/__init__.py               |  41 ---
 tests/_utils/stages/_linux/__init__.py        |  24 --
 tests/_utils/stages/_linux/cpu.py             |  80 -----
 tests/_utils/stages/_linux/eager.py           |  71 -----
 tests/_utils/stages/_linux/gpu.py             |  82 -----
 tests/_utils/stages/_linux/omp.py             |  84 -----
 tests/_utils/stages/_osx/__init__.py          |  24 --
 tests/_utils/stages/_osx/cpu.py               |  64 ----
 tests/_utils/stages/_osx/eager.py             |  64 ----
 tests/_utils/stages/_osx/gpu.py               |  51 ----
 tests/_utils/stages/_osx/omp.py               |  70 -----
 tests/_utils/stages/test_stage.py             | 265 ----------------
 tests/_utils/stages/util.py                   | 115 -------
 tests/_utils/system.py                        | 170 -----------
 tests/_utils/test_plan.py                     | 131 --------
 tests/_utils/tests/__init__.py                |  15 -
 tests/_utils/tests/stages/__init__.py         |  38 ---
 tests/_utils/tests/stages/_linux/__init__.py  |  22 --
 tests/_utils/tests/stages/_linux/test_cpu.py  | 131 --------
 .../_utils/tests/stages/_linux/test_eager.py  |  81 -----
 tests/_utils/tests/stages/_linux/test_gpu.py  | 100 ------
 tests/_utils/tests/stages/_linux/test_omp.py  | 163 ----------
 tests/_utils/tests/stages/test_test_stage.py  |  87 ------
 tests/_utils/tests/stages/test_util.py        |  48 ---
 tests/_utils/tests/test___init__.py           |  73 -----
 tests/_utils/tests/test_args.py               | 132 --------
 tests/_utils/tests/test_config.py             | 177 -----------
 tests/_utils/tests/test_logger.py             |  74 -----
 tests/_utils/tests/test_system.py             |  78 -----
 tests/_utils/tests/test_types.py              |  30 --
 tests/_utils/tests/test_ui.py                 | 103 -------
 tests/_utils/types.py                         |  50 ---
 tests/_utils/ui.py                            | 229 --------------
 38 files changed, 4 insertions(+), 3559 deletions(-)
 delete mode 100644 tests/_utils/__init__.py
 delete mode 100644 tests/_utils/args.py
 delete mode 100644 tests/_utils/config.py
 delete mode 100644 tests/_utils/logger.py
 delete mode 100644 tests/_utils/stages/__init__.py
 delete mode 100644 tests/_utils/stages/_linux/__init__.py
 delete mode 100644 tests/_utils/stages/_linux/cpu.py
 delete mode 100644 tests/_utils/stages/_linux/eager.py
 delete mode 100644 tests/_utils/stages/_linux/gpu.py
 delete mode 100644 tests/_utils/stages/_linux/omp.py
 delete mode 100644 tests/_utils/stages/_osx/__init__.py
 delete mode 100644 tests/_utils/stages/_osx/cpu.py
 delete mode 100644 tests/_utils/stages/_osx/eager.py
 delete mode 100644 tests/_utils/stages/_osx/gpu.py
 delete mode 100644 tests/_utils/stages/_osx/omp.py
 delete mode 100644 tests/_utils/stages/test_stage.py
 delete mode 100644 tests/_utils/stages/util.py
 delete mode 100644 tests/_utils/system.py
 delete mode 100644 tests/_utils/test_plan.py
 delete mode 100644 tests/_utils/tests/__init__.py
 delete mode 100644 tests/_utils/tests/stages/__init__.py
 delete mode 100644 tests/_utils/tests/stages/_linux/__init__.py
 delete mode 100644 tests/_utils/tests/stages/_linux/test_cpu.py
 delete mode 100644 tests/_utils/tests/stages/_linux/test_eager.py
 delete mode 100644 tests/_utils/tests/stages/_linux/test_gpu.py
 delete mode 100644 tests/_utils/tests/stages/_linux/test_omp.py
 delete mode 100644 tests/_utils/tests/stages/test_test_stage.py
 delete mode 100644 tests/_utils/tests/stages/test_util.py
 delete mode 100644 tests/_utils/tests/test___init__.py
 delete mode 100644 tests/_utils/tests/test_args.py
 delete mode 100644 tests/_utils/tests/test_config.py
 delete mode 100644 tests/_utils/tests/test_logger.py
 delete mode 100644 tests/_utils/tests/test_system.py
 delete mode 100644 tests/_utils/tests/test_types.py
 delete mode 100644 tests/_utils/tests/test_ui.py
 delete mode 100644 tests/_utils/types.py
 delete mode 100644 tests/_utils/ui.py

diff --git a/test.py b/test.py
index edf9d772b..8dcda54be 100755
--- a/test.py
+++ b/test.py
@@ -18,14 +18,14 @@
 
 import sys
 
-from tests._utils.config import Config
-from tests._utils.system import System
-from tests._utils.test_plan import TestPlan
+from legate.tester.config import Config
+from legate.tester.test_plan import TestPlan
+from legate.tester.test_system import TestSystem
 
 if __name__ == "__main__":
     config = Config(sys.argv)
 
-    system = System(dry_run=config.dry_run)
+    system = TestSystem(dry_run=config.dry_run)
 
     plan = TestPlan(config, system)
 
diff --git a/tests/_utils/__init__.py b/tests/_utils/__init__.py
deleted file mode 100644
index 11b8f1d70..000000000
--- a/tests/_utils/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Utilities and helpers for implementing the Cunumeric custom test runner.
-
-"""
-from __future__ import annotations
-
-from typing import Union
-from typing_extensions import Literal, TypeAlias
-
-#: Define the available feature types for tests
-FeatureType: TypeAlias = Union[
-    Literal["cpus"], Literal["cuda"], Literal["eager"], Literal["openmp"]
-]
-
-#: Value to use if --cpus is not specified.
-DEFAULT_CPUS_PER_NODE = 4
-
-#: Value to use if --gpus is not specified.
-DEFAULT_GPUS_PER_NODE = 1
-
-# Delay to introduce between GPU test invocations (ms)
-DEFAULT_GPU_DELAY = 2000
-
-# Value to use if --fbmem is not specified (MB)
-DEFAULT_GPU_MEMORY_BUDGET = 4096
-
-#: Value to use if --omps is not specified.
-DEFAULT_OMPS_PER_NODE = 1
-
-#: Value to use if --ompthreads is not specified.
-DEFAULT_OMPTHREADS = 4
-
-#: Default values to apply to normalize the testing environment.
-DEFAULT_PROCESS_ENV = {
-    "LEGATE_TEST": "1",
-}
-
-#: Width for terminal ouput headers and footers.
-UI_WIDTH = 65
-
-#: Feature values that are accepted for --use, in the relative order
-#: that the corresponding test stages should always execute in
-FEATURES: tuple[FeatureType, ...] = (
-    "cpus",
-    "cuda",
-    "eager",
-    "openmp",
-)
-
-#: Paths to example files that should be skipped.
-SKIPPED_EXAMPLES = {
-    "examples/ingest.py",
-    "examples/kmeans_sort.py",
-    "examples/lstm_full.py",
-    "examples/wgrad.py",
-}
-
-#: Extra arguments to supply when specific examples are executed.
-PER_FILE_ARGS = {
-    "examples/lstm_full.py": ["--file", "resources/lstm_input.txt"],
-}
diff --git a/tests/_utils/args.py b/tests/_utils/args.py
deleted file mode 100644
index d97ebf603..000000000
--- a/tests/_utils/args.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide an argparse ArgumentParser for the test runner.
-
-"""
-from __future__ import annotations
-
-from argparse import Action, ArgumentParser, Namespace
-from typing import (
-    Any,
-    Generic,
-    Iterable,
-    Iterator,
-    Literal,
-    Sequence,
-    TypeVar,
-    Union,
-)
-
-from typing_extensions import TypeAlias
-
-from . import (
-    DEFAULT_CPUS_PER_NODE,
-    DEFAULT_GPU_DELAY,
-    DEFAULT_GPU_MEMORY_BUDGET,
-    DEFAULT_GPUS_PER_NODE,
-    DEFAULT_OMPS_PER_NODE,
-    DEFAULT_OMPTHREADS,
-    FEATURES,
-)
-
-T = TypeVar("T")
-
-PinOptionsType: TypeAlias = Union[
-    Literal["partial"],
-    Literal["none"],
-    Literal["strict"],
-]
-
-PIN_OPTIONS: tuple[PinOptionsType, ...] = (
-    "partial",
-    "none",
-    "strict",
-)
-
-
-class MultipleChoices(Generic[T]):
-    """A container that reports True for any item or subset inclusion.
-
-    Parameters
-    ----------
-    choices: Iterable[T]
-        The values to populate the containter.
-
-    Examples
-    --------
-
-    >>> choices = MultipleChoices(["a", "b", "c"])
-
-    >>> "a" in choices
-    True
-
-    >>> ("b", "c") in choices
-    True
-
-    """
-
-    def __init__(self, choices: Iterable[T]) -> None:
-        self.choices = set(choices)
-
-    def __contains__(self, x: Union[T, Iterable[T]]) -> bool:
-        if isinstance(x, (list, tuple)):
-            return set(x).issubset(self.choices)
-        return x in self.choices
-
-    def __iter__(self) -> Iterator[T]:
-        return self.choices.__iter__()
-
-
-class ExtendAction(Action):
-    """A custom argparse action to collect multiple values into a list."""
-
-    def __call__(
-        self,
-        parser: ArgumentParser,
-        namespace: Namespace,
-        values: Union[str, Sequence[Any], None],
-        option_string: Union[str, None] = None,
-    ) -> None:
-        items = getattr(namespace, self.dest, None) or []
-        if isinstance(values, list):
-            items.extend(values)
-        else:
-            items.append(values)
-        setattr(namespace, self.dest, items)
-
-
-#: The argument parser for test.py
-parser = ArgumentParser(
-    description="Run the Cunumeric test suite",
-    epilog="Any extra arguments will be forwarded to the Legate script",
-)
-
-
-stages = parser.add_argument_group("Feature stage selection")
-
-
-stages.add_argument(
-    "--use",
-    dest="features",
-    action=ExtendAction,
-    choices=MultipleChoices(sorted(FEATURES)),
-    # argpase evidently only expects string returns from the type converter
-    # here, but returning a list of strings seems to work in practice
-    type=lambda s: s.split(","),  # type: ignore[return-value, arg-type]
-    help="Test Legate with features (also via USE_*)",
-)
-
-
-selection = parser.add_argument_group("Test file selection")
-
-
-selection.add_argument(
-    "--files",
-    nargs="+",
-    default=None,
-    help="Explicit list of test files to run",
-)
-
-
-selection.add_argument(
-    "--unit",
-    dest="unit",
-    action="store_true",
-    default=False,
-    help="Include unit tests",
-)
-
-
-feature_opts = parser.add_argument_group("Feature stage configuration options")
-
-
-feature_opts.add_argument(
-    "--cpus",
-    dest="cpus",
-    type=int,
-    default=DEFAULT_CPUS_PER_NODE,
-    help="Number of CPUs per node to use",
-)
-
-
-feature_opts.add_argument(
-    "--gpus",
-    dest="gpus",
-    type=int,
-    default=DEFAULT_GPUS_PER_NODE,
-    help="Number of GPUs per node to use",
-)
-
-
-feature_opts.add_argument(
-    "--omps",
-    dest="omps",
-    type=int,
-    default=DEFAULT_OMPS_PER_NODE,
-    help="Number OpenMP processors per node to use",
-)
-
-
-feature_opts.add_argument(
-    "--utility",
-    dest="utility",
-    type=int,
-    default=1,
-    help="Number of of utility CPUs to reserve for runtime services",
-)
-
-
-feature_opts.add_argument(
-    "--cpu-pin",
-    dest="cpu_pin",
-    choices=PIN_OPTIONS,
-    default="partial",
-    help="CPU pinning behavior on platforms that support CPU pinning",
-)
-
-feature_opts.add_argument(
-    "--gpu-delay",
-    dest="gpu_delay",
-    type=int,
-    default=DEFAULT_GPU_DELAY,
-    help="Delay to introduce between GPU tests (ms)",
-)
-
-
-feature_opts.add_argument(
-    "--fbmem",
-    dest="fbmem",
-    type=int,
-    default=DEFAULT_GPU_MEMORY_BUDGET,
-    help="GPU framebuffer memory (MB)",
-)
-
-
-feature_opts.add_argument(
-    "--ompthreads",
-    dest="ompthreads",
-    metavar="THREADS",
-    type=int,
-    default=DEFAULT_OMPTHREADS,
-    help="Number of threads per OpenMP processor",
-)
-
-
-test_opts = parser.add_argument_group("Test run configuration options")
-
-
-test_opts.add_argument(
-    "--legate",
-    dest="legate_dir",
-    metavar="LEGATE_DIR",
-    action="store",
-    default=None,
-    required=False,
-    help="Path to Legate installation directory",
-)
-
-
-test_opts.add_argument(
-    "-C",
-    "--directory",
-    dest="test_root",
-    metavar="DIR",
-    action="store",
-    default=None,
-    required=False,
-    help="Root directory containing the tests subdirectory",
-)
-
-
-test_opts.add_argument(
-    "-j",
-    "--workers",
-    dest="workers",
-    type=int,
-    default=None,
-    help="Number of parallel workers for testing",
-)
-
-
-test_opts.add_argument(
-    "-v",
-    "--verbose",
-    dest="verbose",
-    action="count",
-    default=0,
-    help="Display verbose output. Use -vv for even more output (test stdout)",
-)
-
-
-test_opts.add_argument(
-    "--dry-run",
-    dest="dry_run",
-    action="store_true",
-    help="Print the test plan but don't run anything",
-)
-
-
-test_opts.add_argument(
-    "--debug",
-    dest="debug",
-    action="store_true",
-    help="Print out the commands that are to be executed",
-)
diff --git a/tests/_utils/config.py b/tests/_utils/config.py
deleted file mode 100644
index 06b61e9de..000000000
--- a/tests/_utils/config.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-import os
-from argparse import Namespace
-from pathlib import Path
-
-from . import DEFAULT_PROCESS_ENV, FEATURES, SKIPPED_EXAMPLES, FeatureType
-from .args import parser
-from .types import ArgList, EnvDict
-
-
-class Config:
-    """A centralized configuration object that provides the information
-    needed by test stages in order to run.
-
-    Parameters
-    ----------
-    argv : ArgList
-        command-line arguments to use when building the configuration
-
-    """
-
-    def __init__(self, argv: ArgList) -> None:
-        args, self._extra_args = parser.parse_known_args(argv[1:])
-
-        # which tests to run
-        self.examples = True
-        self.integration = True
-        self.unit = args.unit
-        self.files = args.files
-
-        # feature configuration
-        self.features = self._compute_features(args)
-
-        # feature options for integration tests
-        self.cpus = args.cpus
-        self.gpus = args.gpus
-        self.omps = args.omps
-        self.utility = args.utility
-        self.cpu_pin = args.cpu_pin
-        self.fbmem = args.fbmem
-        self.gpu_delay = args.gpu_delay
-        self.ompthreads = args.ompthreads
-
-        # test run configuration
-        self.debug = args.debug
-        self.dry_run = args.dry_run
-        self.verbose = args.verbose
-        self.test_root = args.test_root
-        self.requested_workers = args.workers
-        self.legate_dir = self._compute_legate_dir(args)
-
-    @property
-    def env(self) -> EnvDict:
-        """Custom environment settings used for process exectution."""
-        return dict(DEFAULT_PROCESS_ENV)
-
-    @property
-    def extra_args(self) -> ArgList:
-        """Extra command-line arguments to pass on to individual test files."""
-        return self._extra_args
-
-    @property
-    def root_dir(self) -> Path:
-        """Path to the directory containing the tests."""
-        if self.test_root:
-            return Path(self.test_root)
-        return Path(__file__).parents[2]
-
-    @property
-    def test_files(self) -> tuple[Path, ...]:
-        """List of all test files to use for each stage.
-
-        An explicit list of files from the command line will take precedence.
-
-        Otherwise, the files are computed based on command-line options, etc.
-
-        """
-        if self.files:
-            return self.files
-
-        files = []
-
-        if self.examples:
-            examples = (
-                path.relative_to(self.root_dir)
-                for path in self.root_dir.joinpath("examples").glob("*.py")
-                if str(path.relative_to(self.root_dir)) not in SKIPPED_EXAMPLES
-            )
-            files.extend(sorted(examples))
-
-        if self.integration:
-            integration_tests = (
-                path.relative_to(self.root_dir)
-                for path in self.root_dir.joinpath("tests/integration").glob(
-                    "*.py"
-                )
-            )
-            files.extend(sorted(integration_tests))
-
-        if self.unit:
-            unit_tests = (
-                path.relative_to(self.root_dir)
-                for path in self.root_dir.joinpath("tests/unit").glob(
-                    "**/*.py"
-                )
-            )
-            files.extend(sorted(unit_tests))
-
-        return tuple(files)
-
-    @property
-    def legate_path(self) -> str:
-        """Computed path to the legate driver script"""
-        if self.legate_dir is None:
-            return "legate"
-        return str(self.legate_dir / "bin" / "legate")
-
-    def _compute_features(self, args: Namespace) -> tuple[FeatureType, ...]:
-        if args.features is not None:
-            computed = args.features
-        else:
-            computed = [
-                feature
-                for feature in FEATURES
-                if os.environ.get(f"USE_{feature.upper()}", None) == "1"
-            ]
-
-        # if nothing is specified any other way, at least run CPU stage
-        if len(computed) == 0:
-            computed.append("cpus")
-
-        return tuple(computed)
-
-    def _compute_legate_dir(self, args: Namespace) -> Path:
-        # self._legate_source below is purely for testing
-        if args.legate_dir:
-            self._legate_source = "cmd"
-            return Path(args.legate_dir)
-        elif "LEGATE_DIR" in os.environ:
-            self._legate_source = "env"
-            return Path(os.environ["LEGATE_DIR"])
-        self._legate_source = "install"
-        return None
diff --git a/tests/_utils/logger.py b/tests/_utils/logger.py
deleted file mode 100644
index f40904219..000000000
--- a/tests/_utils/logger.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide a basic logger that can scrub ANSI color codes.
-
-"""
-from __future__ import annotations
-
-import re
-
-# ref: https://stackoverflow.com/a/14693789
-_ANSI_ESCAPE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
-
-
-class Log:
-    def __init__(self) -> None:
-        self._record: list[str] = []
-
-    def __call__(self, *lines: str) -> tuple[int, int]:
-        return self.record(*lines)
-
-    def record(self, *lines: str) -> tuple[int, int]:
-        if len(lines) == 1 and "\n" in lines[0]:
-            lines = tuple(lines[0].split("\n"))
-
-        start = len(self._record)
-        for line in lines:
-            self._record.append(line)
-            print(line, flush=True)
-        return (start, len(self._record))
-
-    def clear(self) -> None:
-        self._record = []
-
-    def dump(
-        self,
-        *,
-        start: int = 0,
-        end: int | None = None,
-        filter_ansi: bool = True,
-    ) -> str:
-        lines = self._record[start:end]
-
-        if filter_ansi:
-            full_text = _ANSI_ESCAPE.sub("", "\n".join(lines))
-        else:
-            full_text = "\n".join(lines)
-
-        return full_text
-
-    @property
-    def lines(self) -> tuple[str, ...]:
-        return tuple(self._record)
-
-
-LOG = Log()
diff --git a/tests/_utils/stages/__init__.py b/tests/_utils/stages/__init__.py
deleted file mode 100644
index fa8f916d5..000000000
--- a/tests/_utils/stages/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide TestStage subclasses for running configured test files using
-specific features.
-
-"""
-from __future__ import annotations
-
-import sys
-from typing import Dict, Type
-
-from .. import FeatureType
-from .test_stage import TestStage
-from .util import log_proc
-
-if sys.platform == "darwin":
-    from ._osx import CPU, Eager, GPU, OMP
-elif sys.platform.startswith("linux"):
-    from ._linux import CPU, Eager, GPU, OMP
-else:
-    raise RuntimeError(f"unsupported platform: {sys.platform}")
-
-#: All the available test stages that can be selected
-STAGES: Dict[FeatureType, Type[TestStage]] = {
-    "cpus": CPU,
-    "cuda": GPU,
-    "openmp": OMP,
-    "eager": Eager,
-}
diff --git a/tests/_utils/stages/_linux/__init__.py b/tests/_utils/stages/_linux/__init__.py
deleted file mode 100644
index 032305f9c..000000000
--- a/tests/_utils/stages/_linux/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide TestStage subclasses for running configured test files using
-specific features on linux platforms.
-
-"""
-from __future__ import annotations
-
-from .cpu import CPU
-from .gpu import GPU
-from .eager import Eager
-from .omp import OMP
diff --git a/tests/_utils/stages/_linux/cpu.py b/tests/_utils/stages/_linux/cpu.py
deleted file mode 100644
index 665793081..000000000
--- a/tests/_utils/stages/_linux/cpu.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from itertools import chain
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import (
-    CUNUMERIC_TEST_ARG,
-    UNPIN_ENV,
-    Shard,
-    StageSpec,
-    adjust_workers,
-)
-
-
-class CPU(TestStage):
-    """A test stage for exercising CPU features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "cpus"
-
-    args = [CUNUMERIC_TEST_ARG]
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        return {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        args = [
-            "--cpus",
-            str(config.cpus),
-        ]
-        if config.cpu_pin != "none":
-            args += [
-                "--cpu-bind",
-                ",".join(str(x) for x in shard),
-            ]
-        return args
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        cpus = system.cpus
-
-        procs = config.cpus + config.utility + int(config.cpu_pin == "strict")
-        workers = adjust_workers(len(cpus) // procs, config.requested_workers)
-
-        shards: list[tuple[int, ...]] = []
-        for i in range(workers):
-            shard_cpus = range(i * procs, (i + 1) * procs)
-            shard = chain.from_iterable(cpus[j].ids for j in shard_cpus)
-            shards.append(tuple(sorted(shard)))
-
-        return StageSpec(workers, shards)
diff --git a/tests/_utils/stages/_linux/eager.py b/tests/_utils/stages/_linux/eager.py
deleted file mode 100644
index 8e63fc49b..000000000
--- a/tests/_utils/stages/_linux/eager.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import Shard, StageSpec, adjust_workers
-
-
-class Eager(TestStage):
-    """A test stage for exercising Eager Numpy execution features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "eager"
-
-    args: ArgList = []
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        # Raise min chunk sizes for deferred codepaths to force eager execution
-        env = {
-            "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
-        }
-        return env
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        return [
-            "--cpus",
-            "1",
-            "--cpu-bind",
-            ",".join(str(x) for x in shard),
-        ]
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        N = len(system.cpus)
-
-        degree = min(N, 60)  # ~LEGION_MAX_NUM_PROCS just in case
-        workers = adjust_workers(degree, config.requested_workers)
-
-        # Just put each worker on its own full CPU for eager tests
-        shards = [cpu.ids for cpu in system.cpus]
-
-        return StageSpec(workers, shards)
diff --git a/tests/_utils/stages/_linux/gpu.py b/tests/_utils/stages/_linux/gpu.py
deleted file mode 100644
index 12012a481..000000000
--- a/tests/_utils/stages/_linux/gpu.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-import time
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import CUNUMERIC_TEST_ARG, Shard, StageSpec, adjust_workers
-
-BLOAT_FACTOR = 1.5  # hard coded for now
-
-
-class GPU(TestStage):
-    """A test stage for exercising GPU features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "cuda"
-
-    args = [CUNUMERIC_TEST_ARG]
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        return {}
-
-    def delay(self, shard: Shard, config: Config, system: System) -> None:
-        time.sleep(config.gpu_delay / 1000)
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        return [
-            "--fbmem",
-            str(config.fbmem),
-            "--gpus",
-            str(len(shard)),
-            "--gpu-bind",
-            ",".join(str(x) for x in shard),
-        ]
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        N = len(system.gpus)
-        degree = N // config.gpus
-
-        fbsize = min(gpu.total for gpu in system.gpus) / (2 << 20)  # MB
-        oversub_factor = int(fbsize // (config.fbmem * BLOAT_FACTOR))
-        workers = adjust_workers(
-            degree * oversub_factor, config.requested_workers
-        )
-
-        # https://docs.python.org/3/library/itertools.html#itertools-recipes
-        # grouper('ABCDEF', 3) --> ABC DEF
-        args = [iter(range(degree * config.gpus))] * config.gpus
-        per_worker_shards = list(zip(*args))
-
-        shards = per_worker_shards * workers
-
-        return StageSpec(workers, shards)
diff --git a/tests/_utils/stages/_linux/omp.py b/tests/_utils/stages/_linux/omp.py
deleted file mode 100644
index 84a954412..000000000
--- a/tests/_utils/stages/_linux/omp.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from itertools import chain
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import (
-    CUNUMERIC_TEST_ARG,
-    UNPIN_ENV,
-    Shard,
-    StageSpec,
-    adjust_workers,
-)
-
-
-class OMP(TestStage):
-    """A test stage for exercising OpenMP features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "openmp"
-
-    args = [CUNUMERIC_TEST_ARG]
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        return {} if config.cpu_pin == "strict" else dict(UNPIN_ENV)
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        args = [
-            "--omps",
-            str(config.omps),
-            "--ompthreads",
-            str(config.ompthreads),
-        ]
-        if config.cpu_pin != "none":
-            args += [
-                "--cpu-bind",
-                ",".join(str(x) for x in shard),
-            ]
-        return args
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        cpus = system.cpus
-        omps, threads = config.omps, config.ompthreads
-        procs = (
-            omps * threads + config.utility + int(config.cpu_pin == "strict")
-        )
-        workers = adjust_workers(len(cpus) // procs, config.requested_workers)
-
-        shards: list[tuple[int, ...]] = []
-        for i in range(workers):
-            shard_cpus = range(i * procs, (i + 1) * procs)
-            shard = chain.from_iterable(cpus[j].ids for j in shard_cpus)
-            shards.append(tuple(sorted(shard)))
-
-        return StageSpec(workers, shards)
diff --git a/tests/_utils/stages/_osx/__init__.py b/tests/_utils/stages/_osx/__init__.py
deleted file mode 100644
index 80a7c368d..000000000
--- a/tests/_utils/stages/_osx/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide TestStage subclasses for running configured test files using
-specific features on OSX.
-
-"""
-from __future__ import annotations
-
-from .cpu import CPU
-from .gpu import GPU
-from .eager import Eager
-from .omp import OMP
diff --git a/tests/_utils/stages/_osx/cpu.py b/tests/_utils/stages/_osx/cpu.py
deleted file mode 100644
index ec6d23f20..000000000
--- a/tests/_utils/stages/_osx/cpu.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import (
-    CUNUMERIC_TEST_ARG,
-    UNPIN_ENV,
-    Shard,
-    StageSpec,
-    adjust_workers,
-)
-
-
-class CPU(TestStage):
-    """A test stage for exercising CPU features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "cpus"
-
-    args = [CUNUMERIC_TEST_ARG]
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        return UNPIN_ENV
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        return ["--cpus", str(config.cpus)]
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        procs = config.cpus + config.utility
-        workers = adjust_workers(
-            len(system.cpus) // procs, config.requested_workers
-        )
-
-        # return a dummy set of shards just for the runner to iterate over
-        return StageSpec(workers, [(i,) for i in range(workers)])
diff --git a/tests/_utils/stages/_osx/eager.py b/tests/_utils/stages/_osx/eager.py
deleted file mode 100644
index 5cc5d557d..000000000
--- a/tests/_utils/stages/_osx/eager.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import UNPIN_ENV, Shard, StageSpec, adjust_workers
-
-
-class Eager(TestStage):
-    """A test stage for exercising Eager Numpy execution features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "eager"
-
-    args: ArgList = []
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        # Raise min chunk sizes for deferred codepaths to force eager execution
-        env = {
-            "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
-            "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
-        }
-        env.update(UNPIN_ENV)
-        return env
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        return ["--cpus", "1"]
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        N = len(system.cpus)
-        degree = min(N, 60)  # ~LEGION_MAX_NUM_PROCS just in case
-        workers = adjust_workers(degree, config.requested_workers)
-
-        # return a dummy set of shards just for the runner to iterate over
-        return StageSpec(workers, [(i,) for i in range(workers)])
diff --git a/tests/_utils/stages/_osx/gpu.py b/tests/_utils/stages/_osx/gpu.py
deleted file mode 100644
index f89fe7377..000000000
--- a/tests/_utils/stages/_osx/gpu.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-import time
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import CUNUMERIC_TEST_ARG, UNPIN_ENV, Shard
-
-
-class GPU(TestStage):
-    """A test stage for exercising GPU features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "cuda"
-
-    args: ArgList = [CUNUMERIC_TEST_ARG]
-
-    def __init__(self, config: Config, system: System) -> None:
-        raise RuntimeError("GPU test are not supported on OSX")
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        return UNPIN_ENV
-
-    def delay(self, shard: Shard, config: Config, system: System) -> None:
-        time.sleep(config.gpu_delay / 1000)
diff --git a/tests/_utils/stages/_osx/omp.py b/tests/_utils/stages/_osx/omp.py
deleted file mode 100644
index f5f19194d..000000000
--- a/tests/_utils/stages/_osx/omp.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from ... import FeatureType
-from ...config import Config
-from ...system import System
-from ...types import ArgList, EnvDict
-from ..test_stage import TestStage
-from ..util import (
-    CUNUMERIC_TEST_ARG,
-    UNPIN_ENV,
-    Shard,
-    StageSpec,
-    adjust_workers,
-)
-
-
-class OMP(TestStage):
-    """A test stage for exercising OpenMP features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType = "openmp"
-
-    args = [CUNUMERIC_TEST_ARG]
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        return UNPIN_ENV
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        return [
-            "--omps",
-            str(config.omps),
-            "--ompthreads",
-            str(config.ompthreads),
-        ]
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        omps, threads = config.omps, config.ompthreads
-        procs = omps * threads + config.utility
-        workers = adjust_workers(
-            len(system.cpus) // procs, config.requested_workers
-        )
-
-        # return a dummy set of shards just for the runner to iterate over
-        return StageSpec(workers, [(i,) for i in range(workers)])
diff --git a/tests/_utils/stages/test_stage.py b/tests/_utils/stages/test_stage.py
deleted file mode 100644
index 0bfbe4f06..000000000
--- a/tests/_utils/stages/test_stage.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-import multiprocessing
-from datetime import datetime
-from pathlib import Path
-
-from typing_extensions import Protocol
-
-from .. import PER_FILE_ARGS, FeatureType
-from ..config import Config
-from ..system import ProcessResult, System
-from ..types import ArgList, EnvDict
-from ..ui import banner, summary, yellow
-from .util import Shard, StageResult, StageSpec, log_proc
-
-
-class TestStage(Protocol):
-    """Encapsulate running configured test files using specific features.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    kind: FeatureType
-
-    #: The computed specification for processes to launch to run the
-    #: configured test files.
-    spec: StageSpec
-
-    #: The computed sharding id sets to use for job runs
-    shards: multiprocessing.Queue[Shard]
-
-    #: After the stage completes, results will be stored here
-    result: StageResult
-
-    #: Any fixed stage-specific command-line args to pass
-    args: ArgList
-
-    # --- Protocol methods
-
-    def __init__(self, config: Config, system: System) -> None:
-        ...
-
-    def env(self, config: Config, system: System) -> EnvDict:
-        """Generate stage-specific customizations to the process env
-
-        Parameters
-        ----------
-        config: Config
-            Test runner configuration
-
-        system: System
-            Process execution wrapper
-
-        """
-        ...
-
-    def delay(self, shard: Shard, config: Config, system: System) -> None:
-        """Wait any delay that should be applied before running the next
-        test.
-
-        Parameters
-        ----------
-        shard: Shard
-            The shard to be used for the next test that is run
-
-        config: Config
-            Test runner configuration
-
-        system: System
-            Process execution wrapper
-
-        """
-        ...
-
-    def shard_args(self, shard: Shard, config: Config) -> ArgList:
-        """Generate the command line arguments necessary to launch
-        the next test process on the given shard.
-
-        Parameters
-        ----------
-        shard: Shard
-            The shard to be used for the next test that is run
-
-        config: Config
-            Test runner configuration
-
-        """
-        ...
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        """Compute the number of worker processes to launch and stage shards
-        to use for running the configured test files.
-
-        Parameters
-        ----------
-        config: Config
-            Test runner configuration
-
-        system: System
-            Process execution wrapper
-
-        """
-        ...
-
-    # --- Shared implementation methods
-
-    def __call__(self, config: Config, system: System) -> None:
-        """Execute this test stage.
-
-        Parameters
-        ----------
-        config: Config
-            Test runner configuration
-
-        system: System
-            Process execution wrapper
-
-        """
-        t0 = datetime.now()
-        procs = self._launch(config, system)
-        t1 = datetime.now()
-
-        self.result = StageResult(procs, t1 - t0)
-
-    @property
-    def name(self) -> str:
-        """A stage name to display for tests in this stage."""
-        return self.__class__.__name__
-
-    @property
-    def intro(self) -> str:
-        """An informative banner to display at stage end."""
-        workers = self.spec.workers
-        workers_text = f"{workers} worker{'s' if workers > 1 else ''}"
-        return (
-            banner(f"Entering stage: {self.name} (with {workers_text})") + "\n"
-        )
-
-    @property
-    def outro(self) -> str:
-        """An informative banner to display at stage end."""
-        total, passed = self.result.total, self.result.passed
-
-        result = summary(self.name, total, passed, self.result.time)
-
-        footer = banner(
-            f"Exiting stage: {self.name}",
-            details=(
-                "* Results      : "
-                + yellow(
-                    f"{passed} / {total} files passed "  # noqa E500
-                    f"({passed/total*100:0.1f}%)"
-                    if total > 0
-                    else "0 tests are running, Please check "
-                ),
-                "* Elapsed time : " + yellow(f"{self.result.time}"),
-            ),
-        )
-
-        return f"{result}\n{footer}"
-
-    def file_args(self, test_file: Path, config: Config) -> ArgList:
-        """Extra command line arguments based on the test file.
-
-        Parameters
-        ----------
-        test_file : Path
-            Path to a test file
-
-        config: Config
-            Test runner configuration
-
-        """
-        test_file_string = str(test_file)
-        args = PER_FILE_ARGS.get(test_file_string, [])
-
-        # These are a bit ugly but necessary in order to make pytest generate
-        # more verbose output for integration tests when -v, -vv is specified
-        if "integration" in test_file_string and config.verbose > 0:
-            args += ["-v"]
-        if "integration" in test_file_string and config.verbose > 1:
-            args += ["-s"]
-
-        return args
-
-    def run(
-        self, test_file: Path, config: Config, system: System
-    ) -> ProcessResult:
-        """Execute a single test files with appropriate environment and
-        command-line options for a feature test stage.
-
-        Parameters
-        ----------
-        test_file : Path
-            Test file to execute
-
-        config: Config
-            Test runner configuration
-
-        system: System
-            Process execution wrapper
-
-        """
-        test_path = config.root_dir / test_file
-
-        shard = self.shards.get()
-
-        stage_args = self.args + self.shard_args(shard, config)
-        file_args = self.file_args(test_file, config)
-
-        cmd = [str(config.legate_path), str(test_path)]
-        cmd += stage_args + file_args + config.extra_args
-
-        self.delay(shard, config, system)
-
-        result = system.run(cmd, test_file, env=self._env(config, system))
-        log_proc(self.name, result, config, verbose=config.verbose)
-
-        self.shards.put(shard)
-
-        return result
-
-    def _env(self, config: Config, system: System) -> EnvDict:
-        env = dict(config.env)
-        env.update(self.env(config, system))
-        return env
-
-    def _init(self, config: Config, system: System) -> None:
-        self.spec = self.compute_spec(config, system)
-        self.shards = system.manager.Queue(len(self.spec.shards))
-        for shard in self.spec.shards:
-            self.shards.put(shard)
-
-    def _launch(self, config: Config, system: System) -> list[ProcessResult]:
-
-        pool = multiprocessing.pool.ThreadPool(self.spec.workers)
-
-        jobs = [
-            pool.apply_async(self.run, (path, config, system))
-            for path in config.test_files
-        ]
-        pool.close()
-
-        return [job.get() for job in jobs]
diff --git a/tests/_utils/stages/util.py b/tests/_utils/stages/util.py
deleted file mode 100644
index 357474c90..000000000
--- a/tests/_utils/stages/util.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from dataclasses import dataclass
-from datetime import timedelta
-from typing import Tuple, Union
-
-from typing_extensions import TypeAlias
-
-from ..config import Config
-from ..logger import LOG
-from ..system import ProcessResult
-from ..ui import failed, passed, shell, skipped
-
-CUNUMERIC_TEST_ARG = "-cunumeric:test"
-
-UNPIN_ENV = {"REALM_SYNTHETIC_CORE_MAP": ""}
-
-Shard: TypeAlias = Tuple[int, ...]
-
-
-@dataclass(frozen=True)
-class StageSpec:
-    """Specify the operation of a test run"""
-
-    #: The number of worker processes to start for running tests
-    workers: int
-
-    # A list of (cpu or gpu) shards to draw on for each test
-    shards: list[Shard]
-
-
-@dataclass(frozen=True)
-class StageResult:
-    """Collect results from all tests in a TestStage."""
-
-    #: Individual test process results including return code and stdout.
-    procs: list[ProcessResult]
-
-    #: Cumulative execution time for all tests in a stage.
-    time: timedelta
-
-    @property
-    def total(self) -> int:
-        """The total number of tests run in this stage."""
-        return len(self.procs)
-
-    @property
-    def passed(self) -> int:
-        """The number of tests in this stage that passed."""
-        return sum(p.returncode == 0 for p in self.procs)
-
-
-def adjust_workers(workers: int, requested_workers: Union[int, None]) -> int:
-    """Adjust computed workers according to command line requested workers.
-
-    The final number of workers will only be adjusted down by this function.
-
-    Parameters
-    ----------
-    workers: int
-        The computed number of workers to use
-
-    requested_workers: int | None, optional
-        Requested number of workers from the user, if supplied (default: None)
-
-    Returns
-    -------
-    int
-        The number of workers to actually use
-
-    """
-    if requested_workers is not None and requested_workers < 0:
-        raise ValueError("requested workers must be non-negative")
-
-    if requested_workers is not None:
-        if requested_workers > workers:
-            raise RuntimeError(
-                "Requested workers greater than assignable workers"
-            )
-        workers = requested_workers
-
-    if workers == 0:
-        raise RuntimeError("Current configuration results in zero workers")
-
-    return workers
-
-
-def log_proc(
-    name: str, proc: ProcessResult, config: Config, *, verbose: bool
-) -> None:
-    """Log a process result according to the current configuration"""
-    if config.debug or config.dry_run:
-        LOG(shell(proc.invocation))
-    msg = f"({name}) {proc.test_file}"
-    details = proc.output.split("\n") if verbose else None
-    if proc.skipped:
-        LOG(skipped(msg))
-    elif proc.returncode == 0:
-        LOG(passed(msg, details=details))
-    else:
-        LOG(failed(msg, details=details))
diff --git a/tests/_utils/system.py b/tests/_utils/system.py
deleted file mode 100644
index 71411b45b..000000000
--- a/tests/_utils/system.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide a System class to encapsulate process execution and reporting
-system information (number of CPUs present, etc).
-
-"""
-from __future__ import annotations
-
-import multiprocessing
-import os
-import sys
-from dataclasses import dataclass
-from functools import cached_property
-from pathlib import Path
-from subprocess import PIPE, STDOUT, run as stdlib_run
-from typing import Sequence
-
-from .types import CPUInfo, EnvDict, GPUInfo
-
-
-@dataclass
-class ProcessResult:
-
-    #: The command invovation, including relevant environment vars
-    invocation: str
-
-    #  User-friendly test file path to use in reported output
-    test_file: Path
-
-    #: Whether this process was actually invoked
-    skipped: bool = False
-
-    #: The returncode from the process
-    returncode: int = 0
-
-    #: The collected stdout and stderr output from the process
-    output: str = ""
-
-
-class System:
-    """A facade class for system-related functions.
-
-    Parameters
-    ----------
-    dry_run : bool, optional
-        If True, no commands will be executed, but a log of any commands
-        submitted to ``run`` will be made. (default: False)
-
-    """
-
-    def __init__(
-        self,
-        *,
-        dry_run: bool = False,
-    ) -> None:
-        self.manager = multiprocessing.Manager()
-        self.dry_run: bool = dry_run
-
-    def run(
-        self,
-        cmd: Sequence[str],
-        test_file: Path,
-        *,
-        env: EnvDict | None = None,
-        cwd: str | None = None,
-    ) -> ProcessResult:
-        """Wrapper for subprocess.run that encapsulates logging.
-
-        Parameters
-        ----------
-        cmd : sequence of str
-            The command to run, split on whitespace into a sequence
-            of strings
-
-        test_file : Path
-            User-friendly test file path to use in reported output
-
-        env : dict[str, str] or None, optional, default: None
-            Environment variables to apply when running the command
-
-        cwd: str or None, optional, default: None
-            A current working directory to pass to stdlib ``run``.
-
-        """
-
-        env = env or {}
-
-        envstr = (
-            " ".join(f"{k}={v}" for k, v in env.items())
-            + min(len(env), 1) * " "
-        )
-
-        invocation = envstr + " ".join(cmd)
-
-        if self.dry_run:
-            return ProcessResult(invocation, test_file, skipped=True)
-
-        full_env = dict(os.environ)
-        full_env.update(env)
-
-        proc = stdlib_run(
-            cmd, cwd=cwd, env=full_env, stdout=PIPE, stderr=STDOUT, text=True
-        )
-
-        return ProcessResult(
-            invocation,
-            test_file,
-            returncode=proc.returncode,
-            output=proc.stdout,
-        )
-
-    @cached_property
-    def cpus(self) -> tuple[CPUInfo, ...]:
-        """A list of CPUs on the system."""
-
-        N = multiprocessing.cpu_count()
-
-        if sys.platform == "darwin":
-            return tuple(CPUInfo((i,)) for i in range(N))
-
-        sibling_sets: set[tuple[int, ...]] = set()
-        for i in range(N):
-            line = open(
-                f"/sys/devices/system/cpu/cpu{i}/topology/thread_siblings_list"
-            ).read()
-            sibling_sets.add(
-                tuple(sorted(int(x) for x in line.strip().split(",")))
-            )
-        return tuple(CPUInfo(siblings) for siblings in sorted(sibling_sets))
-
-    @cached_property
-    def gpus(self) -> tuple[GPUInfo, ...]:
-        """A list of GPUs on the system, including total memory information."""
-
-        try:
-            # This pynvml import is protected inside this method so that in
-            # case pynvml is not installed, tests stages that don't need gpu
-            # info (e.g. cpus, eager) will proceed unaffected. Test stages
-            # that do require gpu info will fail here with an ImportError.
-            import pynvml  # type: ignore[import]
-
-            # Also a pynvml package is available on some platforms that won't
-            # have GPUs for some reason. In which case this init call will
-            # fail.
-            pynvml.nvmlInit()
-        except Exception:
-            return ()
-
-        num_gpus = pynvml.nvmlDeviceGetCount()
-
-        results = []
-        for i in range(num_gpus):
-            info = pynvml.nvmlDeviceGetMemoryInfo(
-                pynvml.nvmlDeviceGetHandleByIndex(i)
-            )
-            results.append(GPUInfo(i, info.total))
-
-        return tuple(results)
diff --git a/tests/_utils/test_plan.py b/tests/_utils/test_plan.py
deleted file mode 100644
index 9e2a92532..000000000
--- a/tests/_utils/test_plan.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide a TestPlan class to coordinate multiple feature test stages.
-
-"""
-from __future__ import annotations
-
-from datetime import timedelta
-from itertools import chain
-
-from .config import Config
-from .logger import LOG
-from .stages import STAGES, log_proc
-from .system import System
-from .ui import banner, rule, summary, yellow
-
-
-class TestPlan:
-    """Encapsulate an entire test run with multiple feature test stages.
-
-    Parameters
-    ----------
-    config: Config
-        Test runner configuration
-
-    system: System
-        Process execution wrapper
-
-    """
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._config = config
-        self._system = system
-        self._stages = [
-            STAGES[feature](config, system) for feature in config.features
-        ]
-
-    def execute(self) -> int:
-        """Execute the entire test run with all configured feature stages."""
-        LOG.clear()
-
-        LOG(self.intro)
-
-        for stage in self._stages:
-            LOG(stage.intro)
-            stage(self._config, self._system)
-            LOG(stage.outro)
-
-        all_procs = tuple(
-            chain.from_iterable(s.result.procs for s in self._stages)
-        )
-        total = len(all_procs)
-        passed = sum(proc.returncode == 0 for proc in all_procs)
-
-        LOG(f"\n{rule()}")
-
-        self._log_failures(total, passed)
-
-        LOG(self.outro(total, passed))
-
-        return int((total - passed) > 0)
-
-    @property
-    def intro(self) -> str:
-        """An informative banner to display at test run start."""
-
-        cpus = len(self._system.cpus)
-        try:
-            gpus = len(self._system.gpus)
-        except ImportError:
-            gpus = 0
-
-        details = (
-            f"* Feature stages       : {', '.join(yellow(x) for x in self._config.features)}",  # noqa E501
-            f"* Test files per stage : {yellow(str(len(self._config.test_files)))}",  # noqa E501
-            f"* System description   : {yellow(str(cpus) + ' cpus')} / {yellow(str(gpus) + ' gpus')}",  # noqa E501
-        )
-        return banner("Test Suite Configuration", details=details)
-
-    def outro(self, total: int, passed: int) -> str:
-        """An informative banner to display at test run end.
-
-        Parameters
-        ----------
-        total: int
-            Number of total tests that ran in all stages
-
-        passed: int
-            Number of tests that passed in all stages
-
-        """
-        details = [
-            f"* {s.name: <6}: "
-            + yellow(
-                f"{s.result.passed} / {s.result.total} passed in {s.result.time.total_seconds():0.2f}s"  # noqa E501
-            )
-            for s in self._stages
-        ]
-
-        time = sum((s.result.time for s in self._stages), timedelta(0, 0))
-        details.append("")
-        details.append(
-            summary("All tests", total, passed, time, justify=False)
-        )
-
-        overall = banner("Overall summary", details=details)
-
-        return f"{overall}\n"
-
-    def _log_failures(self, total: int, passed: int) -> None:
-        if total == passed:
-            return
-
-        LOG(f"{banner('FAILURES')}\n")
-
-        for stage in self._stages:
-            procs = (proc for proc in stage.result.procs if proc.returncode)
-            for proc in procs:
-                log_proc(stage.name, proc, self._config, verbose=True)
diff --git a/tests/_utils/tests/__init__.py b/tests/_utils/tests/__init__.py
deleted file mode 100644
index f0b271624..000000000
--- a/tests/_utils/tests/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
diff --git a/tests/_utils/tests/stages/__init__.py b/tests/_utils/tests/stages/__init__.py
deleted file mode 100644
index 6e3992dc1..000000000
--- a/tests/_utils/tests/stages/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-from typing import Any
-
-from ...system import System
-from ...types import CPUInfo, GPUInfo
-
-
-class FakeSystem(System):
-    def __init__(
-        self, cpus: int = 6, gpus: int = 6, fbmem: int = 6 << 32, **kwargs: Any
-    ) -> None:
-        self._cpus = cpus
-        self._gpus = gpus
-        self._fbmem = fbmem
-        super().__init__(**kwargs)
-
-    @property
-    def cpus(self) -> tuple[CPUInfo, ...]:
-        return tuple(CPUInfo((i,)) for i in range(self._cpus))
-
-    @property
-    def gpus(self) -> tuple[GPUInfo, ...]:
-        return tuple(GPUInfo(i, self._fbmem) for i in range(self._gpus))
diff --git a/tests/_utils/tests/stages/_linux/__init__.py b/tests/_utils/tests/stages/_linux/__init__.py
deleted file mode 100644
index 345983919..000000000
--- a/tests/_utils/tests/stages/_linux/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-from __future__ import annotations
-
-import sys
-
-import pytest
-
-if sys.platform != "linux":
-    pytestmark = pytest.mark.skip()
diff --git a/tests/_utils/tests/stages/_linux/test_cpu.py b/tests/_utils/tests/stages/_linux/test_cpu.py
deleted file mode 100644
index cc2825c31..000000000
--- a/tests/_utils/tests/stages/_linux/test_cpu.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-import pytest
-
-from ....config import Config
-from ....stages._linux import cpu as m
-from ....stages.util import UNPIN_ENV
-from .. import FakeSystem
-
-
-def test_default() -> None:
-    c = Config([])
-    s = FakeSystem(cpus=12)
-    stage = m.CPU(c, s)
-    assert stage.kind == "cpus"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
-    assert stage.spec.workers > 0
-
-    shard = (1, 2, 3)
-    assert "--cpu-bind" in stage.shard_args(shard, c)
-
-
-def test_cpu_pin_strict() -> None:
-    c = Config(["test.py", "--cpu-pin", "strict"])
-    s = FakeSystem(cpus=12)
-    stage = m.CPU(c, s)
-    assert stage.kind == "cpus"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == {}
-    assert stage.spec.workers > 0
-
-    shard = (1, 2, 3)
-    assert "--cpu-bind" in stage.shard_args(shard, c)
-
-
-def test_cpu_pin_none() -> None:
-    c = Config(["test.py", "--cpu-pin", "none"])
-    s = FakeSystem(cpus=12)
-    stage = m.CPU(c, s)
-    assert stage.kind == "cpus"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
-    assert stage.spec.workers > 0
-
-    shard = (1, 2, 3)
-    assert "--cpu-bind" not in stage.shard_args(shard, c)
-
-
-@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
-def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
-    c = Config([])
-    s = FakeSystem()
-    stage = m.CPU(c, s)
-    result = stage.shard_args(shard, c)
-    assert result == ["--cpus", f"{c.cpus}", "--cpu-bind", expected]
-
-
-def test_spec_with_cpus_1() -> None:
-    c = Config(["test.py", "--cpus", "1"])
-    s = FakeSystem()
-    stage = m.CPU(c, s)
-    assert stage.spec.workers == 3
-    assert stage.spec.shards == [(0, 1), (2, 3), (4, 5)]
-
-
-def test_spec_with_cpus_2() -> None:
-    c = Config(["test.py", "--cpus", "2"])
-    s = FakeSystem()
-    stage = m.CPU(c, s)
-    assert stage.spec.workers == 2
-    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5)]
-
-
-def test_spec_with_utility() -> None:
-    c = Config(["test.py", "--cpus", "1", "--utility", "2"])
-    s = FakeSystem()
-    stage = m.CPU(c, s)
-    assert stage.spec.workers == 2
-    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5)]
-
-
-def test_spec_with_requested_workers() -> None:
-    c = Config(["test.py", "--cpus", "1", "-j", "2"])
-    s = FakeSystem()
-    stage = m.CPU(c, s)
-    assert stage.spec.workers == 2
-    assert stage.spec.shards == [(0, 1), (2, 3)]
-
-
-def test_spec_with_requested_workers_zero() -> None:
-    s = FakeSystem()
-    c = Config(["test.py", "-j", "0"])
-    assert c.requested_workers == 0
-    with pytest.raises(RuntimeError):
-        m.CPU(c, s)
-
-
-def test_spec_with_requested_workers_bad() -> None:
-    s = FakeSystem()
-    c = Config(["test.py", "-j", f"{len(s.cpus)+1}"])
-    assert c.requested_workers > len(s.cpus)
-    with pytest.raises(RuntimeError):
-        m.CPU(c, s)
-
-
-def test_spec_with_verbose() -> None:
-    args = ["test.py", "--cpus", "2"]
-    c = Config(args)
-    cv = Config(args + ["--verbose"])
-    s = FakeSystem()
-
-    spec, vspec = m.CPU(c, s).spec, m.CPU(cv, s).spec
-    assert vspec == spec
diff --git a/tests/_utils/tests/stages/_linux/test_eager.py b/tests/_utils/tests/stages/_linux/test_eager.py
deleted file mode 100644
index 8fc21ecb6..000000000
--- a/tests/_utils/tests/stages/_linux/test_eager.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-import pytest
-
-from ....config import Config
-from ....stages._linux import eager as m
-from .. import FakeSystem
-
-
-def test_default() -> None:
-    c = Config([])
-    s = FakeSystem()
-    stage = m.Eager(c, s)
-    assert stage.kind == "eager"
-    assert stage.args == []
-    assert stage.env(c, s) == {
-        "CUNUMERIC_MIN_CPU_CHUNK": "2000000000",
-        "CUNUMERIC_MIN_OMP_CHUNK": "2000000000",
-        "CUNUMERIC_MIN_GPU_CHUNK": "2000000000",
-    }
-    assert stage.spec.workers > 0
-
-
-@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
-def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
-    c = Config([])
-    s = FakeSystem()
-    stage = m.Eager(c, s)
-    result = stage.shard_args(shard, c)
-    assert result == ["--cpus", "1", "--cpu-bind", expected]
-
-
-def test_spec() -> None:
-    c = Config([])
-    s = FakeSystem()
-    stage = m.Eager(c, s)
-    assert stage.spec.workers == len(s.cpus)
-    #  [cpu.ids for cpu in system.cpus]
-    assert stage.spec.shards == [(i,) for i in range(stage.spec.workers)]
-
-
-def test_spec_with_requested_workers_zero() -> None:
-    s = FakeSystem()
-    c = Config(["test.py", "-j", "0"])
-    assert c.requested_workers == 0
-    with pytest.raises(RuntimeError):
-        m.Eager(c, s)
-
-
-def test_spec_with_requested_workers_bad() -> None:
-    s = FakeSystem()
-    c = Config(["test.py", "-j", f"{len(s.cpus)+1}"])
-    assert c.requested_workers > len(s.cpus)
-    with pytest.raises(RuntimeError):
-        m.Eager(c, s)
-
-
-def test_spec_with_verbose() -> None:
-    c = Config(["test.py"])
-    cv = Config(["test.py", "--verbose"])
-    s = FakeSystem()
-
-    spec, vspec = m.Eager(c, s).spec, m.Eager(cv, s).spec
-    assert vspec == spec
diff --git a/tests/_utils/tests/stages/_linux/test_gpu.py b/tests/_utils/tests/stages/_linux/test_gpu.py
deleted file mode 100644
index 13c7bb836..000000000
--- a/tests/_utils/tests/stages/_linux/test_gpu.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-import pytest
-
-from ....config import Config
-from ....stages._linux import gpu as m
-from .. import FakeSystem
-
-
-def test_default() -> None:
-    c = Config([])
-    s = FakeSystem()
-    stage = m.GPU(c, s)
-    assert stage.kind == "cuda"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == {}
-    assert stage.spec.workers > 0
-
-
-@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
-def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
-    c = Config([])
-    s = FakeSystem()
-    stage = m.GPU(c, s)
-    result = stage.shard_args(shard, c)
-    assert result == [
-        "--fbmem",
-        "4096",
-        "--gpus",
-        f"{len(shard)}",
-        "--gpu-bind",
-        expected,
-    ]
-
-
-def test_spec_with_gpus_1() -> None:
-    c = Config(["test.py", "--gpus", "1"])
-    s = FakeSystem()
-    stage = m.GPU(c, s)
-    assert stage.spec.workers == 12
-    assert stage.spec.shards == [(0,), (1,), (2,), (3,), (4,), (5,)] * 12
-
-
-def test_spec_with_gpus_2() -> None:
-    c = Config(["test.py", "--gpus", "2"])
-    s = FakeSystem()
-    stage = m.GPU(c, s)
-    assert stage.spec.workers == 6
-    assert stage.spec.shards == [(0, 1), (2, 3), (4, 5)] * 6
-
-
-def test_spec_with_requested_workers() -> None:
-    c = Config(["test.py", "--gpus", "1", "-j", "2"])
-    s = FakeSystem()
-    stage = m.GPU(c, s)
-    assert stage.spec.workers == 2
-    assert stage.spec.shards == [(0,), (1,), (2,), (3,), (4,), (5,)] * 2
-
-
-def test_spec_with_requested_workers_zero() -> None:
-    s = FakeSystem()
-    c = Config(["test.py", "-j", "0"])
-    assert c.requested_workers == 0
-    with pytest.raises(RuntimeError):
-        m.GPU(c, s)
-
-
-def test_spec_with_requested_workers_bad() -> None:
-    s = FakeSystem()
-    c = Config(["test.py", "-j", f"{len(s.gpus)+100}"])
-    assert c.requested_workers > len(s.gpus)
-    with pytest.raises(RuntimeError):
-        m.GPU(c, s)
-
-
-def test_spec_with_verbose() -> None:
-    args = ["test.py", "--gpus", "2"]
-    c = Config(args)
-    cv = Config(args + ["--verbose"])
-    s = FakeSystem()
-
-    spec, vspec = m.GPU(c, s).spec, m.GPU(cv, s).spec
-    assert vspec == spec
diff --git a/tests/_utils/tests/stages/_linux/test_omp.py b/tests/_utils/tests/stages/_linux/test_omp.py
deleted file mode 100644
index fd836759e..000000000
--- a/tests/_utils/tests/stages/_linux/test_omp.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-import pytest
-
-from ....config import Config
-from ....stages._linux import omp as m
-from ....stages.util import UNPIN_ENV
-from .. import FakeSystem
-
-
-def test_default() -> None:
-    c = Config([])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.kind == "openmp"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
-    assert stage.spec.workers > 0
-
-    shard = (1, 2, 3)
-    assert "--cpu-bind" in stage.shard_args(shard, c)
-
-
-def test_cpu_pin_strict() -> None:
-    c = Config(["test.py", "--cpu-pin", "strict"])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.kind == "openmp"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == {}
-    assert stage.spec.workers > 0
-
-    shard = (1, 2, 3)
-    assert "--cpu-bind" in stage.shard_args(shard, c)
-
-
-def test_cpu_pin_none() -> None:
-    c = Config(["test.py", "--cpu-pin", "none"])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.kind == "openmp"
-    assert stage.args == ["-cunumeric:test"]
-    assert stage.env(c, s) == UNPIN_ENV
-    assert stage.spec.workers > 0
-
-    shard = (1, 2, 3)
-    assert "--cpu-bind" not in stage.shard_args(shard, c)
-
-
-@pytest.mark.parametrize("shard,expected", [[(2,), "2"], [(1, 2, 3), "1,2,3"]])
-def test_shard_args(shard: tuple[int, ...], expected: str) -> None:
-    c = Config([])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    result = stage.shard_args(shard, c)
-    assert result == [
-        "--omps",
-        f"{c.omps}",
-        "--ompthreads",
-        f"{c.ompthreads}",
-        "--cpu-bind",
-        expected,
-    ]
-
-
-def test_spec_with_omps_1_threads_1() -> None:
-    c = Config(["test.py", "--omps", "1", "--ompthreads", "1"])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.spec.workers == 6
-    assert stage.spec.shards == [
-        (0, 1),
-        (2, 3),
-        (4, 5),
-        (6, 7),
-        (8, 9),
-        (10, 11),
-    ]
-
-
-def test_spec_with_omps_1_threads_2() -> None:
-    c = Config(["test.py", "--omps", "1", "--ompthreads", "2"])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.spec.workers == 4
-    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11)]
-
-
-def test_spec_with_omps_2_threads_1() -> None:
-    c = Config(["test.py", "--omps", "2", "--ompthreads", "1"])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.spec.workers == 4
-    assert stage.spec.shards == [(0, 1, 2), (3, 4, 5), (6, 7, 8), (9, 10, 11)]
-
-
-def test_spec_with_omps_2_threads_2() -> None:
-    c = Config(["test.py", "--omps", "2", "--ompthreads", "2"])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.spec.workers == 2
-    assert stage.spec.shards == [(0, 1, 2, 3, 4), (5, 6, 7, 8, 9)]
-
-
-def test_spec_with_utility() -> None:
-    c = Config(
-        ["test.py", "--omps", "2", "--ompthreads", "2", "--utility", "3"]
-    )
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.spec.workers == 1
-    assert stage.spec.shards == [(0, 1, 2, 3, 4, 5, 6)]
-
-
-def test_spec_with_requested_workers() -> None:
-    c = Config(["test.py", "--omps", "1", "--ompthreads", "1", "-j", "2"])
-    s = FakeSystem(cpus=12)
-    stage = m.OMP(c, s)
-    assert stage.spec.workers == 2
-    assert stage.spec.shards == [(0, 1), (2, 3)]
-
-
-def test_spec_with_requested_workers_zero() -> None:
-    s = FakeSystem(cpus=12)
-    c = Config(["test.py", "-j", "0"])
-    assert c.requested_workers == 0
-    with pytest.raises(RuntimeError):
-        m.OMP(c, s)
-
-
-def test_spec_with_requested_workers_bad() -> None:
-    s = FakeSystem(cpus=12)
-    c = Config(["test.py", "-j", f"{len(s.cpus)+1}"])
-    assert c.requested_workers > len(s.cpus)
-    with pytest.raises(RuntimeError):
-        m.OMP(c, s)
-
-
-def test_spec_with_verbose() -> None:
-    args = ["test.py", "--cpus", "2"]
-    c = Config(args)
-    cv = Config(args + ["--verbose"])
-    s = FakeSystem(cpus=12)
-
-    spec, vspec = m.OMP(c, s).spec, m.OMP(cv, s).spec
-    assert vspec == spec
diff --git a/tests/_utils/tests/stages/test_test_stage.py b/tests/_utils/tests/stages/test_test_stage.py
deleted file mode 100644
index 393ac18bc..000000000
--- a/tests/_utils/tests/stages/test_test_stage.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-from datetime import timedelta
-from pathlib import Path
-
-from ... import FeatureType
-from ...config import Config
-from ...stages import test_stage as m
-from ...stages.util import StageResult, StageSpec
-from ...system import ProcessResult, System
-from . import FakeSystem
-
-s = FakeSystem()
-
-
-class MockTestStage(m.TestStage):
-
-    kind: FeatureType = "eager"
-
-    name = "mock"
-
-    args = ["-foo", "-bar"]
-
-    def __init__(self, config: Config, system: System) -> None:
-        self._init(config, system)
-
-    def compute_spec(self, config: Config, system: System) -> StageSpec:
-        return StageSpec(2, [(0,), (1,), (2,)])
-
-
-class TestTestStage:
-    def test_name(self) -> None:
-        c = Config([])
-        stage = MockTestStage(c, s)
-        assert stage.name == "mock"
-
-    def test_intro(self) -> None:
-        c = Config([])
-        stage = MockTestStage(c, s)
-        assert "Entering stage: mock" in stage.intro
-
-    def test_outro(self) -> None:
-        c = Config([])
-        stage = MockTestStage(c, s)
-        stage.result = StageResult(
-            [ProcessResult("invoke", Path("test/file"))],
-            timedelta(seconds=2.123),
-        )
-        outro = stage.outro
-        assert "Exiting stage: mock" in outro
-        assert "Passed 1 of 1 tests (100.0%)" in outro
-        assert "2.123" in outro
-
-    def test_file_args_default(self) -> None:
-        c = Config([])
-        stage = MockTestStage(c, s)
-        assert stage.file_args(Path("integration/foo"), c) == []
-        assert stage.file_args(Path("unit/foo"), c) == []
-
-    def test_file_args_v(self) -> None:
-        c = Config(["test.py", "-v"])
-        stage = MockTestStage(c, s)
-        assert stage.file_args(Path("integration/foo"), c) == ["-v"]
-        assert stage.file_args(Path("unit/foo"), c) == []
-
-    def test_file_args_vv(self) -> None:
-        c = Config(["test.py", "-vv"])
-        stage = MockTestStage(c, s)
-        assert stage.file_args(Path("integration/foo"), c) == ["-v", "-s"]
-        assert stage.file_args(Path("unit/foo"), c) == []
diff --git a/tests/_utils/tests/stages/test_util.py b/tests/_utils/tests/stages/test_util.py
deleted file mode 100644
index 7d9dfe143..000000000
--- a/tests/_utils/tests/stages/test_util.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-import pytest
-
-from ...stages import util as m
-
-
-class Test_adjust_workers:
-    @pytest.mark.parametrize("n", (1, 5, 100))
-    def test_None_requested(self, n: int) -> None:
-        assert m.adjust_workers(n, None) == n
-
-    @pytest.mark.parametrize("n", (1, 2, 9))
-    def test_requested(self, n: int) -> None:
-        assert m.adjust_workers(10, n) == n
-
-    def test_negative_requested(self) -> None:
-        with pytest.raises(ValueError):
-            assert m.adjust_workers(10, -1)
-
-    def test_zero_requested(self) -> None:
-        with pytest.raises(RuntimeError):
-            assert m.adjust_workers(10, 0)
-
-    def test_zero_computed(self) -> None:
-        with pytest.raises(RuntimeError):
-            assert m.adjust_workers(0, None)
-
-    def test_requested_too_large(self) -> None:
-        with pytest.raises(RuntimeError):
-            assert m.adjust_workers(10, 11)
diff --git a/tests/_utils/tests/test___init__.py b/tests/_utils/tests/test___init__.py
deleted file mode 100644
index 393f5d7bc..000000000
--- a/tests/_utils/tests/test___init__.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-from .. import (
-    DEFAULT_CPUS_PER_NODE,
-    DEFAULT_GPU_DELAY,
-    DEFAULT_GPU_MEMORY_BUDGET,
-    DEFAULT_GPUS_PER_NODE,
-    DEFAULT_OMPS_PER_NODE,
-    DEFAULT_OMPTHREADS,
-    DEFAULT_PROCESS_ENV,
-    FEATURES,
-    PER_FILE_ARGS,
-    SKIPPED_EXAMPLES,
-    UI_WIDTH,
-)
-
-
-class TestConsts:
-    def test_DEFAULT_CPUS_PER_NODE(self) -> None:
-        assert DEFAULT_CPUS_PER_NODE == 4
-
-    def test_DEFAULT_GPUS_PER_NODE(self) -> None:
-        assert DEFAULT_GPUS_PER_NODE == 1
-
-    def test_DEFAULT_GPU_DELAY(self) -> None:
-        assert DEFAULT_GPU_DELAY == 2000
-
-    def test_DEFAULT_GPU_MEMORY_BUDGET(self) -> None:
-        assert DEFAULT_GPU_MEMORY_BUDGET == 4096
-
-    def test_DEFAULT_OMPS_PER_NODE(self) -> None:
-        assert DEFAULT_OMPS_PER_NODE == 1
-
-    def test_DEFAULT_OMPTHREADS(self) -> None:
-        assert DEFAULT_OMPTHREADS == 4
-
-    def test_DEFAULT_PROCESS_ENV(self) -> None:
-        assert DEFAULT_PROCESS_ENV == {
-            "LEGATE_TEST": "1",
-        }
-
-    def test_UI_WIDTH(self) -> None:
-        assert UI_WIDTH == 65
-
-    def test_FEATURES(self) -> None:
-        assert FEATURES == ("cpus", "cuda", "eager", "openmp")
-
-    def test_SKIPPED_EXAMPLES(self) -> None:
-        assert isinstance(SKIPPED_EXAMPLES, set)
-        assert all(isinstance(x, str) for x in SKIPPED_EXAMPLES)
-        assert all(x.startswith("examples") for x in SKIPPED_EXAMPLES)
-
-    def test_PER_FILE_ARGS(self) -> None:
-        assert isinstance(PER_FILE_ARGS, dict)
-        assert all(isinstance(x, str) for x in PER_FILE_ARGS.keys())
-        assert all(isinstance(x, list) for x in PER_FILE_ARGS.values())
diff --git a/tests/_utils/tests/test_args.py b/tests/_utils/tests/test_args.py
deleted file mode 100644
index 1f17a9bdb..000000000
--- a/tests/_utils/tests/test_args.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-from itertools import chain, combinations
-from typing import Iterable, TypeVar
-
-import pytest
-
-from .. import (
-    DEFAULT_CPUS_PER_NODE,
-    DEFAULT_GPU_DELAY,
-    DEFAULT_GPU_MEMORY_BUDGET,
-    DEFAULT_GPUS_PER_NODE,
-    DEFAULT_OMPS_PER_NODE,
-    DEFAULT_OMPTHREADS,
-    args as m,
-)
-
-T = TypeVar("T")
-
-
-# https://docs.python.org/3/library/itertools.html#itertools-recipes
-def powerset(iterable: Iterable[T]) -> Iterable[Iterable[T]]:
-    xs = list(iterable)
-    return chain.from_iterable(combinations(xs, n) for n in range(len(xs) + 1))
-
-
-class TestParserDefaults:
-    def test_featurs(self) -> None:
-        assert m.parser.get_default("features") is None
-
-    def test_files(self) -> None:
-        assert m.parser.get_default("files") is None
-
-    def test_unit(self) -> None:
-        assert m.parser.get_default("unit") is False
-
-    def test_cpus(self) -> None:
-        assert m.parser.get_default("cpus") == DEFAULT_CPUS_PER_NODE
-
-    def test_gpus(self) -> None:
-        assert m.parser.get_default("gpus") == DEFAULT_GPUS_PER_NODE
-
-    def test_cpu_pin(self) -> None:
-        assert m.parser.get_default("cpu_pin") == "partial"
-
-    def test_gpu_delay(self) -> None:
-        assert m.parser.get_default("gpu_delay") == DEFAULT_GPU_DELAY
-
-    def test_fbmem(self) -> None:
-        assert m.parser.get_default("fbmem") == DEFAULT_GPU_MEMORY_BUDGET
-
-    def test_omps(self) -> None:
-        assert m.parser.get_default("omps") == DEFAULT_OMPS_PER_NODE
-
-    def test_ompthreads(self) -> None:
-        assert m.parser.get_default("ompthreads") == DEFAULT_OMPTHREADS
-
-    def test_legate_dir(self) -> None:
-        assert m.parser.get_default("legate_dir") is None
-
-    def test_test_root(self) -> None:
-        assert m.parser.get_default("test_root") is None
-
-    def test_workers(self) -> None:
-        assert m.parser.get_default("workers") is None
-
-    def test_verbose(self) -> None:
-        assert m.parser.get_default("verbose") == 0
-
-    def test_dry_run(self) -> None:
-        assert m.parser.get_default("dry_run") is False
-
-    def test_debug(self) -> None:
-        assert m.parser.get_default("debug") is False
-
-
-class TestParserConfig:
-    def test_parser_epilog(self) -> None:
-        assert (
-            m.parser.epilog
-            == "Any extra arguments will be forwarded to the Legate script"
-        )
-
-    def test_parser_description(self) -> None:
-        assert m.parser.description == "Run the Cunumeric test suite"
-
-
-class TestMultipleChoices:
-    @pytest.mark.parametrize("choices", ([1, 2, 3], range(4), ("a", "b")))
-    def test_init(self, choices: Iterable[T]) -> None:
-        mc = m.MultipleChoices(choices)
-        assert mc.choices == set(choices)
-
-    def test_contains_item(self) -> None:
-        choices = [1, 2, 3]
-        mc = m.MultipleChoices(choices)
-        for item in choices:
-            assert item in mc
-
-    def test_contains_subset(self) -> None:
-        choices = [1, 2, 3]
-        mc = m.MultipleChoices(choices)
-        for subset in powerset(choices):
-            assert subset in mc
-
-    def test_iter(self) -> None:
-        choices = [1, 2, 3]
-        mc = m.MultipleChoices(choices)
-        assert list(mc) == choices
-
-
-# Testing this directly would require getting into argparse
-# internals. See test_config.py for indirect tests with --use
-class TestExtendAction:
-    pass
diff --git a/tests/_utils/tests/test_config.py b/tests/_utils/tests/test_config.py
deleted file mode 100644
index 76f71d7e7..000000000
--- a/tests/_utils/tests/test_config.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-from pathlib import Path, PurePath
-
-import pytest
-
-from .. import (
-    DEFAULT_CPUS_PER_NODE,
-    DEFAULT_GPU_DELAY,
-    DEFAULT_GPU_MEMORY_BUDGET,
-    DEFAULT_GPUS_PER_NODE,
-    DEFAULT_OMPS_PER_NODE,
-    DEFAULT_OMPTHREADS,
-    FEATURES,
-    config as m,
-)
-from ..args import PIN_OPTIONS, PinOptionsType
-
-
-class TestConfig:
-    def test_default_init(self) -> None:
-        c = m.Config([])
-
-        assert c.examples is True
-        assert c.integration is True
-        assert c.unit is False
-        assert c.files is None
-
-        assert c.features == ("cpus",)
-
-        assert c.cpus == DEFAULT_CPUS_PER_NODE
-        assert c.gpus == DEFAULT_GPUS_PER_NODE
-        assert c.cpu_pin == "partial"
-        assert c.gpu_delay == DEFAULT_GPU_DELAY
-        assert c.fbmem == DEFAULT_GPU_MEMORY_BUDGET
-        assert c.omps == DEFAULT_OMPS_PER_NODE
-        assert c.ompthreads == DEFAULT_OMPTHREADS
-
-        assert c.debug is False
-        assert c.dry_run is False
-        assert c.verbose == 0
-        assert c.test_root is None
-        assert c.requested_workers is None
-        assert c.legate_dir is None
-
-        assert c.extra_args == []
-        assert c.root_dir == PurePath(m.__file__).parents[2]
-        assert len(c.test_files) > 0
-        assert any("examples" in str(x) for x in c.test_files)
-        assert any("integration" in str(x) for x in c.test_files)
-        assert all("unit" not in str(x) for x in c.test_files)
-        assert c.legate_path == "legate"
-
-    @pytest.mark.parametrize("feature", FEATURES)
-    def test_env_features(
-        self, monkeypatch: pytest.MonkeyPatch, feature: str
-    ) -> None:
-        monkeypatch.setenv(f"USE_{feature.upper()}", "1")
-
-        # test default config
-        c = m.Config([])
-        assert set(c.features) == {feature}
-
-        # also test with a --use value provided
-        c = m.Config(["test.py", "--use", "cuda"])
-        assert set(c.features) == {"cuda"}
-
-    @pytest.mark.parametrize("feature", FEATURES)
-    def test_cmd_features(self, feature: str) -> None:
-
-        # test a single value
-        c = m.Config(["test.py", "--use", feature])
-        assert set(c.features) == {feature}
-
-        # also test with multiple / duplication
-        c = m.Config(["test.py", "--use", f"cpus,{feature}"])
-        assert set(c.features) == {"cpus", feature}
-
-    def test_unit(self) -> None:
-        c = m.Config(["test.py", "--unit"])
-        assert len(c.test_files) > 0
-        assert any("examples" in str(x) for x in c.test_files)
-        assert any("integration" in str(x) for x in c.test_files)
-        assert any("unit" in str(x) for x in c.test_files)
-
-    def test_files(self) -> None:
-        c = m.Config(["test.py", "--files", "a", "b", "c"])
-        assert c.files == ["a", "b", "c"]
-
-    @pytest.mark.parametrize(
-        "opt", ("cpus", "gpus", "gpu-delay", "fbmem", "omps", "ompthreads")
-    )
-    def test_feature_options(self, opt: str) -> None:
-        c = m.Config(["test.py", f"--{opt}", "1234"])
-        assert getattr(c, opt.replace("-", "_")) == 1234
-
-    @pytest.mark.parametrize("value", PIN_OPTIONS)
-    def test_cpu_pin(self, value: PinOptionsType) -> None:
-        c = m.Config(["test.py", "--cpu-pin", value])
-        assert c.cpu_pin == value
-
-    def test_workers(self) -> None:
-        c = m.Config(["test.py", "-j", "1234"])
-        assert c.requested_workers == 1234
-
-    def test_debug(self) -> None:
-        c = m.Config(["test.py", "--debug"])
-        assert c.debug is True
-
-    def test_dry_run(self) -> None:
-        c = m.Config(["test.py", "--dry-run"])
-        assert c.dry_run is True
-
-    @pytest.mark.parametrize("arg", ("-v", "--verbose"))
-    def test_verbose1(self, arg: str) -> None:
-        c = m.Config(["test.py", arg])
-        assert c.verbose == 1
-
-    def test_verbose2(self) -> None:
-        c = m.Config(["test.py", "-vv"])
-        assert c.verbose == 2
-
-    @pytest.mark.parametrize("arg", ("-C", "--directory"))
-    def test_test_root(self, arg: str) -> None:
-        c = m.Config(["test.py", arg, "some/path"])
-        assert c.test_root == "some/path"
-
-    def test_legate_dir(self) -> None:
-        c = m.Config([])
-        assert c.legate_dir is None
-        assert c.legate_path == "legate"
-        assert c._legate_source == "install"
-
-    def test_cmd_legate_dir_good(self) -> None:
-        legate_dir = Path("/usr/local")
-        c = m.Config(["test.py", "--legate", str(legate_dir)])
-        assert c.legate_dir == legate_dir
-        assert c.legate_path == str(legate_dir / "bin" / "legate")
-        assert c._legate_source == "cmd"
-
-    def test_env_legate_dir_good(
-        self, monkeypatch: pytest.MonkeyPatch
-    ) -> None:
-        legate_dir = Path("/usr/local")
-        monkeypatch.setenv("LEGATE_DIR", str(legate_dir))
-        c = m.Config([])
-        assert c.legate_dir == legate_dir
-        assert c.legate_path == str(legate_dir / "bin" / "legate")
-        assert c._legate_source == "env"
-
-    def test_extra_args(self) -> None:
-        extra = ["-foo", "--bar", "--baz", "10"]
-        c = m.Config(["test.py"] + extra)
-        assert c.extra_args == extra
-
-        # also test with --files since that option collects arguments
-        c = m.Config(["test.py", "--files", "a", "b"] + extra)
-        assert c.extra_args == extra
-        c = m.Config(["test.py"] + extra + ["--files", "a", "b"])
-        assert c.extra_args == extra
diff --git a/tests/_utils/tests/test_logger.py b/tests/_utils/tests/test_logger.py
deleted file mode 100644
index 637b4a5c7..000000000
--- a/tests/_utils/tests/test_logger.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-from .. import logger as m
-
-TEST_LINES = (
-    "line 1",
-    "\x1b[31mfoo\x1b[0m",  # ui.red("foo")
-    "bar",
-    "last line",
-)
-
-
-class TestLogger:
-    def test_init(self) -> None:
-        log = m.Log()
-        assert log.lines == ()
-        assert log.dump() == ""
-
-    def test_record_lines(self) -> None:
-        log = m.Log()
-        log.record(*TEST_LINES)
-        assert log.lines == TEST_LINES
-        assert log.dump(filter_ansi=False) == "\n".join(TEST_LINES)
-
-    def test_record_line_with_newlines(self) -> None:
-        log = m.Log()
-        log.record("\n".join(TEST_LINES))
-        assert log.lines == TEST_LINES
-        assert log.dump(filter_ansi=False) == "\n".join(TEST_LINES)
-
-    def test_call(self) -> None:
-        log = m.Log()
-        log(*TEST_LINES)
-        assert log.lines == TEST_LINES
-        assert log.dump() == "line 1\nfoo\nbar\nlast line"
-
-    def test_dump_filter(self) -> None:
-        log = m.Log()
-        log.record(*TEST_LINES)
-        assert log.lines == TEST_LINES
-        assert log.dump() == "line 1\nfoo\nbar\nlast line"
-
-    def test_dump_index(self) -> None:
-        log = m.Log()
-        log.record(*TEST_LINES)
-        assert log.dump(start=1, end=3) == "foo\nbar"
-
-    def test_clear(self) -> None:
-        log = m.Log()
-        log.record(*TEST_LINES)
-        assert len(log.lines) > 0
-        log.clear()
-        assert len(log.lines) == 0
-
-
-def test_LOG() -> None:
-    assert isinstance(m.LOG, m.Log)
diff --git a/tests/_utils/tests/test_system.py b/tests/_utils/tests/test_system.py
deleted file mode 100644
index d110e260f..000000000
--- a/tests/_utils/tests/test_system.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-import sys
-from pathlib import Path
-from subprocess import CompletedProcess
-from unittest.mock import MagicMock
-
-import pytest
-from pytest_mock import MockerFixture
-
-from .. import system as m
-
-
-@pytest.fixture
-def mock_subprocess_run(mocker: MockerFixture) -> MagicMock:
-    return mocker.patch.object(m, "stdlib_run")
-
-
-CMD = "legate script.py --cpus 4"
-
-
-class TestSystem:
-    def test_init(self) -> None:
-        s = m.System()
-        assert s.dry_run is False
-
-    def test_run(self, mock_subprocess_run: MagicMock) -> None:
-        s = m.System()
-
-        expected = m.ProcessResult(
-            CMD, Path("test/file"), returncode=10, output="<output>"
-        )
-        mock_subprocess_run.return_value = CompletedProcess(
-            CMD, 10, stdout="<output>"
-        )
-
-        result = s.run(CMD.split(), Path("test/file"))
-        mock_subprocess_run.assert_called()
-
-        assert result == expected
-
-    def test_dry_run(self, mock_subprocess_run: MagicMock) -> None:
-        s = m.System(dry_run=True)
-
-        result = s.run(CMD.split(), Path("test/file"))
-        mock_subprocess_run.assert_not_called()
-
-        assert result.output == ""
-        assert result.skipped
-
-    def test_cpus(self) -> None:
-        s = m.System()
-        cpus = s.cpus
-        assert len(cpus) > 0
-        assert all(len(cpu.ids) > 0 for cpu in cpus)
-
-    @pytest.mark.skipif(sys.platform != "linux", reason="pynvml required")
-    def test_gpus(self) -> None:
-        s = m.System()
-        # can't really assume / test much here
-        s.gpus
diff --git a/tests/_utils/tests/test_types.py b/tests/_utils/tests/test_types.py
deleted file mode 100644
index 30fe05a37..000000000
--- a/tests/_utils/tests/test_types.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-from .. import types as m
-
-
-class TestCPUInfo:
-    def test_fields(self) -> None:
-        assert set(m.CPUInfo.__dataclass_fields__) == {"ids"}
-
-
-class TestGPUInfo:
-    def test_fields(self) -> None:
-        assert set(m.GPUInfo.__dataclass_fields__) == {"id", "total"}
diff --git a/tests/_utils/tests/test_ui.py b/tests/_utils/tests/test_ui.py
deleted file mode 100644
index 9cc92948a..000000000
--- a/tests/_utils/tests/test_ui.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Consolidate test configuration from command-line and environment.
-
-"""
-from __future__ import annotations
-
-from datetime import timedelta
-
-import pytest
-from pytest_mock import MockerFixture
-
-from .. import UI_WIDTH, ui as m
-
-
-@pytest.fixture(autouse=True)
-def use_plain_text(mocker: MockerFixture) -> None:
-    mocker.patch.object(m, "bright", m._text)
-    mocker.patch.object(m, "dim", m._text)
-    mocker.patch.object(m, "white", m._text)
-    mocker.patch.object(m, "cyan", m._text)
-    mocker.patch.object(m, "red", m._text)
-    mocker.patch.object(m, "green", m._text)
-    mocker.patch.object(m, "yellow", m._text)
-
-
-def test_banner_simple() -> None:
-    assert (
-        m.banner("some text")
-        == "\n" + "#" * UI_WIDTH + "\n### some text\n" + "#" * UI_WIDTH
-    )
-
-
-def test_banner_full() -> None:
-    assert (
-        m.banner("some text", char="*", width=100, details=["a", "b"])
-        == "\n"
-        + "*" * 100
-        + "\n*** \n*** some text\n*** \n*** a\n*** b\n*** \n"
-        + "*" * 100
-    )
-
-
-def test_rule_default() -> None:
-    assert m.rule() == "    " + "~" * (UI_WIDTH - 4)
-
-
-def test_rule_with_args() -> None:
-    assert m.rule(10, "-") == " " * 10 + "-" * (UI_WIDTH - 10)
-
-
-def test_shell() -> None:
-    assert m.shell("cmd --foo") == "+cmd --foo"
-
-
-def test_shell_with_char() -> None:
-    assert m.shell("cmd --foo", char="") == "cmd --foo"
-
-
-def test_passed() -> None:
-    assert m.passed("msg") == "[PASS] msg"
-
-
-def test_passed_with_details() -> None:
-    assert m.passed("msg", details=["a", "b"]) == "[PASS] msg\n   a\n   b"
-
-
-def test_failed() -> None:
-    assert m.failed("msg") == "[FAIL] msg"
-
-
-def test_failed_with_details() -> None:
-    assert m.failed("msg", details=["a", "b"]) == "[FAIL] msg\n   a\n   b"
-
-
-def test_skipped() -> None:
-    assert m.skipped("msg") == "[SKIP] msg"
-
-
-def test_summary() -> None:
-    assert (
-        m.summary("foo", 12, 11, timedelta(seconds=2.123))
-        == f"{'foo: Passed 11 of 12 tests (91.7%) in 2.12s': >{UI_WIDTH}}"
-    )
-
-
-def test_summary_no_justify() -> None:
-    assert (
-        m.summary("foo", 12, 11, timedelta(seconds=2.123), justify=False)
-        == "foo: Passed 11 of 12 tests (91.7%) in 2.12s"
-    )
diff --git a/tests/_utils/types.py b/tests/_utils/types.py
deleted file mode 100644
index 1641bd597..000000000
--- a/tests/_utils/types.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Provide types that are useful throughout the test driver code.
-
-"""
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Dict, List
-
-from typing_extensions import TypeAlias
-
-
-@dataclass(frozen=True)
-class CPUInfo:
-    """Encapsulate information about a single CPU"""
-
-    #: IDs of hypterthreading sibling cores for a given physscal core
-    ids: tuple[int, ...]
-
-
-@dataclass(frozen=True)
-class GPUInfo:
-    """Encapsulate information about a single CPU"""
-
-    #: ID of the GPU to specify in test shards
-    id: int
-
-    #: The total framebuffer memory of this GPU
-    total: int
-
-
-#: Represent command line arguments
-ArgList = List[str]
-
-
-#: Represent str->str environment variable mappings
-EnvDict: TypeAlias = Dict[str, str]
diff --git a/tests/_utils/ui.py b/tests/_utils/ui.py
deleted file mode 100644
index eaa97d7c0..000000000
--- a/tests/_utils/ui.py
+++ /dev/null
@@ -1,229 +0,0 @@
-# Copyright AS2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Helpler functions for simple text UI output.
-
-The color functions in this module require ``colorama`` to be installed in
-order to generate color output. If ``colorama`` is not available, plain
-text output (i.e. without ANSI color codes) will generated.
-
-"""
-from __future__ import annotations
-
-import sys
-from datetime import timedelta
-from typing import Iterable
-
-from typing_extensions import TypeAlias
-
-from . import UI_WIDTH
-
-Details: TypeAlias = Iterable[str]
-
-
-def _text(text: str) -> str:
-    return text
-
-
-try:
-    import colorama  # type: ignore[import]
-
-    def bright(text: str) -> str:
-        return f"{colorama.Style.BRIGHT}{text}{colorama.Style.RESET_ALL}"
-
-    def dim(text: str) -> str:
-        return f"{colorama.Style.DIM}{text}{colorama.Style.RESET_ALL}"
-
-    def white(text: str) -> str:
-        return f"{colorama.Fore.WHITE}{text}{colorama.Style.RESET_ALL}"
-
-    def cyan(text: str) -> str:
-        return f"{colorama.Fore.CYAN}{text}{colorama.Style.RESET_ALL}"
-
-    def red(text: str) -> str:
-        return f"{colorama.Fore.RED}{text}{colorama.Style.RESET_ALL}"
-
-    def green(text: str) -> str:
-        return f"{colorama.Fore.GREEN}{text}{colorama.Style.RESET_ALL}"
-
-    def yellow(text: str) -> str:
-        return f"{colorama.Fore.YELLOW}{text}{colorama.Style.RESET_ALL}"
-
-    if sys.platform == "win32":
-        colorama.init()
-
-except ImportError:
-
-    bright = dim = white = cyan = red = green = yellow = _text
-
-
-def _format_details(
-    details: Iterable[str] | None = None, pre: str = "   "
-) -> str:
-    if details:
-        return f"{pre}" + f"\n{pre}".join(f"{line}" for line in details)
-    return ""
-
-
-def banner(
-    heading: str,
-    *,
-    char: str = "#",
-    width: int = UI_WIDTH,
-    details: Iterable[str] | None = None,
-) -> str:
-    """Generate a title banner, with optional details included.
-
-    Parameters
-    ----------
-    heading : str
-        Text to use for the title
-
-    char : str, optional
-        A character to use to frame the banner. (default: "#")
-
-    width : int, optional
-        How wide to draw the banner. (Note: user-supplied heading or
-        details willnot be truncated if they exceed this width)
-
-    details : Iterable[str], optional
-        A list of lines to diplay inside the banner area below the heading
-
-    """
-    pre = f"{char*3} "
-    divider = char * width
-    if not details:
-        return f"\n{divider}\n{pre}{heading}\n{divider}"
-    return f"""
-{divider}
-{pre}
-{pre}{heading}
-{pre}
-{_format_details(details, pre)}
-{pre}
-{divider}"""
-
-
-def failed(msg: str, *, details: Details | None = None) -> str:
-    """Report a failed test result with a bright red [FAIL].
-
-    Parameters
-    ----------
-    msg : str
-        Text to display after [FAIL]
-
-    details : Iterable[str], optional
-        A sequenece of text lines to diplay below the ``msg`` line
-
-    """
-    if details:
-        return f"{bright(red('[FAIL]'))} {msg}\n{_format_details(details)}"
-    return f"{bright(red('[FAIL]'))} {msg}"
-
-
-def passed(msg: str, *, details: Details | None = None) -> str:
-    """Report a passed test result with a bright green [PASS].
-
-    Parameters
-    ----------
-    msg : str
-        Text to display after [PASS]
-
-    details : Iterable[str], optional
-        A sequenece of text lines to diplay below the ``msg`` line
-
-    """
-    if details:
-        return f"{bright(green('[PASS]'))} {msg}\n{_format_details(details)}"
-    return f"{bright(green('[PASS]'))} {msg}"
-
-
-def rule(pad: int = 4, char: str = "~") -> str:
-    """Generate a horizontal rule.
-
-    Parameters
-    ----------
-    pad : int, optional
-        How much whitespace to precede the rule. (default: 4)
-
-    char : str, optional
-        A character to use to "draw" the rule. (default: "~")
-
-    """
-    w = UI_WIDTH - pad
-    return f"{char*w: >{UI_WIDTH}}"
-
-
-def shell(cmd: str, *, char: str = "+") -> str:
-    """Report a shell command in a dim white color.
-
-    Parameters
-    ----------
-    cmd : str
-        The shell command string to display
-
-    char : str, optional
-        A character to prefix the ``cmd`` with. (default: "+")
-
-    """
-    return dim(white(f"{char}{cmd}"))
-
-
-def skipped(msg: str) -> str:
-    """Report a skipped test with a cyan [SKIP]
-
-    Parameters
-    ----------
-    msg : str
-        Text to display after [SKIP]
-
-    """
-    return f"{cyan('[SKIP]')} {msg}"
-
-
-def summary(
-    name: str,
-    total: int,
-    passed: int,
-    time: timedelta,
-    *,
-    justify: bool = True,
-) -> str:
-    """Generate a test result summary line.
-
-    The output is bright green if all tests passed, otherwise bright red.
-
-    Parameters
-    ----------
-    name : str
-        A name to display in this summary line.
-
-    total : int
-        The total number of tests to report.
-
-    passed : int
-        The number of passed tests to report.
-
-    time : timedelta
-        The time taken to run the tests
-
-    """
-    summary = (
-        f"{name}: Passed {passed} of {total} tests ({passed/total*100:0.1f}%) "
-        f"in {time.total_seconds():0.2f}s"
-        if total > 0
-        else f"{name}: 0 tests are running, Please check"
-    )
-    color = green if passed == total and total > 0 else red
-    return bright(color(f"{summary: >{UI_WIDTH}}" if justify else summary))

From 31d6bb928ceb1f48d4e36a6497defd5b8c239fda Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 10 Oct 2022 15:05:52 -0700
Subject: [PATCH 05/89] Make the code compile with bounds checks (#648)

* Make the code compile with bounds checks

* Fix bounds checks issues in the CPU sorting code

* Fix "out-of-bounds" accesses in unary reductions on GPUs

* Update the comments to make them accurate
---
 src/cunumeric/sort/sort_cpu.inl               | 24 ++++++++++++-------
 .../unary/scalar_unary_red_template.inl       |  2 +-
 src/cunumeric/unary/unary_red.cu              |  7 ++++++
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/cunumeric/sort/sort_cpu.inl b/src/cunumeric/sort/sort_cpu.inl
index 3f04ed303..25323801b 100644
--- a/src/cunumeric/sort/sort_cpu.inl
+++ b/src/cunumeric/sort/sort_cpu.inl
@@ -482,8 +482,9 @@ void sample_sort_nd(SortPiece<legate_type_of<CODE>> local_sorted,
     comm::coll::collAllgather(
       worker_counts.ptr(my_rank), worker_counts.ptr(0), 1, comm::coll::CollDataType::CollInt, comm);
 
+    auto p_worker_count = worker_counts.ptr(0);
     int32_t worker_count =
-      std::accumulate(worker_counts.ptr(0), worker_counts.ptr(num_ranks), 0, std::plus<int32_t>());
+      std::accumulate(p_worker_count, p_worker_count + num_ranks, 0, std::plus<int32_t>());
 
     if (worker_count < num_ranks) {
       const size_t number_sort_groups = num_ranks / num_sort_ranks;
@@ -565,7 +566,8 @@ void sample_sort_nd(SortPiece<legate_type_of<CODE>> local_sorted,
       for (size_t sort_rank = 0; sort_rank < num_sort_ranks; ++sort_rank) {
         comm_size[sort_ranks[sort_rank]] = num_samples_l * sizeof(SegmentSample<VAL>);
       }
-      thrust::exclusive_scan(exec, comm_size.ptr(0), comm_size.ptr(num_ranks), rdispls.ptr(0), 0);
+      auto p_comm_size = comm_size.ptr(0);
+      thrust::exclusive_scan(exec, p_comm_size, p_comm_size + num_ranks, rdispls.ptr(0), 0);
 
       comm::coll::collAlltoallv(samples_l.ptr(0),
                                 comm_size.ptr(0),  // num_samples_l*size for all in sort group
@@ -612,8 +614,9 @@ void sample_sort_nd(SortPiece<legate_type_of<CODE>> local_sorted,
   auto segment_blocks = create_buffer<int32_t>(num_sort_ranks * num_segments_l);
 
   // initialize sizes to send [r][segment]
-  auto size_send = create_buffer<int32_t>(num_sort_ranks * (num_segments_l + 1));
-  std::fill(size_send.ptr(0), size_send.ptr(num_sort_ranks * (num_segments_l + 1)), 0);
+  auto size_send   = create_buffer<int32_t>(num_sort_ranks * (num_segments_l + 1));
+  auto p_size_send = size_send.ptr(0);
+  std::fill(p_size_send, p_size_send + num_sort_ranks * (num_segments_l + 1), 0);
 
   {
     for (int32_t segment = 0; segment < num_segments_l; ++segment) {
@@ -685,7 +688,8 @@ void sample_sort_nd(SortPiece<legate_type_of<CODE>> local_sorted,
     for (size_t sort_rank = 0; sort_rank < num_sort_ranks; ++sort_rank) {
       comm_size[sort_ranks[sort_rank]] = num_segments_l + 1;
     }
-    thrust::exclusive_scan(exec, comm_size.ptr(0), comm_size.ptr(num_ranks), displs.ptr(0), 0);
+    auto p_comm_size = comm_size.ptr(0);
+    thrust::exclusive_scan(exec, p_comm_size, p_comm_size + num_ranks, displs.ptr(0), 0);
 
     comm::coll::collAlltoallv(
       size_send.ptr(0),
@@ -781,10 +785,12 @@ void sample_sort_nd(SortPiece<legate_type_of<CODE>> local_sorted,
       recv_size_total[sort_ranks[sort_rank]] =
         sizeof(VAL) * size_recv[sort_rank * (num_segments_l + 1) + num_segments_l];
     }
+    auto p_send_size_total = send_size_total.ptr(0);
+    auto p_recv_size_total = recv_size_total.ptr(0);
     thrust::exclusive_scan(
-      exec, send_size_total.ptr(0), send_size_total.ptr(num_ranks), sdispls.ptr(0), 0);
+      exec, p_send_size_total, p_send_size_total + num_ranks, sdispls.ptr(0), 0);
     thrust::exclusive_scan(
-      exec, recv_size_total.ptr(0), recv_size_total.ptr(num_ranks), rdispls.ptr(0), 0);
+      exec, p_recv_size_total, p_recv_size_total + num_ranks, rdispls.ptr(0), 0);
 
     comm::coll::collAlltoallv(val_send_buffer.ptr(0),
                               send_size_total.ptr(0),
@@ -804,9 +810,9 @@ void sample_sort_nd(SortPiece<legate_type_of<CODE>> local_sorted,
       }
 
       thrust::exclusive_scan(
-        exec, send_size_total.ptr(0), send_size_total.ptr(num_ranks), sdispls.ptr(0), 0);
+        exec, p_send_size_total, p_send_size_total + num_ranks, sdispls.ptr(0), 0);
       thrust::exclusive_scan(
-        exec, recv_size_total.ptr(0), recv_size_total.ptr(num_ranks), rdispls.ptr(0), 0);
+        exec, p_recv_size_total, p_recv_size_total + num_ranks, rdispls.ptr(0), 0);
       comm::coll::collAlltoallv(idc_send_buffer.ptr(0),
                                 send_size_total.ptr(0),
                                 sdispls.ptr(0),
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 482d96187..3c097aaf8 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -100,8 +100,8 @@ struct ScalarUnaryRed {
 
   void execute() const noexcept
   {
-#ifndef LEGION_BOUNDS_CHECKS
     auto identity = LG_OP::identity;
+#ifndef LEGION_BOUNDS_CHECKS
     // The constexpr if here prevents the DenseReduction from being instantiated for GPU kernels
     // which limits compile times and binary sizes.
     if constexpr (KIND != VariantKind::GPU) {
diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index 1cc0d4653..99682fc8b 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -270,6 +270,13 @@ static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
   }
 #endif
 
+#ifdef LEGION_BOUNDS_CHECKS
+  // Note: this isn't necessary because we know that the affine transformation on the output
+  // accessor will ignore coordinates of the collapsed dimension. However, Legion's bounds checks
+  // want the accessor to honor the sub-rectangle passed when it was created, so we need to
+  // put points back in the bounds to appease the checks.
+  point[collapsed_dim] = domain.lo[collapsed_dim];
+#endif
   return point;
 }
 

From 9023e5bec3db087cacfa10beb6d5b84c3c577eed Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 10 Oct 2022 16:34:23 -0700
Subject: [PATCH 06/89] [pre-commit.ci] pre-commit autoupdate (#650)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/psf/black: 22.8.0 → 22.10.0](https://github.com/psf/black/compare/22.8.0...22.10.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 878ef81ac..bdc37baff 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ repos:
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
-      rev: 22.8.0
+      rev: 22.10.0
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8

From d0231c32b0ea129ec57a3896f2b77dacf9225437 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Mon, 10 Oct 2022 22:39:53 -0700
Subject: [PATCH 07/89] Add changelog config (#605)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 .github/release.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 .github/release.yml

diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 000000000..0a37704fb
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,17 @@
+changelog:
+  exclude:
+    labels:
+      - category:task
+  categories:
+    - title: 🐛 Bug Fixes
+      labels:
+        - category:bug-fix
+    - title: 🚀 New Features
+      labels:
+        - category:new-feature
+    - title: 🛠️ Improvements
+      labels:
+        - category:improvement
+    - title: 📖 Documentation
+      labels:
+        - category:documentation
\ No newline at end of file

From 2d476815bf252af6f4b38dcabec1dcf63e6e789e Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 12 Oct 2022 14:18:23 -0700
Subject: [PATCH 08/89] MatVec & MatVecMul use reduction stores, not outputs
 (#646)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 src/cunumeric/mapper.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index 855121cd2..ada6ca268 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -119,14 +119,14 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       // TODO: Our actual requirements are a little less strict than this; we require each array or
       // vector to have a stride of 1 on at least one dimension.
       std::vector<StoreMapping> mappings;
-      auto& inputs  = task.inputs();
-      auto& outputs = task.outputs();
+      auto& inputs     = task.inputs();
+      auto& reductions = task.reductions();
       for (auto& input : inputs) {
         mappings.push_back(StoreMapping::default_mapping(input, options.front()));
         mappings.back().policy.exact = true;
       }
-      for (auto& output : outputs) {
-        mappings.push_back(StoreMapping::default_mapping(output, options.front()));
+      for (auto& reduction : reductions) {
+        mappings.push_back(StoreMapping::default_mapping(reduction, options.front()));
         mappings.back().policy.exact = true;
       }
       return std::move(mappings);

From 7144efefba8443cde1a644a74eb94ee6a0f2b3df Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 12 Oct 2022 16:51:53 -0700
Subject: [PATCH 09/89] Remove leftover files from old build (#615)

---
 src/Makefile                   |  87 -----------------
 src/cunumeric.mk               | 166 ---------------------------------
 src/cunumeric/random/random.mk |  24 -----
 src/cunumeric/sort/sort.mk     |  50 ----------
 4 files changed, 327 deletions(-)
 delete mode 100644 src/Makefile
 delete mode 100644 src/cunumeric.mk
 delete mode 100644 src/cunumeric/random/random.mk
 delete mode 100644 src/cunumeric/sort/sort.mk

diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index 76ecd56d8..000000000
--- a/src/Makefile
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-ifndef LEGATE_DIR
-$(error LEGATE_DIR variable is not defined, aborting build)
-endif
-ifndef OPENBLAS_PATH
-$(error OPENBLAS_PATH variable is not defined, aborting build)
-endif
-ifndef OPENBLAS_LIBNAME
-$(error OPENBLAS_PATH variable is not defined, aborting build)
-endif
-ifndef TBLIS_PATH
-$(error TBLIS_PATH variable is not defined, aborting build)
-endif
-ifeq ($(strip $(USE_CUDA)),1)
-ifndef CUTENSOR_PATH
-$(error CUTENSOR_PATH variable is not defined, aborting build)
-endif
-ifndef NCCL_PATH
-$(error NCCL_PATH variable is not defined, aborting build)
-endif
-endif # ifeq ($(strip $(USE_CUDA)),1)
-ifndef THRUST_PATH
-$(error THRUST_PATH variable is not defined, aborting build)
-endif
-
-include $(LEGATE_DIR)/share/legate/config.mk
-
-LIBNAME = libcunumeric
-
-CURAND_PATH ?=
-
-CC_FLAGS ?=
-CC_FLAGS += -I. -I$(OPENBLAS_PATH)/include -I$(TBLIS_PATH)/include -I$(THRUST_PATH)
-CC_FLAGS += -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP
-
-ifdef CURAND_PATH
-BUILD_CURAND_TASKS = 1
-CC_FLAGS += -I$(CURAND_PATH)/include -DCUNUMERIC_CURAND_FOR_CPU_BUILD
-else
-ifeq ($(strip $(USE_CUDA)),1)
-BUILD_CURAND_TASKS = 1
-else
-BUILD_CURAND_TASKS = 0
-endif
-endif
-
-LD_FLAGS ?=
-LD_FLAGS += -L$(OPENBLAS_PATH)/lib -l$(OPENBLAS_LIBNAME) -Wl,-rpath,$(OPENBLAS_PATH)/lib
-LD_FLAGS += -L$(TBLIS_PATH)/lib -ltblis -Wl,-rpath,$(TBLIS_PATH)/lib
-ifeq ($(strip $(USE_CUDA)),1)
-LD_FLAGS += -lcublas -lcusolver -lcufft
-LD_FLAGS += -L$(CUTENSOR_PATH)/lib -lcutensor -Wl,-rpath,$(CUTENSOR_PATH)/lib
-LD_FLAGS += -L$(NCCL_PATH)/lib -lnccl -Wl,-rpath,$(NCCL_PATH)/lib
-endif
-NVCC_FLAGS ?=
-NVCC_FLAGS += -I. -I$(THRUST_PATH) -I$(CUTENSOR_PATH)/include -I$(NCCL_PATH)/include -Wno-deprecated-declarations
-
-ifeq ($(strip $(DEBUG)),1)
-CC_FLAGS += -DDEBUG_CUNUMERIC
-NVCC_FLAGS += -DDEBUG_CUNUMERIC
-endif
-
-CHECK_BOUNDS ?= 0
-ifeq ($(strip $(CHECK_BOUNDS)),1)
-CC_FLAGS += -DBOUNDS_CHECKS
-endif
-
-GEN_CPU_SRC =
-GEN_GPU_SRC =
-
-include cunumeric.mk
-
-include $(LEGATE_DIR)/share/legate/legate.mk
diff --git a/src/cunumeric.mk b/src/cunumeric.mk
deleted file mode 100644
index 1b7f17080..000000000
--- a/src/cunumeric.mk
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# List all the application source files that need OpenMP separately
-# since we have to add the -fopenmp flag to  CC_FLAGS for them
-GEN_CPU_SRC += cunumeric/ternary/where.cc               \
-							 cunumeric/scan/scan_global.cc            \
-							 cunumeric/scan/scan_local.cc             \
-							 cunumeric/binary/binary_op.cc            \
-							 cunumeric/binary/binary_red.cc           \
-							 cunumeric/bits/packbits.cc               \
-							 cunumeric/bits/unpackbits.cc             \
-							 cunumeric/unary/scalar_unary_red.cc      \
-							 cunumeric/unary/unary_op.cc              \
-							 cunumeric/unary/unary_red.cc             \
-							 cunumeric/unary/convert.cc               \
-							 cunumeric/nullary/arange.cc              \
-							 cunumeric/nullary/eye.cc                 \
-							 cunumeric/nullary/fill.cc                \
-							 cunumeric/nullary/window.cc              \
-							 cunumeric/index/advanced_indexing.cc     \
-							 cunumeric/index/choose.cc                \
-							 cunumeric/index/repeat.cc                \
-                                                         cunumeric/index/wrap.cc                  \
-							 cunumeric/index/zip.cc                   \
-							 cunumeric/item/read.cc                   \
-							 cunumeric/item/write.cc                  \
-							 cunumeric/matrix/contract.cc             \
-							 cunumeric/matrix/diag.cc                 \
-							 cunumeric/matrix/gemm.cc                 \
-							 cunumeric/matrix/matmul.cc               \
-							 cunumeric/matrix/matvecmul.cc            \
-							 cunumeric/matrix/dot.cc                  \
-							 cunumeric/matrix/potrf.cc                \
-							 cunumeric/matrix/solve.cc                \
-							 cunumeric/matrix/syrk.cc                 \
-							 cunumeric/matrix/tile.cc                 \
-							 cunumeric/matrix/transpose.cc            \
-							 cunumeric/matrix/trilu.cc                \
-							 cunumeric/matrix/trsm.cc                 \
-							 cunumeric/matrix/util.cc                 \
-							 cunumeric/random/rand.cc                 \
-							 cunumeric/search/argwhere.cc             \
-							 cunumeric/search/nonzero.cc              \
-							 cunumeric/set/unique.cc                  \
-							 cunumeric/set/unique_reduce.cc           \
-							 cunumeric/stat/bincount.cc               \
-							 cunumeric/convolution/convolve.cc        \
-							 cunumeric/transform/flip.cc              \
-							 cunumeric/arg.cc                         \
-							 cunumeric/mapper.cc
-
-GEN_CPU_SRC += cunumeric/cephes/chbevl.cc \
-							 cunumeric/cephes/i0.cc
-
-ifeq ($(strip $(USE_OPENMP)),1)
-GEN_CPU_SRC += cunumeric/ternary/where_omp.cc          \
-							 cunumeric/scan/scan_global_omp.cc       \
-							 cunumeric/scan/scan_local_omp.cc        \
-							 cunumeric/binary/binary_op_omp.cc       \
-							 cunumeric/binary/binary_red_omp.cc      \
-							 cunumeric/bits/packbits_omp.cc          \
-							 cunumeric/bits/unpackbits_omp.cc        \
-							 cunumeric/unary/unary_op_omp.cc         \
-							 cunumeric/unary/scalar_unary_red_omp.cc \
-							 cunumeric/unary/unary_red_omp.cc        \
-							 cunumeric/unary/convert_omp.cc          \
-							 cunumeric/nullary/arange_omp.cc         \
-							 cunumeric/nullary/eye_omp.cc            \
-							 cunumeric/nullary/fill_omp.cc           \
-							 cunumeric/nullary/window_omp.cc         \
-							 cunumeric/index/advanced_indexing_omp.cc\
-							 cunumeric/index/choose_omp.cc           \
-							 cunumeric/index/repeat_omp.cc           \
-                                                         cunumeric/index/wrap_omp.cc             \
-							 cunumeric/index/zip_omp.cc              \
-							 cunumeric/matrix/contract_omp.cc        \
-							 cunumeric/matrix/diag_omp.cc            \
-							 cunumeric/matrix/gemm_omp.cc            \
-							 cunumeric/matrix/matmul_omp.cc          \
-							 cunumeric/matrix/matvecmul_omp.cc       \
-							 cunumeric/matrix/dot_omp.cc             \
-							 cunumeric/matrix/potrf_omp.cc           \
-							 cunumeric/matrix/solve_omp.cc           \
-							 cunumeric/matrix/syrk_omp.cc            \
-							 cunumeric/matrix/tile_omp.cc            \
-							 cunumeric/matrix/transpose_omp.cc       \
-							 cunumeric/matrix/trilu_omp.cc           \
-							 cunumeric/matrix/trsm_omp.cc            \
-							 cunumeric/matrix/util_omp.cc            \
-							 cunumeric/random/rand_omp.cc            \
-							 cunumeric/search/argwhere_omp.cc        \
-							 cunumeric/search/nonzero_omp.cc         \
-							 cunumeric/set/unique_omp.cc             \
-							 cunumeric/stat/bincount_omp.cc          \
-							 cunumeric/convolution/convolve_omp.cc   \
-							 cunumeric/transform/flip_omp.cc
-endif
-
-GEN_GPU_SRC += cunumeric/ternary/where.cu               \
-							 cunumeric/scan/scan_global.cu            \
-							 cunumeric/scan/scan_local.cu             \
-							 cunumeric/binary/binary_op.cu            \
-							 cunumeric/binary/binary_red.cu           \
-							 cunumeric/bits/packbits.cu               \
-							 cunumeric/bits/unpackbits.cu             \
-							 cunumeric/unary/scalar_unary_red.cu      \
-							 cunumeric/unary/unary_red.cu             \
-							 cunumeric/unary/unary_op.cu              \
-							 cunumeric/unary/convert.cu               \
-							 cunumeric/nullary/arange.cu              \
-							 cunumeric/nullary/eye.cu                 \
-							 cunumeric/nullary/fill.cu                \
-							 cunumeric/nullary/window.cu              \
-							 cunumeric/index/advanced_indexing.cu     \
-							 cunumeric/index/choose.cu                \
-							 cunumeric/index/repeat.cu                \
-                                                         cunumeric/index/wrap.cu                  \
-							 cunumeric/index/zip.cu                   \
-							 cunumeric/item/read.cu                   \
-							 cunumeric/item/write.cu                  \
-							 cunumeric/matrix/contract.cu             \
-							 cunumeric/matrix/diag.cu                 \
-							 cunumeric/matrix/gemm.cu                 \
-							 cunumeric/matrix/matmul.cu               \
-							 cunumeric/matrix/matvecmul.cu            \
-							 cunumeric/matrix/dot.cu                  \
-							 cunumeric/matrix/potrf.cu                \
-							 cunumeric/matrix/solve.cu                \
-							 cunumeric/matrix/syrk.cu                 \
-							 cunumeric/matrix/tile.cu                 \
-							 cunumeric/matrix/transpose.cu            \
-							 cunumeric/matrix/trilu.cu                \
-							 cunumeric/matrix/trsm.cu                 \
-							 cunumeric/random/rand.cu                 \
-							 cunumeric/search/argwhere.cu             \
-							 cunumeric/search/nonzero.cu              \
-							 cunumeric/set/unique.cu                  \
-							 cunumeric/stat/bincount.cu               \
-							 cunumeric/convolution/convolve.cu        \
-							 cunumeric/fft/fft.cu                     \
-							 cunumeric/transform/flip.cu              \
-							 cunumeric/cudalibs.cu                    \
-							 cunumeric/cunumeric.cu
-
-include cunumeric/sort/sort.mk
-
-ifeq ($(strip $(BUILD_CURAND_TASKS)),1)
-include cunumeric/random/random.mk
-endif
-
-GEN_CPU_SRC += cunumeric/cunumeric.cc # This must always be the last file!
-                                      # It guarantees we do our registration callback
-                                      # only after all task variants are recorded
diff --git a/src/cunumeric/random/random.mk b/src/cunumeric/random/random.mk
deleted file mode 100644
index e2b2f20a6..000000000
--- a/src/cunumeric/random/random.mk
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-GEN_CPU_SRC += cunumeric/random/bitgenerator.cc                            \
-							 cunumeric/random/randutil/generator_host.cc                 \
-							 cunumeric/random/randutil/generator_host_straightforward.cc \
-							 cunumeric/random/randutil/generator_host_advanced.cc
-
-GEN_GPU_SRC += cunumeric/random/bitgenerator.cu                              \
-							 cunumeric/random/randutil/generator_device.cu                 \
-							 cunumeric/random/randutil/generator_device_straightforward.cu \
-							 cunumeric/random/randutil/generator_device_advanced.cu
\ No newline at end of file
diff --git a/src/cunumeric/sort/sort.mk b/src/cunumeric/sort/sort.mk
deleted file mode 100644
index f13422c69..000000000
--- a/src/cunumeric/sort/sort.mk
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-GEN_CPU_SRC += cunumeric/sort/sort.cc   \
-							 cunumeric/sort/searchsorted.cc 
-ifeq ($(strip $(USE_OPENMP)),1)
-GEN_CPU_SRC += cunumeric/sort/sort_omp.cc   \
-							 cunumeric/sort/searchsorted_omp.cc
-endif
-
-GEN_GPU_SRC += cunumeric/sort/sort.cu   \
-							 cunumeric/sort/searchsorted.cu           \
-							 cunumeric/sort/cub_sort_bool.cu          \
-							 cunumeric/sort/cub_sort_int8.cu          \
-							 cunumeric/sort/cub_sort_int16.cu         \
-							 cunumeric/sort/cub_sort_int32.cu         \
-							 cunumeric/sort/cub_sort_int64.cu         \
-							 cunumeric/sort/cub_sort_uint8.cu         \
-							 cunumeric/sort/cub_sort_uint16.cu        \
-							 cunumeric/sort/cub_sort_uint32.cu        \
-							 cunumeric/sort/cub_sort_uint64.cu        \
-							 cunumeric/sort/cub_sort_half.cu          \
-							 cunumeric/sort/cub_sort_float.cu         \
-							 cunumeric/sort/cub_sort_double.cu        \
-							 cunumeric/sort/thrust_sort_bool.cu       \
-							 cunumeric/sort/thrust_sort_int8.cu       \
-							 cunumeric/sort/thrust_sort_int16.cu      \
-							 cunumeric/sort/thrust_sort_int32.cu      \
-							 cunumeric/sort/thrust_sort_int64.cu      \
-							 cunumeric/sort/thrust_sort_uint8.cu      \
-							 cunumeric/sort/thrust_sort_uint16.cu     \
-							 cunumeric/sort/thrust_sort_uint32.cu     \
-							 cunumeric/sort/thrust_sort_uint64.cu     \
-							 cunumeric/sort/thrust_sort_half.cu       \
-							 cunumeric/sort/thrust_sort_float.cu      \
-							 cunumeric/sort/thrust_sort_double.cu     \
-							 cunumeric/sort/thrust_sort_complex64.cu  \
-							 cunumeric/sort/thrust_sort_complex128.cu

From b01ee12c6e70713cfcbd2c47ed6627aef59bdf28 Mon Sep 17 00:00:00 2001
From: Jeremy <jjwilke@users.noreply.github.com>
Date: Wed, 12 Oct 2022 16:52:34 -0700
Subject: [PATCH 10/89] Set default generator based on whether ninja is
 available (#602)

* check for ninja to determine default CMake generator

* Address PR comments, fix typos

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 install.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/install.py b/install.py
index 11d838b32..febf25b8b 100755
--- a/install.py
+++ b/install.py
@@ -308,7 +308,7 @@ def validate_path(path):
     cmake_flags = []
 
     if cmake_generator:
-        cmake_flags += [f"-G{cmake_generator}"]
+        cmake_flags += [f"-G'{cmake_generator}'"]
 
     if debug or verbose:
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
@@ -520,8 +520,8 @@ def driver():
         "--cmake-generator",
         dest="cmake_generator",
         required=False,
-        default="Ninja",
-        choices=["Ninja", "Unix Makefiles"],
+        default=(None if shutil.which("ninja") is None else "Ninja"),
+        choices=["Ninja", "Unix Makefiles", None],
         help="The CMake makefiles generator",
     )
     parser.add_argument(

From 50d837082c0f8faa6a49fad3467cbe39d58d4877 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 12 Oct 2022 16:53:28 -0700
Subject: [PATCH 11/89] Allow args to be passed by position and name in
 auto_convert (#640)

* Allow args to be passed by position and name in auto_convert

* Address PR comments

* Address PR comments
---
 cunumeric/array.py    | 11 +++---
 cunumeric/deferred.py | 91 +++++++++++++++++++++++++------------------
 2 files changed, 59 insertions(+), 43 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index dd389f995..64784d59a 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -94,7 +94,8 @@ def add_boilerplate(
       parameter (if present), to cuNumeric ndarrays.
     * Convert the special "where" parameter (if present) to a valid predicate.
     """
-    keys: Set[str] = set(array_params)
+    keys = set(array_params)
+    assert len(keys) == len(array_params)
 
     def decorator(func: Callable[P, R]) -> Callable[P, R]:
         assert not hasattr(
@@ -104,18 +105,18 @@ def decorator(func: Callable[P, R]) -> Callable[P, R]:
         # For each parameter specified by name, also consider the case where
         # it's passed as a positional parameter.
         indices: Set[int] = set()
-        all_formals: Set[str] = set()
         where_idx: Optional[int] = None
         out_idx: Optional[int] = None
-        for (idx, param) in enumerate(signature(func).parameters):
-            all_formals.add(param)
+        params = signature(func).parameters
+        extra = keys - set(params)
+        assert len(extra) == 0, f"unknown parameter(s): {extra}"
+        for (idx, param) in enumerate(params):
             if param == "where":
                 where_idx = idx
             elif param == "out":
                 out_idx = idx
             elif param in keys:
                 indices.add(idx)
-        assert len(keys - all_formals) == 0, "unkonwn parameter(s)"
 
         @wraps(func)
         def wrapper(*args: Any, **kwargs: Any) -> R:
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 04fe6e829..54f481977 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -18,13 +18,13 @@
 from collections import Counter
 from collections.abc import Iterable
 from enum import IntEnum, unique
-from functools import reduce
+from functools import reduce, wraps
+from inspect import signature
 from itertools import product
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
-    Collection,
     Dict,
     Optional,
     Sequence,
@@ -95,24 +95,39 @@ def _prod(tpl: Sequence[int]) -> int:
 
 
 def auto_convert(
-    indices: Collection[int], keys: Sequence[str] = []
+    *thunk_params: str,
 ) -> Callable[[Callable[P, R]], Callable[P, R]]:
-    indices = set(indices)
+    """
+    Converts all named parameters to DeferredArrays.
+    """
+    keys = set(thunk_params)
+    assert len(keys) == len(thunk_params)
 
     def decorator(func: Callable[P, R]) -> Callable[P, R]:
-        def wrapper(*args: Any, **kwargs: Any) -> Any:
+        assert not hasattr(
+            func, "__wrapped__"
+        ), "this decorator must be the innermost"
+
+        # For each parameter specified by name, also consider the case where
+        # it's passed as a positional parameter.
+        params = signature(func).parameters
+        extra = keys - set(params)
+        assert len(extra) == 0, f"unknown parameter(s): {extra}"
+        indices = {idx for (idx, param) in enumerate(params) if param in keys}
+
+        @wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> R:
+            # Convert relevant arguments to DeferredArrays
             self = args[0]
-
             args = tuple(
-                self.runtime.to_deferred_array(arg) if idx in indices else arg
+                self.runtime.to_deferred_array(arg)
+                if idx in indices and arg is not None
+                else arg
                 for (idx, arg) in enumerate(args)
             )
-            for key in keys:
-                v = kwargs.get(key, None)
-                if v is None:
-                    continue
-                v = self.runtime.to_deferred_array(v)
-                kwargs[key] = v
+            for (k, v) in kwargs.items():
+                if k in keys and v is not None:
+                    kwargs[k] = self.runtime.to_deferred_array(v)
 
             return func(*args, **kwargs)
 
@@ -350,7 +365,7 @@ def conj(self) -> NumPyThunk:
         return result
 
     # Copy source array to the destination array
-    @auto_convert([1])
+    @auto_convert("rhs")
     def copy(self, rhs: Any, deep: bool = False) -> None:
         if self.scalar and rhs.scalar:
             self.base.set_storage(rhs.base.storage)
@@ -858,7 +873,7 @@ def get_item(self, key: Any) -> NumPyThunk:
 
         return result
 
-    @auto_convert([2])
+    @auto_convert("rhs")
     def set_item(self, key: Any, rhs: Any) -> None:
         assert self.dtype == rhs.dtype
         # Check to see if this is advanced indexing or not
@@ -1178,7 +1193,7 @@ def swapaxes(self, axis1: int, axis2: int) -> DeferredArray:
         return result
 
     # Convert the source array to the destination array
-    @auto_convert([1])
+    @auto_convert("rhs")
     def convert(
         self,
         rhs: Any,
@@ -1214,7 +1229,7 @@ def convert(
         if temporary:
             lhs.set_linear()
 
-    @auto_convert([1, 2])
+    @auto_convert("v", "lhs")
     def convolve(self, v: Any, lhs: Any, mode: ConvolveMode) -> None:
         input = self.base
         filter = v.base
@@ -1249,7 +1264,7 @@ def convolve(self, v: Any, lhs: Any, mode: ConvolveMode) -> None:
 
         task.execute()
 
-    @auto_convert([1])
+    @auto_convert("rhs")
     def fft(
         self,
         rhs: Any,
@@ -1327,7 +1342,7 @@ def fill(self, numpy_array: Any) -> None:
         )
         self._fill(store)
 
-    @auto_convert([2, 4])
+    @auto_convert("rhs1_thunk", "rhs2_thunk")
     def contract(
         self,
         lhs_modes: list[str],
@@ -1595,7 +1610,7 @@ def choose(self, rhs: Any, *args: Any) -> None:
         task.execute()
 
     # Create or extract a diagonal from a matrix
-    @auto_convert([1])
+    @auto_convert("rhs")
     def _diag_helper(
         self,
         rhs: Any,
@@ -1712,7 +1727,7 @@ def create_scalar(value: Any, dtype: np.dtype[Any]) -> Any:
         task.execute()
 
     # Tile the src array onto the destination array
-    @auto_convert([1])
+    @auto_convert("rhs")
     def tile(self, rhs: Any, reps: Union[Any, Sequence[int]]) -> None:
         src_array = rhs
         dst_array = self
@@ -1739,7 +1754,7 @@ def transpose(
         result = DeferredArray(self.runtime, result, self.dtype)
         return result
 
-    @auto_convert([1])
+    @auto_convert("rhs")
     def trilu(self, rhs: Any, k: int, lower: bool) -> None:
         lhs = self.base
         rhs = rhs._broadcast(lhs.shape)
@@ -1780,7 +1795,7 @@ def repeat(
         task.execute()
         return out
 
-    @auto_convert([1])
+    @auto_convert("rhs")
     def flip(self, rhs: Any, axes: Union[None, int, tuple[int, ...]]) -> None:
         input = rhs.base
         output = self.base
@@ -1801,7 +1816,7 @@ def flip(self, rhs: Any, axes: Union[None, int, tuple[int, ...]]) -> None:
         task.execute()
 
     # Perform a bin count operation on the array
-    @auto_convert([1], ["weights"])
+    @auto_convert("rhs", "weights")
     def bincount(self, rhs: Any, weights: Optional[NumPyThunk] = None) -> None:
         weight_array = weights
         src_array = rhs
@@ -2872,7 +2887,7 @@ def random_integer(
         self.random(RandGenCode.INTEGER, [low, high])
 
     # Perform the unary operation and put the result in the array
-    @auto_convert([2])
+    @auto_convert("src")
     def unary_op(
         self,
         op: UnaryOpCode,
@@ -2901,7 +2916,7 @@ def unary_op(
 
     # Perform a unary reduction operation from one set of dimensions down to
     # fewer
-    @auto_convert([2])
+    @auto_convert("src")
     def unary_reduction(
         self,
         op: UnaryRedCode,
@@ -3017,7 +3032,7 @@ def isclose(
         self.binary_op(BinaryOpCode.ISCLOSE, rhs1, rhs2, True, args)
 
     # Perform the binary operation and put the result in the lhs array
-    @auto_convert([2, 3])
+    @auto_convert("src1", "src2")
     def binary_op(
         self,
         op_code: BinaryOpCode,
@@ -3043,7 +3058,7 @@ def binary_op(
 
         task.execute()
 
-    @auto_convert([2, 3])
+    @auto_convert("src1", "src2")
     def binary_reduction(
         self,
         op: BinaryOpCode,
@@ -3079,7 +3094,7 @@ def binary_reduction(
 
         task.execute()
 
-    @auto_convert([1, 2, 3])
+    @auto_convert("src1", "src2", "src3")
     def where(self, src1: Any, src2: Any, src3: Any) -> None:
         lhs = self.base
         rhs1 = src1._broadcast(lhs.shape)
@@ -3138,15 +3153,15 @@ def compute_strides(shape: NdShape) -> tuple[int, ...]:
             stride *= dim
         return result
 
-    @auto_convert([1])
+    @auto_convert("src")
     def cholesky(self, src: Any, no_tril: bool = False) -> None:
         cholesky(self, src, no_tril)
 
-    @auto_convert([1, 2])
+    @auto_convert("a", "b")
     def solve(self, a: Any, b: Any) -> None:
         solve(self, a, b)
 
-    @auto_convert([2])
+    @auto_convert("rhs")
     def scan(
         self,
         op: int,
@@ -3223,7 +3238,7 @@ def unique(self) -> NumPyThunk:
 
         return result
 
-    @auto_convert([1, 2])
+    @auto_convert("rhs", "v")
     def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
 
         task = self.context.create_task(CuNumericOpCode.SEARCHSORTED)
@@ -3249,7 +3264,7 @@ def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
         task.add_scalar_arg(rhs.size, ty.int64)
         task.execute()
 
-    @auto_convert([1])
+    @auto_convert("rhs")
     def sort(
         self,
         rhs: Any,
@@ -3274,7 +3289,7 @@ def sort(
 
         sort(self, rhs, argsort, axis, stable)
 
-    @auto_convert([1])
+    @auto_convert("rhs")
     def partition(
         self,
         rhs: Any,
@@ -3305,7 +3320,7 @@ def create_window(self, op_code: WindowOpCode, M: int, *args: Any) -> None:
             task.add_scalar_arg(arg, ty.float64)
         task.execute()
 
-    @auto_convert([1])
+    @auto_convert("src")
     def packbits(
         self, src: Any, axis: Union[int, None], bitorder: BitOrder
     ) -> None:
@@ -3321,7 +3336,7 @@ def packbits(
         task.add_constraint(p_in <= p_out * scale)  # type: ignore
         task.execute()
 
-    @auto_convert([1])
+    @auto_convert("src")
     def unpackbits(
         self, src: Any, axis: Union[int, None], bitorder: BitOrder
     ) -> None:
@@ -3337,7 +3352,7 @@ def unpackbits(
         task.add_constraint(p_out <= p_in * scale)  # type: ignore
         task.execute()
 
-    @auto_convert([1])
+    @auto_convert("src")
     def _wrap(self, src: Any, new_len: int) -> None:
         if src.base.kind == Future or src.base.transformed:
             src = src._convert_future_to_regionfield()

From 51b027ea958d05e3d2c84bcefded5a7a2a1c6392 Mon Sep 17 00:00:00 2001
From: Jeremy <jjwilke@users.noreply.github.com>
Date: Thu, 13 Oct 2022 06:57:57 -0700
Subject: [PATCH 12/89] force positive values for log and sqrt tests (#580)

---
 tests/integration/test_unary_ufunc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_unary_ufunc.py b/tests/integration/test_unary_ufunc.py
index ae624850f..5f43fbf37 100644
--- a/tests/integration/test_unary_ufunc.py
+++ b/tests/integration/test_unary_ufunc.py
@@ -215,7 +215,7 @@ def test_log_ops(op):
     check_op_input(op, astype="F", out_dtype="D")
 
     check_op_input(op, randint=True, a_min=3, a_max=10)
-    check_op_input(op, shape=(1,), offset=3)
+    check_op_input(op, shape=(1,), a_min=0.1, offset=3)
 
 
 even_root_ops = ("sqrt",)
@@ -231,7 +231,7 @@ def test_even_root_ops(op):
     # Complex inputs can be negative
     check_op_input(op, astype="F", out_dtype="D")
     check_op_input(op, randint=True, a_min=3, a_max=10)
-    check_op_input(op, shape=(1,), offset=3)
+    check_op_input(op, shape=(1,), a_min=0.1, offset=3)
 
 
 odd_root_ops = ("cbrt",)

From 1e16e9c5c3ca5afed17bcdcd4d59de3e8519f9d2 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Fri, 14 Oct 2022 09:03:36 -0700
Subject: [PATCH 13/89] Use right type in shmem calculation for kernels using
 reduce_output (#659)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 src/cunumeric/index/advanced_indexing.cu | 2 +-
 src/cunumeric/index/repeat.cu            | 2 +-
 src/cunumeric/search/nonzero.cuh         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index fde5590fd..a7d3f2f94 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -94,7 +94,7 @@ struct AdvancedIndexingImplBody<VariantKind::GPU, CODE, DIM, OUT_TYPE> {
 
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
-    size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
+    size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(uint64_t);
 
     if (blocks >= MAX_REDUCTION_CTAS) {
       const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu
index 30f0c2aff..1b658874a 100644
--- a/src/cunumeric/index/repeat.cu
+++ b/src/cunumeric/index/repeat.cu
@@ -139,7 +139,7 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
 
     DeviceScalarReductionBuffer<SumReduction<uint64_t>> sum(stream);
     const size_t blocks_count = (extent + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    const size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
+    const size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(uint64_t);
 
     if (blocks_count > MAX_REDUCTION_CTAS) {
       const size_t iters = (blocks_count + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
diff --git a/src/cunumeric/search/nonzero.cuh b/src/cunumeric/search/nonzero.cuh
index 1b777b34c..e9af92578 100644
--- a/src/cunumeric/search/nonzero.cuh
+++ b/src/cunumeric/search/nonzero.cuh
@@ -63,7 +63,7 @@ int64_t compute_offsets(const AccessorRO<VAL, DIM>& in,
   DeviceScalarReductionBuffer<SumReduction<uint64_t>> size(stream);
 
   const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
+  size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(uint64_t);
 
   if (blocks >= MAX_REDUCTION_CTAS) {
     const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;

From 7a781331bba431cfadd3f5151ed253e7a1d3459c Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Mon, 17 Oct 2022 13:04:23 +0800
Subject: [PATCH 14/89] Enhance test_tri*.py. (#658)

---
 tests/integration/test_tri.py           | 139 +++++++++++++--
 tests/integration/test_trilu.py         |  90 ++++++----
 tests/integration/test_trilu_indices.py | 226 ++++++++++++++++++++----
 tests/integration/utils/utils.py        |   2 +-
 4 files changed, 368 insertions(+), 89 deletions(-)

diff --git a/tests/integration/test_tri.py b/tests/integration/test_tri.py
index 13b9db665..127180064 100644
--- a/tests/integration/test_tri.py
+++ b/tests/integration/test_tri.py
@@ -15,36 +15,137 @@
 
 import numpy as np
 import pytest
+from utils.utils import check_module_function
 
 import cunumeric as num
 
-KS = [0, -1, 1, -2, 2]
+KS = (0, -1, 1, -2, 2)
+N = 100
 
 
-def _test(func, k):
-    num_f = getattr(num, func)
-    np_f = getattr(np, func)
+@pytest.mark.parametrize("n", (0, 1, N), ids=lambda n: f"(n={n})")
+def test_tri_n(n):
+    print_msg = f"np & cunumeric.tri({n})"
+    check_module_function("tri", [n], {}, print_msg)
 
-    a = num_f(100, k=k)
-    an = np_f(100, k=k)
-    assert num.array_equal(a, an)
 
-    a = num_f(100, 50, k=k)
-    an = np_f(100, 50, k=k)
-    assert num.array_equal(a, an)
+@pytest.mark.parametrize("k", KS + (-N, N), ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize("m", (1, 10, N), ids=lambda m: f"(M={m})")
+@pytest.mark.parametrize("n", (1, N), ids=lambda n: f"(n={n})")
+def test_tri_full(n, m, k):
+    print_msg = f"np & cunumeric.tri({n}, k={k}, M={m})"
+    check_module_function("tri", [n], {"k": k, "M": m}, print_msg)
 
-    a = num_f(100, k=k, dtype=int)
-    an = np_f(100, k=k, dtype=int)
-    assert num.array_equal(a, an)
 
-    a = num_f(100, k=k, dtype=bool)
-    an = np_f(100, k=k, dtype=bool)
-    assert num.array_equal(a, an)
+@pytest.mark.parametrize("m", (0, None), ids=lambda m: f"(M={m})")
+def test_tri_m(m):
+    print_msg = f"np & cunumeric.tri({N}, M={m})"
+    check_module_function("tri", [N], {"M": m}, print_msg)
 
 
-@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-def test_tri(k):
-    _test("tri", k)
+DTYPES = (
+    int,
+    float,
+    bool,
+    pytest.param(None, marks=pytest.mark.xfail),
+)
+
+
+@pytest.mark.parametrize("dtype", DTYPES, ids=str)
+def test_tri_dtype(dtype):
+    # cuNumeric: returns an array with dtype=int
+    # Numpy: returns an array with dtype=float
+    print_msg = f"np & cunumeric.tri({N}, dtype={dtype})"
+    check_module_function("tri", [N], {"dtype": dtype}, print_msg)
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("k", (-10.5, 0.0, 10.5), ids=lambda k: f"(k={k})")
+def test_tri_float_k(k):
+    # cuNumeric: struct.error: required argument is not an integer
+    # Numpy: pass
+    print_msg = f"np & cunumeric.tri({N}, k={k})"
+    check_module_function("tri", [N], {"k": k}, print_msg)
+
+
+class TestTriErrors:
+    def test_negative_n(self):
+        with pytest.raises(ValueError):
+            num.tri(-100)
+
+    @pytest.mark.xfail
+    def test_negative_n_DIVERGENCE(self):
+        # np.tri(-100) returns empty array
+        # num.tri(-100) raises ValueError
+        n = -100
+        np_res = np.tri(n)
+        num_res = num.tri(n)
+        assert np.array_equal(np_res, num_res)
+
+    @pytest.mark.parametrize("n", (-10.5, 0.0, 10.5))
+    def test_float_n(self, n):
+        msg = "expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.tri(n)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("n", (-10.5, 0.0, 10.5))
+    def test_float_n_DIVERGENCE(self, n):
+        # np.tri(-10.5) returns empty array
+        # np.tri(0.0) returns empty array
+        # np.tri(10.5) returns array
+        # num.tri(-10.5) raises TypeError
+        # num.tri(0.0) raises TypeError
+        # num.tri(10.5) raises TypeError
+        np_res = np.tri(n)
+        num_res = num.tri(n)
+        assert np.array_equal(np_res, num_res)
+
+    def test_negative_m(self):
+        with pytest.raises(ValueError):
+            num.tri(N, M=-10)
+
+    @pytest.mark.xfail
+    def test_negative_m_DIVERGENCE(self):
+        # np.tri(100, M=-10) returns empty array
+        # num.tri(100, M=-10) raises ValueError
+        m = -10
+        np_res = np.tri(N, M=m)
+        num_res = num.tri(N, M=m)
+        assert np.array_equal(np_res, num_res)
+
+    @pytest.mark.parametrize("m", (-10.5, 0.0, 10.5))
+    def test_float_m(self, m):
+        msg = "expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.tri(N, M=m)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("m", (-10.5, 0.0, 10.5))
+    def test_float_m_DIVERGENCE(self, m):
+        # np.tri(100, M=-10.5) returns empty array
+        # np.tri(100, M=0.0) returns empty array
+        # np.tri(100, M=10.5) returns array
+        # num.tri(100, M=-10.5) raises TypeError
+        # num.tri(100, M=0.0) raises TypeError
+        # num.tri(100, M=10.5) raises TypeError
+        np_res = np.tri(N, M=m)
+        num_res = num.tri(N, M=m)
+        assert np.array_equal(np_res, num_res)
+
+    def test_n_none(self):
+        msg = "expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.tri(None)
+
+    @pytest.mark.xfail
+    def test_k_none(self):
+        # In cuNumeric, it raises struct.error,
+        # msg is required argument is not an integer
+        # In Numpy, it raises TypeError,
+        # msg is bad operand type for unary -: 'NoneType'
+        with pytest.raises(TypeError):
+            num.tri(N, k=None)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_trilu.py b/tests/integration/test_trilu.py
index 395d2dd4c..80e9ae7d8 100644
--- a/tests/integration/test_trilu.py
+++ b/tests/integration/test_trilu.py
@@ -18,51 +18,73 @@
 
 import cunumeric as num
 
-KS = [0, -1, 1, -2, 2]
-
-a = num.array(
-    [
-        [1, 2, 3, 4],
-        [5, 6, 7, 8],
-        [9, 10, 11, 12],
-        [13, 14, 15, 16],
-        [17, 18, 19, 20],
-    ]
-)
-
-anp = np.array(
-    [
-        [1, 2, 3, 4],
-        [5, 6, 7, 8],
-        [9, 10, 11, 12],
-        [13, 14, 15, 16],
-        [17, 18, 19, 20],
-    ]
-)
+KS = (0, -1, 1, -2, 2)
+FUNCTIONS = ("tril", "triu")
 
 
-@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-@pytest.mark.parametrize("func", ("tril", "triu"))
-def test_full(func, k):
+def _test(func, anp, a, k):
     num_f = getattr(num, func)
     np_f = getattr(np, func)
 
     b = num_f(a, k=k)
-    bn = np_f(anp, k=k)
+    bnp = np_f(anp, k=k)
 
-    assert num.array_equal(b, bn)
+    assert num.array_equal(b, bnp)
 
 
-@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-@pytest.mark.parametrize("func", ("tril", "triu"))
-def test_slice(func, k):
-    num_f = getattr(num, func)
-    np_f = getattr(np, func)
+ARRAY_SHAPE = (
+    (0,),
+    (1,),
+    (10,),
+    (1, 10),
+    (10, 10),
+    (1, 1, 10),
+    (1, 10, 10),
+    (10, 10, 10),
+)
 
-    b = num_f(a[0, :], k=k)
-    bn = np_f(anp[0, :], k=k)
 
-    assert num.array_equal(b, bn)
+@pytest.mark.parametrize("k", KS + (-10, 10), ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize("dtype", (int, float), ids=str)
+@pytest.mark.parametrize(
+    "shape", ARRAY_SHAPE, ids=lambda shape: f"(shape={shape})"
+)
+@pytest.mark.parametrize("func", FUNCTIONS)
+def test_trilu(func, shape, dtype, k):
+    anp = np.ones(shape, dtype=dtype)
+    a = num.ones(shape, dtype=dtype)
+
+    _test(func, anp, a, k)
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("k", (-2.5, 0.0, 2.5), ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize("func", FUNCTIONS)
+def test_trilu_float_k(func, k):
+    # cuNumeric: struct.error: required argument is not an integer
+    # Numpy: pass
+    shape = (10, 10)
+    anp = np.ones(shape)
+    a = num.ones(shape)
+
+    _test(func, anp, a, k)
+
+
+class TestTriluErrors:
+    def test_arr_none(self):
+        msg = "'NoneType' object has no attribute 'ndim'"
+        with pytest.raises(AttributeError, match=msg):
+            num.tril(None)
+
+    @pytest.mark.xfail
+    def test_k_none(self):
+        # In cuNumeric, it raises struct.error,
+        # msg is required argument is not an integer
+        # In Numpy, it raises TypeError,
+        # msg is bad operand type for unary -: 'NoneType'
+        a = num.ones((3, 3))
+        with pytest.raises(TypeError):
+            num.tril(a, k=None)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_trilu_indices.py b/tests/integration/test_trilu_indices.py
index d25e4e718..6962d283a 100644
--- a/tests/integration/test_trilu_indices.py
+++ b/tests/integration/test_trilu_indices.py
@@ -15,54 +15,210 @@
 
 import numpy as np
 import pytest
+from utils.utils import check_module_function
 
 import cunumeric as num
 
-KS = [0, -1, 1, -2, 2]
+KS = (0, -1, 1, -2, 2)
+FUNCTIONS_INDICES = ("tril_indices", "triu_indices")
+FUNCTIONS_INDICES_FROM = ("tril_indices_from", "triu_indices_from")
+N = 100
 
 
-def _test(func, k):
+def _test_from(func, shape, k):
     num_f = getattr(num, func)
     np_f = getattr(np, func)
-
-    a = num_f(100, k=k)
-    an = np_f(100, k=k)
-    assert num.array_equal(a, an)
-
-    a = num_f(100, k=k, m=30)
-    an = np_f(100, k=k, m=30)
-    assert num.array_equal(a, an)
-
-
-def _test_from(func, k):
-    num_f = getattr(num, func)
-    np_f = getattr(np, func)
-    a = num.ones((70, 40), dtype=int)
-    an = np.ones((70, 40), dtype=int)
+    a = num.ones(shape, dtype=int)
+    an = np.ones(shape, dtype=int)
 
     b = num_f(a, k=k)
     bn = np_f(an, k=k)
     assert num.array_equal(b, bn)
 
 
-@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-def test_tril_indices_from(k):
-    _test_from("tril_indices_from", k)
-
-
-@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-def test_triu_indices_from(k):
-    _test_from("triu_indices_from", k)
-
-
-@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-def test_tril_indices(k):
-    _test("tril_indices", k)
-
-
-@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-def test_triu_indices(k):
-    _test("triu_indices", k)
+@pytest.mark.parametrize("n", (0, 1, 100), ids=lambda n: f"(n={n})")
+@pytest.mark.parametrize("func", FUNCTIONS_INDICES)
+def test_trilu_indices_default(func, n):
+    print_msg = f"np & cunumeric.{func}({n})"
+    check_module_function(func, [n], {}, print_msg)
+
+
+@pytest.mark.parametrize("k", KS + (-N, N), ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize("m", (1, 10, N), ids=lambda m: f"(m={m})")
+@pytest.mark.parametrize("n", (1, N), ids=lambda n: f"(n={n})")
+@pytest.mark.parametrize("func", FUNCTIONS_INDICES)
+def test_trilu_indices_full(func, n, m, k):
+    print_msg = f"np & cunumeric.{func}({n}, k={k}, m={m})"
+    check_module_function(func, [n], {"k": k, "m": m}, print_msg)
+
+
+@pytest.mark.parametrize("m", (0, None), ids=lambda m: f"(m={m})")
+@pytest.mark.parametrize("func", FUNCTIONS_INDICES)
+def test_trilu_indices_m(func, m):
+    print_msg = f"np & cunumeric.{func}({N}, m={m})"
+    check_module_function(func, [N], {"m": m}, print_msg)
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("k", (-10.5, 0.0, 10.5), ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize("func", FUNCTIONS_INDICES)
+def test_trilu_indices_float_k(func, k):
+    # cuNumeric: struct.error: required argument is not an integer
+    # Numpy: pass
+    print_msg = f"np & cunumeric.{func}({N}, k={k})"
+    check_module_function(func, [N], {"k": k}, print_msg)
+
+
+class TestTriluIndicesErrors:
+    def test_negative_n(self):
+        with pytest.raises(ValueError):
+            num.tril_indices(-100)
+
+    @pytest.mark.xfail
+    def test_negative_n_DIVERGENCE(self):
+        # np.tril_indices(-100) returns empty array, dtype=int64
+        # num.tril_indices(-100) raises ValueError
+        n = -100
+        np_res = np.tril_indices(n)
+        num_res = num.tril_indices(n)
+        assert np.array_equal(np_res, num_res)
+
+    @pytest.mark.parametrize("n", (-10.5, 0.0, 10.5))
+    def test_float_n(self, n):
+        msg = "expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.tril_indices(n)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("n", (-10.5, 0.0, 10.5))
+    def test_float_n_DIVERGENCE(self, n):
+        # np.tril_indices(-10.5) returns empty array, dtype=int64
+        # np.tril_indices(0.0) returns empty array, dtype=int64
+        # np.tril_indices(10.5) returns array, dtype=int64
+        # num.tril_indices(-10.5) raises TypeError
+        # num.tril_indices(0.0) raises TypeError
+        # num.tril_indices(10.5) raises TypeError
+        np_res = np.tril_indices(n)
+        num_res = num.tril_indices(n)
+        assert np.array_equal(np_res, num_res)
+
+    def test_negative_m(self):
+        with pytest.raises(ValueError):
+            num.tril_indices(N, m=-10)
+
+    @pytest.mark.xfail
+    def test_negative_m_DIVERGENCE(self):
+        # np.tril_indices(100, m=-10) returns empty array, dtype=int64
+        # num.tril_indices(100, m=-10) raises ValueError
+        m = -10
+        np_res = np.tril_indices(N, m=m)
+        num_res = num.tril_indices(N, m=m)
+        assert np.array_equal(np_res, num_res)
+
+    @pytest.mark.parametrize("m", (-10.5, 0.0, 10.5))
+    def test_float_m(self, m):
+        msg = "expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.tril_indices(N, m=m)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("m", (-10.5, 0.0, 10.5))
+    def test_float_m_DIVERGENCE(self, m):
+        # np.tril_indices(100, m=-10.5) returns empty array, dtype=int64
+        # np.tril_indices(100, m=0.0) returns empty array, dtype=int64
+        # np.tril_indices(100, m=10.5) returns array, dtype=int64
+        # num.tril_indices(100, m=-10.5) raises TypeError
+        # num.tril_indices(100, m=0.0) raises TypeError
+        # num.tril_indices(100, m=10.5) raises TypeError
+        np_res = np.tril_indices(N, m=m)
+        num_res = num.tril_indices(N, m=m)
+        assert np.array_equal(np_res, num_res)
+
+    def test_n_none(self):
+        msg = "expected a sequence of integers or a single integer"
+        with pytest.raises(TypeError, match=msg):
+            num.tril_indices(None)
+
+    @pytest.mark.xfail
+    def test_k_none(self):
+        # In cuNumeric, it raises struct.error,
+        # msg is required argument is not an integer
+        # In Numpy, it raises TypeError,
+        # msg is bad operand type for unary -: 'NoneType'
+        with pytest.raises(TypeError):
+            num.tril_indices(N, k=None)
+
+
+ARRAY_SHAPE = (
+    (1, 1),
+    (1, N),
+    (10, N),
+    (N, N),
+    (N, 10),
+    (N, 1),
+)
+
+
+@pytest.mark.parametrize("k", KS + (-N, N), ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize(
+    "shape", ARRAY_SHAPE, ids=lambda shape: f"(shape={shape})"
+)
+@pytest.mark.parametrize("func", FUNCTIONS_INDICES_FROM)
+def test_trilu_indices_from(func, shape, k):
+    _test_from(func, shape, k)
+
+
+@pytest.mark.parametrize(
+    "shape", ((10, 0), (0, 10), (0, 0)), ids=lambda shape: f"(shape={shape})"
+)
+@pytest.mark.parametrize("func", FUNCTIONS_INDICES_FROM)
+def test_trilu_indices_from_empty_array(func, shape):
+    k = 0
+    _test_from(func, shape, k)
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("k", (-10.5, 0.0, 10.5), ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize("func", FUNCTIONS_INDICES_FROM)
+def test_trilu_indices_from_float_k(func, k):
+    # cuNumeric: struct.error: required argument is not an integer
+    # Numpy: pass
+    shape = (10, 10)
+    _test_from(func, shape, k)
+
+
+class TestTriluIndicesFromErrors:
+    @pytest.mark.parametrize("size", ((5,), (0,)), ids=str)
+    @pytest.mark.parametrize(
+        "dimension", (1, 3), ids=lambda dimension: f"(dim={dimension})"
+    )
+    def test_arr_non_2d(self, dimension, size):
+        shape = size * dimension
+        a = num.ones(shape, dtype=int)
+        msg = "input array must be 2-d"
+        with pytest.raises(ValueError, match=msg):
+            num.tril_indices_from(a)
+
+    def test_arr_0d(self):
+        a = num.array(3)
+        msg = "input array must be 2-d"
+        with pytest.raises(ValueError, match=msg):
+            num.tril_indices_from(a)
+
+    def test_arr_none(self):
+        msg = "'NoneType' object has no attribute 'ndim'"
+        with pytest.raises(AttributeError, match=msg):
+            num.tril_indices_from(None)
+
+    @pytest.mark.xfail
+    def test_k_none(self):
+        # In cuNumeric, it raises struct.error,
+        # msg is required argument is not an integer
+        # In Numpy, it raises TypeError,
+        # msg is bad operand type for unary -: 'NoneType'
+        a = num.ones((3, 3))
+        with pytest.raises(TypeError):
+            num.tril_indices_from(a, k=None)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/utils/utils.py b/tests/integration/utils/utils.py
index 505249321..892154d45 100644
--- a/tests/integration/utils/utils.py
+++ b/tests/integration/utils/utils.py
@@ -39,7 +39,7 @@ def compare_array_and_print_results(a, b, print_msg, check_type=True):
     """
     Compare two arrays and print results.
     """
-    if isinstance(a, list):
+    if isinstance(a, list) or isinstance(a, tuple):
         is_equal, err_arr = compare_array(a, b, check_type=False)
         assert is_equal, (
             f"Failed, {print_msg}\n"

From bf5b7f279c281ebd1c3c0f2ce1509cd959447a60 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Tue, 18 Oct 2022 09:26:08 -0700
Subject: [PATCH 15/89] Remove --install-dir option (#656)

* remove --install-dir option

* remove --with-core as well

* remove legate-url and legate-branch

* suggestions
---
 install.py | 63 +++++++++++-------------------------------------------
 1 file changed, 12 insertions(+), 51 deletions(-)

diff --git a/install.py b/install.py
index febf25b8b..c6ee2d80d 100755
--- a/install.py
+++ b/install.py
@@ -139,10 +139,6 @@ def install_cunumeric(
     gasnet_dir,
     networks,
     hdf,
-    install_dir,
-    legate_branch,
-    legate_dir,
-    legate_url,
     llvm,
     march,
     maxdim,
@@ -187,10 +183,6 @@ def install_cunumeric(
         print("gasnet_dir: ", gasnet_dir)
         print("networks: ", networks)
         print("hdf: ", hdf)
-        print("install_dir: ", install_dir)
-        print("legate_branch: ", legate_branch)
-        print("legate_dir: ", legate_dir)
-        print("legate_url: ", legate_url)
         print("llvm: ", llvm)
         print("march: ", march)
         print("maxdim: ", maxdim)
@@ -226,20 +218,21 @@ def validate_path(path):
     cuda_dir = validate_path(cuda_dir)
     nccl_dir = validate_path(nccl_dir)
     tblis_dir = validate_path(tblis_dir)
-    legate_dir = validate_path(legate_dir)
     thrust_dir = validate_path(thrust_dir)
     curand_dir = validate_path(curand_dir)
     gasnet_dir = validate_path(gasnet_dir)
     cutensor_dir = validate_path(cutensor_dir)
     openblas_dir = validate_path(openblas_dir)
 
-    if legate_dir is None:
-        try:
-            import legate.install_info as lg_install_info
+    try:
+        import legate.install_info as lg_install_info
+    except ImportError:
+        raise RuntimeError(
+            "Cannot determine Legate install directory. Please make sure "
+            "legate.core is installed in the current Python environment."
+        )
 
-            legate_dir = dirname(lg_install_info.libpath)
-        except Exception:
-            pass
+    legate_dir = dirname(lg_install_info.libpath)
 
     if verbose:
         print("cuda_dir: ", cuda_dir)
@@ -274,6 +267,8 @@ def validate_path(path):
     pip_install_cmd = [sys.executable, "-m", "pip", "install"]
     cmd_env = dict(os.environ.items())
 
+    install_dir = None
+
     if unknown is not None:
         try:
             prefix_loc = unknown.index("--prefix")
@@ -350,12 +345,8 @@ def validate_path(path):
     # A custom path to cuRAND is ignored when CUDA support is available
     if cuda and curand_dir is not None:
         cmake_flags += ["-Dcunumeric_cuRAND_INCLUDE_DIR=%s" % curand_dir]
-    if legate_dir:
-        cmake_flags += ["-Dlegate_core_ROOT=%s" % legate_dir]
-    if legate_url:
-        cmake_flags += ["-Dcunumeric_LEGATE_CORE_REPOSITORY=%s" % legate_url]
-    if legate_branch:
-        cmake_flags += ["-Dcunumeric_LEGATE_CORE_BRANCH=%s" % legate_branch]
+
+    cmake_flags += ["-Dlegate_core_ROOT=%s" % legate_dir]
 
     cmake_flags += extra_flags
     cmd_env.update(
@@ -370,14 +361,6 @@ def validate_path(path):
 
 def driver():
     parser = argparse.ArgumentParser(description="Install cuNumeric.")
-    parser.add_argument(
-        "--install-dir",
-        dest="install_dir",
-        metavar="DIR",
-        required=False,
-        default=None,
-        help="Path to install cuNumeric software",
-    )
     parser.add_argument(
         "--debug",
         dest="debug",
@@ -434,28 +417,6 @@ def driver():
         default=os.environ.get("GASNET"),
         help="Path to GASNet installation directory.",
     )
-    parser.add_argument(
-        "--with-core",
-        dest="legate_dir",
-        metavar="DIR",
-        required=False,
-        default=os.environ.get("LEGATE_DIR"),
-        help="Path to Legate Core installation directory.",
-    )
-    parser.add_argument(
-        "--legate-url",
-        dest="legate_url",
-        required=False,
-        default="https://github.com/nv-legate/legate.core.git",
-        help="Legate git URL to build cuNumeric with.",
-    )
-    parser.add_argument(
-        "--legate-branch",
-        dest="legate_branch",
-        required=False,
-        default="branch-22.10",
-        help="Legate branch to build cuNumeric with.",
-    )
     parser.add_argument(
         "--with-openblas",
         dest="openblas_dir",

From 81b6ac3c86d6c8c8375bb3d7cbffb9cec47663cf Mon Sep 17 00:00:00 2001
From: Mark Vaz <m3vaz@users.noreply.github.com>
Date: Wed, 19 Oct 2022 09:09:59 +1100
Subject: [PATCH 16/89] Fix missing legate-core run requirement (#661)

---
 conda/conda-build/meta.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 4adf59927..7a274a1d3 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -132,7 +132,10 @@ requirements:
   run:
     - numpy {{ numpy_version }}
     - libopenblas =* =*openmp*
-{% if gpu_enabled_bool %}
+{% if not gpu_enabled_bool %}
+    - legate-core ={{ core_version }} =*_cpu
+{% else %}
+    - legate-core ={{ core_version }}
     - cuda-cudart >={{ cuda_version }}
     # - libcutensor >=1.3
     - cutensor >=1.3

From 29a56c31bab448c011c5a1c8b2b4c6f5500c74fe Mon Sep 17 00:00:00 2001
From: xialu00 <110973296+xialu00@users.noreply.github.com>
Date: Wed, 19 Oct 2022 10:12:54 +0800
Subject: [PATCH 17/89] add test cases for test_tile.py and test_repeat.py
 (#657)

* add test cases for test_tile.py and test_repeat.py

* fix bug

* fix bug
---
 cunumeric/module.py              |  34 +++++-
 tests/integration/test_repeat.py | 188 +++++++++++++++++++++++++++----
 tests/integration/test_tile.py   |  81 +++++++------
 3 files changed, 245 insertions(+), 58 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index 7a3024e55..69647b3cb 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -2327,17 +2327,44 @@ def repeat(a: ndarray, repeats: Any, axis: Optional[int] = None) -> ndarray:
     Multiple GPUs, Multiple CPUs
     """
 
+    if repeats is None:
+        raise TypeError(
+            "int() argument must be a string, a bytes-like object or a number,"
+            " not 'NoneType'"
+        )
+
+    if np.ndim(repeats) > 1:
+        raise ValueError("`repeats` should be scalar or 1D array")
+
+    # axes should be integer type
+    if axis is not None and not isinstance(axis, int):
+        raise TypeError("Axis should be integer type")
+
     # when array is a scalar
     if np.ndim(a) == 0:
+        if axis is not None and axis != 0:
+            raise np.AxisError("axis is out of bounds for array of dimension")
         if np.ndim(repeats) == 0:
+            if not isinstance(repeats, int):
+                runtime.warn(
+                    "converting repeats to an integer type",
+                    category=UserWarning,
+                )
+            repeats = np.int64(repeats)
             return full((repeats,), cast(Union[int, float], a))
+        elif np.ndim(repeats) == 1 and len(repeats) == 1:
+            if not isinstance(repeats, int):
+                runtime.warn(
+                    "converting repeats to an integer type",
+                    category=UserWarning,
+                )
+            repeats = np.int64(repeats)
+            return full((repeats[0],), cast(Union[int, float], a))
         else:
             raise ValueError(
                 "`repeat` with a scalar parameter `a` is only "
                 "implemented for scalar values of the parameter `repeats`."
             )
-    if np.ndim(repeats) > 1:
-        raise ValueError("`repeats` should be scalar or 1D array")
 
     # array is an array
     array = convert_to_cunumeric_ndarray(a)
@@ -2349,9 +2376,6 @@ def repeat(a: ndarray, repeats: Any, axis: Optional[int] = None) -> ndarray:
         array = array.ravel()
         axis = 0
 
-    # axes should be integer type
-    if not isinstance(axis, int):
-        raise TypeError("Axis should be integer type")
     axis_int = np.int32(axis)
 
     if axis_int >= array.ndim:
diff --git a/tests/integration/test_repeat.py b/tests/integration/test_repeat.py
index a704d884a..1128a34a3 100644
--- a/tests/integration/test_repeat.py
+++ b/tests/integration/test_repeat.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 import numpy as np
 import pytest
 from legate.core import LEGATE_MAX_DIM
@@ -23,31 +22,164 @@
 np.random.seed(12345)
 
 
-def test_basic():
-    assert np.array_equal(num.repeat(3, 4), np.repeat(3, 4))
-    assert np.array_equal(num.repeat([3, 1], 4), np.repeat([3, 1], 4))
+@pytest.mark.parametrize(
+    "array", (None, [], 4, [2, 3], mk_seq_array(num, (3, 4, 2)))
+)
+def test_repeats_none(array):
+    with pytest.raises(TypeError):
+        num.repeat(array, None)
+
+
+@pytest.mark.parametrize("repeats", (-3, [], [-3], [2, 3]))
+def test_array_none_invalid(repeats):
+    with pytest.raises(ValueError):
+        num.repeat(None, repeats)
+
+
+@pytest.mark.parametrize("repeats", (3, [0], [3], 4.7, [4.7]))
+def test_array_none_valid(repeats):
+    res_num = num.repeat(None, repeats)
+    res_np = np.repeat(None, repeats)
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize("repeats", (-3, 0, 3, 4.7, [], [-3], [0], [3], [4.7]))
+def test_array_empty_repeats_valid(repeats):
+    res_np = np.repeat([], repeats)
+    res_num = num.repeat([], repeats)
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize("repeats", ([3, 4], [1, 2, 3]))
+def test_array_empty_repeats_invalid_negative(repeats):
+    # numpy raises:
+    # ValueError: operands could not be broadcast together with shape (0,) (2,)
+    # while cunumeric is pass with the result []
+    res_num = num.repeat([], repeats)
+    assert np.array_equal(res_num, [])
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("repeats", ([3, 4], [1, 2, 3]))
+def test_array_empty_repeats_invalid(repeats):
+    res_np = np.repeat([], repeats)
+    res_num = num.repeat([], repeats)
+    assert np.array_equal(res_num, res_np)
+
+
+@pytest.mark.parametrize("repeats", (-3, 0, 3, 4.7, [], [-3], [0], [3], [4.7]))
+def test_array_empty_axis_valid(repeats):
+    res_np = np.repeat([], repeats, axis=0)
+    res_num = num.repeat([], repeats, axis=0)
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize("repeats", (-3, 0, 3, 4.7, [], [-3], [0], [3], [4.7]))
+def test_array_empty_axis_invalid(repeats):
+    with pytest.raises(ValueError):
+        num.repeat([], repeats, axis=1)
+
+
+@pytest.mark.parametrize("repeats", (-3, [-3]))
+def test_array_int_repeats_negative(repeats):
+    with pytest.raises(ValueError):
+        num.repeat(3, repeats)
+
+
+@pytest.mark.parametrize("repeats", (0, 3, 4.7, [0], [3], [4.7]))
+def test_array_int_repeats_valid(repeats):
+    res_np = np.repeat(3, repeats)
+    res_num = num.repeat(3, repeats)
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize("repeats", ([], [1, 2]))
+def test_array_int_repeats_invalid(repeats):
+    msg = r"scalar"
+    with pytest.raises(ValueError, match=msg):
+        num.repeat(3, repeats)
+
+
+@pytest.mark.parametrize("repeats", (0, 3, 4.7, [0], [3], [4.7], [2, 3, 4]))
+def test_array_1d_repeats_valid(repeats):
+    anp = np.array([1, 2, 3])
+    res_np = np.repeat(anp, repeats)
+    res_num = num.repeat(anp, repeats)
+    assert np.array_equal(res_np, res_num)
+
 
+@pytest.mark.parametrize("repeats", ([], [2, 3]))
+def test_array_1d_repeats_invalid(repeats):
+    anp = np.array([1, 2, 3])
+    with pytest.raises(ValueError):
+        num.repeat(anp, repeats)
 
-def test_axis():
+
+@pytest.mark.parametrize("repeats", (0, [0], 3, 4.7, [3], [4.7]))
+def test_array_2d_repeats_valid(repeats):
+    anp = np.array([[1, 3], [2, 4]])
+    res_np = np.repeat(anp, repeats)
+    res_num = num.repeat(anp, repeats)
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize("repeats", ([], [2, 3]))
+def test_array_2d_repeats_invalid(repeats):
+    anp = np.array([[1, 3], [2, 4]])
+    with pytest.raises(ValueError):
+        num.repeat(anp, repeats)
+
+
+@pytest.mark.skip()
+@pytest.mark.parametrize("arr", ([1, 2, 3], [[1, 3], [2, 4]]))
+@pytest.mark.parametrize("repeats", (-3, [-3]))
+def test_array_1d_repeats_fatal_error(arr, repeats):
+    anp = np.array(arr)
+    # numpy raises "ValueError: negative dimensions are not allowed"
+    # while cunumeric got "Fatal Python error: Aborted"
+    num.repeat(anp, repeats)
+
+
+@pytest.mark.parametrize("arr", (None, [], 3, [1, 2, 3], [[1, 3], [2, 4]]))
+@pytest.mark.parametrize(
+    "repeats",
+    ([[2, 3], [3, 3]], np.random.randint(low=-10.0, high=10, size=(3, 3, 3))),
+)
+def test_repeats_nd(arr, repeats):
+    anp = np.array(arr)
+    msg = r"should be scalar or 1D array"
+    with pytest.raises(ValueError, match=msg):
+        num.repeat(anp, repeats)
+
+
+@pytest.mark.parametrize(("arr", "repeats"), ((3, 3), ([1, 2, 3], [1, 2, 3])))
+@pytest.mark.parametrize("axis", ("hello", 0.9))
+def test_axis_string(arr, repeats, axis):
+    msg = r"integer"
+    with pytest.raises(TypeError, match=msg):
+        num.repeat(arr, repeats, axis=axis)
+
+
+def test_array_axis_out_bound():
+    anp = np.array([1, 2, 3, 4, 5])
+    # np.repeat(anp, 4, 2)
+    # numpy.AxisError: axis 2 is out of bounds for array of dimension 1
+    msg = r"dimension"
+    with pytest.raises(ValueError, match=msg):
+        num.repeat(anp, 4, 2)
+
+
+@pytest.mark.xfail()
+def test_array_axis_negative_equal():
     anp = np.array([1, 2, 3, 4, 5])
-    a = num.array(anp)
-    repnp = np.array([1, 2, 1, 2, 1])
-    rep = num.array(repnp)
-    print(num.repeat(a, rep, axis=0))
-    print(np.repeat(anp, repnp, axis=0))
-    assert np.array_equal(
-        num.repeat(a, rep, axis=0), np.repeat(anp, repnp, axis=0)
-    )
-    xnp = np.array([[1, 2], [3, 4]])
-    x = num.array([[1, 2], [3, 4]])
-    assert np.array_equal(
-        num.repeat(x, [1, 2], axis=0), np.repeat(xnp, [1, 2], axis=0)
-    )
-    assert np.array_equal(num.repeat(x, 0, axis=0), np.repeat(xnp, 0, axis=0))
+    res_np = np.repeat(anp, 4, -1)  # [1 1 1 1 2 2 2 2 3 3 3 3 4 4 4 4 5 5 5 5]
+    res_num = num.repeat(anp, 4, -1)  # [1 1 1 1 2]
+    # They have different outputs.
+    assert np.array_equal(res_np, res_num)
 
 
 @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
-def test_nd(ndim):
+def test_nd_basic(ndim):
     a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
     np_array = mk_seq_array(np, a_shape)
     num_array = mk_seq_array(num, a_shape)
@@ -55,10 +187,26 @@ def test_nd(ndim):
     res_num = num.repeat(num_array, repeats)
     res_np = np.repeat(np_array, repeats)
     assert np.array_equal(res_num, res_np)
+
+
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+def test_nd_axis(ndim):
     for axis in range(0, ndim):
+        a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
+        np_array = mk_seq_array(np, a_shape)
+        num_array = mk_seq_array(num, a_shape)
+        repeats = np.random.randint(0, 15)
         res_num2 = num.repeat(num_array, repeats, axis)
         res_np2 = np.repeat(np_array, repeats, axis)
         assert np.array_equal(res_num2, res_np2)
+
+
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+def test_nd_repeats(ndim):
+    a_shape = tuple(np.random.randint(1, 9) for _ in range(ndim))
+    np_array = mk_seq_array(np, a_shape)
+    num_array = mk_seq_array(num, a_shape)
+    for axis in range(0, ndim):
         rep_shape = (a_shape[axis],)
         rep_arr_np = mk_seq_array(np, rep_shape)
         rep_arr_num = mk_seq_array(num, rep_shape)
diff --git a/tests/integration/test_tile.py b/tests/integration/test_tile.py
index d9ec3e1c7..1bfc1dcf8 100644
--- a/tests/integration/test_tile.py
+++ b/tests/integration/test_tile.py
@@ -12,46 +12,61 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import numpy as np
 import pytest
 
 import cunumeric as num
 
 
-def test_1d():
+def test_negative():
     a = num.array([0, 1, 2])
+    with pytest.raises(ValueError):
+        num.tile(a, -4)
 
-    b = num.tile(a, 4)
-    assert num.array_equal(b, [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2])
-
-    c = num.tile(a, (3, 4))
-    assert num.array_equal(
-        c,
-        [
-            [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
-            [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
-            [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
-        ],
-    )
-
-    d = num.tile(a, (3, 1, 4))
-    assert num.array_equal(
-        d,
-        [
-            [[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]],
-            [[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]],
-            [[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]],
-        ],
-    )
-
-
-def test_2d():
-    e = num.array([[1, 2], [3, 4]])
-
-    f = num.tile(e, 2)
-    assert num.array_equal(f, [[1, 2, 1, 2], [3, 4, 3, 4]])
-
-    g = num.tile(e, (2, 1))
-    assert num.array_equal(g, [[1, 2], [3, 4], [1, 2], [3, 4]])
+
+def test_float():
+    a = num.array([0, 1, 2])
+    msg = r"float"
+    with pytest.raises(TypeError, match=msg):
+        num.tile(a, 2.2)
+
+
+def test_list():
+    a = num.array([0, 1, 2])
+    msg = r"1d sequence"
+    with pytest.raises(TypeError, match=msg):
+        num.tile(a, [[1, 2], [3, 4]])
+
+
+def test_tuple():
+    a = num.array([0, 1, 2])
+    msg = r"1d sequence"
+    with pytest.raises(TypeError, match=msg):
+        num.tile(a, ((1, 2), (3, 4)))
+
+
+DIM = 5
+SIZES = [
+    (0,),
+    (1),
+    (0, 1),
+    (1, 0),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 1, 1),
+    (DIM, DIM, DIM),
+]
+
+
+@pytest.mark.parametrize("size", SIZES, ids=str)
+@pytest.mark.parametrize("value", (0, DIM, (DIM, DIM), (DIM, DIM, DIM)))
+def test_basic(size, value):
+    a = np.random.randint(low=-10.0, high=10, size=size)
+    res_np = np.tile(a, value)
+    res_num = num.tile(a, value)
+    assert np.array_equal(res_np, res_num)
 
 
 if __name__ == "__main__":

From 87c7d450fd69ee883d268d1df14bb4ee206b9a86 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 19 Oct 2022 11:28:03 -0600
Subject: [PATCH 18/89] Implementing PUT routine (#582)

Adding support for PUT
---
 cunumeric/array.py                     |  82 +++++++++++++---
 cunumeric/deferred.py                  |  69 ++++++++++++-
 cunumeric/eager.py                     |   7 ++
 cunumeric/module.py                    |  45 +++++++--
 cunumeric/thunk.py                     |   4 +
 docs/cunumeric/source/api/indexing.rst |   1 +
 src/cunumeric/index/wrap.cc            |  16 +--
 src/cunumeric/index/wrap.cu            |  83 +++++++++++++---
 src/cunumeric/index/wrap.h             |  29 ++++++
 src/cunumeric/index/wrap_omp.cc        |  10 +-
 src/cunumeric/index/wrap_template.inl  |  22 ++++-
 src/cunumeric/index/zip.cu             |  98 ++++++++++++++----
 src/cunumeric/index/zip.h              |   6 ++
 tests/integration/test_put.py          | 131 +++++++++++++++++++++++++
 14 files changed, 536 insertions(+), 67 deletions(-)
 create mode 100644 tests/integration/test_put.py

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 64784d59a..cd14eda7c 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -920,12 +920,8 @@ def _convert_key(self, key: Any, first: bool = True) -> Any:
             key = convert_to_cunumeric_ndarray(key)
             if key.dtype != bool and not np.issubdtype(key.dtype, np.integer):
                 raise TypeError("index arrays should be int or bool type")
-            if key.dtype != bool and key.dtype != np.int64:
-                runtime.warn(
-                    "converting index array to int64 type",
-                    category=RuntimeWarning,
-                )
-                key = key.astype(np.int64)
+            if key.dtype != bool:
+                key = key._warn_and_convert(np.dtype(np.int64))
 
             return key._thunk
 
@@ -2104,12 +2100,8 @@ def compress(
             raise ValueError(
                 "Dimension mismatch: condition must be a 1D array"
             )
-        if condition.dtype != bool:
-            runtime.warn(
-                "converting condition to bool type",
-                category=RuntimeWarning,
-            )
-            condition = condition.astype(bool)
+
+        condition = condition._warn_and_convert(np.dtype(bool))
 
         if axis is None:
             axis = 0
@@ -2476,6 +2468,62 @@ def diagonal(
                 raise ValueError("Either axis1/axis2 or axes must be supplied")
         return self._diag_helper(offset=offset, axes=axes, extract=extract)
 
+    @add_boilerplate("indices", "values")
+    def put(
+        self, indices: ndarray, values: ndarray, mode: str = "raise"
+    ) -> None:
+        """
+        Replaces specified elements of the array with given values.
+
+        Refer to :func:`cunumeric.put` for full documentation.
+
+        See Also
+        --------
+        cunumeric.put : equivalent function
+
+        Availability
+        --------
+        Multiple GPUs, Multiple CPUs
+
+        """
+
+        if values.size == 0 or indices.size == 0 or self.size == 0:
+            return
+
+        if mode not in ("raise", "wrap", "clip"):
+            raise ValueError(
+                "mode must be one of 'clip', 'raise', or 'wrap' "
+                f"(got  {mode})"
+            )
+
+        if mode == "wrap":
+            indices = indices % self.size
+        elif mode == "clip":
+            indices = indices.clip(0, self.size - 1)
+
+        indices = indices._warn_and_convert(np.dtype(np.int64))
+        values = values._warn_and_convert(self.dtype)
+
+        if indices.ndim > 1:
+            indices = indices.ravel()
+
+        if self.shape == ():
+            if mode == "raise":
+                if indices.min() < -1 or indices.max() > 0:
+                    raise ValueError("Indices out of bounds")
+            if values.shape == ():
+                v = values
+            else:
+                v = values[0]
+            self._thunk.copy(v._thunk, deep=False)
+            return
+
+        # call _wrap on the values if they need to be wrapped
+        if values.ndim != indices.ndim or values.size != indices.size:
+            values = values._wrap(indices.size)
+
+        self._thunk.put(indices._thunk, values._thunk)
+
     @add_boilerplate()
     def trace(
         self,
@@ -3822,6 +3870,16 @@ def _maybe_convert(self, dtype: np.dtype[Any], hints: Any) -> ndarray:
         copy._thunk.convert(self._thunk)
         return copy
 
+    def _warn_and_convert(self, dtype: np.dtype[Any]) -> ndarray:
+        if self.dtype != dtype:
+            runtime.warn(
+                f"converting array to {dtype} type",
+                category=RuntimeWarning,
+            )
+            return self.astype(dtype)
+        else:
+            return self
+
     # For performing normal/broadcast unary operations
     @classmethod
     def _perform_unary_op(
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 54f481977..3bb5c4db7 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -796,10 +796,16 @@ def _broadcast(self, shape: NdShape) -> Any:
 
         return result
 
-    def _convert_future_to_regionfield(self) -> DeferredArray:
+    def _convert_future_to_regionfield(
+        self, change_shape: bool = False
+    ) -> DeferredArray:
+        if change_shape and self.shape == ():
+            shape: NdShape = (1,)
+        else:
+            shape = self.shape
         store = self.context.create_store(
             self.dtype,
-            shape=self.shape,
+            shape=shape,
             optimize_scalar=False,
         )
         thunk_copy = DeferredArray(
@@ -1679,6 +1685,60 @@ def _diag_helper(
 
         task.execute()
 
+    @auto_convert("indices", "values")
+    def put(self, indices: Any, values: Any) -> None:
+
+        if indices.base.kind == Future or indices.base.transformed:
+            change_shape = indices.base.kind == Future
+            indices = indices._convert_future_to_regionfield(change_shape)
+        if values.base.kind == Future or values.base.transformed:
+            change_shape = values.base.kind == Future
+            values = values._convert_future_to_regionfield(change_shape)
+
+        if self.base.kind == Future or self.base.transformed:
+            change_shape = self.base.kind == Future
+            self_tmp = self._convert_future_to_regionfield(change_shape)
+        else:
+            self_tmp = self
+
+        assert indices.size == values.size
+
+        # first, we create indirect array with PointN type that
+        # (indices.size,) shape and is used to copy data from values
+        # to the target ND array (self)
+        N = self_tmp.ndim
+        pointN_dtype = self.runtime.get_point_type(N)
+        indirect = cast(
+            DeferredArray,
+            self.runtime.create_empty_thunk(
+                shape=indices.shape,
+                dtype=pointN_dtype,
+                inputs=[indices],
+            ),
+        )
+
+        shape = self_tmp.shape
+        task = self.context.create_task(CuNumericOpCode.WRAP)
+        task.add_output(indirect.base)
+        task.add_scalar_arg(shape, (ty.int64,))
+        task.add_scalar_arg(True, bool)  # has_input
+        task.add_input(indices.base)
+        task.add_alignment(indices.base, indirect.base)
+        task.throws_exception(IndexError)
+        task.execute()
+        if indirect.base.kind == Future:
+            indirect = indirect._convert_future_to_regionfield()
+
+        copy = self.context.create_copy()
+        copy.set_target_indirect_out_of_range(False)
+        copy.add_input(values.base)
+        copy.add_target_indirect(indirect.base)
+        copy.add_output(self_tmp.base)
+        copy.execute()
+
+        if self_tmp is not self:
+            self.copy(self_tmp, deep=True)
+
     # Create an identity array with the ones offset from the diagonal by k
     def eye(self, k: int) -> None:
         assert self.ndim == 2  # Only 2-D arrays should be here
@@ -2896,6 +2956,7 @@ def unary_op(
         args: Any,
         multiout: Optional[Any] = None,
     ) -> None:
+
         lhs = self.base
         rhs = src._broadcast(lhs.shape)
 
@@ -3355,7 +3416,8 @@ def unpackbits(
     @auto_convert("src")
     def _wrap(self, src: Any, new_len: int) -> None:
         if src.base.kind == Future or src.base.transformed:
-            src = src._convert_future_to_regionfield()
+            change_shape = src.base.kind == Future
+            src = src._convert_future_to_regionfield(change_shape)
 
         # first, we create indirect array with PointN type that
         # (len,) shape and is used to copy data from original array
@@ -3374,6 +3436,7 @@ def _wrap(self, src: Any, new_len: int) -> None:
         task = self.context.create_task(CuNumericOpCode.WRAP)
         task.add_output(indirect.base)
         task.add_scalar_arg(src.shape, (ty.int64,))
+        task.add_scalar_arg(False, bool)  # has_input
         task.execute()
 
         copy = self.context.create_copy()
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index fdb8f7989..b8cb36ecd 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -620,6 +620,13 @@ def _diag_helper(
                 axes = tuple(range(ndims - naxes, ndims))
                 self.array = diagonal_reference(rhs.array, axes)
 
+    def put(self, indices: Any, values: Any) -> None:
+        self.check_eager_args(indices, values)
+        if self.deferred is not None:
+            self.deferred.put(indices, values)
+        else:
+            np.put(self.array, indices.array, values.array)
+
     def eye(self, k: int) -> None:
         if self.deferred is not None:
             self.deferred.eye(k)
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 69647b3cb..0a4e97a5a 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -2410,12 +2410,7 @@ def repeat(a: ndarray, repeats: Any, axis: Optional[int] = None) -> ndarray:
     # repeats is an array
     else:
         # repeats should be integer type
-        if repeats.dtype != np.int64:
-            runtime.warn(
-                "converting repeats to an integer type",
-                category=RuntimeWarning,
-            )
-        repeats = repeats.astype(np.int64)
+        repeats = repeats._warn_and_convert(np.int64)
         if repeats.shape[0] != array.shape[axis]:
             raise ValueError("incorrect shape of repeats array")
         result = array._thunk.repeat(
@@ -3473,6 +3468,44 @@ def diagonal(
     )
 
 
+@add_boilerplate("a", "indices", "values")
+def put(
+    a: ndarray, indices: ndarray, values: ndarray, mode: str = "raise"
+) -> None:
+    """
+    Replaces specified elements of an array with given values.
+    The indexing works as if the target array is first flattened.
+
+    Parameters
+    ----------
+    a : array_like
+        Array to put data into
+    indices : array_like
+        Target indices, interpreted as integers.
+        WARNING: In case there are repeated entries in the
+        indices array, Legate doesn't guarantee the order in
+        which values are updated.
+
+    values : array_like
+        Values to place in `a` at target indices. If values array is shorter
+        than indices, it will be repeated as necessary.
+    mode : {'raise', 'wrap', 'clip'}, optional
+        Specifies how out-of-bounds indices will behave.
+        'raise' : raise an error.
+        'wrap' : wrap around.
+        'clip' : clip to the range.
+
+    See Also
+    --------
+    numpy.put
+
+    Availability
+    --------
+    Multiple GPUs, Multiple CPUs
+    """
+    a.put(indices=indices, values=values, mode=mode)
+
+
 @add_boilerplate("a", "val")
 def fill_diagonal(a: ndarray, val: ndarray, wrap: bool = False) -> None:
     """
diff --git a/cunumeric/thunk.py b/cunumeric/thunk.py
index bdc773aeb..e1f1dab77 100644
--- a/cunumeric/thunk.py
+++ b/cunumeric/thunk.py
@@ -197,6 +197,10 @@ def _diag_helper(
     ) -> None:
         ...
 
+    @abstractmethod
+    def put(self, indices: Any, values: Any) -> None:
+        ...
+
     @abstractmethod
     def eye(self, k: int) -> None:
         ...
diff --git a/docs/cunumeric/source/api/indexing.rst b/docs/cunumeric/source/api/indexing.rst
index 1023ed1d4..1ace111d4 100644
--- a/docs/cunumeric/source/api/indexing.rst
+++ b/docs/cunumeric/source/api/indexing.rst
@@ -43,5 +43,6 @@ Inserting data into arrays
    :toctree: generated/
  
    fill_diagonal
+   put
    put_along_axis
    place
diff --git a/src/cunumeric/index/wrap.cc b/src/cunumeric/index/wrap.cc
index 33dfcfe4b..a5483cbdd 100644
--- a/src/cunumeric/index/wrap.cc
+++ b/src/cunumeric/index/wrap.cc
@@ -24,28 +24,30 @@ using namespace legate;
 
 template <int DIM>
 struct WrapImplBody<VariantKind::CPU, DIM> {
+  template <typename IND>
   void operator()(const AccessorWO<Point<DIM>, 1>& out,
                   const Pitches<0>& pitches_out,
                   const Rect<1>& out_rect,
                   const Pitches<DIM - 1>& pitches_in,
                   const Rect<DIM>& in_rect,
-                  const bool dense) const
+                  const bool dense,
+                  const IND& indices) const
   {
     const int64_t start  = out_rect.lo[0];
     const int64_t end    = out_rect.hi[0];
     const auto in_volume = in_rect.volume();
     if (dense) {
-      int64_t out_idx = 0;
-      auto outptr     = out.ptr(out_rect);
+      auto outptr = out.ptr(out_rect);
       for (int64_t i = start; i <= end; i++) {
-        const int64_t input_idx = i % in_volume;
+        check_idx(i, in_volume, indices);
+        const int64_t input_idx = compute_idx(i, in_volume, indices);
         auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
-        outptr[out_idx]         = point;
-        out_idx++;
+        outptr[i - start]       = point;
       }
     } else {
       for (int64_t i = start; i <= end; i++) {
-        const int64_t input_idx = i % in_volume;
+        check_idx(i, in_volume, indices);
+        const int64_t input_idx = compute_idx(i, in_volume, indices);
         auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
         out[i]                  = point;
       }
diff --git a/src/cunumeric/index/wrap.cu b/src/cunumeric/index/wrap.cu
index 0f118eadf..af81073d6 100644
--- a/src/cunumeric/index/wrap.cu
+++ b/src/cunumeric/index/wrap.cu
@@ -23,7 +23,28 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <int DIM>
+template <typename Output>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  check_kernel(Output out,
+               const AccessorRO<int64_t, 1> indices,
+               const int64_t start,
+               const int64_t volume,
+               const int64_t in_volume,
+               const int64_t iters)
+{
+  bool value = false;
+  for (size_t i = 0; i < iters; i++) {
+    const auto idx = (i * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= volume) break;
+    auto index_tmp = indices[idx + start];
+    int64_t index  = index_tmp < 0 ? index_tmp + in_volume : index_tmp;
+    bool val       = (index < 0 || index >= in_volume);
+    SumReduction<bool>::fold<true>(value, val);
+  }
+  reduce_output(out, value);
+}
+
+template <int DIM, typename IND>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   wrap_kernel(const AccessorWO<Point<DIM>, 1> out,
               const int64_t start,
@@ -32,53 +53,93 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
               const Point<1> out_lo,
               const Pitches<DIM - 1> pitches_in,
               const Point<DIM> in_lo,
-              const size_t in_volume)
+              const size_t in_volume,
+              const IND indices)
 {
   const auto idx = global_tid_1d();
   if (idx >= volume) return;
-  const int64_t input_idx = (idx + start) % in_volume;
+  const int64_t input_idx = compute_idx((idx + start), in_volume, indices);
   auto out_p              = pitches_out.unflatten(idx, out_lo);
   auto p                  = pitches_in.unflatten(input_idx, in_lo);
   out[out_p]              = p;
 }
 
-template <int DIM>
+template <int DIM, typename IND>
 __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   wrap_kernel_dense(Point<DIM>* out,
                     const int64_t start,
                     const int64_t volume,
                     const Pitches<DIM - 1> pitches_in,
                     const Point<DIM> in_lo,
-                    const size_t in_volume)
+                    const size_t in_volume,
+                    const IND indices)
 {
   const auto idx = global_tid_1d();
   if (idx >= volume) return;
-  const int64_t input_idx = (idx + start) % in_volume;
+  const int64_t input_idx = compute_idx((idx + start), in_volume, indices);
   auto p                  = pitches_in.unflatten(input_idx, in_lo);
   out[idx]                = p;
 }
 
+// don't do anything when indices is a boolean
+void check_out_of_bounds(const bool& indices,
+                         const int64_t start,
+                         const int64_t volume,
+                         const int64_t volume_in,
+                         cudaStream_t stream)
+{
+}
+
+void check_out_of_bounds(const AccessorRO<int64_t, 1>& indices,
+                         const int64_t start,
+                         const int64_t volume,
+                         const int64_t volume_in,
+                         cudaStream_t stream)
+{
+  const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(bool);
+  DeviceScalarReductionBuffer<SumReduction<bool>> out_of_bounds(stream);
+
+  if (blocks >= MAX_REDUCTION_CTAS) {
+    const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
+    check_kernel<<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
+      out_of_bounds, indices, start, volume, volume_in, iters);
+  } else {
+    check_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
+      out_of_bounds, indices, start, volume, volume_in, 1);
+  }
+  CHECK_CUDA_STREAM(stream);
+
+  bool res = out_of_bounds.read(stream);
+  if (res) throw legate::TaskException("index is out of bounds in index array");
+}
+
 template <int DIM>
 struct WrapImplBody<VariantKind::GPU, DIM> {
+  template <typename IND>
   void operator()(const AccessorWO<Point<DIM>, 1>& out,
                   const Pitches<0>& pitches_out,
                   const Rect<1>& out_rect,
                   const Pitches<DIM - 1>& pitches_in,
                   const Rect<DIM>& in_rect,
-                  const bool dense) const
+                  const bool dense,
+                  const IND& indices) const
   {
     auto stream          = get_cached_stream();
     const int64_t start  = out_rect.lo[0];
     const int64_t volume = out_rect.volume();
     const auto in_volume = in_rect.volume();
     const size_t blocks  = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+    check_out_of_bounds(indices, start, volume, in_volume, stream);
+
     if (dense) {
       auto outptr = out.ptr(out_rect);
-      wrap_kernel_dense<DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-        outptr, start, volume, pitches_in, in_rect.lo, in_volume);
+      wrap_kernel_dense<DIM, IND><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        outptr, start, volume, pitches_in, in_rect.lo, in_volume, indices);
     } else {
-      wrap_kernel<DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-        out, start, volume, pitches_out, out_rect.lo, pitches_in, in_rect.lo, in_volume);
+      wrap_kernel<DIM, IND><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
+        out, start, volume, pitches_out, out_rect.lo, pitches_in, in_rect.lo, in_volume, indices);
     }
     CHECK_CUDA_STREAM(stream);
   }
diff --git a/src/cunumeric/index/wrap.h b/src/cunumeric/index/wrap.h
index 91c3f2326..181a9b97c 100644
--- a/src/cunumeric/index/wrap.h
+++ b/src/cunumeric/index/wrap.h
@@ -25,6 +25,8 @@ struct WrapArgs {
                                     // copy information from original array to the
                                     //  `wrapped` one
   const Legion::DomainPoint shape;  // shape of the original array
+  const bool has_input;
+  const Array& in = Array();
 };
 
 class WrapTask : public CuNumericTask<WrapTask> {
@@ -41,4 +43,31 @@ class WrapTask : public CuNumericTask<WrapTask> {
 #endif
 };
 
+__CUDA_HD__ static int64_t compute_idx(const int64_t i, const int64_t volume, const bool&)
+{
+  return i % volume;
+}
+
+__CUDA_HD__ static int64_t compute_idx(const int64_t i,
+                                       const int64_t volume,
+                                       const legate::AccessorRO<int64_t, 1>& indices)
+{
+  int64_t idx   = indices[i];
+  int64_t index = idx < 0 ? idx + volume : idx;
+  return index;
+}
+
+static void check_idx(const int64_t i,
+                      const int64_t volume,
+                      const legate::AccessorRO<int64_t, 1>& indices)
+{
+  int64_t idx   = indices[i];
+  int64_t index = idx < 0 ? idx + volume : idx;
+  if (index < 0 || index >= volume)
+    throw legate::TaskException("index is out of bounds in index array");
+}
+static void check_idx(const int64_t i, const int64_t volume, const bool&)
+{
+  // don't do anything when wrapping indices
+}
 }  // namespace cunumeric
diff --git a/src/cunumeric/index/wrap_omp.cc b/src/cunumeric/index/wrap_omp.cc
index f95e9123c..531592df9 100644
--- a/src/cunumeric/index/wrap_omp.cc
+++ b/src/cunumeric/index/wrap_omp.cc
@@ -24,12 +24,14 @@ using namespace legate;
 
 template <int DIM>
 struct WrapImplBody<VariantKind::OMP, DIM> {
+  template <typename IND>
   void operator()(const AccessorWO<Point<DIM>, 1>& out,
                   const Pitches<0>& pitches_out,
                   const Rect<1>& out_rect,
                   const Pitches<DIM - 1>& pitches_in,
                   const Rect<DIM>& in_rect,
-                  const bool dense) const
+                  const bool dense,
+                  const IND& indices) const
   {
     const int64_t start  = out_rect.lo[0];
     const int64_t end    = out_rect.hi[0];
@@ -38,14 +40,16 @@ struct WrapImplBody<VariantKind::OMP, DIM> {
       auto outptr = out.ptr(out_rect);
 #pragma omp parallel for schedule(static)
       for (int64_t i = start; i <= end; i++) {
-        const int64_t input_idx = i % in_volume;
+        check_idx(i, in_volume, indices);
+        const int64_t input_idx = compute_idx(i, in_volume, indices);
         auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
         outptr[i - start]       = point;
       }
     } else {
 #pragma omp parallel for schedule(static)
       for (int64_t i = start; i <= end; i++) {
-        const int64_t input_idx = i % in_volume;
+        check_idx(i, in_volume, indices);
+        const int64_t input_idx = compute_idx(i, in_volume, indices);
         auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
         out[i]                  = point;
       }
diff --git a/src/cunumeric/index/wrap_template.inl b/src/cunumeric/index/wrap_template.inl
index 46885f24e..093f5f5b1 100644
--- a/src/cunumeric/index/wrap_template.inl
+++ b/src/cunumeric/index/wrap_template.inl
@@ -60,16 +60,30 @@ struct WrapImpl {
     assert(volume_in != 0);
 #endif
 
-    WrapImplBody<KIND, DIM>()(out, pitches_out, out_rect, pitches_in, input_rect, dense);
+    if (args.has_input) {
+      auto in_rect = args.in.shape<1>();
+      auto in = args.in.read_accessor<int64_t, 1>(in_rect);  // input should be always integer type
+#ifdef DEBUG_CUNUMERIC
+      assert(in_rect == out_rect);
+#endif
+      WrapImplBody<KIND, DIM>()(out, pitches_out, out_rect, pitches_in, input_rect, dense, in);
+
+    } else {
+      bool tmp = false;
+      WrapImplBody<KIND, DIM>()(out, pitches_out, out_rect, pitches_in, input_rect, dense, tmp);
+    }  // else
   }
 };
 
 template <VariantKind KIND>
 static void wrap_template(TaskContext& context)
 {
-  auto shape = context.scalars()[0].value<DomainPoint>();
-  int dim    = shape.dim;
-  WrapArgs args{context.outputs()[0], shape};
+  auto shape      = context.scalars()[0].value<DomainPoint>();
+  int dim         = shape.dim;
+  bool has_input  = context.scalars()[1].value<bool>();
+  Array tmp_array = Array();
+  WrapArgs args{
+    context.outputs()[0], shape, has_input, has_input ? context.inputs()[0] : tmp_array};
   dim_dispatch(dim, WrapImpl<KIND>{}, args);
 }
 
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 8bdfcd3f0..82d162126 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -28,15 +28,15 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
              const Buffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
-             size_t volume,
-             DomainPoint shape,
+             const size_t volume,
+             const DomainPoint shape,
              std::index_sequence<Is...>)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
   auto p = pitches.unflatten(idx, rect.lo);
   Legion::Point<N> new_point;
-  for (size_t i = 0; i < N; i++) { new_point[i] = compute_idx(index_arrays[i][p], shape[i]); }
+  for (size_t i = 0; i < N; i++) { new_point[i] = compute_idx_cuda(index_arrays[i][p], shape[i]); }
   out[p] = new_point;
 }
 
@@ -45,14 +45,16 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   zip_kernel_dense(Point<N>* out,
                    const Buffer<const int64_t*, 1> index_arrays,
                    const Rect<DIM> rect,
-                   size_t volume,
-                   DomainPoint shape,
+                   const size_t volume,
+                   const DomainPoint shape,
                    std::index_sequence<Is...>)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
   Legion::Point<N> new_point;
-  for (size_t i = 0; i < N; i++) { new_point[i] = compute_idx(index_arrays[i][idx], shape[i]); }
+  for (size_t i = 0; i < N; i++) {
+    new_point[i] = compute_idx_cuda(index_arrays[i][idx], shape[i]);
+  }
   out[idx] = new_point;
 }
 
@@ -62,11 +64,11 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
              const Buffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
              const Rect<DIM> rect,
              const Pitches<DIM - 1> pitches,
-             int narrays,
-             size_t volume,
-             int64_t key_dim,
-             int64_t start_index,
-             DomainPoint shape)
+             const int64_t narrays,
+             const size_t volume,
+             const int64_t key_dim,
+             const int64_t start_index,
+             const DomainPoint shape)
 {
   const size_t idx = global_tid_1d();
   if (idx >= volume) return;
@@ -74,7 +76,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   Legion::Point<N> new_point;
   for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
   for (size_t i = 0; i < narrays; i++) {
-    new_point[start_index + i] = compute_idx(index_arrays[i][p], shape[start_index + i]);
+    new_point[start_index + i] = compute_idx_cuda(index_arrays[i][p], shape[start_index + i]);
   }
   for (size_t i = (start_index + narrays); i < N; i++) {
     int64_t j    = key_dim + i - narrays;
@@ -83,10 +85,63 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   out[p] = new_point;
 }
 
+template <typename Output, int DIM>
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  check_kernel(Output out,
+               const Buffer<AccessorRO<int64_t, DIM>, 1> index_arrays,
+               const int64_t volume,
+               const int64_t iters,
+               const Rect<DIM> rect,
+               const Pitches<DIM - 1> pitches,
+               const int64_t narrays,
+               const int64_t start_index,
+               const DomainPoint shape)
+{
+  bool value = false;
+  for (size_t i = 0; i < iters; i++) {
+    const auto idx = (i * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    if (idx >= volume) break;
+    auto p = pitches.unflatten(idx, rect.lo);
+    for (size_t n = 0; n < narrays; n++) {
+      const int64_t extent = shape[start_index + n];
+      coord_t index = index_arrays[n][p] < 0 ? index_arrays[n][p] + extent : index_arrays[n][p];
+      bool val      = (index < 0 || index >= extent);
+      SumReduction<bool>::fold<true>(value, val);
+    }  // for n
+  }
+  reduce_output(out, value);
+}
+
 template <int DIM, int N>
 struct ZipImplBody<VariantKind::GPU, DIM, N> {
   using VAL = int64_t;
 
+  void check_out_of_bounds(const Buffer<AccessorRO<int64_t, DIM>, 1>& index_arrays,
+                           const int64_t volume,
+                           const Rect<DIM>& rect,
+                           const Pitches<DIM - 1>& pitches,
+                           const int64_t narrays,
+                           const int64_t start_index,
+                           const DomainPoint& shape,
+                           cudaStream_t stream) const
+  {
+    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    size_t shmem_size   = THREADS_PER_BLOCK / 32 * sizeof(bool);
+    DeviceScalarReductionBuffer<SumReduction<bool>> out_of_bounds(stream);
+    if (blocks >= MAX_REDUCTION_CTAS) {
+      const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
+      check_kernel<<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
+        out_of_bounds, index_arrays, volume, iters, rect, pitches, narrays, start_index, shape);
+    } else {
+      check_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
+        out_of_bounds, index_arrays, volume, 1, rect, pitches, narrays, start_index, shape);
+    }
+    CHECK_CUDA_STREAM(stream);
+
+    bool res = out_of_bounds.read(stream);
+    if (res) throw legate::TaskException("index is out of bounds in index array");
+  }
+
   template <size_t... Is>
   void operator()(const AccessorWO<Point<N>, DIM>& out,
                   const std::vector<AccessorRO<VAL, DIM>>& index_arrays,
@@ -101,19 +156,23 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
     auto stream         = get_cached_stream();
     const size_t volume = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+    auto index_buf =
+      create_buffer<AccessorRO<VAL, DIM>, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
+    for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
+    check_out_of_bounds(
+      index_buf, volume, rect, pitches, index_arrays.size(), start_index, shape, stream);
+
     if (index_arrays.size() == N) {
       if (dense) {
-        auto index_buf =
+        auto index_buf_dense =
           create_buffer<const int64_t*, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
         for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) {
-          index_buf[idx] = index_arrays[idx].ptr(rect);
+          index_buf_dense[idx] = index_arrays[idx].ptr(rect);
         }
         zip_kernel_dense<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-          out.ptr(rect), index_buf, rect, volume, shape, std::make_index_sequence<N>());
+          out.ptr(rect), index_buf_dense, rect, volume, shape, std::make_index_sequence<N>());
       } else {
-        auto index_buf =
-          create_buffer<AccessorRO<VAL, DIM>, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
-        for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
         zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
           out, index_buf, rect, pitches, volume, shape, std::make_index_sequence<N>());
       }
@@ -121,9 +180,6 @@ struct ZipImplBody<VariantKind::GPU, DIM, N> {
 #ifdef DEBUG_CUNUMERIC
       assert(index_arrays.size() < N);
 #endif
-      auto index_buf =
-        create_buffer<AccessorRO<VAL, DIM>, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
-      for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
       int num_arrays = index_arrays.size();
       zip_kernel<DIM, N><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
         out, index_buf, rect, pitches, num_arrays, volume, key_dim, start_index, shape);
diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h
index 61a87104c..ffa5941d5 100644
--- a/src/cunumeric/index/zip.h
+++ b/src/cunumeric/index/zip.h
@@ -51,4 +51,10 @@ constexpr coord_t compute_idx(coord_t index, coord_t extent)
   return new_index;
 }
 
+constexpr coord_t compute_idx_cuda(coord_t index, coord_t extent)
+{
+  coord_t new_index = index < 0 ? index + extent : index;
+  return new_index;
+}
+
 }  // namespace cunumeric
diff --git a/tests/integration/test_put.py b/tests/integration/test_put.py
new file mode 100644
index 000000000..1c69a705b
--- /dev/null
+++ b/tests/integration/test_put.py
@@ -0,0 +1,131 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+from legate.core import LEGATE_MAX_DIM
+from utils.generators import mk_seq_array
+
+import cunumeric as num
+
+
+@pytest.mark.parametrize("mode", ("wrap", "clip"))
+def test_scalar(mode):
+    # testing the case when indices is a scalar
+    x = mk_seq_array(np, (3, 4, 5))
+    x_num = mk_seq_array(num, (3, 4, 5))
+    values = mk_seq_array(np, (6,)) * 10
+    values_num = num.array(values)
+
+    np.put(x, 0, values)
+    num.put(x_num, 0, values_num)
+    assert np.array_equal(x_num, x)
+
+    np.put(x, 1, -10, mode)
+    num.put(x_num, 1, -10, mode)
+    assert np.array_equal(x_num, x)
+
+    # checking transformed array
+    y = x[:1]
+    y_num = x_num[:1]
+    np.put(y, 0, values)
+    num.put(y_num, 0, values_num)
+    assert np.array_equal(x_num, x)
+
+    x = np.zeros(1)
+    x_num = num.zeros(1)
+    np.put(x, np.arange(4), np.ones(4), mode="clip")
+    num.put(x_num, num.arange(4), num.ones(4), mode="clip")
+    assert np.array_equal(x_num, x)
+
+    x = np.arange(5)
+    x_num = num.array(x)
+    indices = np.array([1, 4])
+    indices_num = num.array(indices)
+    np.put(x, indices, 10)
+    num.put(x_num, indices_num, 10)
+    assert np.array_equal(x_num, x)
+
+    x = np.zeros(())
+    x_num = num.zeros(())
+    np.put(x, 0, 1)
+    num.put(x_num, 0, 1)
+    assert np.array_equal(x_num, x)
+
+    x = np.zeros(())
+    x_num = num.zeros(())
+    np.put(x, [0], 1)
+    num.put(x_num, [0], 1)
+    assert np.array_equal(x_num, x)
+
+    x = np.zeros(())
+    x_num = num.zeros(())
+    np.put(x, [0], [1])
+    num.put(x_num, [0], [1])
+    assert np.array_equal(x_num, x)
+
+
+def test_indices_type_convert():
+    x = mk_seq_array(np, (3, 4, 5))
+    x_num = mk_seq_array(num, (3, 4, 5))
+    values = mk_seq_array(np, (6,)) * 10
+    values_num = num.array(values)
+    indices = np.array([-2, 2], dtype=np.int32)
+    indices_num = num.array(indices)
+    np.put(x, indices, values)
+    num.put(x_num, indices_num, values_num)
+    assert np.array_equal(x_num, x)
+
+
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+def test_ndim(ndim):
+    shape = (5,) * ndim
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    shape_in = (3,) * ndim
+    np_indices = mk_seq_array(np, shape_in)
+    num_indices = mk_seq_array(num, shape_in)
+    shape_val = (2,) * ndim
+    np_values = mk_seq_array(np, shape_val) * 10
+    num_values = mk_seq_array(num, shape_val) * 10
+
+    np.put(np_arr, np_indices, np_values)
+    num.put(num_arr, num_indices, num_values)
+    assert np.array_equal(np_arr, num_arr)
+
+
+INDICES = ([1, 2, 3, 100], [[2, 1], [3, 100]], [1], [100])
+
+
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("mode", ("wrap", "clip"))
+@pytest.mark.parametrize("indices", INDICES)
+def test_ndim_mode(ndim, mode, indices):
+    shape = (5,) * ndim
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    shape_val = (2,) * ndim
+    np_values = mk_seq_array(np, shape_val) * 10
+    num_values = mk_seq_array(num, shape_val) * 10
+
+    np.put(np_arr, indices, np_values, mode=mode)
+    num.put(num_arr, indices, num_values, mode=mode)
+    assert np.array_equal(np_arr, num_arr)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))

From 47d1e8b492df59244218529655de68fa1b36812d Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 19 Oct 2022 11:23:57 -0700
Subject: [PATCH 19/89] Update version number (#663)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53bbc0790..ee10d8337 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-set(cunumeric_version 22.10.00)
+set(cunumeric_version 22.12.00)
 
 # For now we want the optimization flags to match on both normal make and cmake
 # builds so we override the cmake defaults here for release, this changes

From 8e902287330f990e78071d17da756ed91a89817a Mon Sep 17 00:00:00 2001
From: xialu00 <110973296+xialu00@users.noreply.github.com>
Date: Sat, 22 Oct 2022 09:17:23 +0800
Subject: [PATCH 20/89] Testcase add test cases for test_squeeze.py and
 test_transpose.py (#662)

* add test cases for test_tile.py and test_repeat.py

* fix bug

* fix bug

* add test cases for test_squeeze.py and test_transpose.py

* fix bug

* fix bug
---
 tests/integration/test_squeeze.py   | 166 +++++++++++++----
 tests/integration/test_transpose.py | 278 +++++++++++++++++++++++++---
 2 files changed, 385 insertions(+), 59 deletions(-)

diff --git a/tests/integration/test_squeeze.py b/tests/integration/test_squeeze.py
index 68067422d..84ac8be2e 100644
--- a/tests/integration/test_squeeze.py
+++ b/tests/integration/test_squeeze.py
@@ -13,39 +13,143 @@
 # limitations under the License.
 #
 
+import numpy as np
 import pytest
 
-import cunumeric as np
-
-x = np.array([[[1, 2, 3]]])
-
-
-def test_default():
-    y = x.squeeze()
-
-    assert np.array_equal(y, [1, 2, 3])
-
-
-def test_axis_1d():
-    y = x.squeeze(axis=1)
-
-    assert np.array_equal(y, [[1, 2, 3]])
-
-
-def test_axis_2d():
-    x = np.array([[[1], [2], [3]]])
-
-    y = x.squeeze(axis=(0, 2))
-
-    assert np.array_equal(y, [1, 2, 3])
-
-
-def test_idempotent():
-    x = np.array([1, 2, 3])
-
-    y = x.squeeze()
-
-    assert x is y
+import cunumeric as num
+
+DIM = 5
+SIZES = [
+    (0,),
+    (1),
+    (DIM),
+    (0, 1),
+    (1, 0),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 0, 0),
+    (1, 1, 0),
+    (1, 0, 1),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
+
+
+@pytest.mark.xfail
+def test_none_array_compare():
+    res_num = num.squeeze(None)  # AttributeError: 'NoneType'
+    res_np = np.squeeze(None)  # return None
+    assert np.array_equal(res_num, res_np, equal_nan=True)
+
+
+def test_none_array():
+    # numpy returned None
+    msg = r"NoneType"
+    with pytest.raises(AttributeError, match=msg):
+        num.squeeze(None)
+
+
+def test_num_invalid_axis():
+    size = (1, 2, 1)
+    a = num.random.randint(low=-10, high=10, size=size)
+    msg = r"one"
+    with pytest.raises(ValueError, match=msg):
+        num.squeeze(a, axis=1)
+
+
+def test_array_invalid_axis():
+    size = (1, 2, 1)
+    a = num.random.randint(low=-10, high=10, size=size)
+    msg = r"one"
+    with pytest.raises(ValueError, match=msg):
+        a.squeeze(axis=1)
+
+
+def test_num_axis_out_bound():
+    size = (1, 2, 1)
+    a = num.random.randint(low=-10, high=10, size=size)
+    msg = r"bounds"
+    with pytest.raises(np.AxisError, match=msg):
+        num.squeeze(a, axis=3)
+
+
+def test_array_axis_out_bound():
+    size = (1, 2, 1)
+    a = num.random.randint(-10, 10, size=size)
+    msg = r"bounds"
+    with pytest.raises(np.AxisError, match=msg):
+        a.squeeze(axis=3)
+
+
+@pytest.mark.parametrize("axes", (-1, -3))
+def test_num_axis_negative(axes):
+    size = (1, 2, 1)
+    a = np.random.randint(low=-10, high=10, size=size)
+    b = num.array(a)
+    res_np = np.squeeze(a, axis=axes)
+    res_num = num.squeeze(b, axis=axes)
+    assert np.array_equal(res_num, res_np)
+
+
+@pytest.mark.parametrize("axes", (-1, -3))
+def test_array_axis_negative(axes):
+    size = (1, 2, 1)
+    a = np.random.randint(low=-10, high=10, size=size)
+    b = num.array(a)
+    res_np = a.squeeze(axis=axes)
+    res_num = b.squeeze(axis=axes)
+    assert np.array_equal(res_num, res_np)
+
+
+@pytest.mark.parametrize("size", SIZES, ids=str)
+def test_num_basic(size):
+    a = np.random.randint(low=-10, high=10, size=size)
+    b = num.array(a)
+    res_np = np.squeeze(a)
+    res_num = num.squeeze(b)
+    assert np.array_equal(res_num, res_np)
+
+
+@pytest.mark.parametrize("size", SIZES, ids=str)
+def test_array_basic(size):
+    a = np.random.randint(low=-10, high=10, size=size)
+    b = num.array(a)
+    res_np = a.squeeze()
+    res_num = b.squeeze()
+    assert np.array_equal(res_num, res_np)
+
+
+@pytest.mark.parametrize(
+    "size", (s for s in SIZES if type(s) == tuple if 1 in s), ids=str
+)
+def test_num_axis(size):
+    a = np.random.randint(low=-10, high=10, size=size)
+    b = num.array(a)
+
+    for k, axis in enumerate(a.shape):
+        if axis == 1:
+            res_np = np.squeeze(a, axis=k)
+            res_num = num.squeeze(b, axis=k)
+            assert np.array_equal(res_num, res_np)
+
+
+@pytest.mark.parametrize(
+    "size", (s for s in SIZES if type(s) == tuple if 1 in s), ids=str
+)
+def test_array_axis(size):
+    a = np.random.randint(low=-10, high=10, size=size)
+    b = num.array(a)
+
+    for k, axis in enumerate(a.shape):
+        if axis == 1:
+            res_np = a.squeeze(axis=k)
+            res_num = b.squeeze(axis=k)
+            assert np.array_equal(res_num, res_np)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_transpose.py b/tests/integration/test_transpose.py
index 1bc9c9ffb..4162df713 100644
--- a/tests/integration/test_transpose.py
+++ b/tests/integration/test_transpose.py
@@ -17,34 +17,256 @@
 
 import cunumeric as num
 
-rect = num.array([[1, 2, 3], [4, 5, 6]])
-square = num.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
-
-
-@pytest.mark.parametrize("x", (rect, square), ids=("rect", "square"))
-class Test_free_function:
-    def test_forward(self, x):
-        y = num.transpose(x)
-        npx = np.array(x)
-        assert num.array_equal(y, np.transpose(npx))
-
-    def test_round_trip(self, x):
-        y = num.transpose(x)
-        z = num.transpose(y)
-        assert num.array_equal(x, z)
-
-
-@pytest.mark.parametrize("x", (rect, square), ids=("rect", "square"))
-class Test_method:
-    def test_forward(self, x):
-        y = x.transpose()
-        npx = np.array(x)
-        assert num.array_equal(y, npx.transpose())
-
-    def test_round_trip(self, x):
-        y = x.transpose()
-        z = y.transpose()
-        assert num.array_equal(x, z)
+DIM = 5
+SIZES = [
+    1,
+    DIM,
+    (0,),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, DIM - 1, DIM),
+    (2, DIM - 1, DIM),
+    (DIM, DIM, DIM),
+]
+
+
+class TestModule:
+    @pytest.mark.xfail
+    def test_none_array_compare(self):
+        res_num = num.transpose(None)  # AttributeError: 'NoneType'
+        res_np = np.transpose(None)  # return None
+        assert np.array_equal(res_num, res_np, equal_nan=True)
+
+    def test_none_array(self):
+        # numpy returned None
+        msg = r"NoneType"
+        with pytest.raises(AttributeError, match=msg):
+            num.transpose(None)
+
+    @pytest.mark.parametrize(
+        "axes", ((1, 1, 1), (1, 2, 3), (1, 2), (1, 2, 0, 1))
+    )
+    def test_invalid_axis(self, axes):
+        size = (2, 3, 4)
+        a = num.random.randint(low=-10, high=10, size=size)
+        with pytest.raises(ValueError):
+            num.transpose(a, axes=axes)
+
+    def test_int_axis(self):
+        size = (2, 3, 4)
+        a = num.random.randint(low=-10, high=10, size=size)
+        # numpy raises "ValueError: axes don't match array".
+        # cunumeric raises "TypeError".
+        with pytest.raises(TypeError):
+            num.transpose(a, axes=2)
+
+    @pytest.mark.xfail
+    def test_int_axis_compare(self):
+        size = (2, 3, 4)
+        a = num.random.randint(low=-10, high=10, size=size)
+        # numpy raises "ValueError: axes don't match array".
+        # cunumeric raises "TypeError".
+        with pytest.raises(ValueError):
+            num.transpose(a, axes=2)
+
+    @pytest.mark.parametrize("size", SIZES, ids=str)
+    def test_round(self, size):
+        a = num.random.randint(low=-10, high=10, size=size)
+        b = num.transpose(a)
+        c = num.transpose(b)
+        assert num.array_equal(c, a)
+
+    @pytest.mark.parametrize("size", SIZES, ids=str)
+    def test_basic(self, size):
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = np.transpose(a)
+        res_num = num.transpose(b)
+        assert np.array_equal(res_num, res_np)
+
+    @pytest.mark.parametrize("size", (0, 1, DIM))
+    def test_axes_1d(self, size):
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = np.transpose(a, axes=0)
+        res_num = num.transpose(b, axes=0)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", (0, 1, DIM))
+    @pytest.mark.parametrize("axes", (-3, 3))
+    def test_axes_1d_int(self, size, axes):
+        # For cunumeric, if array.dim==1, it returns the array itself directly,
+        # no matter what the axes value is.
+        # For numpy, it raises
+        # "numpy.AxisError: axis * is out of bounds for array of dimension 1".
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = np.transpose(a, axes=axes)
+        res_num = num.transpose(b, axes=axes)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", (0, 1, DIM))
+    @pytest.mark.parametrize("axes", ((1,), (3, 1)))
+    def test_axes_1d_tuple(self, size, axes):
+        # For cunumeric, if array.dim==1, it returns the array itself directly,
+        # no matter what the axes value is.
+        # For numpy, it raises "ValueError: axes don't match array".
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = np.transpose(a, axes=axes)
+        res_num = num.transpose(b, axes=axes)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.parametrize(
+        "size",
+        ((1, 0), (1, 1), (1, DIM), (DIM, 1), (DIM - 1, DIM - 2), (DIM, DIM)),
+    )
+    @pytest.mark.parametrize("axes", ((0, 1), (1, 0)))
+    def test_axes_2d(self, size, axes):
+        a = num.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = np.transpose(a, axes=axes)
+        res_num = num.transpose(b, axes=axes)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.parametrize(
+        "size",
+        (
+            (1, 0, 1),
+            (1, 1, 1),
+            (DIM, DIM - 1, 1),
+            (1, 1, DIM),
+            (2, 3, 4),
+            (DIM, DIM, DIM),
+        ),
+    )
+    @pytest.mark.parametrize(
+        "axes", ((0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0))
+    )
+    def test_axes_3d(self, size, axes):
+        a = num.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = np.transpose(a, axes=axes)
+        res_num = num.transpose(b, axes=axes)
+        assert num.array_equal(res_num, res_np)
+
+
+class TestArrayMethod:
+    @pytest.mark.parametrize(
+        "axes", ((1, 1, 1), (1, 2, 3), (1, 2), (1, 2, 0, 1))
+    )
+    def test_invalid_axis(self, axes):
+        size = (2, 3, 4)
+        a = num.random.randint(low=-10, high=10, size=size)
+        with pytest.raises(ValueError):
+            a.transpose(axes=axes)
+
+    def test_int_axis(self):
+        size = (2, 3, 4)
+        a = num.random.randint(low=-10, high=10, size=size)
+        # numpy raises "ValueError: axes don't match array".
+        # cunumeric raises "TypeError".
+        with pytest.raises(TypeError):
+            a.transpose(axes=2)
+
+    @pytest.mark.xfail
+    def test_int_axis_compare(self):
+        size = (2, 3, 4)
+        a = num.random.randint(low=-10, high=10, size=size)
+        # numpy raises "ValueError: axes don't match array".
+        # cunumeric raises "TypeError".
+        with pytest.raises(ValueError):
+            a.transpose(axes=2)
+
+    @pytest.mark.parametrize("size", SIZES, ids=str)
+    def test_round(self, size):
+        a = num.random.randint(low=-10, high=10, size=size)
+        b = a.transpose()
+        c = b.transpose()
+        assert num.array_equal(c, a)
+
+    @pytest.mark.parametrize("size", SIZES, ids=str)
+    def test_basic(self, size):
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = a.transpose()
+        res_num = b.transpose()
+        assert np.array_equal(res_num, res_np)
+
+    @pytest.mark.parametrize("size", (0, 1, DIM))
+    def test_axes_1d(self, size):
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = a.transpose(0)
+        res_num = b.transpose(0)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", (0, 1, DIM))
+    @pytest.mark.parametrize("axes", (-3, 3))
+    def test_axes_1d_int(self, size, axes):
+        # For cunumeric, if array.dim==1, it returns the array itself directly,
+        # no matter what the axes value is.
+        # For Numpy, it raises
+        # "numpy.AxisError: axis * is out of bounds for array of dimension 1".
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = a.transpose(axes)
+        res_num = b.transpose(axes)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", (0, 1, DIM))
+    @pytest.mark.parametrize("axes", ((1,), (3, 1)))
+    def test_axes_1d_tuple(self, size, axes):
+        # For cunumeric, if array.dim==1, it returns the array itself directly,
+        # no matter what the axes value is.
+        # For Numpy, it raises "ValueError: axes don't match array".
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = a.transpose(axes)
+        res_num = b.transpose(axes)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.parametrize(
+        "size",
+        ((1, 0), (1, 1), (1, DIM), (DIM, 1), (DIM - 1, DIM - 2), (DIM, DIM)),
+    )
+    @pytest.mark.parametrize("axes", ((0, 1), (1, 0)))
+    def test_axes_2d(self, size, axes):
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = a.transpose(axes)
+        res_num = b.transpose(axes)
+        assert num.array_equal(res_num, res_np)
+
+    @pytest.mark.parametrize(
+        "size",
+        (
+            (1, 0, 1),
+            (1, 1, 1),
+            (DIM, DIM - 1, 1),
+            (1, 1, DIM),
+            (2, 3, 4),
+            (DIM, DIM, DIM),
+        ),
+    )
+    @pytest.mark.parametrize(
+        "axes", ((0, 2, 1), (1, 0, 2), (1, 2, 0), (2, 0, 1), (2, 1, 0))
+    )
+    def test_axes_3d(self, size, axes):
+        a = np.random.randint(low=-10, high=10, size=size)
+        b = num.array(a)
+        res_np = a.transpose(axes)
+        res_num = b.transpose(axes)
+        assert num.array_equal(res_num, res_np)
 
 
 if __name__ == "__main__":

From 0d5f84e6bb2c331deedfdcf85faa0e687ef146d1 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 25 Oct 2022 10:44:49 -0700
Subject: [PATCH 21/89] Updates for new script-based conda env generation
 (#651)

* Remove old conda env files

* Update build documentation

* Fix a file link
---
 BUILD.md                        | 98 +++++++++------------------------
 README.md                       | 67 +---------------------
 conda/environment-test-3.10.yml | 61 --------------------
 conda/environment-test-3.8.yml  | 61 --------------------
 conda/environment-test-3.9.yml  | 61 --------------------
 5 files changed, 27 insertions(+), 321 deletions(-)
 delete mode 100644 conda/environment-test-3.10.yml
 delete mode 100644 conda/environment-test-3.8.yml
 delete mode 100644 conda/environment-test-3.9.yml

diff --git a/BUILD.md b/BUILD.md
index cea0ce4d1..589d7c016 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -15,89 +15,42 @@ limitations under the License.
 
 -->
 
-# Overview
+# Dependencies
 
-The build system is designed to enable two different modes of use:
-1. Simple `pip install` for users
-2. Highly customizable incremental builds for developers
-
-We review each of these modes with examples.
+Users must have a working installation of the
+[Legate Core](https://github.com/nv-legate/legate.core)
+library prior to installing cuNumeric.
 
+As for other dependencies, the Dependencies section on the
+[Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
+also covers cuNumeric.
 
 # Building for Users
 
-## Using install.py
-
-For releases <= 22.07, the main method for building cuNumeric was the `install.py` script.
-Although the underlying implementation has significantly changed, `install.py` still supports the
-same usage and same set of flags. For a full list of flags, users can run:
-
-```
-$ ./install.py --help
-```
-
-## Using Conda
- 
-cuNumeric can be installed using Conda by pointing to the required channels (`-c`):
-
-```
-conda install -c nvidia -c conda-forge -c legate legate-core
-```
-
-## Using pip
-
-cuNumeric is not yet registered in a standard pip repository. However, users can still use the 
-pip installer to build and install cuNumeric. After downloading or cloning the cunumeric source,
-users can run the following in the cunumeric folder:
-
-```
-$ pip install .
-```
-or
-```
-$ python3 -m pip install .
-```
-
-This will install cuNumeric in the standard packages directory for the environment Python.
-Note: This is currently not sufficient for running cuNumeric programs. cuNumeric relies
-on the `legate` launcher from Legate core, which must be installed separately.
-For details on installing Legate, consult the [Legate repository](https://github.com/nv-legate/legate.core).
-
-### Advanced Customization
+cuNumeric provides the same source-based installation scripts as Legate Core (a
+custom `install.py` script, that is backed by `pip install`). See the
+[Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
+for help on using these.
 
-If users need to customize details of the underlying CMake build, they can pass
-CMake flags through the `SKBUILD_CONFIGURE_OPTIONS` environment variable:
-
-```
-$ SKBUILD_CONFIGURE_OPTIONS="-D Legion_USE_CUDA:BOOL=ON" \
-  pip install .
-```
-An alternative syntax using `setup.py` with `scikit-build` is
-```
-$ python setup.py install -- -DLegion_USE_CUDA:BOOL=ON
-```
+Note: Installing cuNumeric by itself will *not* automatically install Legate Core.
 
 # Building for Developers
 
 ## Overview
 
-pip uses [scikit-build](https://scikit-build.readthedocs.io/en/latest/)
-in `setup.py` to drive the build and installation.  A `pip install` will trigger three general actions:
+cuNumeric uses the same cmake/scikit-build-based build workflow as Legate Core.
+See the
+[Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
+for an overview.
 
-1. CMake build and installation of C++ libraries
-2. CMake generation of configuration files and build-dependent Python files
-3. pip installation of Python files
+## Example
 
-The CMake build can be configured independently of `pip`, allowing incremental C++ builds directly through CMake.
-This simplifies rebuilding `libcunumeric.so` either via command-line or via IDE.
-After building the C++ libraries, the `pip install` can be done in "editable" mode using the `-e` flag.
-This configures the Python site packages to import the Python source tree directly.
-The Python source can then be edited and used directly for testing without requiring a `pip install`.
+There are several examples in the `scripts` folder. We walk through the steps in
+`build-with-legate-separately-no-install.sh` here.
 
-## Example
+We assume a pre-existing Legate Core build. For details on building Legate Core,
+consult the [Legate Core repository](https://github.com/nv-legate/legate.core).
 
-There are several examples in the `scripts` folder. We walk through the steps in the `build-with-legate-separately-no-install.sh` here.
-We assume a pre-existing Legate CUDA build. For details on building Legate, consult the [Legate repository](https://github.com/nv-legate/legate.core).
 First, the CMake build needs to be configured:
 
 ```
@@ -106,6 +59,7 @@ $ cmake -S . -B build -GNinja -D legate_core_ROOT:STRING=path/to/legate/build
 
 We point cuNumeric to the Legate *build* tree, not an installation.
 This generates all build-dependent headers and Python files.
+
 Once configured, we can build the C++ libraries:
 
 ```
@@ -118,14 +72,12 @@ Once the C++ libraries are available, we can do an editable (development) pip in
 ```
 $ SKBUILD_BUILD_OPTIONS="-D FIND_CUNUMERIC_CPP=ON -D cunumeric_ROOT=$(pwd)/build" \
   python3 -m pip install \
-  --root / --no-deps --no-build-isolation 
+  --root / --no-deps --no-build-isolation
   --editable .
 ```
 
-The Python source tree and CMake build tree are now available with the environment Python 
-for running cuNumeric programs. The diagram below illustrates the 
+The Python source tree and CMake build tree are now available with the environment Python
+for running cuNumeric programs. The diagram below illustrates the
 complete workflow for building both Legate core and cuNumeric.
 
 <img src="docs/figures/developer-build.png" alt="drawing" width="600"/>
-
-
diff --git a/README.md b/README.md
index 1d85c650d..93fee01ef 100644
--- a/README.md
+++ b/README.md
@@ -36,8 +36,6 @@ canonical NumPy implementation.
 If you have questions, please contact us at legate(at)nvidia.com.
 
 1. [Installation](#installation)
-1. [Dependencies](#dependencies)
-1. [Building from Source](#building-from-source)
 1. [Usage and Execution](#usage-and-execution)
 1. [Supported and Planned Features](#supported-and-planned-features)
 1. [Supported Types and Dimensions](#supported-types-and-dimensions)
@@ -53,6 +51,7 @@ cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric):
 ```
 conda install -c nvidia -c conda-forge -c legate cunumeric
 ```
+
 The conda package is compatible with CUDA >= 11.4 (CUDA driver version >= r470),
 and Volta or later GPU architectures.
 
@@ -60,69 +59,7 @@ Docker image build scripts, as well as specialized install scripts for
 supported clusters are available on the
 [quickstart](https://github.com/nv-legate/quickstart) repo.
 
-Read on for general instructions on building cuNumeric from source.
-
-## Dependencies
-
-Users must have a working installation of the
-[Legate Core](https://github.com/nv-legate/legate.core)
-library prior to installing cuNumeric.
-
-cuNumeric requires the following:
-
-  - Python >= 3.8
-  - [CUDA](https://developer.nvidia.com/cuda-downloads) >= 10.2
-  - GNU Make
-  - C++17 compatible compiler (g++, clang, or nvc++)
-  - Fortran compiler (for building OpenBLAS; not necessary if you provide a pre-built version of OpenBLAS)
-  - the Python packages listed in any one of the conda environment files:
-    - `conda/environment-test-3.8.yml`
-    - `conda/environment-test-3.9.yml`
-    - `conda/environment-test-3.10.yml`
-
-See the [corresponding section](https://github.com/nv-legate/legate.core#dependencies)
-on the Legate Core instructions for help on installing the required Python packages
-using conda.
-
-cuNumeric is tested and guaranteed to be compatible with Volta and later GPU
-architectures. You can use cuNumeric with Pascal GPUs as well, but there could
-be issues due to lack of independent thread scheduling. Please report any such
-issues on GitHub.
-
-## Building from Source
-
-Installation can be done the `install.py` script.
-For releases >= 22.10, `pip install` is now available.
-The most common installation command is:
-
-```
-./install.py --with-core <path-to-legate-core-installation>
-```
-
-This will build cuNumeric against the Legate Core installation and then
-install cuNumeric into the same location.
-
-If Legate Core has been installed with CUDA support, a working cuTENSOR
-installation must also be provided to the installation command with the
-`--with-cutensor` option:
-```
-./install.py --with-core <path-to-legate-core-installation> --with-cutensor <path-to-cutensor-installation>
-```
-
-You can also specify an installation of [OpenBLAS](https://www.openblas.net/)
-to use for the build. If you already have an installation of OpenBLAS on your
-machine, you can inform the installation script using the `--with-openblas`
-option:
-
-```
-./install.py --with-openblas <path-to-OpenBLAS>
-```
-
-Advanced users can also invoke `install.py --help` to see options for
-configuring cuNumeric by invoking the `install.py` script directly.
-More information on building - including development workflows - can be found
-in the [build instructions](BUILD.md)
-
+See [BUILD.md]() for instructions on building cuNumeric from source.
 
 ## Usage and Execution
 
diff --git a/conda/environment-test-3.10.yml b/conda/environment-test-3.10.yml
deleted file mode 100644
index 1066db97e..000000000
--- a/conda/environment-test-3.10.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: cunumeric-test
-channels:
-  - conda-forge
-dependencies:
-  - python=3.10
-
-  # build
-  - git
-  - nccl
-  - make
-  - zlib
-  - cmake>=3.24
-  - ninja
-  - openmpi
-  - c-compiler
-  - cxx-compiler
-  - gcc_linux-64 # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - setuptools>=60
-  - cutensor>=1.3.3
-  - scikit-build>=0.13.1
-
-  # runtime
-  - cffi
-  - numpy>=1.22
-  - opt_einsum
-  - pyarrow>=5
-  - scipy
-  - typing_extensions
-  - llvm-openmp
-  - openblas=*=*openmp*
-
-  # tests
-  - clang>=8
-  - clang-tools>=8
-  - colorama
-  - coverage
-  - mock
-  - mypy>=0.961
-  - pre-commit
-  - pynvml
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pytest-lazy-fixture
-  - types-docutils
-
-  # pip dependencies
-  - pip
-  - pip:
-    # docs
-    - jinja2
-    - pydata-sphinx-theme
-    - recommonmark
-    - markdown<3.4.0
-    - sphinx>=4.4.0
-    - sphinx-copybutton
-    - sphinx-markdown-tables
-
-    # examples
-    - tifffile
diff --git a/conda/environment-test-3.8.yml b/conda/environment-test-3.8.yml
deleted file mode 100644
index 9049ec0b9..000000000
--- a/conda/environment-test-3.8.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: cunumeric-test
-channels:
-  - conda-forge
-dependencies:
-  - python=3.8
-
-  # build
-  - git
-  - nccl
-  - make
-  - zlib
-  - cmake>=3.24
-  - ninja
-  - openmpi
-  - c-compiler
-  - cxx-compiler
-  - gcc_linux-64 # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - setuptools>=60
-  - cutensor>=1.3.3
-  - scikit-build>=0.13.1
-
-  # runtime
-  - cffi
-  - numpy>=1.22
-  - opt_einsum
-  - pyarrow>=5
-  - scipy
-  - typing_extensions
-  - llvm-openmp
-  - openblas=*=*openmp*
-
-  # tests
-  - clang>=8
-  - clang-tools>=8
-  - colorama
-  - coverage
-  - mock
-  - mypy>=0.961
-  - pre-commit
-  - pynvml
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pytest-lazy-fixture
-  - types-docutils
-
-  # pip dependencies
-  - pip
-  - pip:
-    # docs
-    - jinja2
-    - pydata-sphinx-theme
-    - recommonmark
-    - markdown<3.4.0
-    - sphinx>=4.4.0
-    - sphinx-copybutton
-    - sphinx-markdown-tables
-
-    # examples
-    - tifffile
diff --git a/conda/environment-test-3.9.yml b/conda/environment-test-3.9.yml
deleted file mode 100644
index 482277bae..000000000
--- a/conda/environment-test-3.9.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: cunumeric-test
-channels:
-  - conda-forge
-dependencies:
-  - python=3.9
-
-  # build
-  - git
-  - nccl
-  - make
-  - zlib
-  - cmake>=3.24
-  - ninja
-  - openmpi
-  - c-compiler
-  - cxx-compiler
-  - gcc_linux-64 # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - setuptools>=60
-  - cutensor>=1.3.3
-  - scikit-build>=0.13.1
-
-  # runtime
-  - cffi
-  - numpy>=1.22
-  - opt_einsum
-  - pyarrow>=5
-  - scipy
-  - typing_extensions
-  - llvm-openmp
-  - openblas=*=*openmp*
-
-  # tests
-  - clang>=8
-  - clang-tools>=8
-  - colorama
-  - coverage
-  - mock
-  - mypy>=0.961
-  - pre-commit
-  - pynvml
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pytest-lazy-fixture
-  - types-docutils
-
-  # pip dependencies
-  - pip
-  - pip:
-    # docs
-    - jinja2
-    - pydata-sphinx-theme
-    - recommonmark
-    - markdown<3.4.0
-    - sphinx>=4.4.0
-    - sphinx-copybutton
-    - sphinx-markdown-tables
-
-    # examples
-    - tifffile

From f072d660d016286aae70458d755255981b5e46be Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Wed, 26 Oct 2022 06:22:44 -0700
Subject: [PATCH 22/89] Update upload artifact action version (#669)

v2 -> v3 to avoid GitHub warnings.

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6abe49c3b..499dce58a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -60,7 +60,7 @@ jobs:
         if: always()
       - name: Upload Build Log
         if: always()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: build-log
           path: ./**/${{ env.COMMIT }}-build.log.gpg
@@ -128,7 +128,7 @@ jobs:
           cat *artifacts/*/*
       - name: Upload Log
         if: always()
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: test-${{ matrix.log }}-log
           path: ./**/${{ env.COMMIT }}-test-${{ matrix.log }}.log.gpg

From 5e5105fb396a1be2c43b7ebaf54cdf5c17135bd1 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 26 Oct 2022 14:03:11 -0700
Subject: [PATCH 23/89] Following on comments for PUT (#665)

* cheking bounds only for the raise mode

* renaming some variables for better readability

* fix issue with assertion inside omp loop

* Typo in comment

Co-authored-by: Manolis Papadakis <manopapad@gmail.com>
---
 cunumeric/array.py                    |  2 +-
 cunumeric/deferred.py                 |  4 +-
 cunumeric/eager.py                    |  4 +-
 cunumeric/thunk.py                    |  2 +-
 src/cunumeric/index/wrap.cc           | 27 ++++++-----
 src/cunumeric/index/wrap.cu           | 68 +++++++++++++++------------
 src/cunumeric/index/wrap.h            | 12 +++++
 src/cunumeric/index/wrap_omp.cc       | 32 ++++++++-----
 src/cunumeric/index/wrap_template.inl | 44 +++++++++--------
 src/cunumeric/index/zip.h             |  7 +++
 src/cunumeric/index/zip_omp.cc        | 16 +++++--
 11 files changed, 134 insertions(+), 84 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index cd14eda7c..0ac528531 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -2522,7 +2522,7 @@ def put(
         if values.ndim != indices.ndim or values.size != indices.size:
             values = values._wrap(indices.size)
 
-        self._thunk.put(indices._thunk, values._thunk)
+        self._thunk.put(indices._thunk, values._thunk, mode == "raise")
 
     @add_boilerplate()
     def trace(
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 3bb5c4db7..8e6d8cacb 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1686,7 +1686,7 @@ def _diag_helper(
         task.execute()
 
     @auto_convert("indices", "values")
-    def put(self, indices: Any, values: Any) -> None:
+    def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
 
         if indices.base.kind == Future or indices.base.transformed:
             change_shape = indices.base.kind == Future
@@ -1722,6 +1722,7 @@ def put(self, indices: Any, values: Any) -> None:
         task.add_output(indirect.base)
         task.add_scalar_arg(shape, (ty.int64,))
         task.add_scalar_arg(True, bool)  # has_input
+        task.add_scalar_arg(check_bounds, bool)
         task.add_input(indices.base)
         task.add_alignment(indices.base, indirect.base)
         task.throws_exception(IndexError)
@@ -3437,6 +3438,7 @@ def _wrap(self, src: Any, new_len: int) -> None:
         task.add_output(indirect.base)
         task.add_scalar_arg(src.shape, (ty.int64,))
         task.add_scalar_arg(False, bool)  # has_input
+        task.add_scalar_arg(False, bool)  # check bounds
         task.execute()
 
         copy = self.context.create_copy()
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index b8cb36ecd..cef2b7b49 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -620,10 +620,10 @@ def _diag_helper(
                 axes = tuple(range(ndims - naxes, ndims))
                 self.array = diagonal_reference(rhs.array, axes)
 
-    def put(self, indices: Any, values: Any) -> None:
+    def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
         self.check_eager_args(indices, values)
         if self.deferred is not None:
-            self.deferred.put(indices, values)
+            self.deferred.put(indices, values, check_bounds)
         else:
             np.put(self.array, indices.array, values.array)
 
diff --git a/cunumeric/thunk.py b/cunumeric/thunk.py
index e1f1dab77..7ade503d0 100644
--- a/cunumeric/thunk.py
+++ b/cunumeric/thunk.py
@@ -198,7 +198,7 @@ def _diag_helper(
         ...
 
     @abstractmethod
-    def put(self, indices: Any, values: Any) -> None:
+    def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
         ...
 
     @abstractmethod
diff --git a/src/cunumeric/index/wrap.cc b/src/cunumeric/index/wrap.cc
index a5483cbdd..9d8fef331 100644
--- a/src/cunumeric/index/wrap.cc
+++ b/src/cunumeric/index/wrap.cc
@@ -27,28 +27,29 @@ struct WrapImplBody<VariantKind::CPU, DIM> {
   template <typename IND>
   void operator()(const AccessorWO<Point<DIM>, 1>& out,
                   const Pitches<0>& pitches_out,
-                  const Rect<1>& out_rect,
-                  const Pitches<DIM - 1>& pitches_in,
-                  const Rect<DIM>& in_rect,
+                  const Rect<1>& rect_out,
+                  const Pitches<DIM - 1>& pitches_base,
+                  const Rect<DIM>& rect_base,
                   const bool dense,
+                  const bool check_bounds,
                   const IND& indices) const
   {
-    const int64_t start  = out_rect.lo[0];
-    const int64_t end    = out_rect.hi[0];
-    const auto in_volume = in_rect.volume();
+    const int64_t start    = rect_out.lo[0];
+    const int64_t end      = rect_out.hi[0];
+    const auto volume_base = rect_base.volume();
     if (dense) {
-      auto outptr = out.ptr(out_rect);
+      auto outptr = out.ptr(rect_out);
       for (int64_t i = start; i <= end; i++) {
-        check_idx(i, in_volume, indices);
-        const int64_t input_idx = compute_idx(i, in_volume, indices);
-        auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
+        if (check_bounds) check_idx(i, volume_base, indices);
+        const int64_t input_idx = compute_idx(i, volume_base, indices);
+        auto point              = pitches_base.unflatten(input_idx, rect_base.lo);
         outptr[i - start]       = point;
       }
     } else {
       for (int64_t i = start; i <= end; i++) {
-        check_idx(i, in_volume, indices);
-        const int64_t input_idx = compute_idx(i, in_volume, indices);
-        auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
+        if (check_bounds) check_idx(i, volume_base, indices);
+        const int64_t input_idx = compute_idx(i, volume_base, indices);
+        auto point              = pitches_base.unflatten(input_idx, rect_base.lo);
         out[i]                  = point;
       }
     }  // else
diff --git a/src/cunumeric/index/wrap.cu b/src/cunumeric/index/wrap.cu
index af81073d6..cc82418a0 100644
--- a/src/cunumeric/index/wrap.cu
+++ b/src/cunumeric/index/wrap.cu
@@ -29,7 +29,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
                const AccessorRO<int64_t, 1> indices,
                const int64_t start,
                const int64_t volume,
-               const int64_t in_volume,
+               const int64_t volume_base,
                const int64_t iters)
 {
   bool value = false;
@@ -37,8 +37,8 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
     const auto idx = (i * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (idx >= volume) break;
     auto index_tmp = indices[idx + start];
-    int64_t index  = index_tmp < 0 ? index_tmp + in_volume : index_tmp;
-    bool val       = (index < 0 || index >= in_volume);
+    int64_t index  = index_tmp < 0 ? index_tmp + volume_base : index_tmp;
+    bool val       = (index < 0 || index >= volume_base);
     SumReduction<bool>::fold<true>(value, val);
   }
   reduce_output(out, value);
@@ -51,16 +51,16 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
               const int64_t volume,
               const Pitches<0> pitches_out,
               const Point<1> out_lo,
-              const Pitches<DIM - 1> pitches_in,
-              const Point<DIM> in_lo,
-              const size_t in_volume,
+              const Pitches<DIM - 1> pitches_base,
+              const Point<DIM> base_lo,
+              const size_t volume_base,
               const IND indices)
 {
   const auto idx = global_tid_1d();
   if (idx >= volume) return;
-  const int64_t input_idx = compute_idx((idx + start), in_volume, indices);
+  const int64_t input_idx = compute_idx((idx + start), volume_base, indices);
   auto out_p              = pitches_out.unflatten(idx, out_lo);
-  auto p                  = pitches_in.unflatten(input_idx, in_lo);
+  auto p                  = pitches_base.unflatten(input_idx, base_lo);
   out[out_p]              = p;
 }
 
@@ -69,15 +69,15 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   wrap_kernel_dense(Point<DIM>* out,
                     const int64_t start,
                     const int64_t volume,
-                    const Pitches<DIM - 1> pitches_in,
-                    const Point<DIM> in_lo,
-                    const size_t in_volume,
+                    const Pitches<DIM - 1> pitches_base,
+                    const Point<DIM> base_lo,
+                    const size_t volume_base,
                     const IND indices)
 {
   const auto idx = global_tid_1d();
   if (idx >= volume) return;
-  const int64_t input_idx = compute_idx((idx + start), in_volume, indices);
-  auto p                  = pitches_in.unflatten(input_idx, in_lo);
+  const int64_t input_idx = compute_idx((idx + start), volume_base, indices);
+  auto p                  = pitches_base.unflatten(input_idx, base_lo);
   out[idx]                = p;
 }
 
@@ -85,7 +85,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
 void check_out_of_bounds(const bool& indices,
                          const int64_t start,
                          const int64_t volume,
-                         const int64_t volume_in,
+                         const int64_t volume_base,
                          cudaStream_t stream)
 {
 }
@@ -93,7 +93,7 @@ void check_out_of_bounds(const bool& indices,
 void check_out_of_bounds(const AccessorRO<int64_t, 1>& indices,
                          const int64_t start,
                          const int64_t volume,
-                         const int64_t volume_in,
+                         const int64_t volume_base,
                          cudaStream_t stream)
 {
   const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
@@ -103,10 +103,10 @@ void check_out_of_bounds(const AccessorRO<int64_t, 1>& indices,
   if (blocks >= MAX_REDUCTION_CTAS) {
     const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
     check_kernel<<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
-      out_of_bounds, indices, start, volume, volume_in, iters);
+      out_of_bounds, indices, start, volume, volume_base, iters);
   } else {
     check_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
-      out_of_bounds, indices, start, volume, volume_in, 1);
+      out_of_bounds, indices, start, volume, volume_base, 1);
   }
   CHECK_CUDA_STREAM(stream);
 
@@ -119,27 +119,35 @@ struct WrapImplBody<VariantKind::GPU, DIM> {
   template <typename IND>
   void operator()(const AccessorWO<Point<DIM>, 1>& out,
                   const Pitches<0>& pitches_out,
-                  const Rect<1>& out_rect,
-                  const Pitches<DIM - 1>& pitches_in,
-                  const Rect<DIM>& in_rect,
+                  const Rect<1>& rect_out,
+                  const Pitches<DIM - 1>& pitches_base,
+                  const Rect<DIM>& rect_base,
                   const bool dense,
+                  const bool check_bounds,
                   const IND& indices) const
   {
-    auto stream          = get_cached_stream();
-    const int64_t start  = out_rect.lo[0];
-    const int64_t volume = out_rect.volume();
-    const auto in_volume = in_rect.volume();
-    const size_t blocks  = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    auto stream            = get_cached_stream();
+    const int64_t start    = rect_out.lo[0];
+    const int64_t volume   = rect_out.volume();
+    const auto volume_base = rect_base.volume();
+    const size_t blocks    = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 
-    check_out_of_bounds(indices, start, volume, in_volume, stream);
+    if (check_bounds) check_out_of_bounds(indices, start, volume, volume_base, stream);
 
     if (dense) {
-      auto outptr = out.ptr(out_rect);
+      auto outptr = out.ptr(rect_out);
       wrap_kernel_dense<DIM, IND><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-        outptr, start, volume, pitches_in, in_rect.lo, in_volume, indices);
+        outptr, start, volume, pitches_base, rect_base.lo, volume_base, indices);
     } else {
-      wrap_kernel<DIM, IND><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
-        out, start, volume, pitches_out, out_rect.lo, pitches_in, in_rect.lo, in_volume, indices);
+      wrap_kernel<DIM, IND><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(out,
+                                                                      start,
+                                                                      volume,
+                                                                      pitches_out,
+                                                                      rect_out.lo,
+                                                                      pitches_base,
+                                                                      rect_base.lo,
+                                                                      volume_base,
+                                                                      indices);
     }
     CHECK_CUDA_STREAM(stream);
   }
diff --git a/src/cunumeric/index/wrap.h b/src/cunumeric/index/wrap.h
index 181a9b97c..8c4167983 100644
--- a/src/cunumeric/index/wrap.h
+++ b/src/cunumeric/index/wrap.h
@@ -26,6 +26,7 @@ struct WrapArgs {
                                     //  `wrapped` one
   const Legion::DomainPoint shape;  // shape of the original array
   const bool has_input;
+  const bool check_bounds;
   const Array& in = Array();
 };
 
@@ -70,4 +71,15 @@ static void check_idx(const int64_t i, const int64_t volume, const bool&)
 {
   // don't do anything when wrapping indices
 }
+
+static bool check_idx_omp(const int64_t i,
+                          const int64_t volume,
+                          const legate::AccessorRO<int64_t, 1>& indices)
+{
+  int64_t idx   = indices[i];
+  int64_t index = idx < 0 ? idx + volume : idx;
+  return (index < 0 || index >= volume);
+}
+static bool check_idx_omp(const int64_t i, const int64_t volume, const bool&) { return false; }
+
 }  // namespace cunumeric
diff --git a/src/cunumeric/index/wrap_omp.cc b/src/cunumeric/index/wrap_omp.cc
index 531592df9..9387e2e3b 100644
--- a/src/cunumeric/index/wrap_omp.cc
+++ b/src/cunumeric/index/wrap_omp.cc
@@ -27,33 +27,39 @@ struct WrapImplBody<VariantKind::OMP, DIM> {
   template <typename IND>
   void operator()(const AccessorWO<Point<DIM>, 1>& out,
                   const Pitches<0>& pitches_out,
-                  const Rect<1>& out_rect,
-                  const Pitches<DIM - 1>& pitches_in,
-                  const Rect<DIM>& in_rect,
+                  const Rect<1>& rect_out,
+                  const Pitches<DIM - 1>& pitches_base,
+                  const Rect<DIM>& rect_base,
                   const bool dense,
+                  const bool check_bounds,
                   const IND& indices) const
   {
-    const int64_t start  = out_rect.lo[0];
-    const int64_t end    = out_rect.hi[0];
-    const auto in_volume = in_rect.volume();
+    const int64_t start                = rect_out.lo[0];
+    const int64_t end                  = rect_out.hi[0];
+    const auto volume_base             = rect_base.volume();
+    std::atomic<bool> is_out_of_bounds = false;
     if (dense) {
-      auto outptr = out.ptr(out_rect);
+      auto outptr = out.ptr(rect_out);
 #pragma omp parallel for schedule(static)
       for (int64_t i = start; i <= end; i++) {
-        check_idx(i, in_volume, indices);
-        const int64_t input_idx = compute_idx(i, in_volume, indices);
-        auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
+        if (check_bounds)
+          if (check_idx_omp(i, volume_base, indices)) is_out_of_bounds = true;
+        const int64_t input_idx = compute_idx(i, volume_base, indices);
+        auto point              = pitches_base.unflatten(input_idx, rect_base.lo);
         outptr[i - start]       = point;
       }
     } else {
 #pragma omp parallel for schedule(static)
       for (int64_t i = start; i <= end; i++) {
-        check_idx(i, in_volume, indices);
-        const int64_t input_idx = compute_idx(i, in_volume, indices);
-        auto point              = pitches_in.unflatten(input_idx, in_rect.lo);
+        if (check_bounds)
+          if (check_idx_omp(i, volume_base, indices)) is_out_of_bounds = true;
+        const int64_t input_idx = compute_idx(i, volume_base, indices);
+        auto point              = pitches_base.unflatten(input_idx, rect_base.lo);
         out[i]                  = point;
       }
     }  // else
+
+    if (is_out_of_bounds) throw legate::TaskException("index is out of bounds in index array");
   }
 };
 
diff --git a/src/cunumeric/index/wrap_template.inl b/src/cunumeric/index/wrap_template.inl
index 093f5f5b1..9a9fc3b28 100644
--- a/src/cunumeric/index/wrap_template.inl
+++ b/src/cunumeric/index/wrap_template.inl
@@ -34,15 +34,15 @@ struct WrapImpl {
   void operator()(WrapArgs& args) const
   {
     using VAL     = Point<DIM>;
-    auto out_rect = args.out.shape<1>();  // output array is always 1D
-    auto out      = args.out.write_accessor<Point<DIM>, 1>(out_rect);
+    auto rect_out = args.out.shape<1>();  // output array is always 1D
+    auto out      = args.out.write_accessor<Point<DIM>, 1>(rect_out);
 
     Pitches<0> pitches_out;
-    size_t volume_out = pitches_out.flatten(out_rect);
+    size_t volume_out = pitches_out.flatten(rect_out);
     if (volume_out == 0) return;
 
 #ifndef LEGION_BOUNDS_CHECKS
-    bool dense = out.accessor.is_dense_row_major(out_rect);
+    bool dense = out.accessor.is_dense_row_major(rect_out);
 #else
     bool dense = false;
 #endif
@@ -52,25 +52,27 @@ struct WrapImpl {
       point_lo[dim] = 0;
       point_hi[dim] = args.shape[dim] - 1;
     }
-    Rect<DIM> input_rect(point_lo, point_hi);
+    Rect<DIM> rect_base(point_lo, point_hi);
 
-    Pitches<DIM - 1> pitches_in;
-    size_t volume_in = pitches_in.flatten(input_rect);
+    Pitches<DIM - 1> pitches_base;
+    size_t volume_base = pitches_base.flatten(rect_base);
 #ifdef DEBUG_CUNUMERIC
-    assert(volume_in != 0);
+    assert(volume_base != 0);
 #endif
 
     if (args.has_input) {
-      auto in_rect = args.in.shape<1>();
-      auto in = args.in.read_accessor<int64_t, 1>(in_rect);  // input should be always integer type
+      auto rect_in = args.in.shape<1>();
+      auto in = args.in.read_accessor<int64_t, 1>(rect_in);  // input should be always integer type
 #ifdef DEBUG_CUNUMERIC
-      assert(in_rect == out_rect);
+      assert(rect_in == rect_out);
 #endif
-      WrapImplBody<KIND, DIM>()(out, pitches_out, out_rect, pitches_in, input_rect, dense, in);
+      WrapImplBody<KIND, DIM>()(
+        out, pitches_out, rect_out, pitches_base, rect_base, dense, args.check_bounds, in);
 
     } else {
       bool tmp = false;
-      WrapImplBody<KIND, DIM>()(out, pitches_out, out_rect, pitches_in, input_rect, dense, tmp);
+      WrapImplBody<KIND, DIM>()(
+        out, pitches_out, rect_out, pitches_base, rect_base, dense, args.check_bounds, tmp);
     }  // else
   }
 };
@@ -78,12 +80,16 @@ struct WrapImpl {
 template <VariantKind KIND>
 static void wrap_template(TaskContext& context)
 {
-  auto shape      = context.scalars()[0].value<DomainPoint>();
-  int dim         = shape.dim;
-  bool has_input  = context.scalars()[1].value<bool>();
-  Array tmp_array = Array();
-  WrapArgs args{
-    context.outputs()[0], shape, has_input, has_input ? context.inputs()[0] : tmp_array};
+  auto shape        = context.scalars()[0].value<DomainPoint>();
+  int dim           = shape.dim;
+  bool has_input    = context.scalars()[1].value<bool>();
+  bool check_bounds = context.scalars()[2].value<bool>();
+  Array tmp_array   = Array();
+  WrapArgs args{context.outputs()[0],
+                shape,
+                has_input,
+                check_bounds,
+                has_input ? context.inputs()[0] : tmp_array};
   dim_dispatch(dim, WrapImpl<KIND>{}, args);
 }
 
diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h
index ffa5941d5..e3c7af8a7 100644
--- a/src/cunumeric/index/zip.h
+++ b/src/cunumeric/index/zip.h
@@ -51,6 +51,13 @@ constexpr coord_t compute_idx(coord_t index, coord_t extent)
   return new_index;
 }
 
+constexpr std::pair<coord_t, bool> compute_idx_omp(coord_t index, coord_t extent)
+{
+  coord_t new_index  = index < 0 ? index + extent : index;
+  bool out_of_bounds = (new_index < 0 || new_index >= extent);
+  return {new_index, out_of_bounds};
+}
+
 constexpr coord_t compute_idx_cuda(coord_t index, coord_t extent)
 {
   coord_t new_index = index < 0 ? index + extent : index;
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index 14a3c4b25..aa014547e 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -37,7 +37,8 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
                   const DomainPoint& shape,
                   std::index_sequence<Is...>) const
   {
-    const size_t volume = rect.volume();
+    const size_t volume                = rect.volume();
+    std::atomic<bool> is_out_of_bounds = false;
     if (index_arrays.size() == N) {
       if (dense) {
         std::vector<const VAL*> indx_ptrs = {index_arrays[Is].ptr(rect)...};
@@ -46,7 +47,9 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
         for (size_t idx = 0; idx < volume; ++idx) {
           Legion::Point<N> new_point;
           for (size_t i = 0; i < N; i++) {
-            new_point[i] = compute_idx(indx_ptrs[i][idx], shape[i]);
+            auto pair    = compute_idx_omp(indx_ptrs[i][idx], shape[i]);
+            new_point[i] = pair.first;
+            if (pair.second) is_out_of_bounds = true;
           }
           outptr[idx] = new_point;
         }
@@ -56,7 +59,9 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
           auto p = pitches.unflatten(idx, rect.lo);
           Legion::Point<N> new_point;
           for (size_t i = 0; i < N; i++) {
-            new_point[i] = compute_idx(index_arrays[i][p], shape[i]);
+            auto pair    = compute_idx_omp(index_arrays[i][p], shape[i]);
+            new_point[i] = pair.first;
+            if (pair.second) is_out_of_bounds = true;
           }
           out[p] = new_point;
         }
@@ -71,7 +76,9 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
         Legion::Point<N> new_point;
         for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
         for (size_t i = 0; i < index_arrays.size(); i++) {
-          new_point[start_index + i] = compute_idx(index_arrays[i][p], shape[start_index + i]);
+          auto pair                  = compute_idx_omp(index_arrays[i][p], shape[start_index + i]);
+          new_point[start_index + i] = pair.first;
+          if (pair.second) is_out_of_bounds = true;
         }
         for (size_t i = (start_index + index_arrays.size()); i < N; i++) {
           int64_t j    = key_dim + i - index_arrays.size();
@@ -80,6 +87,7 @@ struct ZipImplBody<VariantKind::OMP, DIM, N> {
         out[p] = new_point;
       }
     }
+    if (is_out_of_bounds) throw legate::TaskException("index is out of bounds in index array");
   }
 };
 

From bbd0887ffe0f7b8d559b45af872f94013183853d Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 27 Oct 2022 10:10:58 -0700
Subject: [PATCH 24/89] Fix BUILD.md link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 93fee01ef..62eecb153 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ Docker image build scripts, as well as specialized install scripts for
 supported clusters are available on the
 [quickstart](https://github.com/nv-legate/quickstart) repo.
 
-See [BUILD.md]() for instructions on building cuNumeric from source.
+See [BUILD.md](BUILD.md) for instructions on building cuNumeric from source.
 
 ## Usage and Execution
 

From 63e6206ed627396c2376656db9b2864787cd666a Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Tue, 1 Nov 2022 09:09:06 +0800
Subject: [PATCH 25/89] Enhance test_put_along_axis, test_take_along_axis.
 (#671)

* Enhance test_put_along_axis, test_take_along_axis.

* Address comments
---
 tests/integration/test_put_along_axis.py  | 203 +++++++++++++++++++++-
 tests/integration/test_take_along_axis.py | 106 ++++++++++-
 tests/integration/utils/generators.py     |  14 ++
 3 files changed, 320 insertions(+), 3 deletions(-)

diff --git a/tests/integration/test_put_along_axis.py b/tests/integration/test_put_along_axis.py
index 4df2108c1..a289ffc2a 100644
--- a/tests/integration/test_put_along_axis.py
+++ b/tests/integration/test_put_along_axis.py
@@ -16,13 +16,26 @@
 import numpy as np
 import pytest
 from legate.core import LEGATE_MAX_DIM
-from utils.generators import mk_seq_array
+from utils.generators import (
+    broadcasts_to,
+    broadcasts_to_along_axis,
+    mk_seq_array,
+)
 
 import cunumeric as num
 
 
-def test_None():
+def equivalent_shapes_gen(shape):
+    yield shape
+    for i in range(len(shape) - 1):
+        if shape[i] == 1:
+            i += 1
+            yield shape[i:]
+        else:
+            break
 
+
+def test_axis_None():
     x = mk_seq_array(np, (256,))
     x_num = mk_seq_array(num, (256,))
 
@@ -54,6 +67,192 @@ def test_ndim(ndim):
         assert np.array_equal(np_a, num_a)
 
 
+@pytest.mark.parametrize(
+    "axis", range(-1, 3), ids=lambda axis: f"(axis={axis})"
+)
+def test_full(axis):
+    shape = (3, 4, 5)
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+
+    size = shape[axis]
+    axis_values = (0, size - 1, size * 2)
+
+    for shape_idx in broadcasts_to_along_axis(shape, axis, axis_values):
+        np_indices = mk_seq_array(np, shape_idx) % shape[axis]
+        num_indices = mk_seq_array(num, shape_idx) % shape[axis]
+        np_a = np_arr.copy()
+        num_a = num_arr.copy()
+        np.put_along_axis(np_a, np_indices, 100, axis=axis)
+        num.put_along_axis(num_a, num_indices, 100, axis=axis)
+        assert np.array_equal(np_a, num_a)
+
+
+def test_values():
+    shape = (3, 4, 5)
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    shape_idx = (3, 4, 5)
+    axis = 0
+    np_indices = mk_seq_array(np, shape_idx) % shape[axis]
+    num_indices = mk_seq_array(num, shape_idx) % shape[axis]
+
+    for shape_values in broadcasts_to(shape_idx):
+        for s in equivalent_shapes_gen(shape_values):
+            np_values = mk_seq_array(np, s)
+            num_values = mk_seq_array(num, s)
+            np_a = np_arr.copy()
+            num_a = num_arr.copy()
+            np.put_along_axis(np_a, np_indices, np_values, axis=axis)
+            num.put_along_axis(num_a, num_indices, num_values, axis=axis)
+            assert np.array_equal(np_a, num_a)
+
+
+def test_empty_indice():
+    x = mk_seq_array(np, (10,))
+    x_num = mk_seq_array(num, (10,))
+
+    indices = np.array([], dtype=int)
+    indices_num = num.array([], dtype=int)
+
+    np.put_along_axis(x, indices, 99, axis=0)
+    num.put_along_axis(x_num, indices_num, 99, axis=0)
+    assert np.array_equal(x_num, x)
+
+
+class TestPutAlongAxisErrors:
+    def setup(self):
+        self.a = num.ones((3, 3))
+        self.ai = num.ones((3, 3), dtype=int)
+
+    @pytest.mark.parametrize("dtype", (bool, float), ids=str)
+    def test_indices_bad_type(self, dtype):
+        ai = num.ones((3, 3), dtype=dtype)
+        msg = "`indices` must be an integer array"
+        with pytest.raises(TypeError, match=msg):
+            num.put_along_axis(self.a, ai, 100, axis=0)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "shape", ((3, 2), (3, 0)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_indices_bad_shape(self, shape):
+        # In Numpy, it raises IndexError.
+        # In cuNumeric, it raises ValueError.
+        ai = num.ones(shape, dtype=int)
+        msg = "shape mismatch: indexing arrays could not be broadcast"
+        with pytest.raises(IndexError, match=msg):
+            num.put_along_axis(self.a, ai, 100, axis=0)
+
+    @pytest.mark.parametrize(
+        "shape", ((1,), (3, 3, 1)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_indices_bad_dims(self, shape):
+        ai = num.ones(shape, dtype=int)
+        msg = "`indices` and `a` must have the same number of dimensions"
+        with pytest.raises(ValueError, match=msg):
+            num.put_along_axis(self.a, ai, 100, axis=0)
+
+    @pytest.mark.parametrize(
+        "value", (-4, 3), ids=lambda value: f"(value={value})"
+    )
+    def test_indices_out_of_bound(self, value):
+        ai = num.full((3, 3), value, dtype=int)
+        msg = "out of bounds"
+        with pytest.raises(IndexError, match=msg):
+            num.put_along_axis(self.a, ai, 100, axis=0)
+
+    @pytest.mark.parametrize(
+        "axis", (2, -3), ids=lambda axis: f"(axis={axis})"
+    )
+    def test_axis_out_of_bound(self, axis):
+        msg = "out of bounds"
+        # In Numpy, it raises AxisError
+        with pytest.raises(ValueError, match=msg):
+            num.put_along_axis(self.a, self.ai, 100, axis=axis)
+
+    def test_axis_float(self):
+        axis = 0.0
+        msg = "integer argument expected"
+        with pytest.raises(TypeError, match=msg):
+            num.put_along_axis(self.a, self.ai, 100, axis=axis)
+
+    def test_axis_none_indice_not_1d(self):
+        axis = None
+        msg = "indices must be 1D if axis=None"
+        with pytest.raises(ValueError, match=msg):
+            num.put_along_axis(self.a, self.ai, 100, axis=axis)
+
+    def test_axis_none_andim_greater_than_one(self):
+        ai = num.ones((3 * 3), dtype=int)
+        axis = None
+        msg = "a.ndim>1 case is not supported when axis=None"
+        with pytest.raises(ValueError, match=msg):
+            num.put_along_axis(self.a, ai, 100, axis=axis)
+
+    @pytest.mark.parametrize(
+        "shape",
+        ((1, 2), (4, 1), (0,), (2,), (4,), (1, 0)),
+        ids=lambda shape: f"(shape={shape})",
+    )
+    def test_values_bad_shape(self, shape):
+        values = num.ones(shape)
+        with pytest.raises(ValueError):
+            num.put_along_axis(self.a, self.ai, values, axis=0)
+
+    def test_values_bad_shape2(self):
+        shape = (3, 3, 1)
+        values = num.ones(shape)
+        with pytest.raises(ValueError):
+            num.put_along_axis(self.a, self.ai, values, axis=0)
+
+    @pytest.mark.parametrize(
+        "shape", ((0,), (5,), (4, 5)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_values_axis_none(self, shape):
+        a = mk_seq_array(num, (10,))
+        ai = mk_seq_array(num, (7,))
+        values = mk_seq_array(num, shape)
+        with pytest.raises(ValueError):
+            num.put_along_axis(a, ai, values, None)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "shape", ((0,), (5,), (4, 5)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_values_axis_none_DIVERGENC(self, shape):
+        # In Numpy, all 3 cases pass
+        # In cuNumeric, all 3 cases raise ValueError "Shape did not match"
+        np_arr = mk_seq_array(np, (10,))
+        num_arr = mk_seq_array(num, (10,))
+
+        indices = mk_seq_array(np, (7,))
+        indices_num = mk_seq_array(num, (7,))
+
+        values = mk_seq_array(np, shape)
+        values_num = mk_seq_array(num, shape)
+
+        np.put_along_axis(np_arr, indices, values, None)
+        num.put_along_axis(num_arr, indices_num, values_num, None)
+        assert np.array_equal(np_arr, num_arr)
+
+    def test_a_none(self):
+        ai = num.array([1, 1, 1])
+        msg = "object has no attribute 'ndim'"
+        with pytest.raises(AttributeError, match=msg):
+            num.put_along_axis(None, ai, 100, axis=0)
+
+    def test_indice_none(self):
+        msg = "'NoneType' object has no attribute 'dtype'"
+        with pytest.raises(AttributeError, match=msg):
+            num.put_along_axis(self.a, None, 100, axis=0)
+
+    def test_values_none(self):
+        msg = "'NoneType' object has no attribute 'dtype'"
+        with pytest.raises(AttributeError, match=msg):
+            num.put_along_axis(self.a, self.ai, None, axis=0)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_take_along_axis.py b/tests/integration/test_take_along_axis.py
index 4d98680a4..b19638ae9 100644
--- a/tests/integration/test_take_along_axis.py
+++ b/tests/integration/test_take_along_axis.py
@@ -16,7 +16,7 @@
 import numpy as np
 import pytest
 from legate.core import LEGATE_MAX_DIM
-from utils.generators import mk_seq_array
+from utils.generators import broadcasts_to_along_axis, mk_seq_array
 
 import cunumeric as num
 
@@ -42,6 +42,110 @@ def test_ndim(ndim):
     assert np.array_equal(res_num, res_np)
 
 
+@pytest.mark.parametrize(
+    "axis", range(-1, 3), ids=lambda axis: f"(axis={axis})"
+)
+def test_full(axis):
+    shape = (3, 4, 5)
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+
+    size = shape[axis]
+    axis_values = (0, size - 1, size * 2)
+
+    for shape_idx in broadcasts_to_along_axis(shape, axis, axis_values):
+        np_indices = mk_seq_array(np, shape_idx) % shape[axis]
+        num_indices = mk_seq_array(num, shape_idx) % shape[axis]
+        res_np = np.take_along_axis(np_arr, np_indices, axis=axis)
+        res_num = num.take_along_axis(num_arr, num_indices, axis=axis)
+        assert np.array_equal(res_num, res_np)
+
+
+def test_empty_indice():
+    np_arr = mk_seq_array(np, (10,))
+    num_arr = mk_seq_array(num, (10,))
+    np_indices = np.array([], dtype=int)
+    num_indices = num.array([], dtype=int)
+    res_np = np.take_along_axis(np_arr, np_indices, axis=0)
+    res_num = num.take_along_axis(num_arr, num_indices, axis=0)
+    assert np.array_equal(res_num, res_np)
+
+
+class TestTakeAlongAxisErrors:
+    def setup(self):
+        self.a = num.ones((3, 3))
+        self.ai = num.ones((3, 3), dtype=int)
+
+    @pytest.mark.parametrize("dtype", (bool, float), ids=str)
+    def test_indices_bad_type(self, dtype):
+        ai = num.ones((3, 3), dtype=dtype)
+        msg = "`indices` must be an integer array"
+        with pytest.raises(TypeError, match=msg):
+            num.take_along_axis(self.a, ai, axis=0)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "shape", ((3, 2), (3, 0)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_indices_bad_shape(self, shape):
+        # In Numpy, it raises IndexError.
+        # In cuNumeric, it raises ValueError.
+        ai = num.ones(shape, dtype=int)
+        msg = "shape mismatch: indexing arrays could not be broadcast"
+        with pytest.raises(IndexError, match=msg):
+            num.take_along_axis(self.a, ai, axis=0)
+
+    @pytest.mark.parametrize(
+        "shape", ((1,), (3, 3, 1)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_indices_bad_dims(self, shape):
+        ai = num.ones(shape, dtype=int)
+        msg = "`indices` and `a` must have the same number of dimensions"
+        with pytest.raises(ValueError, match=msg):
+            num.take_along_axis(self.a, ai, axis=0)
+
+    @pytest.mark.parametrize(
+        "value", (-4, 3), ids=lambda value: f"(value={value})"
+    )
+    def test_indices_out_of_bound(self, value):
+        ai = num.full((3, 3), value, dtype=int)
+        msg = "out of bounds"
+        with pytest.raises(IndexError, match=msg):
+            num.take_along_axis(self.a, ai, axis=0)
+
+    @pytest.mark.parametrize(
+        "axis", (2, -3), ids=lambda axis: f"(axis={axis})"
+    )
+    def test_axis_out_of_bound(self, axis):
+        msg = "out of bounds"
+        # In Numpy, it raises AxisError
+        with pytest.raises(ValueError, match=msg):
+            num.take_along_axis(self.a, self.ai, axis=axis)
+
+    def test_axis_float(self):
+        axis = 0.0
+        msg = "integer argument expected"
+        with pytest.raises(TypeError, match=msg):
+            num.take_along_axis(self.a, self.ai, axis=axis)
+
+    def test_axis_none_indice_not_1d(self):
+        axis = None
+        msg = "indices must be 1D if axis=None"
+        with pytest.raises(ValueError, match=msg):
+            num.take_along_axis(self.a, self.ai, axis=axis)
+
+    def test_a_none(self):
+        ai = num.array([1, 1, 1])
+        msg = "object has no attribute 'ndim'"
+        with pytest.raises(AttributeError, match=msg):
+            num.take_along_axis(None, ai, axis=0)
+
+    def test_indice_none(self):
+        msg = "'NoneType' object has no attribute 'dtype'"
+        with pytest.raises(AttributeError, match=msg):
+            num.take_along_axis(self.a, None, axis=0)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/utils/generators.py b/tests/integration/utils/generators.py
index cc5c521b3..624f34b9d 100644
--- a/tests/integration/utils/generators.py
+++ b/tests/integration/utils/generators.py
@@ -80,3 +80,17 @@ def permutes_to(tgt_shape):
         for (i, j) in enumerate(axes):
             src_shape[j] = tgt_shape[i]
         yield (axes, tuple(src_shape))
+
+
+def broadcasts_to_along_axis(tgt_shape, axis, values):
+    """
+    Generates all shapes that broadcast to `tgt_shape` along axis for
+    each value.
+    """
+    axis = axis % (len(tgt_shape))
+    tgt_shape_axis_removed = tgt_shape[:axis] + tgt_shape[axis + 1 :]
+
+    for s in broadcasts_to(tgt_shape_axis_removed):
+        for v in values:
+            shape = s[:axis] + (v,) + s[axis:]
+            yield shape

From 8f0233c3bbcdc54a6ab7d1c839fd73f70ea11522 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 1 Nov 2022 14:50:38 -0700
Subject: [PATCH 26/89] Eliminate empty kernel launch in `cunumeric.unique`
 (#675)

* Handle empty sub-stores correctly unique for GPUs

* Add a test case with empty subregions
---
 src/cunumeric/set/unique.cu      | 33 ++++++++++++++++++--------------
 tests/integration/test_unique.py | 10 ++++++++++
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/cunumeric/set/unique.cu b/src/cunumeric/set/unique.cu
index 38528b633..11e9e6fc1 100644
--- a/src/cunumeric/set/unique.cu
+++ b/src/cunumeric/set/unique.cu
@@ -153,27 +153,32 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
     // Make a copy of the input as we're going to sort it
     auto temp = create_buffer<VAL>(volume);
     VAL* ptr  = temp.ptr(0);
-    if (in.accessor.is_dense_arbitrary(rect)) {
-      auto* src = in.ptr(rect.lo);
-      CHECK_CUDA(cudaMemcpyAsync(ptr, src, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
-    } else {
-      const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-      copy_into_buffer<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
-        ptr, in, rect.lo, pitches, volume);
-    }
-    CHECK_CUDA_STREAM(stream);
+    VAL* end  = ptr;
+    if (volume > 0) {
+      if (in.accessor.is_dense_arbitrary(rect)) {
+        auto* src = in.ptr(rect.lo);
+        CHECK_CUDA(
+          cudaMemcpyAsync(ptr, src, sizeof(VAL) * volume, cudaMemcpyDeviceToDevice, stream));
+      } else {
+        const size_t num_blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+        copy_into_buffer<<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(
+          ptr, in, rect.lo, pitches, volume);
+      }
+      CHECK_CUDA_STREAM(stream);
 
-    // Find unique values
-    thrust::sort(thrust::cuda::par.on(stream), ptr, ptr + volume);
-    auto* end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + volume);
+      // Find unique values
+      thrust::sort(thrust::cuda::par.on(stream), ptr, ptr + volume);
+      auto* end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + volume);
+    }
 
     Piece<VAL> result;
     result.second = end - ptr;
     auto buf_size = (get_aligned_size(result.second * sizeof(VAL)) + sizeof(VAL) - 1) / sizeof(VAL);
     assert(end - ptr <= buf_size);
     result.first = create_buffer<VAL>(buf_size);
-    CHECK_CUDA(cudaMemcpyAsync(
-      result.first.ptr(0), ptr, sizeof(VAL) * result.second, cudaMemcpyDeviceToDevice, stream));
+    if (result.second > 0)
+      CHECK_CUDA(cudaMemcpyAsync(
+        result.first.ptr(0), ptr, sizeof(VAL) * result.second, cudaMemcpyDeviceToDevice, stream));
 
     if (comms.size() > 0) {
       // The launch domain is 1D because of the output region
diff --git a/tests/integration/test_unique.py b/tests/integration/test_unique.py
index 54eb17092..4f3d84274 100644
--- a/tests/integration/test_unique.py
+++ b/tests/integration/test_unique.py
@@ -20,6 +20,16 @@
 import cunumeric as num
 
 
+def test_with_nonzero():
+    (a,) = num.nonzero(num.array([1, 1, 0, 0]))
+    a_np = a.__array__()
+
+    b = num.unique(a)
+    b_np = num.unique(a_np)
+
+    assert np.array_equal(b, b_np)
+
+
 @pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
 def test_ndim(ndim):
     shape = (4,) * ndim

From 329339b7f6bd4717e34a5640772bcd236b5d5b3b Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 1 Nov 2022 15:44:20 -0700
Subject: [PATCH 27/89] Some quality-of-life changes (#674)

* Catch up changes in the type traits

* Allow ThreadLocalStorage to be used for values bigger than 64B
---
 src/cunumeric/binary/binary_op_util.h      | 18 ++++----
 src/cunumeric/omp_help.h                   |  7 +++-
 src/cunumeric/scan/scan_local_template.inl | 15 +++----
 src/cunumeric/unary/convert_template.inl   |  4 +-
 src/cunumeric/unary/convert_util.h         | 48 +++++++++++-----------
 src/cunumeric/unary/unary_op_util.h        | 46 ++++++++++-----------
 src/cunumeric/unary/unary_red_util.h       |  8 ++--
 7 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/src/cunumeric/binary/binary_op_util.h b/src/cunumeric/binary/binary_op_util.h
index 6d1375e13..a4c1538ec 100644
--- a/src/cunumeric/binary/binary_op_util.h
+++ b/src/cunumeric/binary/binary_op_util.h
@@ -311,7 +311,7 @@ template <legate::LegateTypeCode CODE>
 struct BinaryOp<BinaryOpCode::FMOD, CODE> {
   using T = legate::legate_type_of<CODE>;
   static constexpr bool valid =
-    not(CODE == legate::LegateTypeCode::BOOL_LT or legate::is_complex<T>::value);
+    not(CODE == legate::LegateTypeCode::BOOL_LT or legate::is_complex<CODE>::value);
   BinaryOp(const std::vector<legate::Store>& args) {}
 
   template <typename _T = T, std::enable_if_t<std::is_integral<_T>::value>* = nullptr>
@@ -459,7 +459,7 @@ struct BinaryOp<BinaryOpCode::ISCLOSE, CODE> {
     atol_ = args[1].scalar<double>();
   }
 
-  template <typename T = VAL, std::enable_if_t<!legate::is_complex<T>::value>* = nullptr>
+  template <typename T = VAL, std::enable_if_t<!legate::is_complex_type<T>::value>* = nullptr>
   constexpr bool operator()(const T& a, const T& b) const
   {
     using std::fabs;
@@ -469,7 +469,7 @@ struct BinaryOp<BinaryOpCode::ISCLOSE, CODE> {
            atol_ + rtol_ * static_cast<double>(fabs(b));
   }
 
-  template <typename T = VAL, std::enable_if_t<legate::is_complex<T>::value>* = nullptr>
+  template <typename T = VAL, std::enable_if_t<legate::is_complex_type<T>::value>* = nullptr>
   constexpr bool operator()(const T& a, const T& b) const
   {
     return static_cast<double>(abs(a - b)) <= atol_ + rtol_ * static_cast<double>(abs(b));
@@ -606,13 +606,13 @@ struct BinaryOp<BinaryOpCode::LOGICAL_AND, CODE> {
   static constexpr bool valid = true;
   BinaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const _T& a, const _T& b) const
   {
     return static_cast<bool>(a.real()) && static_cast<bool>(b.real());
   }
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const _T& a, const _T& b) const
   {
     return static_cast<bool>(a) && static_cast<bool>(b);
@@ -626,13 +626,13 @@ struct BinaryOp<BinaryOpCode::LOGICAL_OR, CODE> {
 
   BinaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const _T& a, const _T& b) const
   {
     return static_cast<bool>(a.real()) || static_cast<bool>(b.real());
   }
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const _T& a, const _T& b) const
   {
     return static_cast<bool>(a) || static_cast<bool>(b);
@@ -645,13 +645,13 @@ struct BinaryOp<BinaryOpCode::LOGICAL_XOR, CODE> {
   static constexpr bool valid = true;
   BinaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const _T& a, const _T& b) const
   {
     return static_cast<bool>(a.real()) != static_cast<bool>(b.real());
   }
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const _T& a, const _T& b) const
   {
     return static_cast<bool>(a) != static_cast<bool>(b);
diff --git a/src/cunumeric/omp_help.h b/src/cunumeric/omp_help.h
index 0093d265c..e4b4cb49f 100644
--- a/src/cunumeric/omp_help.h
+++ b/src/cunumeric/omp_help.h
@@ -25,10 +25,13 @@ template <typename VAL>
 struct ThreadLocalStorage {
  private:
   static constexpr size_t CACHE_LINE_SIZE = 64;
+  // Round the element size to the nearest multiple of cache line size
+  static constexpr size_t PER_THREAD_SIZE =
+    (sizeof(VAL) + CACHE_LINE_SIZE - 1) / CACHE_LINE_SIZE * CACHE_LINE_SIZE;
 
  public:
   ThreadLocalStorage(size_t num_threads)
-    : storage_(CACHE_LINE_SIZE * num_threads), num_threads_(num_threads)
+    : storage_(PER_THREAD_SIZE * num_threads), num_threads_(num_threads)
   {
   }
   ~ThreadLocalStorage() {}
@@ -36,7 +39,7 @@ struct ThreadLocalStorage {
  public:
   VAL& operator[](size_t idx)
   {
-    return *reinterpret_cast<VAL*>(storage_.data() + CACHE_LINE_SIZE * idx);
+    return *reinterpret_cast<VAL*>(storage_.data() + PER_THREAD_SIZE * idx);
   }
 
  private:
diff --git a/src/cunumeric/scan/scan_local_template.inl b/src/cunumeric/scan/scan_local_template.inl
index 0680ee34a..56c038fcc 100644
--- a/src/cunumeric/scan/scan_local_template.inl
+++ b/src/cunumeric/scan/scan_local_template.inl
@@ -33,9 +33,8 @@ struct ScanLocalImpl {
   // Case where NANs are transformed
   template <LegateTypeCode CODE,
             int DIM,
-            std::enable_if_t<NAN_TO_IDENTITY &&
-                             (legate::is_floating_point<CODE>::value ||
-                              legate::is_complex<legate::legate_type_of<CODE>>::value)>* = nullptr>
+            std::enable_if_t<NAN_TO_IDENTITY && (legate::is_floating_point<CODE>::value ||
+                                                 legate::is_complex<CODE>::value)>* = nullptr>
   void operator()(ScanLocalArgs& args) const
   {
     using OP  = ScanOp<OP_CODE, CODE>;
@@ -58,12 +57,10 @@ struct ScanLocalImpl {
     ScanLocalNanImplBody<KIND, OP_CODE, CODE, DIM>()(func, out, in, args.sum_vals, pitches, rect);
   }
   // Case where NANs are as is
-  template <
-    LegateTypeCode CODE,
-    int DIM,
-    std::enable_if_t<!(NAN_TO_IDENTITY &&
-                       (legate::is_floating_point<CODE>::value ||
-                        legate::is_complex<legate::legate_type_of<CODE>>::value))>* = nullptr>
+  template <LegateTypeCode CODE,
+            int DIM,
+            std::enable_if_t<!(NAN_TO_IDENTITY && (legate::is_floating_point<CODE>::value ||
+                                                   legate::is_complex<CODE>::value))>* = nullptr>
   void operator()(ScanLocalArgs& args) const
   {
     using OP  = ScanOp<OP_CODE, CODE>;
diff --git a/src/cunumeric/unary/convert_template.inl b/src/cunumeric/unary/convert_template.inl
index 41265892a..fe35005d2 100644
--- a/src/cunumeric/unary/convert_template.inl
+++ b/src/cunumeric/unary/convert_template.inl
@@ -75,7 +75,7 @@ template <VariantKind KIND, LegateTypeCode SRC_TYPE>
 struct ConvertDispatch {
   template <ConvertCode NAN_OP,
             std::enable_if_t<(legate::is_floating_point<SRC_TYPE>::value ||
-                              legate::is_complex<legate::legate_type_of<SRC_TYPE>>::value) ||
+                              legate::is_complex<SRC_TYPE>::value) ||
                              NAN_OP == ConvertCode::NOOP>* = nullptr>
   void operator()(ConvertArgs& args) const
   {
@@ -85,7 +85,7 @@ struct ConvertDispatch {
 
   template <ConvertCode NAN_OP,
             std::enable_if_t<!((legate::is_floating_point<SRC_TYPE>::value ||
-                                legate::is_complex<legate::legate_type_of<SRC_TYPE>>::value) ||
+                                legate::is_complex<SRC_TYPE>::value) ||
                                (NAN_OP == ConvertCode::NOOP))>* = nullptr>
   void operator()(ConvertArgs& args) const
   {
diff --git a/src/cunumeric/unary/convert_util.h b/src/cunumeric/unary/convert_util.h
index 3d4a10d48..03e3692c8 100644
--- a/src/cunumeric/unary/convert_util.h
+++ b/src/cunumeric/unary/convert_util.h
@@ -52,17 +52,17 @@ struct ConvertOp<ConvertCode::NOOP, DST_TYPE, SRC_TYPE> {
   using SRC = legate::legate_type_of<SRC_TYPE>;
   using DST = legate::legate_type_of<DST_TYPE>;
 
-  template <
-    typename _SRC                                                                         = SRC,
-    std::enable_if_t<!legate::is_complex<_SRC>::value or legate::is_complex<DST>::value>* = nullptr>
+  template <typename _SRC                                          = SRC,
+            std::enable_if_t<!legate::is_complex_type<_SRC>::value or
+                             legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
     return static_cast<DST>(src);
   }
 
-  template <typename _SRC = SRC,
-            std::enable_if_t<legate::is_complex<_SRC>::value and !legate::is_complex<DST>::value>* =
-              nullptr>
+  template <typename _SRC                                           = SRC,
+            std::enable_if_t<legate::is_complex_type<_SRC>::value and
+                             !legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
     if constexpr (DST_TYPE == legate::LegateTypeCode::BOOL_LT)
@@ -79,13 +79,13 @@ template <legate::LegateTypeCode SRC_TYPE>
 struct ConvertOp<ConvertCode::NOOP, legate::LegateTypeCode::HALF_LT, SRC_TYPE> {
   using SRC = legate::legate_type_of<SRC_TYPE>;
 
-  template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex<_SRC>::value>* = nullptr>
+  template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
     return static_cast<__half>(static_cast<double>(src));
   }
 
-  template <typename _SRC = SRC, std::enable_if_t<legate::is_complex<_SRC>::value>* = nullptr>
+  template <typename _SRC = SRC, std::enable_if_t<legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
     return static_cast<__half>(static_cast<double>(src.real()));
@@ -107,17 +107,17 @@ struct ConvertOp<ConvertCode::PROD, DST_TYPE, SRC_TYPE> {
   using SRC = legate::legate_type_of<SRC_TYPE>;
   using DST = legate::legate_type_of<DST_TYPE>;
 
-  template <
-    typename _SRC                                                                         = SRC,
-    std::enable_if_t<!legate::is_complex<_SRC>::value or legate::is_complex<DST>::value>* = nullptr>
+  template <typename _SRC                                          = SRC,
+            std::enable_if_t<!legate::is_complex_type<_SRC>::value or
+                             legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<DST>(1) : static_cast<DST>(src);
   }
 
-  template <typename _SRC = SRC,
-            std::enable_if_t<legate::is_complex<_SRC>::value and !legate::is_complex<DST>::value>* =
-              nullptr>
+  template <typename _SRC                                           = SRC,
+            std::enable_if_t<legate::is_complex_type<_SRC>::value and
+                             !legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<DST>(1) : static_cast<DST>(src.real());
@@ -128,14 +128,14 @@ template <legate::LegateTypeCode SRC_TYPE>
 struct ConvertOp<ConvertCode::PROD, legate::LegateTypeCode::HALF_LT, SRC_TYPE> {
   using SRC = legate::legate_type_of<SRC_TYPE>;
 
-  template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex<_SRC>::value>* = nullptr>
+  template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<__half>(1)
                                   : static_cast<__half>(static_cast<double>(src));
   }
 
-  template <typename _SRC = SRC, std::enable_if_t<legate::is_complex<_SRC>::value>* = nullptr>
+  template <typename _SRC = SRC, std::enable_if_t<legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<__half>(1)
@@ -159,17 +159,17 @@ struct ConvertOp<ConvertCode::SUM, DST_TYPE, SRC_TYPE> {
   using SRC = legate::legate_type_of<SRC_TYPE>;
   using DST = legate::legate_type_of<DST_TYPE>;
 
-  template <
-    typename _SRC                                                                         = SRC,
-    std::enable_if_t<!legate::is_complex<_SRC>::value or legate::is_complex<DST>::value>* = nullptr>
+  template <typename _SRC                                          = SRC,
+            std::enable_if_t<!legate::is_complex_type<_SRC>::value or
+                             legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<DST>(0) : static_cast<DST>(src);
   }
 
-  template <typename _SRC = SRC,
-            std::enable_if_t<legate::is_complex<_SRC>::value and !legate::is_complex<DST>::value>* =
-              nullptr>
+  template <typename _SRC                                           = SRC,
+            std::enable_if_t<legate::is_complex_type<_SRC>::value and
+                             !legate::is_complex_type<DST>::value>* = nullptr>
   constexpr DST operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<DST>(0) : static_cast<DST>(src.real());
@@ -180,14 +180,14 @@ template <legate::LegateTypeCode SRC_TYPE>
 struct ConvertOp<ConvertCode::SUM, legate::LegateTypeCode::HALF_LT, SRC_TYPE> {
   using SRC = legate::legate_type_of<SRC_TYPE>;
 
-  template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex<_SRC>::value>* = nullptr>
+  template <typename _SRC = SRC, std::enable_if_t<!legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<__half>(0)
                                   : static_cast<__half>(static_cast<double>(src));
   }
 
-  template <typename _SRC = SRC, std::enable_if_t<legate::is_complex<_SRC>::value>* = nullptr>
+  template <typename _SRC = SRC, std::enable_if_t<legate::is_complex_type<_SRC>::value>* = nullptr>
   __CUDA_HD__ __half operator()(const _SRC& src) const
   {
     return cunumeric::is_nan(src) ? static_cast<__half>(0)
diff --git a/src/cunumeric/unary/unary_op_util.h b/src/cunumeric/unary/unary_op_util.h
index 61d11da17..f5012df0d 100644
--- a/src/cunumeric/unary/unary_op_util.h
+++ b/src/cunumeric/unary/unary_op_util.h
@@ -189,7 +189,7 @@ static constexpr bool is_floating_point =
 
 template <legate::LegateTypeCode CODE>
 static constexpr bool is_floating_or_complex =
-  is_floating_point<CODE> || legate::is_complex<legate::legate_type_of<CODE>>::value;
+  is_floating_point<CODE> || legate::is_complex<CODE>::value;
 
 template <UnaryOpCode OP_CODE, legate::LegateTypeCode CODE>
 struct UnaryOp {
@@ -203,7 +203,7 @@ struct UnaryOp<UnaryOpCode::ABSOLUTE, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
     return abs(x);
@@ -225,9 +225,9 @@ struct UnaryOp<UnaryOpCode::ABSOLUTE, CODE> {
     return x;
   }
 
-  template <
-    typename _T                                                                        = T,
-    std::enable_if_t<!legate::is_complex<_T>::value and !std::is_integral<_T>::value>* = nullptr>
+  template <typename _T                                     = T,
+            std::enable_if_t<!legate::is_complex_type<_T>::value and
+                             !std::is_integral<_T>::value>* = nullptr>
   constexpr _T operator()(const _T& x) const
   {
     using std::fabs;
@@ -428,13 +428,13 @@ struct UnaryOp<UnaryOpCode::CONJ, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr T operator()(const T& x) const
   {
     return T{x.real(), -x.imag()};
   }
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr T operator()(const T& x) const
   {
     return x;
@@ -537,13 +537,13 @@ struct UnaryOp<UnaryOpCode::EXP2, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr T operator()(const T& x) const
   {
     return std::exp2(x);
   }
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr T operator()(const T& x) const
   {
     using std::exp;
@@ -578,14 +578,14 @@ struct UnaryOp<UnaryOpCode::EXPM1, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
     using std::expm1;
     return expm1(x);
   }
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
     using std::exp;
@@ -634,7 +634,7 @@ struct UnaryOp<UnaryOpCode::GETARG, CODE> {
 template <legate::LegateTypeCode CODE>
 struct UnaryOp<UnaryOpCode::IMAG, CODE> {
   using T                     = legate::legate_type_of<CODE>;
-  static constexpr bool valid = legate::is_complex<T>::value;
+  static constexpr bool valid = legate::is_complex_type<T>::value;
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
@@ -789,14 +789,14 @@ struct UnaryOp<UnaryOpCode::LOG1P, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
     using std::log1p;
     return log1p(x);
   }
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
     using std::log;
@@ -826,14 +826,14 @@ struct UnaryOp<UnaryOpCode::LOG2, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
     using std::log2;
     return log2(x);
   }
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const T& x) const
   {
     using std::log;
@@ -862,13 +862,13 @@ struct UnaryOp<UnaryOpCode::LOGICAL_NOT, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const T& x) const
   {
     return !static_cast<bool>(x);
   }
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr bool operator()(const T& x) const
   {
     return !static_cast<bool>(x.real());
@@ -911,7 +911,7 @@ struct UnaryOp<UnaryOpCode::RAD2DEG, legate::LegateTypeCode::HALF_LT> {
 template <legate::LegateTypeCode CODE>
 struct UnaryOp<UnaryOpCode::REAL, CODE> {
   using T                     = legate::legate_type_of<CODE>;
-  static constexpr bool valid = legate::is_complex<T>::value;
+  static constexpr bool valid = legate::is_complex_type<T>::value;
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
@@ -952,13 +952,13 @@ struct UnaryOp<UnaryOpCode::RINT, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
     return _T(std::rint(x.real()), std::rint(x.imag()));
   }
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
     return std::rint(x);
@@ -1002,7 +1002,7 @@ struct UnaryOp<UnaryOpCode::SIGN, CODE> {
 
   UnaryOp(const std::vector<legate::Store>& args) {}
 
-  template <typename _T = T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
     if (x.real() != 0) {
@@ -1012,7 +1012,7 @@ struct UnaryOp<UnaryOpCode::SIGN, CODE> {
     }
   }
 
-  template <typename _T = T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
+  template <typename _T = T, std::enable_if_t<!legate::is_complex_type<_T>::value>* = nullptr>
   constexpr decltype(auto) operator()(const _T& x) const
   {
     return detail::sign(x);
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index 04edd774e..ab193a7df 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -151,7 +151,7 @@ struct UnaryRedOp<UnaryRedCode::COUNT_NONZERO, TYPE_CODE> {
 
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::MAX, TYPE_CODE> {
-  static constexpr bool valid = !legate::is_complex<legate::legate_type_of<TYPE_CODE>>::value;
+  static constexpr bool valid = !legate::is_complex<TYPE_CODE>::value;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = RHS;
@@ -174,7 +174,7 @@ struct UnaryRedOp<UnaryRedCode::MAX, TYPE_CODE> {
 
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::MIN, TYPE_CODE> {
-  static constexpr bool valid = !legate::is_complex<legate::legate_type_of<TYPE_CODE>>::value;
+  static constexpr bool valid = !legate::is_complex<TYPE_CODE>::value;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = RHS;
@@ -243,7 +243,7 @@ struct UnaryRedOp<UnaryRedCode::SUM, TYPE_CODE> {
 
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ARGMAX, TYPE_CODE> {
-  static constexpr bool valid = !legate::is_complex<legate::legate_type_of<TYPE_CODE>>::value;
+  static constexpr bool valid = !legate::is_complex<TYPE_CODE>::value;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = Argval<RHS>;
@@ -276,7 +276,7 @@ struct UnaryRedOp<UnaryRedCode::ARGMAX, TYPE_CODE> {
 
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ARGMIN, TYPE_CODE> {
-  static constexpr bool valid = !legate::is_complex<legate::legate_type_of<TYPE_CODE>>::value;
+  static constexpr bool valid = !legate::is_complex<TYPE_CODE>::value;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = Argval<RHS>;

From 03b1d3c4e9de770442f168156046555430e70850 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Tue, 1 Nov 2022 17:34:32 -0700
Subject: [PATCH 28/89] Mark sort and unique variants as concurrent (#676)

---
 src/cunumeric/set/unique.cc | 6 +++++-
 src/cunumeric/sort/sort.cc  | 7 ++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/cunumeric/set/unique.cc b/src/cunumeric/set/unique.cc
index ad07ec718..997d99cd6 100644
--- a/src/cunumeric/set/unique.cc
+++ b/src/cunumeric/set/unique.cc
@@ -58,7 +58,11 @@ struct UniqueImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { UniqueTask::register_variants(); }
+static void __attribute__((constructor)) register_tasks(void)
+{
+  UniqueTask::register_variants(
+    {{LEGATE_GPU_VARIANT, legate::VariantOptions{}.with_concurrent(true)}});
+}
 }  // namespace
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/sort/sort.cc b/src/cunumeric/sort/sort.cc
index bf273b59f..3f3a192f3 100644
--- a/src/cunumeric/sort/sort.cc
+++ b/src/cunumeric/sort/sort.cc
@@ -73,7 +73,12 @@ struct SortImplBody<VariantKind::CPU, CODE, DIM> {
 
 namespace  // unnamed
 {
-static void __attribute__((constructor)) register_tasks(void) { SortTask::register_variants(); }
+static void __attribute__((constructor)) register_tasks(void)
+{
+  auto options = legate::VariantOptions{}.with_concurrent(true);
+  SortTask::register_variants(
+    {{LEGATE_CPU_VARIANT, options}, {LEGATE_GPU_VARIANT, options}, {LEGATE_OMP_VARIANT, options}});
+}
 }  // namespace
 
 }  // namespace cunumeric

From 5d3f743ba21e6458d7ee611935ec952314edad77 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Fri, 4 Nov 2022 13:18:35 -0700
Subject: [PATCH 29/89] Make `install.py` reconfigure editable installs when
 build type changes (#670)

* pass -mindepth 1 so we don't accidentally delete the search root if it matches one of the `-d` names

* pass unknown flags to `pip install` command

* use CMAKE_ARGS instead of SKBUILD_CONFIGURE_OPTIONS to work around scikit-build bug

* replace SKBUILD_CONFIGURE_OPTIONS with CMAKE_ARGS everywhere
---
 install.py                                         | 13 ++++++++++---
 scripts/build-install.sh                           |  4 ++--
 scripts/build-no-install.sh                        |  4 ++--
 scripts/build-separately-no-install.sh             |  4 ++--
 scripts/build-with-legate-no-install.sh            |  4 ++--
 scripts/build-with-legate-separately-no-install.sh |  4 ++--
 ...tall-global-legion-legate-core-and-cunumeric.sh | 14 +++++++-------
 7 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/install.py b/install.py
index c6ee2d80d..8bed64992 100755
--- a/install.py
+++ b/install.py
@@ -296,14 +296,21 @@ def validate_path(path):
             pip_install_cmd += ["--no-deps", "--no-build-isolation"]
         pip_install_cmd += ["--upgrade"]
 
+    if unknown is not None:
+        pip_install_cmd += unknown
+
     pip_install_cmd += ["."]
     if verbose:
         pip_install_cmd += ["-vv"]
 
-    cmake_flags = []
+    # Also use preexisting CMAKE_ARGS from conda if set
+    cmake_flags = cmd_env.get("CMAKE_ARGS", "").split(" ")
 
     if cmake_generator:
-        cmake_flags += [f"-G'{cmake_generator}'"]
+        if " " not in cmake_generator:
+            cmake_flags += [f"-G{cmake_generator}"]
+        else:
+            cmake_flags += [f"-G'{cmake_generator}'"]
 
     if debug or verbose:
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
@@ -352,7 +359,7 @@ def validate_path(path):
     cmd_env.update(
         {
             "SKBUILD_BUILD_OPTIONS": f"-j{str(thread_count)}",
-            "SKBUILD_CONFIGURE_OPTIONS": "\n".join(cmake_flags),
+            "CMAKE_ARGS": " ".join(cmake_flags),
         }
     )
 
diff --git a/scripts/build-install.sh b/scripts/build-install.sh
index 4d9bdbfc8..8adb472d2 100755
--- a/scripts/build-install.sh
+++ b/scripts/build-install.sh
@@ -13,7 +13,7 @@ source ./scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
 rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -29,7 +29,7 @@ ninja_args="-j$(nproc --ignore=2)"
 
 # Build cunumeric + cunumeric_python and install into the current Python environment
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
         --no-deps --no-build-isolation    \
diff --git a/scripts/build-no-install.sh b/scripts/build-no-install.sh
index 623ca788d..c398eda58 100755
--- a/scripts/build-no-install.sh
+++ b/scripts/build-no-install.sh
@@ -11,7 +11,7 @@ source ./scripts/util/compiler-flags.sh
 rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -27,7 +27,7 @@ ninja_args="-j$(nproc --ignore=2)"
 
 # Build legion_core + legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-separately-no-install.sh b/scripts/build-separately-no-install.sh
index b9de045b4..8d8078723 100644
--- a/scripts/build-separately-no-install.sh
+++ b/scripts/build-separately-no-install.sh
@@ -11,7 +11,7 @@ source ./scripts/util/compiler-flags.sh
 rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -44,7 +44,7 @@ cmake_args+="
 
 # Build legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-with-legate-no-install.sh b/scripts/build-with-legate-no-install.sh
index ad1da812a..498745e31 100644
--- a/scripts/build-with-legate-no-install.sh
+++ b/scripts/build-with-legate-no-install.sh
@@ -13,7 +13,7 @@ source ./scripts/util/read-legate-core-root.sh "$0"
 rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -28,7 +28,7 @@ ninja_args="-j$(nproc --ignore=2)"
 
 # Build legion_core + legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-with-legate-separately-no-install.sh b/scripts/build-with-legate-separately-no-install.sh
index c04e7f9ed..fa9e97d05 100755
--- a/scripts/build-with-legate-separately-no-install.sh
+++ b/scripts/build-with-legate-separately-no-install.sh
@@ -13,7 +13,7 @@ source ./scripts/util/read-legate-core-root.sh "$0"
 rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 
 # Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
 if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
@@ -45,7 +45,7 @@ cmake_args+="
 
 # Build legion_core_python and perform an "editable" install
 SKBUILD_BUILD_OPTIONS="$ninja_args"       \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args"   \
+CMAKE_ARGS="$cmake_args"                  \
 SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
     python -m pip install                 \
         --root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh b/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
index a759dd37f..4f37467bb 100755
--- a/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
+++ b/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
@@ -1,10 +1,10 @@
 #! /usr/bin/env bash
 
-rm -rf $(find "$CONDA_PREFIX/lib" -type d -name '*cunumeric*') \
-       $(find "$CONDA_PREFIX/lib" -type f -name 'libcunumeric*') \
-       $(find "$CONDA_PREFIX/lib" -type f -name 'cunumeric.egg-link') \
-       $(find "$CONDA_PREFIX/include" -type f -name 'tci.h') \
-       $(find "$CONDA_PREFIX/include" -type d -name 'tci') \
-       $(find "$CONDA_PREFIX/include" -type d -name 'tblis') \
-       $(find "$CONDA_PREFIX/include" -type d -name 'cunumeric') \
+rm -rf $(find "$CONDA_PREFIX/lib" -mindepth 1 -type d -name '*cunumeric*') \
+       $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'libcunumeric*') \
+       $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'cunumeric.egg-link') \
+       $(find "$CONDA_PREFIX/include" -mindepth 1 -type f -name 'tci.h') \
+       $(find "$CONDA_PREFIX/include" -mindepth 1 -type d -name 'tci') \
+       $(find "$CONDA_PREFIX/include" -mindepth 1 -type d -name 'tblis') \
+       $(find "$CONDA_PREFIX/include" -mindepth 1 -type d -name 'cunumeric') \
        ;

From 1aba3a01f76b99997340bfae63f267c896e3a431 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Mon, 7 Nov 2022 13:50:11 -0800
Subject: [PATCH 30/89] configure test overrides in the project test.py (#678)

---
 test.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/test.py b/test.py
index 8dcda54be..50e22ee88 100755
--- a/test.py
+++ b/test.py
@@ -18,10 +18,27 @@
 
 import sys
 
+from legate.tester import PER_FILE_ARGS, SKIPPED_EXAMPLES
 from legate.tester.config import Config
 from legate.tester.test_plan import TestPlan
 from legate.tester.test_system import TestSystem
 
+SKIPPED_EXAMPLES.update(
+    {
+        "examples/ingest.py",
+        "examples/kmeans_sort.py",
+        "examples/lstm_full.py",
+        "examples/wgrad.py",
+    }
+)
+
+PER_FILE_ARGS.update(
+    {
+        "examples/lstm_full.py": ["--file", "resources/lstm_input.txt"],
+    }
+)
+
+
 if __name__ == "__main__":
     config = Config(sys.argv)
 

From 22b3f172f5ea631bfd4591e7839ea6a883efb56b Mon Sep 17 00:00:00 2001
From: xialu00 <110973296+xialu00@users.noreply.github.com>
Date: Tue, 8 Nov 2022 10:25:20 +0800
Subject: [PATCH 31/89] add test case for test_compress.py and test_extract.py
 (#672)

* add test case for test_compress.py

* add test case for test_extract.py

* fix comments

* fix comments
---
 cunumeric/array.py                 |   8 +-
 tests/integration/test_compress.py | 173 ++++++++++++++++++--------
 tests/integration/test_extract.py  | 187 +++++++++++++++++++----------
 3 files changed, 255 insertions(+), 113 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 0ac528531..6dc818112 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -2096,11 +2096,15 @@ def compress(
 
         """
         a = self
-        if condition.ndim != 1:
+        try:
+            if condition.ndim != 1:
+                raise ValueError(
+                    "Dimension mismatch: condition must be a 1D array"
+                )
+        except AttributeError:
             raise ValueError(
                 "Dimension mismatch: condition must be a 1D array"
             )
-
         condition = condition._warn_and_convert(np.dtype(bool))
 
         if axis is None:
diff --git a/tests/integration/test_compress.py b/tests/integration/test_compress.py
index 2523dbd18..7247685e6 100644
--- a/tests/integration/test_compress.py
+++ b/tests/integration/test_compress.py
@@ -21,25 +21,93 @@
 import cunumeric as num
 
 
-def test_1d():
-    a = mk_seq_array(np, (10,))
-    a_num = num.array(a)
-
-    res = np.compress([True, False, True], a, axis=0)
-    res_num = num.compress([True, False, True], a_num, axis=0)
-
-    assert np.array_equal(res_num, res)
-
-
-@pytest.mark.parametrize("axis", (0, 1))
-def test_2d_axis(axis):
-    a = np.array([[1, 2], [3, 4], [5, 6]])
-    num_a = num.array(a)
-
-    res_np = np.compress([0, 1], a, axis=axis)
-    res_num = num.compress([0, 1], num_a, axis=axis)
-
-    assert np.array_equal(res_num, res_np)
+@pytest.mark.xfail
+def test_none_array():
+    res_np = np.compress([0], None)  # numpy return []
+    # cuNumeric raises:
+    # AttributeError: 'NoneType' object has no attribute 'compress'
+    res_num = num.compress([0], None)
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.xfail
+def test_empty_array():
+    res_np = np.compress([0], [])  # numpy return []
+    # cuNumeric raises: ValueError:
+    # Shape mismatch: condition contains entries that are out of bounds
+    res_num = num.compress([0], [])
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize("con", (-3, 0, 3, None, False, True))
+def test_negative_condition(con):
+    a = num.array([1, 2, 3, 4])
+    with pytest.raises(ValueError):
+        num.compress(con, a)
+
+
+def test_condition_out_bound():
+    a = num.array([1, 2, 3, 4])
+    msg = r"bounds"
+    with pytest.raises(ValueError, match=msg):
+        num.compress([1, 2, 3, 4, 5], a)
+
+
+def test_axis_out_bound():
+    a = num.array([1, 2, 3, 4])
+    msg = r"bounds"
+    with pytest.raises(ValueError, match=msg):
+        num.compress([1, 2, 3, 4], a, axis=1)
+
+
+@pytest.mark.parametrize(
+    "con", ([True, True], [True, True, True, True, True, True])
+)
+def test_out_bounds(con):
+    a = num.array([1, 2, 3, 4])
+    b = num.array([-1, -2, -3, -4])
+    with pytest.raises(ValueError):
+        num.compress(con, a, out=b)
+
+
+@pytest.mark.xfail
+def test_dtype_out1():
+    a = mk_seq_array(np, (4,))
+    b = mk_seq_array(num, (4,))
+    out_np = np.random.random((4,))
+    out_num = num.random.random((4,))
+    # for Numpy, it will raise TypeError:
+    # "Cannot cast array data from dtype('float64') to dtype('int64')
+    # according to the rule 'safe'".
+    # cuNumeric passed.
+    np.compress([True, True, True, True], a, out=out_np)
+    num.compress([True, True, True, True], b, out=out_num)
+    assert np.array_equal(out_np, out_num)
+
+
+def test_dtype_out2():
+    # both Numpy and cuNumeric turn float into int
+    a = np.random.random((4,)) * 10
+    b = num.array(a)
+    out_np = np.random.randint(1, 10, (4,))
+    out_num = num.random.randint(-10, -1, (4,))
+    np.compress([True, True, True, True], a, out=out_np)
+    num.compress([True, True, True, True], b, out=out_num)
+    assert np.array_equal(out_np, out_num)
+
+
+@pytest.mark.xfail
+def test_out_parameter():
+    a = mk_seq_array(np, (4,))
+    b = mk_seq_array(num, (4,))
+    out_np = np.random.randint(1, 5, (4,))
+    out_num = np.random.randint(1, 5, (4,))
+    np.compress([True, True, True, True], a, 0, out_np)
+    num.compress([True, True, True, True], b, 0, out_num)
+    # for cuNumeric, the last parameter 'out',
+    # it should be written as 'out=out_num'
+    # otherwise it raises error
+    assert np.array_equal(out_num, out_np)
 
 
 def test_bool_condition():
@@ -52,51 +120,58 @@ def test_bool_condition():
     assert np.array_equal(res_num, res_np)
 
 
-def test_out():
-    a = np.array([[1, 2], [3, 4], [5, 6]])
-    num_a = num.array(a)
-    out_np = np.array([[1], [1], [1]])
-    out_num = num.array(out_np)
-
-    res_np = np.compress([0, 1], a, axis=1, out=out_np)
-    res_num = num.compress([0, 1], num_a, axis=1, out=out_num)
-
-    assert np.array_equal(res_num, res_np)
-    assert np.array_equal(out_num, out_np)
-
-
-def test_different_types():
-    a = np.array([[1, 2], [3, 4], [5, 6]], dtype=float)
-    num_a = num.array(a)
-    out_np = np.array([[1], [1], [1]])
-    out_num = num.array(out_np)
-
-    res_np = np.compress([0, 1], a, axis=1, out=out_np)
-    res_num = num.compress([0, 1], num_a, axis=1, out=out_num)
-
-    assert np.array_equal(res_num, res_np)
-    assert np.array_equal(out_num, out_np)
-
-
 @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
-def test_ndim(ndim):
+def test_ndim_basic(ndim):
     shape = (5,) * ndim
     np_arr = mk_seq_array(np, shape)
     num_arr = mk_seq_array(num, shape)
-    # make sure condition is between 1 and 2
-    np_condition = mk_seq_array(np, (5,)) % 2
-    num_condition = mk_seq_array(num, (5,)) % 2
+    # make sure condition is between 0 and 1
+    np_condition = np.array((mk_seq_array(np, (5,)) % 2).astype(bool))
+    num_condition = num.array((mk_seq_array(num, (5,)) % 2).astype(bool))
 
     res_np = np.compress(np_condition, np_arr)
     res_num = num.compress(num_condition, num_arr)
     assert np.array_equal(res_num, res_np)
 
+
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+def test_ndim_axis(ndim):
+    shape = (5,) * ndim
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    # make sure condition is between 0 and 1
+    np_condition = np.array((mk_seq_array(np, (5,)) % 2).astype(bool))
+    num_condition = num.array((mk_seq_array(num, (5,)) % 2).astype(bool))
+
     for axis in range(ndim):
         res_np = np.compress(np_condition, np_arr, axis)
         res_num = num.compress(num_condition, num_arr, axis)
         assert np.array_equal(res_num, res_np)
 
 
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+def test_ndim_out(ndim):
+    shape = (5,) * ndim
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    # make sure condition is between 0 and 1
+    np_condition = np.array((mk_seq_array(np, (5,)) % 2).astype(bool))
+    num_condition = num.array((mk_seq_array(num, (5,)) % 2).astype(bool))
+
+    for axis in range(ndim):
+        shape_list = list(shape)
+        shape_list[axis] = 3
+        shape_new = tuple(shape_list)
+
+        out_np = np.random.randint(1, 10, shape_new)
+        out_num = np.random.randint(-10, -1, shape_new)
+
+        np.compress(np_condition, np_arr, axis, out_np)
+        num.compress(num_condition, num_arr, axis, out=out_num)
+
+        assert np.array_equal(out_num, out_np)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_extract.py b/tests/integration/test_extract.py
index d6d369c8b..105609f05 100644
--- a/tests/integration/test_extract.py
+++ b/tests/integration/test_extract.py
@@ -15,103 +15,166 @@
 
 import numpy as np
 import pytest
+from utils.generators import mk_seq_array
 
 import cunumeric as num
 
-np.random.seed(42)
+DIM = 5
+SIZES = [
+    (0,),
+    1,
+    5,
+    (0, 1),
+    (1, 0),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 0, 0),
+    (1, 1, 0),
+    (1, 0, 1),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
+
+VALUES = [
+    [0],
+    [42],
+    [42 + 3j],
+    [11, 12, 13],
+    [True, False, False, True],
+    [42.3, 42.3, 42.3, 42.3, 42.3],
+    [np.inf, np.Inf],
+]
+
 
+@pytest.mark.xfail
+def test_none_array():
+    res_np = np.extract([0], None)  # return []
+    res_num = num.extract(
+        [0], None
+    )  # AttributeError: 'NoneType' object has no attribute 'size'
+    assert np.array_equal(res_np, res_num)
 
-def test_extract():
-    cnp = np.array(
-        [1, 54, 4, 4, 0, 45, 5, 58, 0, 9, 0, 4, 0, 0, 0, 5, 0, 1]
-    ).reshape(
-        (6, 3)
-    )  # noqa E501
-    c = num.array(cnp)
-    bnp = np.random.randn(6, 3)
-    b = num.array(bnp)
-    assert num.array_equal(num.extract(c, b), np.extract(cnp, bnp))
+
+@pytest.mark.xfail
+def test_empty_array():
+    res_np = np.extract([0], [])  # return []
+    res_num = num.extract(
+        [0], []
+    )  # ValueError: arr array and condition array must be of same size
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.xfail
+def test_none_condition():
+    a = num.array([1, 2, 3, 4])
+    res_np = np.extract(None, a)  # all return []
+    res_num = num.extract(
+        None, a
+    )  # AttributeError: 'NoneType' object has no attribute 'size'
+    assert np.array_equal(res_np, res_num)
+
+
+@pytest.mark.parametrize(
+    "con", (-3, 0, 3, False, True, [2], [2, 3], [2, -3, 4], [1, 2, 3, 4, 5])
+)
+def test_negative_condition(con):
+    a = num.array([1, 2, 3, 4])
+    with pytest.raises(ValueError):
+        num.extract(con, a)
+
+
+@pytest.mark.xfail
+def test_complex_condition():
+    # when condition is complex type a+bj,
+    # if a==0, cuNumeric take it as 0, while Numpy take it as 1
+    a = np.array([1, 2, 3, 4])
+    b = num.array([1, 2, 3, 4])
+    condition = [1 + 2j, 2, 2, 5j]
+    res_np = np.extract(condition, a)  # array([1, 2, 3, 4])
+    res_num = num.extract(condition, b)  # array([1, 2, 3])
+    assert np.array_equal(res_np, res_num)
 
 
 ARR = [
-    [1, 54, 4, 4, 0, 45, 5, 58, 0, 9, 0, 4, 0, 0, 0, 5, 0, 1],
-    [[1, 54, 4], [4, 0, 45], [5, 58, 0], [9, 0, 4], [0, 0, 0], [5, 0, 1]],
     [
-        [[1, 54, 4], [4, 0, 45]],
-        [[5, 58, 0], [9, 0, 4]],
-        [[0, 0, 0], [5, 0, 1]],
+        [[1 + 2j, 54, 4], [4, 3 + 1j, 45]],
+        [[5.5, 58.3, 0.6], [9, 0, 4]],
+        [[0, 0, 0], [-9, 0, -4]],
     ],
-    [[[1 + 2j, 54, 4], [4, 0 + 1j, 45]], [[5, 58, 0], [9, 0, 4]]],
     [[True, False], [True, True], [True, False]],
     [[]],
-    [],
+    [[], []],
     [
         [[0, 0, 0], [0, 0, 0]],
         [[0, 0, 0], [0, 0, 1]],
     ],
-    [False, False, False],
-    [
-        [[0, 0, 0], [0, 0, 0]],
-        [[0, 0, 0], [0, 0, 0]],
-    ],
 ]
 
 
+def array_condition():
+    arr_list = []
+    for arr in ARR:
+        arr_np = np.array(arr)
+        condition_np = arr_np.copy()
+        arr_list.append((condition_np, arr_np))
+        arr_list.append((condition_np.flatten(), arr_np))
+        arr_list.append((condition_np, arr_np.flatten()))
+        arr_list.append(
+            (condition_np.swapaxes(0, condition_np.ndim - 1), arr_np)
+        )
+        arr_list.append(
+            (condition_np, arr_np.swapaxes(0, condition_np.ndim - 1))
+        )
+    return arr_list
+
+
 def check_extract(condition_np, arr_np):
     arr_num = num.array(arr_np)
     condition_num = num.array(condition_np)
-    result_np = np.extract(condition_np, arr_np)
     result_np2 = arr_np[condition_np.reshape(arr_np.shape).astype(bool)]
-    assert np.array_equal(result_np, result_np2)
     result_num = num.extract(condition_num, arr_num)
-    assert np.array_equal(result_np, result_num)
+    assert np.array_equal(result_np2, result_num)
 
 
-@pytest.mark.parametrize("arr", ARR, ids=str)
-def test_extract_bool(arr):
-    arr_np = np.array(arr)
-    condition_np = arr_np != 0
-    check_extract(condition_np, arr_np)
-    check_extract(condition_np.flatten(), arr_np)
-    check_extract(condition_np, arr_np.flatten())
-    check_extract(condition_np.swapaxes(0, condition_np.ndim - 1), arr_np)
-    check_extract(condition_np, arr_np.swapaxes(0, condition_np.ndim - 1))
+@pytest.mark.parametrize(
+    "con, arr", (data for data in array_condition()), ids=str
+)
+def test_extract_nonzero1(con, arr):
+    check_extract(con, arr)
 
 
-@pytest.mark.parametrize("arr", ARR, ids=str)
-def test_extract_nonzero(arr):
-    arr_np = np.array(arr)
-    condition_np = arr_np.copy()
-    check_extract(condition_np, arr_np)
-    check_extract(condition_np.flatten(), arr_np)
-    check_extract(condition_np, arr_np.flatten())
-    check_extract(condition_np.swapaxes(0, condition_np.ndim - 1), arr_np)
-    check_extract(condition_np, arr_np.swapaxes(0, condition_np.ndim - 1))
+@pytest.mark.parametrize("shape", SIZES, ids=str)
+def test_extract_basic(shape):
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    # make sure condition is between 0 and 1
+    np_condition = np.array((mk_seq_array(np, shape) % 2).astype(bool))
+    num_condition = num.array((mk_seq_array(num, shape) % 2).astype(bool))
 
+    res_np = np.extract(np_condition, np_arr)
+    res_num = num.extract(num_condition, num_arr)
+    assert np.array_equal(res_num, res_np)
 
-VALUES = [
-    [11, 12, 13],
-    [99, 93, 76, 65, 76, 87, 43, 23, 12, 54, 756, 2345, 232, 2323, 12145],
-    [42],
-    [True, False, False, True],
-    [42.3, 42.3, 42.3, 42.3, 42.3, 42.3, 42.3, 42.3],
-    [42 + 3j],
-]
 
-
-@pytest.mark.parametrize("arr", ARR, ids=str)
+@pytest.mark.parametrize("shape", SIZES, ids=str)
 @pytest.mark.parametrize("vals", VALUES, ids=str)
-def test_place(arr, vals):
-    arr_np = np.array(arr)
-    vals_np = np.array(vals).astype(arr_np.dtype)
-    condition_np = arr_np != 0
+def test_place_basic(shape, vals):
+    arr_np = mk_seq_array(np, shape)
+    arr_num = num.array(mk_seq_array(num, shape))
 
-    arr_num = num.array(arr_np)
-    condition_num = num.array(condition_np)
+    mask_np = np.array((mk_seq_array(np, shape) % 2).astype(bool))
+    mask_num = num.array((mk_seq_array(np, shape) % 2).astype(bool))
+
+    vals_np = np.array(vals).astype(arr_np.dtype)
     vals_num = num.array(vals_np)
 
-    np.place(arr_np, condition_np, vals_np)
-    num.place(arr_num, condition_num, vals_num)
+    np.place(arr_np, mask_np, vals_np)
+    num.place(arr_num, mask_num, vals_num)
 
     assert np.array_equal(arr_np, arr_num)
 

From d7ca2782f75dbb533e060dab75889592a3b0afcf Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 9 Nov 2022 20:03:42 -0800
Subject: [PATCH 32/89] Mypy fix (#688)

* Fix mypy errors

* Add mypy to pre-commit hooks

* Update .pre-commit-config.yaml

Co-authored-by: Bryan Van de Ven <bryan@bokeh.org>

* Remove 'tests' from the mypy args

* Remove the mypy hook for now

Co-authored-by: Bryan Van de Ven <bryan@bokeh.org>
---
 cunumeric/config.py              | 2 ++
 cunumeric/random/bitgenerator.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cunumeric/config.py b/cunumeric/config.py
index 88802b911..c45ae6313 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import os
+from abc import abstractmethod
 from enum import IntEnum, unique
 from typing import TYPE_CHECKING, Any, List, Union, cast
 
@@ -269,6 +270,7 @@ class _CunumericSharedLib:
     CUNUMERIC_WRITE: int
     CUNUMERIC_ZIP: int
 
+    @abstractmethod
     def cunumeric_has_curand(self) -> int:
         ...
 
diff --git a/cunumeric/random/bitgenerator.py b/cunumeric/random/bitgenerator.py
index 2c5dfc577..1bd0aaa03 100644
--- a/cunumeric/random/bitgenerator.py
+++ b/cunumeric/random/bitgenerator.py
@@ -15,6 +15,7 @@
 from __future__ import annotations
 
 import time
+from abc import abstractproperty
 from typing import TYPE_CHECKING, Union
 
 import numpy as np
@@ -66,7 +67,7 @@ def __init__(
             self.generatorType, seed, self.flags, forceBuild
         )
 
-    @property
+    @abstractproperty
     def generatorType(self) -> BitGeneratorType:
         ...
 

From a93ea498815a26ca8f4b13b61f9bba2997555363 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 9 Nov 2022 23:01:53 -0800
Subject: [PATCH 33/89] Add a test case for 0D region-backed stores (#666)

---
 tests/integration/test_0d_store.py | 38 ++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 tests/integration/test_0d_store.py

diff --git a/tests/integration/test_0d_store.py b/tests/integration/test_0d_store.py
new file mode 100644
index 000000000..1701983f4
--- /dev/null
+++ b/tests/integration/test_0d_store.py
@@ -0,0 +1,38 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from itertools import product
+
+import pytest
+
+import cunumeric as num
+
+SIZE = 3
+
+
+def test_0d_region_backed_stores():
+    arr = num.arange(9).reshape(3, 3)
+
+    for i, j in product(range(SIZE), range(SIZE)):
+        i_ind = num.array(i)
+        j_ind = num.array(j)
+        v = arr[i_ind, j_ind]
+        assert int(v) == i * SIZE + j
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))

From 3162f5eb5c362c2a0b21df6187d22ba573ebb65f Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 9 Nov 2022 23:03:18 -0800
Subject: [PATCH 34/89] Fix a silly mistake. Fixes #684. (#686)

---
 src/cunumeric/set/unique.cu      | 2 +-
 tests/integration/test_unique.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cunumeric/set/unique.cu b/src/cunumeric/set/unique.cu
index 11e9e6fc1..908a87664 100644
--- a/src/cunumeric/set/unique.cu
+++ b/src/cunumeric/set/unique.cu
@@ -168,7 +168,7 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
 
       // Find unique values
       thrust::sort(thrust::cuda::par.on(stream), ptr, ptr + volume);
-      auto* end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + volume);
+      end = thrust::unique(thrust::cuda::par.on(stream), ptr, ptr + volume);
     }
 
     Piece<VAL> result;
diff --git a/tests/integration/test_unique.py b/tests/integration/test_unique.py
index 4f3d84274..c657c2c57 100644
--- a/tests/integration/test_unique.py
+++ b/tests/integration/test_unique.py
@@ -25,7 +25,7 @@ def test_with_nonzero():
     a_np = a.__array__()
 
     b = num.unique(a)
-    b_np = num.unique(a_np)
+    b_np = np.unique(a_np)
 
     assert np.array_equal(b, b_np)
 

From 9c28d6738f63441879bb06d3843670641a61f5fe Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Thu, 10 Nov 2022 12:52:43 -0800
Subject: [PATCH 35/89] Add missing put method to docs (#689)

---
 docs/cunumeric/source/api/_ndarray.rst | 1 +
 docs/cunumeric/source/api/ndarray.rst  | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/cunumeric/source/api/_ndarray.rst b/docs/cunumeric/source/api/_ndarray.rst
index 317772104..3320f0857 100644
--- a/docs/cunumeric/source/api/_ndarray.rst
+++ b/docs/cunumeric/source/api/_ndarray.rst
@@ -43,6 +43,7 @@ cunumeric.ndarray
       ~ndarray.nonzero
       ~ndarray.partition
       ~ndarray.prod
+      ~ndarray.put
       ~ndarray.ravel
       ~ndarray.reshape
       ~ndarray.searchsorted
diff --git a/docs/cunumeric/source/api/ndarray.rst b/docs/cunumeric/source/api/ndarray.rst
index 1b2c2107c..afdd1406f 100644
--- a/docs/cunumeric/source/api/ndarray.rst
+++ b/docs/cunumeric/source/api/ndarray.rst
@@ -124,7 +124,7 @@ Item selection and manipulation
    :toctree: generated/
 
    ndarray.take
-   .. ndarray.put
+   ndarray.put
    .. ndarray.repeat
    ndarray.choose
    ndarray.sort

From 6b835d6df4d693937344688fb33cecb46ef41e5d Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Fri, 11 Nov 2022 11:05:11 +0800
Subject: [PATCH 36/89] Enhance test_linspace.py and test_swapaxes.py (#680)

* Enhance test_linspace.py and test_swapaxes.py

* Address comments.

* Address comments part-2.
---
 tests/integration/test_linspace.py | 305 +++++++++++++++++++++++++++--
 tests/integration/test_swapaxes.py | 212 +++++++++++++++++---
 2 files changed, 468 insertions(+), 49 deletions(-)

diff --git a/tests/integration/test_linspace.py b/tests/integration/test_linspace.py
index 170bec59c..4f05a843c 100644
--- a/tests/integration/test_linspace.py
+++ b/tests/integration/test_linspace.py
@@ -13,48 +13,315 @@
 # limitations under the License.
 #
 
+from itertools import chain
+
 import numpy as np
 import pytest
+from utils.generators import broadcasts_to, mk_seq_array
 
 import cunumeric as num
 
 
-def test_basic():
-    x = np.linspace(2.0, 3.0, num=5)
-    y = num.linspace(2.0, 3.0, num=5)
+def equivalent_shapes_gen(shape):
+    """
+    Generate more equivalent shapes by removing
+    leading singleton dimensions from `shape`.
+    e.g., shape=(1, 4, 1), yield (1, 4, 1), (4, 1)
+    shape=(1, 1, 5), yield (1, 1, 5), (1, 5), (5,)
+    """
+    yield shape
+    for i in range(len(shape) - 1):
+        if shape[i] == 1:
+            i += 1
+            yield shape[i:]
+        else:
+            break
+
+
+@pytest.mark.parametrize(
+    "endpoint", (True, False), ids=lambda endpoint: f"(endpoint={endpoint})"
+)
+@pytest.mark.parametrize(
+    "number", (0, 1, 10), ids=lambda number: f"(num={number})"
+)
+@pytest.mark.parametrize(
+    "values",
+    ((10, -5.5), (2.0, 3.0), (0, 0), (1 + 2.5j, 10 + 5j), (0j, 10)),
+    ids=lambda values: f"(values={values})",
+)
+def test_scalar_basic(values, number, endpoint):
+    start, stop = values
+    x = np.linspace(start, stop, num=number, endpoint=endpoint)
+    y = num.linspace(start, stop, num=number, endpoint=endpoint)
     assert np.array_equal(x, y)
 
 
-def test_endpoint():
-    x = np.linspace(2.0, 3.0, num=5, endpoint=False)
-    y = num.linspace(2.0, 3.0, num=5, endpoint=False)
+@pytest.mark.parametrize(
+    "endpoint", (True, False), ids=lambda endpoint: f"(endpoint={endpoint})"
+)
+@pytest.mark.parametrize(
+    "number", (0, 1, 10), ids=lambda number: f"(num={number})"
+)
+@pytest.mark.parametrize(
+    "values",
+    ((10, -5.5), (2.0, 3.0), (0, 0), (1 + 2.5j, 10 + 5j), (0j, 10)),
+    ids=lambda values: f"(values={values})",
+)
+def test_scalar_basic_retstep(values, number, endpoint):
+    start, stop = values
+    x = np.linspace(start, stop, num=number, endpoint=endpoint, retstep=True)
+    y = num.linspace(start, stop, num=number, endpoint=endpoint, retstep=True)
+
+    assert np.array_equal(x[0], y[0])
+    if not (np.isnan(x[1]) and np.isnan(y[1])):
+        assert x[1] == y[1]
+
+
+@pytest.mark.parametrize(
+    "endpoint", (True, False), ids=lambda endpoint: f"(endpoint={endpoint})"
+)
+def test_arrays_basic(endpoint):
+    shape = (2, 2, 3)
+    np_start = mk_seq_array(np, shape)
+    num_start = mk_seq_array(num, shape)
+    np_stop = mk_seq_array(np, shape) + 10
+    num_stop = mk_seq_array(num, shape) + 10
+    x = np.linspace(np_start, np_stop, num=5, endpoint=endpoint)
+    y = np.linspace(num_start, num_stop, num=5, endpoint=endpoint)
     assert np.array_equal(x, y)
 
 
-def test_retstep():
-    x = np.linspace(2.0, 3.0, num=5, retstep=True)
-    y = np.linspace(2.0, 3.0, num=5, retstep=True)
+@pytest.mark.parametrize(
+    "endpoint", (True, False), ids=lambda endpoint: f"(endpoint={endpoint})"
+)
+def test_arrays_basic_retstep(endpoint):
+    shape = (2, 2, 3)
+    np_start = mk_seq_array(np, shape)
+    num_start = mk_seq_array(num, shape)
+    np_stop = mk_seq_array(np, shape) + 10
+    num_stop = mk_seq_array(num, shape) + 10
+    x = np.linspace(np_start, np_stop, num=5, endpoint=endpoint, retstep=True)
+    y = np.linspace(
+        num_start, num_stop, num=5, endpoint=endpoint, retstep=True
+    )
     assert np.array_equal(x[0], y[0])
-    assert x[1] == y[1]
+    assert np.array_equal(x[1], y[1])
+
+
+shape_start = (2, 2, 3)
+shape_stops = (equivalent_shapes_gen(s) for s in broadcasts_to(shape_start))
+
+
+@pytest.mark.parametrize(
+    "shape_stop",
+    chain.from_iterable(shape_stops),
+    ids=lambda shape_stop: f"(shape_stop={shape_stop})",
+)
+def test_array_broadcast_stops(shape_stop):
+    np_start = mk_seq_array(np, shape_start)
+    num_start = mk_seq_array(num, shape_start)
+
+    np_stop = mk_seq_array(np, shape_stop) + 5
+    num_stop = mk_seq_array(num, shape_stop) + 5
+    x = np.linspace(np_start, np_stop, num=5)
+    y = num.linspace(num_start, num_stop, num=5)
+    assert np.array_equal(x, y)
+
+
+def test_arrays_both_start_and_stop_broadcast():
+    shape_start = (1, 3)
+    np_start = mk_seq_array(np, shape_start)
+    num_start = mk_seq_array(num, shape_start)
+    shape_stop = (2, 1)
+    np_stop = mk_seq_array(np, shape_stop) + 5
+    num_stop = mk_seq_array(num, shape_stop) + 5
+
+    x = np.linspace(np_start, np_stop, num=5)
+    y = num.linspace(num_start, num_stop, num=5)
+    assert np.array_equal(x, y)
+
+
+@pytest.mark.parametrize(
+    "shape", ((0,), (3,), (2, 1)), ids=lambda shape: f"(shape={shape})"
+)
+def test_array_with_scalar(shape):
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    scalar = 10
+
+    x1 = np.linspace(np_arr, scalar, num=5)
+    y1 = num.linspace(num_arr, scalar, num=5)
+    assert np.array_equal(x1, y1)
+
+    x2 = np.linspace(scalar, np_arr, num=5)
+    y2 = num.linspace(scalar, num_arr, num=5)
+    assert np.array_equal(x2, y2)
+
+
+@pytest.mark.parametrize(
+    "endpoint", (True, False), ids=lambda endpoint: f"(endpoint={endpoint})"
+)
+@pytest.mark.parametrize(
+    "shape", ((0,), (2, 1)), ids=lambda shape: f"(shape={shape})"
+)
+def test_empty_array(shape, endpoint):
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+
+    x1 = np.linspace(np_arr, [], num=5, endpoint=endpoint)
+    y1 = num.linspace(num_arr, [], num=5, endpoint=endpoint)
+    assert np.array_equal(x1, y1)
 
+    x2 = np.linspace([], np_arr, num=5, endpoint=endpoint)
+    y2 = num.linspace([], num_arr, num=5, endpoint=endpoint)
+    assert np.array_equal(x2, y2)
 
-def test_axis():
+
+@pytest.mark.parametrize(
+    "endpoint", (True, False), ids=lambda endpoint: f"(endpoint={endpoint})"
+)
+@pytest.mark.parametrize(
+    "shape", ((0,), (2, 1)), ids=lambda shape: f"(shape={shape})"
+)
+def test_empty_array_retstep(shape, endpoint):
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+
+    x1 = np.linspace(np_arr, [], num=5, endpoint=endpoint, retstep=True)
+    y1 = num.linspace(num_arr, [], num=5, endpoint=endpoint, retstep=True)
+    assert np.array_equal(x1[0], y1[0])
+    assert np.array_equal(x1[1], y1[1])
+
+    x2 = np.linspace([], np_arr, num=5, endpoint=endpoint, retstep=True)
+    y2 = num.linspace([], num_arr, num=5, endpoint=endpoint, retstep=True)
+    assert np.array_equal(x2[0], y2[0])
+    assert np.array_equal(x2[1], y2[1])
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize(
+    "number", (0, 1, 10), ids=lambda number: f"(num={number})"
+)
+@pytest.mark.parametrize(
+    "axis", range(-3, 3), ids=lambda axis: f"(axis={axis})"
+)
+def test_arrays_axis(axis, number):
+    # In cuNumeric, if axis < -1, raise ValueError
+    # 'Point cannot exceed 4 dimensions set from LEGATE_MAX_DIM'
+    # In Numpy, if axis is -2 or -3, also pass
+    # In cuNumeric, for axis >= -1, if num=0, raise IndexError:
+    # tuple index out of range
+    # In Numpy, if num=0, pass and returns empty array
     x = np.array([[0, 1], [2, 3]])
     y = np.array([[4, 5], [6, 7]])
     xp = num.array(x)
     yp = num.array(y)
 
-    z = np.linspace(x, y, num=5, axis=0)
-    w = num.linspace(xp, yp, num=5, axis=0)
+    z = np.linspace(x, y, num=number, axis=axis)
+    w = num.linspace(xp, yp, num=number, axis=axis)
     assert np.array_equal(z, w)
 
-    z = np.linspace(x, y, num=5, axis=1)
-    w = num.linspace(xp, yp, num=5, axis=1)
-    assert np.array_equal(z, w)
 
-    z = np.linspace(x, y, num=5, axis=2)
-    w = num.linspace(xp, yp, num=5, axis=2)
-    assert np.array_equal(z, w)
+@pytest.mark.parametrize(
+    "axis", range(-1, 1), ids=lambda axis: f"(axis={axis})"
+)
+def test_scalar_axis(axis):
+    start = 2.0
+    stop = 3.0
+    x = np.linspace(start, stop, num=5, axis=axis)
+    y = num.linspace(start, stop, num=5, axis=axis)
+    assert np.array_equal(x, y)
+
+
+@pytest.mark.parametrize(
+    "dtype", (None, int, float, bool), ids=lambda dtype: f"(dtype={dtype})"
+)
+def test_dtype(dtype):
+    start = 2.0
+    stop = 3.0
+    x = np.linspace(start, stop, num=5, dtype=dtype)
+    y = num.linspace(start, stop, num=5, dtype=dtype)
+    assert np.array_equal(x, y)
+
+
+class TestLinspaceErrors:
+    def setup_method(self):
+        self.start = mk_seq_array(num, (2, 3))
+        self.stop = mk_seq_array(num, (2, 3)) + 10
+        self.num = 5
+
+    @pytest.mark.xfail
+    def test_num_float(self):
+        # In Numpy, raise TypeError
+        # In cuNumeric, pass
+        msg = "cannot be interpreted as an integer"
+        with pytest.raises(TypeError, match=msg):
+            num.linspace(0, 10, num=4.5)
+
+    def test_num_negative(self):
+        msg = "must be non-negative"
+        with pytest.raises(ValueError, match=msg):
+            num.linspace(0, 10, num=-1)
+
+    def test_num_none(self):
+        msg = "not supported between instances of 'NoneType' and 'int'"
+        with pytest.raises(TypeError, match=msg):
+            num.linspace(0, 10, num=None)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "axis", (-4, 3), ids=lambda axis: f"(axis={axis})"
+    )
+    def test_axis_out_of_bound_array(self, axis):
+        # In cuNumeric, if axis < -1, raise ValueError
+        # 'Point cannot exceed 4 dimensions set from LEGATE_MAX_DIM'
+        msg = "out of bounds"
+        # In Numpy, it raises AxisError
+        with pytest.raises(ValueError, match=msg):
+            num.linspace(self.start, self.stop, axis=axis)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "axis", (-2, 1), ids=lambda axis: f"(axis={axis})"
+    )
+    def test_axis_out_of_bound_scalar(self, axis):
+        # In cuNumeric, it pass and the result equals when axis=0
+        # In Numpy, it raises AxisError
+        msg = "out of bounds"
+        with pytest.raises(ValueError, match=msg):
+            num.linspace(2.0, 3.0, axis=axis)
+
+    def test_axis_float(self):
+        axis = 1.0
+        msg = "can't multiply sequence by non-int of type 'float'"
+        with pytest.raises(TypeError, match=msg):
+            num.linspace(self.start, self.stop, axis=axis)
+
+    @pytest.mark.xfail
+    def test_axis_none(self):
+        # In cuNumeric, pass and treat it as axis=0
+        # In Numpy, raises TypeError
+        axis = None
+        msg = "'NoneType' object is not iterable"
+        with pytest.raises(TypeError, match=msg):
+            num.linspace(self.start, self.stop, axis=axis)
+
+    @pytest.mark.parametrize(
+        "shape", ((0,), (2,), (3, 3)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_array_bad_shape(self, shape):
+        msg = "shape mismatch"
+        stop = mk_seq_array(num, shape)
+        with pytest.raises(ValueError, match=msg):
+            num.linspace(self.start, stop)
+
+    def test_start_none(self):
+        with pytest.raises(TypeError):
+            num.linspace(None, 10, num=5)
+
+    def test_stop_none(self):
+        with pytest.raises(TypeError):
+            num.linspace(0, None, num=5)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_swapaxes.py b/tests/integration/test_swapaxes.py
index 92dc83e4e..0217019c9 100644
--- a/tests/integration/test_swapaxes.py
+++ b/tests/integration/test_swapaxes.py
@@ -15,42 +15,194 @@
 
 import numpy as np
 import pytest
+from utils.generators import mk_seq_array
 
 import cunumeric as num
 
 a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
 
 
-def test_small():
-    a_num = num.array(a)
-    b_num = a_num.swapaxes(0, 1)
-
-    assert num.array_equal(a_num.sum(axis=0), b_num.sum(axis=1))
-
-
-def test_tall():
-    a_tall = np.concatenate((a,) * 100)
-    a_tall_num = num.array(a_tall)
-    b_tall_num = a_tall_num.swapaxes(0, 1)
-
-    assert num.array_equal(a_tall_num.sum(axis=0), b_tall_num.sum(axis=1))
-
-
-def test_wide():
-    a_wide = np.concatenate((a,) * 100, axis=1)
-    a_wide_num = num.array(a_wide)
-    b_wide_num = a_wide_num.swapaxes(0, 1)
-
-    assert num.array_equal(a_wide_num.sum(axis=0), b_wide_num.sum(axis=1))
-
-
-def test_big():
-    a_tall = np.concatenate((a,) * 100)
-    a_big = np.concatenate((a_tall,) * 100, axis=1)
-    a_big_num = num.array(a_big)
-    b_big_num = a_big_num.swapaxes(0, 1)
-
-    assert num.array_equal(a_big_num.sum(axis=0), b_big_num.sum(axis=1))
+class TestSwapAxesModule:
+    def test_small(self):
+        a_num = num.array(a)
+        b = np.swapaxes(a, 0, 1)
+        b_num = num.swapaxes(a_num, 0, 1)
+        assert np.array_equal(b, b_num)
+
+    def test_tall(self):
+        a_tall = np.concatenate((a,) * 100)
+        a_tall_num = num.array(a_tall)
+
+        b_tall = np.swapaxes(a_tall, 0, 1)
+        b_tall_num = num.swapaxes(a_tall_num, 0, 1)
+        assert np.array_equal(b_tall, b_tall_num)
+
+    def test_wide(self):
+        a_wide = np.concatenate((a,) * 100, axis=1)
+        a_wide_num = num.array(a_wide)
+
+        b_wide = np.swapaxes(a_wide, 0, 1)
+        b_wide_num = num.swapaxes(a_wide_num, 0, 1)
+        assert np.array_equal(b_wide, b_wide_num)
+
+    def test_big(self):
+        a_tall = np.concatenate((a,) * 100)
+        a_big = np.concatenate((a_tall,) * 100, axis=1)
+        a_big_num = num.array(a_big)
+
+        b_big = np.swapaxes(a_big, 0, 1)
+        b_big_num = num.swapaxes(a_big_num, 0, 1)
+        assert np.array_equal(b_big, b_big_num)
+
+    @pytest.mark.parametrize(
+        "axes",
+        ((0, 0), (-3, 1), (0, 2), (-3, -2)),
+        ids=lambda axes: f"(axes={axes})",
+    )
+    def test_axes(self, axes):
+        shape = (3, 4, 5)
+        np_arr = mk_seq_array(np, shape)
+        num_arr = num.array(np_arr)
+        axis1, axis2 = axes
+
+        res_np = np.swapaxes(np_arr, axis1, axis2)
+        res_num = num.swapaxes(num_arr, axis1, axis2)
+        assert np.array_equal(res_num, res_np)
+
+    def test_emtpy_array(self):
+        np_arr = np.array([])
+        num_arr = num.array([])
+        axis1 = 0
+        axis2 = 0
+
+        res_np = np.swapaxes(np_arr, axis1, axis2)
+        res_num = num.swapaxes(num_arr, axis1, axis2)
+        assert np.array_equal(res_num, res_np)
+
+
+class TestSwapAxesModuleErrors:
+    def setup_method(self):
+        self.a = mk_seq_array(num, (3, 3))
+
+    def test_a_none(self):
+        msg = "has no attribute 'swapaxes'"
+        with pytest.raises(AttributeError, match=msg):
+            num.swapaxes(None, 0, 0)
+
+    @pytest.mark.parametrize(
+        "axes", ((3, 0), (0, 3)), ids=lambda axes: f"(axes={axes})"
+    )
+    def test_axes_out_of_bound1(self, axes):
+        axis1, axis2 = axes
+        msg = "too large for swapaxes"
+        with pytest.raises(ValueError, match=msg):
+            num.swapaxes(self.a, axis1, axis2)
+
+    @pytest.mark.parametrize(
+        "axes", ((-4, 0), (0, -4)), ids=lambda axes: f"(axes={axes})"
+    )
+    def test_axes_out_of_bound2(self, axes):
+        axis1, axis2 = axes
+        with pytest.raises(IndexError):
+            num.swapaxes(self.a, axis1, axis2)
+
+    @pytest.mark.parametrize(
+        "axes", ((None, 0), (0, None)), ids=lambda axes: f"(axes={axes})"
+    )
+    def test_axes_none(self, axes):
+        axis1, axis2 = axes
+        msg = "not supported between instances of 'NoneType' and 'int'"
+        with pytest.raises(TypeError, match=msg):
+            num.swapaxes(self.a, axis1, axis2)
+
+
+class TestSwapAxesArrayMethod:
+    def test_small(self):
+        a_num = num.array(a)
+        b = a.swapaxes(0, 1)
+        b_num = a_num.swapaxes(0, 1)
+        assert np.array_equal(b, b_num)
+
+    def test_tall(self):
+        a_tall = np.concatenate((a,) * 100)
+        a_tall_num = num.array(a_tall)
+
+        b_tall = a_tall.swapaxes(0, 1)
+        b_tall_num = a_tall_num.swapaxes(0, 1)
+        assert np.array_equal(b_tall, b_tall_num)
+
+    def test_wide(self):
+        a_wide = np.concatenate((a,) * 100, axis=1)
+        a_wide_num = num.array(a_wide)
+
+        b_wide = a_wide.swapaxes(0, 1)
+        b_wide_num = a_wide_num.swapaxes(0, 1)
+        assert np.array_equal(b_wide, b_wide_num)
+
+    def test_big(self):
+        a_tall = np.concatenate((a,) * 100)
+        a_big = np.concatenate((a_tall,) * 100, axis=1)
+        a_big_num = num.array(a_big)
+
+        b_big = a_big.swapaxes(0, 1)
+        b_big_num = a_big_num.swapaxes(0, 1)
+        assert np.array_equal(b_big, b_big_num)
+
+    @pytest.mark.parametrize(
+        "axes",
+        ((0, 0), (-3, 1), (0, 2), (-3, -2)),
+        ids=lambda axes: f"(axes={axes})",
+    )
+    def test_axes(self, axes):
+        shape = (3, 4, 5)
+        np_arr = mk_seq_array(np, shape)
+        num_arr = num.array(np_arr)
+        axis1, axis2 = axes
+
+        res_np_arr = np_arr.swapaxes(axis1, axis2)
+        res_num_arr = num_arr.swapaxes(axis1, axis2)
+        assert np.array_equal(res_num_arr, res_np_arr)
+
+    def test_emtpy_array(self):
+        np_arr = np.array([])
+        num_arr = num.array([])
+        axis1 = 0
+        axis2 = 0
+
+        res_np_arr = np_arr.swapaxes(axis1, axis2)
+        res_num_arr = num_arr.swapaxes(axis1, axis2)
+        assert np.array_equal(res_num_arr, res_np_arr)
+
+
+class TestSwapAxesArrayMethodErrors:
+    def setup_method(self):
+        self.a = mk_seq_array(num, (3, 3))
+
+    @pytest.mark.parametrize(
+        "axes", ((3, 0), (0, 3)), ids=lambda axes: f"(axes={axes})"
+    )
+    def test_axes_out_of_bound1(self, axes):
+        axis1, axis2 = axes
+        msg = "too large for swapaxes"
+        with pytest.raises(ValueError, match=msg):
+            self.a.swapaxes(axis1, axis2)
+
+    @pytest.mark.parametrize(
+        "axes", ((-4, 0), (0, -4)), ids=lambda axes: f"(axes={axes})"
+    )
+    def test_axes_out_of_bound2(self, axes):
+        axis1, axis2 = axes
+        with pytest.raises(IndexError):
+            self.a.swapaxes(axis1, axis2)
+
+    @pytest.mark.parametrize(
+        "axes", ((None, 0), (0, None)), ids=lambda axes: f"(axes={axes})"
+    )
+    def test_axes_none(self, axes):
+        axis1, axis2 = axes
+        msg = "not supported between instances of 'NoneType' and 'int'"
+        with pytest.raises(TypeError, match=msg):
+            self.a.swapaxes(axis1, axis2)
 
 
 if __name__ == "__main__":

From dfbb053deed140e81f0961cd55d3ba6fe5e3b076 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 14 Nov 2022 14:41:23 -0800
Subject: [PATCH 37/89] Handle complex dtypes in __legate_data_interface__
 (#690)

* Handle complex dtypes in __legate_data_interface__

* Fix a mypy error
---
 cunumeric/array.py                       | 18 +++++++++++++++++-
 tests/integration/test_data_interface.py |  8 ++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 6dc818112..2c7b7f770 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -31,6 +31,7 @@
     cast,
 )
 
+import legate.core.types as ty
 import numpy as np
 import pyarrow  # type: ignore
 from legate.core import Array
@@ -187,6 +188,21 @@ def _convert_all_to_numpy(obj: Any) -> Any:
         return obj
 
 
+# FIXME: we can't give an accurate return type as mypy thinks
+# the pyarrow import can be ignored, and can't override the check
+# either, because no-any-unimported needs Python >= 3.10. We can
+# fix it once we bump up the Python version
+def convert_numpy_dtype_to_pyarrow(dtype: np.dtype[Any]) -> Any:
+    if dtype.kind != "c":
+        return pyarrow.from_numpy_dtype(dtype)
+    elif dtype == np.complex64:
+        return ty.complex64
+    elif dtype == np.complex128:
+        return ty.complex128
+    else:
+        raise ValueError(f"Unsupported NumPy dtype: {dtype}")
+
+
 @clone_np_ndarray
 class ndarray:
     def __init__(
@@ -269,7 +285,7 @@ def __legate_data_interface__(self) -> dict[str, Any]:
             # All of our thunks implement the Legate Store interface
             # so we just need to convert our type and stick it in
             # a Legate Array
-            arrow_type = pyarrow.from_numpy_dtype(self.dtype)
+            arrow_type = convert_numpy_dtype_to_pyarrow(self.dtype)
             # If the thunk is an eager array, we need to convert it to a
             # deferred array so we can extract a legate store
             deferred_thunk = runtime.to_deferred_array(self._thunk)
diff --git a/tests/integration/test_data_interface.py b/tests/integration/test_data_interface.py
index 1421437ea..6c617db43 100644
--- a/tests/integration/test_data_interface.py
+++ b/tests/integration/test_data_interface.py
@@ -16,6 +16,9 @@
 import pytest
 
 import cunumeric as num
+from cunumeric.runtime import _supported_dtypes
+
+DTYPES = _supported_dtypes.keys()
 
 
 # A simple wrapper with a legate data interface implementation for testing
@@ -28,8 +31,9 @@ def __legate_data_interface__(self):
         return self.wrapped
 
 
-def test_roundtrip():
-    arr1 = num.array([1, 2, 3, 4], dtype=num.float64)
+@pytest.mark.parametrize("dtype", DTYPES)
+def test_roundtrip(dtype):
+    arr1 = num.array([1, 2, 3, 4], dtype=dtype)
     data = Wrapper(arr1.__legate_data_interface__)
     arr2 = num.asarray(data)
     assert num.array_equal(arr1, arr2)

From 4bd9762e1e61e2e29212f52b0861c0fcab7112f4 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 14 Nov 2022 23:41:07 -0800
Subject: [PATCH 38/89] Log operator names of unary and binary operations using
 annotations (#679)

---
 cunumeric/deferred.py | 88 +++++++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 40 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 8e6d8cacb..b04b3088c 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -35,7 +35,7 @@
 
 import legate.core.types as ty
 import numpy as np
-from legate.core import Future, ReductionOp, Store
+from legate.core import Annotation, Future, ReductionOp, Store
 from numpy.core.numeric import normalize_axis_tuple  # type: ignore
 from typing_extensions import ParamSpec
 
@@ -2961,20 +2961,21 @@ def unary_op(
         lhs = self.base
         rhs = src._broadcast(lhs.shape)
 
-        task = self.context.create_auto_task(CuNumericOpCode.UNARY_OP)
-        task.add_output(lhs)
-        task.add_input(rhs)
-        task.add_scalar_arg(op.value, ty.int32)
-        self.add_arguments(task, args)
+        with Annotation(self.context, {"OpCode": op.name}):
+            task = self.context.create_auto_task(CuNumericOpCode.UNARY_OP)
+            task.add_output(lhs)
+            task.add_input(rhs)
+            task.add_scalar_arg(op.value, ty.int32)
+            self.add_arguments(task, args)
 
-        task.add_alignment(lhs, rhs)
+            task.add_alignment(lhs, rhs)
 
-        if multiout is not None:
-            for out in multiout:
-                task.add_output(out.base)
-                task.add_alignment(out.base, rhs)
+            if multiout is not None:
+                for out in multiout:
+                    task.add_output(out.base)
+                    task.add_alignment(out.base, rhs)
 
-        task.execute()
+            task.execute()
 
     # Perform a unary reduction operation from one set of dimensions down to
     # fewer
@@ -3010,10 +3011,6 @@ def unary_reduction(
                 0 if keepdims else lhs_array.ndim
             )
 
-            task = self.context.create_auto_task(
-                CuNumericOpCode.SCALAR_UNARY_RED
-            )
-
             if initial is not None:
                 assert not argred
                 fill_value = initial
@@ -3026,14 +3023,21 @@ def unary_reduction(
             while lhs.ndim > 1:
                 lhs = lhs.project(0, 0)
 
-            task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
-            task.add_input(rhs_array.base)
-            task.add_scalar_arg(op, ty.int32)
-            task.add_scalar_arg(rhs_array.shape, (ty.int64,))
+            with Annotation(
+                self.context, {"OpCode": op.name, "ArgRed?": str(argred)}
+            ):
+                task = self.context.create_auto_task(
+                    CuNumericOpCode.SCALAR_UNARY_RED
+                )
 
-            self.add_arguments(task, args)
+                task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
+                task.add_input(rhs_array.base)
+                task.add_scalar_arg(op, ty.int32)
+                task.add_scalar_arg(rhs_array.shape, (ty.int64,))
 
-            task.execute()
+                self.add_arguments(task, args)
+
+                task.execute()
 
         else:
             # Before we perform region reduction, make sure to have the lhs
@@ -3062,18 +3066,21 @@ def unary_reduction(
                     "Need support for reducing multiple dimensions"
                 )
 
-            task = self.context.create_auto_task(CuNumericOpCode.UNARY_RED)
+            with Annotation(
+                self.context, {"OpCode": op.name, "ArgRed?": str(argred)}
+            ):
+                task = self.context.create_auto_task(CuNumericOpCode.UNARY_RED)
 
-            task.add_input(rhs_array.base)
-            task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])
-            task.add_scalar_arg(axis, ty.int32)
-            task.add_scalar_arg(op, ty.int32)
+                task.add_input(rhs_array.base)
+                task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])
+                task.add_scalar_arg(axis, ty.int32)
+                task.add_scalar_arg(op, ty.int32)
 
-            self.add_arguments(task, args)
+                self.add_arguments(task, args)
 
-            task.add_alignment(result, rhs_array.base)
+                task.add_alignment(result, rhs_array.base)
 
-            task.execute()
+                task.execute()
 
         if argred:
             self.unary_op(
@@ -3107,18 +3114,19 @@ def binary_op(
         rhs1 = src1._broadcast(lhs.shape)
         rhs2 = src2._broadcast(lhs.shape)
 
-        # Populate the Legate launcher
-        task = self.context.create_auto_task(CuNumericOpCode.BINARY_OP)
-        task.add_output(lhs)
-        task.add_input(rhs1)
-        task.add_input(rhs2)
-        task.add_scalar_arg(op_code.value, ty.int32)
-        self.add_arguments(task, args)
+        with Annotation(self.context, {"OpCode": op_code.name}):
+            # Populate the Legate launcher
+            task = self.context.create_auto_task(CuNumericOpCode.BINARY_OP)
+            task.add_output(lhs)
+            task.add_input(rhs1)
+            task.add_input(rhs2)
+            task.add_scalar_arg(op_code.value, ty.int32)
+            self.add_arguments(task, args)
 
-        task.add_alignment(lhs, rhs1)
-        task.add_alignment(lhs, rhs2)
+            task.add_alignment(lhs, rhs1)
+            task.add_alignment(lhs, rhs2)
 
-        task.execute()
+            task.execute()
 
     @auto_convert("src1", "src2")
     def binary_reduction(

From 4293e6b562c32c9208d1a006a0d4d7ec27ba3f43 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Tue, 15 Nov 2022 08:42:26 -0800
Subject: [PATCH 39/89] mypy pre-commit hook for local but not CI (#695)

---
 .pre-commit-config.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bdc37baff..798efa23d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,11 @@
 repos:
+    - repo: https://github.com/pre-commit/mirrors-mypy
+      rev: 'v0.982'
+      hooks:
+        - id: mypy
+          language: system
+          pass_filenames: false
+          args: ['cunumeric']
     - repo: https://github.com/PyCQA/isort
       rev: 5.10.1
       hooks:
@@ -25,5 +32,9 @@ repos:
           entry: python scripts/hooks/enforce_pytest_main.py
           language: python
           pass_filenames: false
+
+ci:
+  skip: [mypy]
+
 default_language_version:
     python: python3

From ba2f73f7842924cb0588dbb9a71bb1e4f4201631 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Tue, 15 Nov 2022 15:00:49 -0800
Subject: [PATCH 40/89] Add CI job to build docs (#697)

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 499dce58a..be0db706c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -80,6 +80,7 @@ jobs:
           - {name: 2 OpenMPs test, options: --use openmp --omps 2 --ompthreads 2 --debug, log: omps}
           - {name: Eager execution test, options: --use eager --debug, log: eager}
           - {name: mypy, options: mypy, log: mypy}
+          - {name: documentation, options: docs, log: docs}
     name: ${{ matrix.name }}
     steps:
       - name: Dump GitHub context

From 7a0e7f784175e8e73c5845759d7ecd84a4b908b4 Mon Sep 17 00:00:00 2001
From: xialu00 <110973296+xialu00@users.noreply.github.com>
Date: Wed, 16 Nov 2022 08:58:01 +0800
Subject: [PATCH 41/89] add negative test case for test_convolve.py and
 test_astype.py (#694)

* add negative test case for test_convolve.py

* add test case for test_astype.py

* add test case for test_astype.py

* fix bug
---
 tests/integration/test_astype.py   | 80 +++++++++++++++++++++++++++++-
 tests/integration/test_convolve.py | 23 +++++++++
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_astype.py b/tests/integration/test_astype.py
index 534496950..5a54a7789 100644
--- a/tests/integration/test_astype.py
+++ b/tests/integration/test_astype.py
@@ -21,12 +21,51 @@
 TEST_VECTOR = [0, 0, 1, 2, 3, 0, 1, 2, 3]
 ALL_BUT_COMPLEX = ["?", "b", "h", "i", "l", "B", "H", "I", "L", "e", "f", "d"]
 ALL_TYPES = ALL_BUT_COMPLEX + ["F", "D"]
+ORDER = ("C", "F", "A", "K")
+CASTING = ("no", "equiv", "safe", "same_kind")
+UNSIGNED_TYPE = ["b", "h", "i", "l"]
+SIGNED_TYPE = ["B", "H", "I", "L"]
+ALL_TYPES_BUT_BOOL = [
+    "b",
+    "h",
+    "i",
+    "l",
+    "B",
+    "H",
+    "I",
+    "L",
+    "e",
+    "f",
+    "d",
+    "F",
+    "D",
+]
 
 
 def to_dtype(s):
     return str(np.dtype(s))
 
 
+def test_none():
+    arr = None
+    in_np = num.array(arr)
+    msg = r"NoneType"
+    with pytest.raises(TypeError, match=msg):
+        in_np.astype("b")
+
+
+@pytest.mark.parametrize("src_dtype", ALL_TYPES, ids=to_dtype)
+def test_empty(src_dtype):
+    arr = []
+    in_np = np.array(arr)
+    out_np = in_np.astype(src_dtype)
+
+    in_num = np.array(arr)
+    out_num = in_num.astype(src_dtype)
+
+    assert np.array_equal(out_np, out_num)
+
+
 @pytest.mark.parametrize("src_dtype", ALL_BUT_COMPLEX, ids=to_dtype)
 @pytest.mark.parametrize("dst_dtype", ALL_TYPES, ids=to_dtype)
 def test_basic(src_dtype, dst_dtype):
@@ -39,8 +78,30 @@ def test_basic(src_dtype, dst_dtype):
     assert np.array_equal(out_num, out_np)
 
 
-@pytest.mark.parametrize("src_dtype", ("F", "D"), ids=to_dtype)
+@pytest.mark.parametrize("src_dtype", ALL_BUT_COMPLEX, ids=to_dtype)
 @pytest.mark.parametrize("dst_dtype", ALL_TYPES, ids=to_dtype)
+@pytest.mark.parametrize("order", ORDER, ids=str)
+def test_order(src_dtype, dst_dtype, order):
+    in_np = np.array(TEST_VECTOR, dtype=src_dtype)
+    in_num = num.array(in_np)
+
+    out_np = in_np.astype(dst_dtype, order=order)
+    out_num = in_num.astype(dst_dtype, order=order)
+
+    assert np.array_equal(out_num, out_np)
+
+
+@pytest.mark.parametrize("src_dtype", UNSIGNED_TYPE, ids=to_dtype)
+@pytest.mark.parametrize("dst_dtype", SIGNED_TYPE, ids=to_dtype)
+@pytest.mark.parametrize("cast", CASTING, ids=str)
+def test_casting_negative(src_dtype, dst_dtype, cast):
+    in_num = num.array(TEST_VECTOR, dtype=src_dtype)
+    with pytest.raises(TypeError):
+        in_num.astype(dst_dtype, casting=cast)
+
+
+@pytest.mark.parametrize("src_dtype", ("F", "D"), ids=to_dtype)
+@pytest.mark.parametrize("dst_dtype", ALL_TYPES_BUT_BOOL, ids=to_dtype)
 def test_complex(src_dtype, dst_dtype):
     complex_input = [
         complex(v1, v2) for v1, v2 in zip(TEST_VECTOR[:-1], TEST_VECTOR[1:])
@@ -54,6 +115,23 @@ def test_complex(src_dtype, dst_dtype):
     assert np.array_equal(out_num, out_np)
 
 
+@pytest.mark.xfail
+@pytest.mark.parametrize("src_dtype", ("F", "D"), ids=to_dtype)
+def test_complex_negative(src_dtype):
+    complex_input = [
+        complex(v1, v2) for v1, v2 in zip(TEST_VECTOR[:-1], TEST_VECTOR[1:])
+    ]
+    in_np = np.array(complex_input, dtype=src_dtype)
+    in_num = num.array(in_np)
+
+    out_np = in_np.astype(to_dtype("?"))
+    out_num = in_num.astype(to_dtype("?"))
+
+    # Numpy and cuNumeric have different performance.
+    # For complex data 0.+1.j, Numpy set as True, cuNumeric set as False.
+    assert np.array_equal(out_num, out_np)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_convolve.py b/tests/integration/test_convolve.py
index a0beada23..08bbb3af4 100644
--- a/tests/integration/test_convolve.py
+++ b/tests/integration/test_convolve.py
@@ -25,6 +25,29 @@
 FILTER_SHAPES = [(5,), (3, 5), (3, 5, 3)]
 
 
+@pytest.mark.xfail
+def test_none():
+    # Numpy raises:
+    # TypeError: unsupported operand type(s) for *: 'NoneType' and 'NoneType'
+    with pytest.raises(AttributeError):
+        num.convolve(None, None, mode="same")
+
+
+def test_empty():
+    msg = r"empty"
+    with pytest.raises(ValueError, match=msg):
+        num.convolve([], [], mode="same")
+
+
+def test_diff_dims():
+    shape1 = (5,) * 3
+    shape2 = (5,) * 2
+    arr1 = num.random.random(shape1)
+    arr2 = num.random.random(shape2)
+    with pytest.raises(RuntimeError):
+        num.convolve(arr1, arr2, mode="same")
+
+
 def check_convolve(a, v):
     anp = a.__array__()
     vnp = v.__array__()

From b4656bca1a32535b3ec24e8d9b3460d4229753dd Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Tue, 15 Nov 2022 21:58:29 -0800
Subject: [PATCH 42/89] Print build start and end times (#687)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/build.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index b1d79b52b..d0df68008 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -1,7 +1,5 @@
 #!/bin/bash
-
-set -x;
-
+  
 # Rewrite conda's -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY to
 #                 -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
 CMAKE_ARGS="$(echo "$CMAKE_ARGS" | sed -r "s@_INCLUDE=ONLY@_INCLUDE=BOTH@g")"
@@ -32,6 +30,8 @@ export CUDAFLAGS="-UNDEBUG"
 export CMAKE_GENERATOR=Ninja
 export CUDAHOSTCXX=${CXX}
 
+echo "Build starting on $(date)"
+
 cmake -S . -B build ${CMAKE_ARGS}
 cmake --build build -j$CPU_COUNT
 cmake --install build
@@ -51,6 +51,8 @@ $PYTHON -m pip install             \
   --disable-pip-version-check      \
   . -vv
 
+echo "Build ending on $(date)"
+
 # Legion leaves an egg-info file which will confuse conda trying to pick up the information
 # Remove it so the legate-core is the only egg-info file added
 rm -rf $SP_DIR/legion*egg-info

From d1e95bc27c35292e0fa51e523697eaab0714f338 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 16 Nov 2022 12:11:00 -0700
Subject: [PATCH 43/89] Fixing logic for putmask with axis = None

* put_along_axis: fixing logic for the case when axis=None

* putmask: removing test that produces different error message fron numpy
---
 cunumeric/module.py                      | 10 +++++-
 tests/integration/test_put_along_axis.py | 41 ------------------------
 2 files changed, 9 insertions(+), 42 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index 0a4e97a5a..d8a8f2a2e 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -3211,7 +3211,7 @@ def put_along_axis(
 
     Parameters
     ----------
-    arr : ndarray (Ni..., M, Nk...)
+    a : ndarray (Ni..., M, Nk...)
         Destination array.
     indices : ndarray (Ni..., J, Nk...)
         Indices to change along each 1d slice of `arr`. This must match the
@@ -3239,6 +3239,10 @@ def put_along_axis(
     Multiple GPUs, Multiple CPUs
 
     """
+
+    if a.size == 0:
+        return
+
     if not np.issubdtype(indices.dtype, np.integer):
         raise TypeError("`indices` must be an integer array")
 
@@ -3249,6 +3253,10 @@ def put_along_axis(
         if a.ndim > 1:
             # TODO call a=a.flat when flat is implemented
             raise ValueError("a.ndim>1 case is not supported when axis=None")
+        if (indices.size == 0) or (values.size == 0):
+            return
+        if values.shape != indices.shape:
+            values = values._wrap(indices.size)
     else:
         computed_axis = normalize_axis_index(axis, a.ndim)
 
diff --git a/tests/integration/test_put_along_axis.py b/tests/integration/test_put_along_axis.py
index a289ffc2a..8b46ff0e3 100644
--- a/tests/integration/test_put_along_axis.py
+++ b/tests/integration/test_put_along_axis.py
@@ -132,18 +132,6 @@ def test_indices_bad_type(self, dtype):
         with pytest.raises(TypeError, match=msg):
             num.put_along_axis(self.a, ai, 100, axis=0)
 
-    @pytest.mark.xfail
-    @pytest.mark.parametrize(
-        "shape", ((3, 2), (3, 0)), ids=lambda shape: f"(shape={shape})"
-    )
-    def test_indices_bad_shape(self, shape):
-        # In Numpy, it raises IndexError.
-        # In cuNumeric, it raises ValueError.
-        ai = num.ones(shape, dtype=int)
-        msg = "shape mismatch: indexing arrays could not be broadcast"
-        with pytest.raises(IndexError, match=msg):
-            num.put_along_axis(self.a, ai, 100, axis=0)
-
     @pytest.mark.parametrize(
         "shape", ((1,), (3, 3, 1)), ids=lambda shape: f"(shape={shape})"
     )
@@ -210,19 +198,6 @@ def test_values_bad_shape2(self):
         "shape", ((0,), (5,), (4, 5)), ids=lambda shape: f"(shape={shape})"
     )
     def test_values_axis_none(self, shape):
-        a = mk_seq_array(num, (10,))
-        ai = mk_seq_array(num, (7,))
-        values = mk_seq_array(num, shape)
-        with pytest.raises(ValueError):
-            num.put_along_axis(a, ai, values, None)
-
-    @pytest.mark.xfail
-    @pytest.mark.parametrize(
-        "shape", ((0,), (5,), (4, 5)), ids=lambda shape: f"(shape={shape})"
-    )
-    def test_values_axis_none_DIVERGENC(self, shape):
-        # In Numpy, all 3 cases pass
-        # In cuNumeric, all 3 cases raise ValueError "Shape did not match"
         np_arr = mk_seq_array(np, (10,))
         num_arr = mk_seq_array(num, (10,))
 
@@ -236,22 +211,6 @@ def test_values_axis_none_DIVERGENC(self, shape):
         num.put_along_axis(num_arr, indices_num, values_num, None)
         assert np.array_equal(np_arr, num_arr)
 
-    def test_a_none(self):
-        ai = num.array([1, 1, 1])
-        msg = "object has no attribute 'ndim'"
-        with pytest.raises(AttributeError, match=msg):
-            num.put_along_axis(None, ai, 100, axis=0)
-
-    def test_indice_none(self):
-        msg = "'NoneType' object has no attribute 'dtype'"
-        with pytest.raises(AttributeError, match=msg):
-            num.put_along_axis(self.a, None, 100, axis=0)
-
-    def test_values_none(self):
-        msg = "'NoneType' object has no attribute 'dtype'"
-        with pytest.raises(AttributeError, match=msg):
-            num.put_along_axis(self.a, self.ai, None, axis=0)
-
 
 if __name__ == "__main__":
     import sys

From db2a4f80994a7a2e311798088a752dcb5ff9b768 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Wed, 16 Nov 2022 12:27:04 -0700
Subject: [PATCH 44/89] Implementing Putmask (#667)

implementing putmask
---
 cunumeric/config.py                           |   2 +
 cunumeric/deferred.py                         | 147 +++++++++++------
 cunumeric/eager.py                            |   7 +
 cunumeric/module.py                           |  44 +++++
 cunumeric/thunk.py                            |   4 +
 cunumeric_cpp.cmake                           |   3 +
 docs/cunumeric/source/api/indexing.rst        |   1 +
 src/cunumeric/cunumeric_c.h                   |   1 +
 .../indexing/parallel_loop.cuh                |  51 ++++++
 .../execution_policy/indexing/parallel_loop.h |  37 +++++
 .../indexing/parallel_loop_omp.h              |  38 +++++
 src/cunumeric/index/putmask.cc                |  32 ++++
 src/cunumeric/index/putmask.cu                |  28 ++++
 src/cunumeric/index/putmask.h                 |  43 +++++
 src/cunumeric/index/putmask_omp.cc            |  28 ++++
 src/cunumeric/index/putmask_template.inl      | 116 +++++++++++++
 tests/integration/test_putmask.py             | 154 ++++++++++++++++++
 tests/unit/cunumeric/test_config.py           |   1 +
 18 files changed, 684 insertions(+), 53 deletions(-)
 create mode 100644 src/cunumeric/execution_policy/indexing/parallel_loop.cuh
 create mode 100644 src/cunumeric/execution_policy/indexing/parallel_loop.h
 create mode 100644 src/cunumeric/execution_policy/indexing/parallel_loop_omp.h
 create mode 100644 src/cunumeric/index/putmask.cc
 create mode 100644 src/cunumeric/index/putmask.cu
 create mode 100644 src/cunumeric/index/putmask.h
 create mode 100644 src/cunumeric/index/putmask_omp.cc
 create mode 100644 src/cunumeric/index/putmask_template.inl
 create mode 100644 tests/integration/test_putmask.py

diff --git a/cunumeric/config.py b/cunumeric/config.py
index c45ae6313..9195022d6 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -167,6 +167,7 @@ class _CunumericSharedLib:
     CUNUMERIC_NONZERO: int
     CUNUMERIC_PACKBITS: int
     CUNUMERIC_POTRF: int
+    CUNUMERIC_PUTMASK: int
     CUNUMERIC_RAND: int
     CUNUMERIC_READ: int
     CUNUMERIC_RED_ALL: int
@@ -358,6 +359,7 @@ class CuNumericOpCode(IntEnum):
     NONZERO = _cunumeric.CUNUMERIC_NONZERO
     PACKBITS = _cunumeric.CUNUMERIC_PACKBITS
     POTRF = _cunumeric.CUNUMERIC_POTRF
+    PUTMASK = _cunumeric.CUNUMERIC_PUTMASK
     RAND = _cunumeric.CUNUMERIC_RAND
     READ = _cunumeric.CUNUMERIC_READ
     REPEAT = _cunumeric.CUNUMERIC_REPEAT
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index b04b3088c..5c9330ee4 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -533,7 +533,10 @@ def _slice_store(k: slice, store: Store, dim: int) -> tuple[slice, Store]:
         return k, store
 
     def _create_indexing_array(
-        self, key: Any, is_set: bool = False
+        self,
+        key: Any,
+        is_set: bool = False,
+        set_value: Optional[Any] = None,
     ) -> tuple[bool, Any, Any, Any]:
         store = self.base
         rhs = self
@@ -583,61 +586,79 @@ def _create_indexing_array(
             for i in range(key_store.ndim, rhs.ndim):
                 key_store = key_store.promote(i, rhs.shape[i])
 
-            out_dtype = rhs.dtype
-            # in the case this operation is called for the set_item, we
-            # return Point<N> type field that is later used for
-            # indirect copy operation
-            if is_set:
-                N = rhs.ndim
-                out_dtype = rhs.runtime.get_point_type(N)
-
-            # TODO : current implementation of the ND output regions
-            # requires out.ndim == rhs.ndim. This will be fixed in the
-            # future
-            out = rhs.runtime.create_unbound_thunk(out_dtype, ndim=rhs.ndim)
-            key_dims = key.ndim  # dimension of the original key
+            # has_set_value && set_value.size==1 corresponds to the case
+            # when a[bool_indices]=scalar
+            # then we can call "putmask" to modify input array
+            # and avoid calling Copy
+            has_set_value = set_value is not None and set_value.size == 1
+            if has_set_value:
+                mask = DeferredArray(
+                    self.runtime,
+                    base=key_store,
+                    dtype=self.dtype,
+                )
+                rhs.putmask(mask, set_value)
+                return False, rhs, rhs, self
+            else:
+                out_dtype = rhs.dtype
+                # in the case this operation is called for the set_item, we
+                # return Point<N> type field that is later used for
+                # indirect copy operation
+                if is_set:
+                    N = rhs.ndim
+                    out_dtype = rhs.runtime.get_point_type(N)
+
+                # TODO : current implementation of the ND output regions
+                # requires out.ndim == rhs.ndim. This will be fixed in the
+                # future
+                out = rhs.runtime.create_unbound_thunk(
+                    out_dtype, ndim=rhs.ndim
+                )
+                key_dims = key.ndim  # dimension of the original key
 
-            task = rhs.context.create_auto_task(
-                CuNumericOpCode.ADVANCED_INDEXING
-            )
-            task.add_output(out.base)
-            task.add_input(rhs.base)
-            task.add_input(key_store)
-            task.add_scalar_arg(is_set, bool)
-            task.add_scalar_arg(key_dims, ty.int64)
-            task.add_alignment(rhs.base, key_store)
-            task.add_broadcast(
-                rhs.base, axes=tuple(range(1, len(rhs.base.shape)))
-            )
-            task.execute()
+                task = rhs.context.create_auto_task(
+                    CuNumericOpCode.ADVANCED_INDEXING
+                )
+                task.add_output(out.base)
+                task.add_input(rhs.base)
+                task.add_input(key_store)
+                task.add_scalar_arg(is_set, bool)
+                task.add_scalar_arg(key_dims, ty.int64)
+                task.add_alignment(rhs.base, key_store)
+                task.add_broadcast(
+                    rhs.base, axes=tuple(range(1, len(rhs.base.shape)))
+                )
+                task.execute()
 
-            # TODO : current implementation of the ND output regions
-            # requires out.ndim == rhs.ndim.
-            # The logic below will be removed in the future
-            out_dim = rhs.ndim - key_dims + 1
-
-            if out_dim != rhs.ndim:
-                out_tmp = out.base
-
-                if out.size == 0:
-                    out_shape = tuple(out.shape[i] for i in range(0, out_dim))
-                    out = cast(
-                        DeferredArray,
-                        self.runtime.create_empty_thunk(
-                            out_shape,
-                            out_dtype,
-                            inputs=[out],
-                        ),
-                    )
-                    if not is_set:
-                        out.fill(np.array(0, dtype=out_dtype))
-                else:
-                    for dim in range(rhs.ndim - out_dim):
-                        out_tmp = out_tmp.project(rhs.ndim - dim - 1, 0)
+                # TODO : current implementation of the ND output regions
+                # requires out.ndim == rhs.ndim.
+                # The logic below will be removed in the future
+                out_dim = rhs.ndim - key_dims + 1
+
+                if out_dim != rhs.ndim:
+                    out_tmp = out.base
+
+                    if out.size == 0:
+                        out_shape = tuple(
+                            out.shape[i] for i in range(0, out_dim)
+                        )
+                        out = cast(
+                            DeferredArray,
+                            self.runtime.create_empty_thunk(
+                                out_shape,
+                                out_dtype,
+                                inputs=[out],
+                            ),
+                        )
+                        if not is_set:
+                            out.fill(np.array(0, dtype=out_dtype))
+                    else:
+                        for dim in range(rhs.ndim - out_dim):
+                            out_tmp = out_tmp.project(rhs.ndim - dim - 1, 0)
 
-                    out = out._copy_store(out_tmp)
+                        out = out._copy_store(out_tmp)
 
-            return False, rhs, out, self
+                return is_set, rhs, out, self
 
         if isinstance(key, NumPyThunk):
             key = (key,)
@@ -890,7 +911,10 @@ def set_item(self, key: Any, rhs: Any) -> None:
                 lhs,
                 index_array,
                 self,
-            ) = self._create_indexing_array(key, True)
+            ) = self._create_indexing_array(key, True, rhs)
+
+            if not copy_needed:
+                return
 
             if rhs.shape != index_array.shape:
                 rhs_tmp = rhs._broadcast(index_array.base.shape)
@@ -917,6 +941,8 @@ def set_item(self, key: Any, rhs: Any) -> None:
                 index_array = index_array._convert_future_to_regionfield()
             if lhs.base.kind == Future:
                 lhs = lhs._convert_future_to_regionfield()
+            if lhs.base.transformed:
+                lhs = lhs._copy_store(lhs.base)
 
             if index_array.size != 0:
                 copy = self.context.create_copy()
@@ -1740,6 +1766,21 @@ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
         if self_tmp is not self:
             self.copy(self_tmp, deep=True)
 
+    @auto_convert("mask", "values")
+    def putmask(self, mask: Any, values: Any) -> None:
+        if values.shape != self.shape:
+            values_new = values._broadcast(self.shape)
+        else:
+            values_new = values.base
+        task = self.context.create_task(CuNumericOpCode.PUTMASK)
+        task.add_input(self.base)
+        task.add_input(mask.base)
+        task.add_input(values_new)
+        task.add_output(self.base)
+        task.add_alignment(self.base, mask.base)
+        task.add_alignment(self.base, values_new)
+        task.execute()
+
     # Create an identity array with the ones offset from the diagonal by k
     def eye(self, k: int) -> None:
         assert self.ndim == 2  # Only 2-D arrays should be here
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index cef2b7b49..530b805c5 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -627,6 +627,13 @@ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
         else:
             np.put(self.array, indices.array, values.array)
 
+    def putmask(self, mask: Any, values: Any) -> None:
+        self.check_eager_args(mask, values)
+        if self.deferred is not None:
+            self.deferred.putmask(mask, values)
+        else:
+            np.putmask(self.array, mask.array, values.array)
+
     def eye(self, k: int) -> None:
         if self.deferred is not None:
             self.deferred.eye(k)
diff --git a/cunumeric/module.py b/cunumeric/module.py
index d8a8f2a2e..390f8d755 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -3514,6 +3514,50 @@ def put(
     a.put(indices=indices, values=values, mode=mode)
 
 
+@add_boilerplate("a", "mask", "values")
+def putmask(a: ndarray, mask: ndarray, values: ndarray) -> None:
+    """
+    putmask(a, mask, values)
+    Changes elements of an array based on conditional and input values.
+    Sets ``a.flat[n] = values[n]`` for each n where ``mask.flat[n]==True``.
+    If `values` is not the same size as `a` and `mask` then it will repeat.
+    This gives behavior different from ``a[mask] = values``.
+
+    Parameters
+    ----------
+    a : ndarray
+        Target array.
+    mask : array_like
+        Boolean mask array. It has to be the same shape as `a`.
+    values : array_like
+        Values to put into `a` where `mask` is True. If `values` is smaller
+        than `a` it will be repeated.
+
+    See Also
+    --------
+    numpy.putmask
+
+    Availability
+    ------------
+    Multiple GPUs, Multiple CPUs
+    """
+    if not a.shape == mask.shape:
+        raise ValueError("mask and data must be the same size")
+
+    mask = mask._warn_and_convert(np.dtype(bool))
+
+    if a.dtype != values.dtype:
+        values = values._warn_and_convert(a.dtype)
+
+    try:
+        np.broadcast_shapes(values.shape, a.shape)
+    except ValueError:
+        values = values._wrap(a.size)
+        values = values.reshape(a.shape)
+
+    a._thunk.putmask(mask._thunk, values._thunk)
+
+
 @add_boilerplate("a", "val")
 def fill_diagonal(a: ndarray, val: ndarray, wrap: bool = False) -> None:
     """
diff --git a/cunumeric/thunk.py b/cunumeric/thunk.py
index 7ade503d0..e80941d4e 100644
--- a/cunumeric/thunk.py
+++ b/cunumeric/thunk.py
@@ -201,6 +201,10 @@ def _diag_helper(
     def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
         ...
 
+    @abstractmethod
+    def putmask(self, mask: Any, values: Any) -> None:
+        ...
+
     @abstractmethod
     def eye(self, k: int) -> None:
         ...
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
index a47038a3b..9ab2741b3 100644
--- a/cunumeric_cpp.cmake
+++ b/cunumeric_cpp.cmake
@@ -131,6 +131,7 @@ list(APPEND cunumeric_SOURCES
   src/cunumeric/index/repeat.cc
   src/cunumeric/index/wrap.cc
   src/cunumeric/index/zip.cc
+  src/cunumeric/index/putmask.cc
   src/cunumeric/item/read.cc
   src/cunumeric/item/write.cc
   src/cunumeric/matrix/contract.cc
@@ -180,6 +181,7 @@ if(Legion_USE_OpenMP)
     src/cunumeric/nullary/window_omp.cc
     src/cunumeric/index/advanced_indexing_omp.cc
     src/cunumeric/index/choose_omp.cc
+    src/cunumeric/index/putmask_omp.cc
     src/cunumeric/index/repeat_omp.cc
     src/cunumeric/index/wrap_omp.cc
     src/cunumeric/index/zip_omp.cc
@@ -229,6 +231,7 @@ if(Legion_USE_CUDA)
     src/cunumeric/index/repeat.cu
     src/cunumeric/index/wrap.cu
     src/cunumeric/index/zip.cu
+    src/cunumeric/index/putmask.cu
     src/cunumeric/item/read.cu
     src/cunumeric/item/write.cu
     src/cunumeric/matrix/contract.cu
diff --git a/docs/cunumeric/source/api/indexing.rst b/docs/cunumeric/source/api/indexing.rst
index 1ace111d4..ab02bbcc4 100644
--- a/docs/cunumeric/source/api/indexing.rst
+++ b/docs/cunumeric/source/api/indexing.rst
@@ -44,5 +44,6 @@ Inserting data into arrays
  
    fill_diagonal
    put
+   putmask
    put_along_axis
    place
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 60d6e108d..462214782 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -52,6 +52,7 @@ enum CuNumericOpCode {
   CUNUMERIC_NONZERO,
   CUNUMERIC_PACKBITS,
   CUNUMERIC_POTRF,
+  CUNUMERIC_PUTMASK,
   CUNUMERIC_RAND,
   CUNUMERIC_READ,
   CUNUMERIC_REPEAT,
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop.cuh b/src/cunumeric/execution_policy/indexing/parallel_loop.cuh
new file mode 100644
index 000000000..81788908f
--- /dev/null
+++ b/src/cunumeric/execution_policy/indexing/parallel_loop.cuh
@@ -0,0 +1,51 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/execution_policy/indexing/parallel_loop.h"
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+template <class KERNEL, class Tag>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  parallel_loop_kernel(const size_t volume, KERNEL kernel, Tag tag)
+{
+  const size_t idx = global_tid_1d();
+  if (idx >= volume) return;
+  kernel(idx, tag);
+}
+
+template <class Tag>
+struct ParallelLoopPolicy<VariantKind::GPU, Tag> {
+  template <class RECT, class KERNEL>
+  void operator()(const RECT& rect, KERNEL&& kernel)
+  {
+    const size_t volume = rect.volume();
+    if (0 == volume) return;
+    auto stream         = get_cached_stream();
+    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+    parallel_loop_kernel<<<blocks, THREADS_PER_BLOCK, 1, stream>>>(
+      volume, std::forward<KERNEL>(kernel), Tag{});
+
+    CHECK_CUDA_STREAM(stream);
+  }
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop.h b/src/cunumeric/execution_policy/indexing/parallel_loop.h
new file mode 100644
index 000000000..31adf811f
--- /dev/null
+++ b/src/cunumeric/execution_policy/indexing/parallel_loop.h
@@ -0,0 +1,37 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+template <VariantKind KIND, class Tag = void>
+struct ParallelLoopPolicy {
+};
+
+template <class Tag>
+struct ParallelLoopPolicy<VariantKind::CPU, Tag> {
+  template <class RECT, class KERNEL>
+  void operator()(const RECT& rect, KERNEL&& kernel)
+  {
+    const size_t volume = rect.volume();
+    for (size_t idx = 0; idx < volume; ++idx) { kernel(idx, Tag{}); }
+  }
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h b/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h
new file mode 100644
index 000000000..a89702fe3
--- /dev/null
+++ b/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/execution_policy/indexing/parallel_loop.h"
+#include "cunumeric/omp_help.h"
+
+#include <omp.h>
+
+namespace cunumeric {
+
+template <class Tag>
+struct ParallelLoopPolicy<VariantKind::OMP, Tag> {
+  template <class RECT, class KERNEL>
+  void operator()(const RECT& rect, KERNEL&& kernel)
+  {
+    const size_t volume = rect.volume();
+#pragma omp for schedule(static)
+    for (size_t idx = 0; idx < volume; ++idx) { kernel(idx, Tag{}); }
+  }
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/putmask.cc b/src/cunumeric/index/putmask.cc
new file mode 100644
index 000000000..595329f13
--- /dev/null
+++ b/src/cunumeric/index/putmask.cc
@@ -0,0 +1,32 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/index/putmask_template.inl"
+
+namespace cunumeric {
+
+/*static*/ void PutmaskTask::cpu_variant(TaskContext& context)
+{
+  putmask_template<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { PutmaskTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/putmask.cu b/src/cunumeric/index/putmask.cu
new file mode 100644
index 000000000..abe94d82f
--- /dev/null
+++ b/src/cunumeric/index/putmask.cu
@@ -0,0 +1,28 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/execution_policy/indexing/parallel_loop.cuh"
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/index/putmask_template.inl"
+
+namespace cunumeric {
+
+/*static*/ void PutmaskTask::gpu_variant(TaskContext& context)
+{
+  putmask_template<VariantKind::GPU>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/putmask.h b/src/cunumeric/index/putmask.h
new file mode 100644
index 000000000..07a418d19
--- /dev/null
+++ b/src/cunumeric/index/putmask.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct PutmaskArgs {
+  const Array& input;
+  const Array& mask;
+  const Array& values;
+};
+
+class PutmaskTask : public CuNumericTask<PutmaskTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_PUTMASK;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/putmask_omp.cc b/src/cunumeric/index/putmask_omp.cc
new file mode 100644
index 000000000..8550b41cd
--- /dev/null
+++ b/src/cunumeric/index/putmask_omp.cc
@@ -0,0 +1,28 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/execution_policy/indexing/parallel_loop_omp.h"
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/index/putmask_template.inl"
+
+namespace cunumeric {
+
+/*static*/ void PutmaskTask::omp_variant(TaskContext& context)
+{
+  putmask_template<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/index/putmask_template.inl b/src/cunumeric/index/putmask_template.inl
new file mode 100644
index 000000000..6f55c34e0
--- /dev/null
+++ b/src/cunumeric/index/putmask_template.inl
@@ -0,0 +1,116 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+// Useful for IDEs
+#include <core/utilities/typedefs.h>
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/pitches.h"
+#include "cunumeric/execution_policy/indexing/parallel_loop.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template <VariantKind KIND, LegateTypeCode CODE, int DIM>
+struct Putmask {
+  using T      = legate_type_of<CODE>;
+  using IN     = AccessorRW<T, DIM>;
+  using MASK   = AccessorRO<bool, DIM>;
+  using VALUES = AccessorRO<T, DIM>;
+
+  IN input;
+  T* inputptr;
+  MASK mask;
+  const bool* maskptr;
+  VALUES values;
+  const T* valptr;
+  Pitches<DIM - 1> pitches;
+  Rect<DIM> rect;
+  bool dense;
+  size_t volume;
+
+  struct DenseTag {};
+  struct SparseTag {};
+
+  // constructor:
+  Putmask(PutmaskArgs& args) : dense(false)
+  {
+    rect = args.input.shape<DIM>();
+#ifdef DEBUG_CUNUMERIC
+    assert(rect == args.mask.shape<DIM>());
+#endif
+
+    input  = args.input.read_write_accessor<T, DIM>(rect);
+    mask   = args.mask.read_accessor<bool, DIM>(rect);
+    values = args.values.read_accessor<T, DIM>(rect);
+    volume = pitches.flatten(rect);
+    if (volume == 0) return;
+#ifndef LEGION_BOUNDS_CHECKS
+    dense = input.accessor.is_dense_row_major(rect) && mask.accessor.is_dense_row_major(rect);
+    dense = dense && values.accessor.is_dense_row_major(rect);
+    if (dense) {
+      inputptr = input.ptr(rect);
+      maskptr  = mask.ptr(rect);
+      valptr   = values.ptr(rect);
+    }
+#endif
+  }  // constructor
+
+  __CUDA_HD__ void operator()(const size_t idx, DenseTag) const noexcept
+  {
+    if (maskptr[idx]) inputptr[idx] = valptr[idx];
+  }
+
+  __CUDA_HD__ void operator()(const size_t idx, SparseTag) const noexcept
+  {
+    auto p = pitches.unflatten(idx, rect.lo);
+    if (mask[p]) input[p] = values[p];
+  }
+
+  void execute() const noexcept
+  {
+#ifndef LEGION_BOUNDS_CHECKS
+    if (dense) { return ParallelLoopPolicy<KIND, DenseTag>()(rect, *this); }
+#endif
+    return ParallelLoopPolicy<KIND, SparseTag>()(rect, *this);
+  }
+};
+
+using namespace Legion;
+using namespace legate;
+
+template <VariantKind KIND>
+struct PutmaskImpl {
+  template <LegateTypeCode CODE, int DIM>
+  void operator()(PutmaskArgs& args) const
+  {
+    Putmask<KIND, CODE, DIM> putmask(args);
+    putmask.execute();
+  }
+};
+
+template <VariantKind KIND>
+static void putmask_template(TaskContext& context)
+{
+  auto& inputs = context.inputs();
+  PutmaskArgs args{context.outputs()[0], inputs[1], inputs[2]};
+  double_dispatch(args.input.dim(), args.input.code(), PutmaskImpl<KIND>{}, args);
+}
+
+}  // namespace cunumeric
diff --git a/tests/integration/test_putmask.py b/tests/integration/test_putmask.py
new file mode 100644
index 000000000..b484cbad7
--- /dev/null
+++ b/tests/integration/test_putmask.py
@@ -0,0 +1,154 @@
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import pytest
+from legate.core import LEGATE_MAX_DIM
+from utils.generators import mk_seq_array
+
+import cunumeric as num
+
+
+def test_scalar():
+    x = mk_seq_array(np, (3,))
+    x_num = mk_seq_array(num, (3,))
+    values = np.zeros((), dtype=np.int32)
+    values_num = num.zeros((), dtype=np.int32)
+    mask = (x % 2).astype(bool)
+    mask_num = num.array(mask)
+    np.putmask(x[:1], mask[2:], values)
+    num.putmask(x_num[:1], mask_num[2:], values_num)
+    assert np.array_equal(x_num, x)
+
+    x = mk_seq_array(np, (3, 4, 5))
+    x_num = mk_seq_array(num, (3, 4, 5))
+    mask = (x % 2).astype(bool)
+    mask_num = num.array(mask)
+    np.putmask(x, mask, 100)
+    num.putmask(x_num, mask_num, 100)
+    assert np.array_equal(x_num, x)
+
+    x = np.zeros((), dtype=np.int32)
+    x_num = num.zeros((), dtype=np.int32)
+    mask = False
+    mask_num = False
+    np.putmask(x, mask, -1)
+    num.putmask(x_num, mask_num, -1)
+    assert np.array_equal(x_num, x)
+
+    x = np.zeros((), dtype=np.int32)
+    x_num = num.zeros((), dtype=np.int32)
+    mask = True
+    mask_num = True
+    np.putmask(x, mask, -1)
+    num.putmask(x_num, mask_num, -1)
+    assert np.array_equal(x_num, x)
+
+    # testing the case when indices is a scalar
+    x = mk_seq_array(np, (3, 4, 5))
+    x_num = mk_seq_array(num, (3, 4, 5))
+    values = mk_seq_array(np, (6,)) * 10
+    values_num = num.array(values)
+    mask = (x % 2).astype(bool)
+    mask_num = num.array(mask)
+    np.putmask(x, mask, values[:1])
+    num.putmask(x_num, mask_num, values_num[:1])
+    assert np.array_equal(x_num, x)
+
+
+def test_type_convert():
+    x = mk_seq_array(np, (3, 4, 5))
+    x_num = mk_seq_array(num, (3, 4, 5))
+    values = mk_seq_array(np, (6,)) * 10
+    values_num = num.array(values)
+    mask = x % 2
+    mask_num = x_num % 2
+    np.putmask(x, mask, values)
+    num.putmask(x_num, mask_num, values_num)
+    assert np.array_equal(x_num, x)
+
+    x = mk_seq_array(np, (3, 4, 5))
+    x_num = mk_seq_array(num, (3, 4, 5))
+    values = mk_seq_array(np, (6,)) * 10
+    values_num = num.array(values)
+    mask = np.zeros(
+        (
+            3,
+            4,
+            5,
+        )
+    )
+    mask_num = num.zeros((3, 4, 5))
+    np.putmask(x, mask, values)
+    num.putmask(x_num, mask_num, values_num)
+    assert np.array_equal(x_num, x)
+
+    x = mk_seq_array(np, (3, 4, 5))
+    x_num = mk_seq_array(num, (3, 4, 5))
+    x = x.astype(np.int32)
+    x_num = x_num.astype(np.int32)
+    mask = np.zeros(
+        (
+            3,
+            4,
+            5,
+        )
+    )
+    mask_num = num.zeros((3, 4, 5))
+    np.putmask(x, mask, 11)
+    num.putmask(x_num, mask_num, 11)
+    assert np.array_equal(x_num, x)
+
+
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+def test_ndim(ndim):
+    shape = (5,) * ndim
+    np_arr = mk_seq_array(np, shape)
+    num_arr = mk_seq_array(num, shape)
+    np_mask = (np_arr % 2).astype(bool)
+    num_mask = (num_arr % 2).astype(bool)
+    # scalar_val
+    np.putmask(np_arr, np_mask, -10)
+    num.putmask(num_arr, num_mask, -10)
+    assert np.array_equal(np_arr, num_arr)
+
+    # val is the same shape:
+    np_val = np_arr * 10
+    num_val = num_arr * 10
+    np.putmask(np_arr, np_mask, np_val)
+    num.putmask(num_arr, num_mask, num_val)
+    assert np.array_equal(np_arr, num_arr)
+
+    # val is different shape, but the same size
+    shape_val = (np_arr.size,)
+    np_values = mk_seq_array(np, shape_val) * 10
+    num_values = mk_seq_array(num, shape_val) * 10
+    np.putmask(np_arr, np_mask, np_values)
+    num.putmask(num_arr, num_mask, num_values)
+    assert np.array_equal(np_arr, num_arr)
+
+    # val is different shape and different size for vals and array
+    shape_val = (2,) * ndim
+    np_values = mk_seq_array(np, shape_val) * 10
+    num_values = mk_seq_array(num, shape_val) * 10
+    np.putmask(np_arr, np_mask, np_values)
+    num.putmask(num_arr, num_mask, num_values)
+    assert np.array_equal(np_arr, num_arr)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/unit/cunumeric/test_config.py b/tests/unit/cunumeric/test_config.py
index db486df5d..ddede6241 100644
--- a/tests/unit/cunumeric/test_config.py
+++ b/tests/unit/cunumeric/test_config.py
@@ -154,6 +154,7 @@ def test_CuNumericOpCode() -> None:
         "NONZERO",
         "PACKBITS",
         "POTRF",
+        "PUTMASK",
         "RAND",
         "READ",
         "REPEAT",

From 0297bf6eb7b62430612193e6981dd2d2a088d03f Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Fri, 18 Nov 2022 06:52:02 +0800
Subject: [PATCH 45/89] Enhance test_index_routines.py and test_reshape.py
 (#696)

* Enhance test_index_routines.py and test_reshape.py

* Address comments.
---
 tests/integration/test_index_routines.py | 444 +++++++++++++++++++----
 tests/integration/test_reshape.py        | 117 +++++-
 2 files changed, 487 insertions(+), 74 deletions(-)

diff --git a/tests/integration/test_index_routines.py b/tests/integration/test_index_routines.py
index c5ff7869c..f01c3fa7b 100644
--- a/tests/integration/test_index_routines.py
+++ b/tests/integration/test_index_routines.py
@@ -25,7 +25,8 @@
 from cunumeric.eager import diagonal_reference
 
 
-def test_choose_1d():
+class TestChoose1d:
+
     choices1 = [
         [0, 1, 2, 3],
         [10, 11, 12, 13],
@@ -35,26 +36,36 @@ def test_choose_1d():
     a1 = [2, 3, 1, 0]
     num_a1 = num.array(a1)
     num_choices1 = num.array(choices1)
+    b = [2, 4, 1, 0]
+    num_b = num.array(b)
 
-    aout = np.array([2.3, 3.0, 1.2, 0.3])
-    num_aout = num.array(aout)
+    def test_basic(self):
+        assert np.array_equal(
+            num.choose(self.num_a1, self.num_choices1),
+            np.choose(self.a1, self.choices1),
+        )
 
-    assert np.array_equal(
-        np.choose(a1, choices1, out=aout),
-        num.choose(num_a1, num_choices1, out=num_aout),
-    )
-    assert np.array_equal(aout, num_aout)
+    def test_out_none(self):
+        assert np.array_equal(
+            num.choose(self.num_a1, self.num_choices1, out=None),
+            np.choose(self.a1, self.choices1, out=None),
+        )
 
-    b = [2, 4, 1, 0]
-    num_b = num.array(b)
-    assert np.array_equal(
-        np.choose(b, choices1, mode="clip"),
-        num.choose(num_b, num_choices1, mode="clip"),
-    )
-    assert np.array_equal(
-        np.choose(b, choices1, mode="wrap"),
-        num.choose(num_b, num_choices1, mode="wrap"),
-    )
+    def test_out(self):
+        aout = np.array([2.3, 3.0, 1.2, 0.3])
+        num_aout = num.array(aout)
+        assert np.array_equal(
+            np.choose(self.a1, self.choices1, out=aout),
+            num.choose(self.num_a1, self.num_choices1, out=num_aout),
+        )
+        assert np.array_equal(aout, num_aout)
+
+    @pytest.mark.parametrize("mode", ("wrap", "clip"), ids=str)
+    def test_mode(self, mode):
+        assert np.array_equal(
+            np.choose(self.b, self.choices1, mode=mode),
+            num.choose(self.num_b, self.num_choices1, mode=mode),
+        )
 
 
 def test_choose_2d():
@@ -102,6 +113,163 @@ def test_choose_target_ndim(ndim):
                 assert np.array_equal(np_res, num_res)
 
 
+SHAPES_A = (
+    (2, 4),
+    (2, 1),
+    (1, 4),
+    (1, 1),
+    (4,),
+    (1,),
+    (3, 2, 4),
+    (2, 3, 2, 4),
+    (1, 3, 1, 1),
+)
+
+
+@pytest.mark.parametrize(
+    "shape_a", SHAPES_A, ids=lambda shape_a: f"(shape_a={shape_a})"
+)
+def test_choose_a_array(shape_a):
+    shape_choices = (3, 2, 4)
+    np_a = mk_seq_array(np, shape_a) % shape_choices[0]
+    num_a = mk_seq_array(num, shape_a) % shape_choices[0]
+    np_choices = mk_seq_array(np, shape_choices)
+    num_choices = mk_seq_array(num, shape_choices)
+
+    np_res = np.choose(np_a, np_choices)
+    num_res = num.choose(num_a, num_choices)
+    assert np.array_equal(np_res, num_res)
+
+
+def test_choose_a_scalar():
+    shape_choices = (3, 2, 4)
+    a = 1
+    np_choices = mk_seq_array(np, shape_choices)
+    num_choices = mk_seq_array(num, shape_choices)
+
+    np_res = np.choose(a, np_choices)
+    num_res = num.choose(a, num_choices)
+    assert np.array_equal(np_res, num_res)
+
+
+@pytest.mark.parametrize("mode", ("wrap", "clip"), ids=str)
+@pytest.mark.parametrize(
+    "shape_a", ((3, 2, 4), (4,)), ids=lambda shape_a: f"(shape_a={shape_a})"
+)
+def test_choose_mode(shape_a, mode):
+    shape_choices = (3, 2, 4)
+    np_a = mk_seq_array(np, shape_a) - 10
+    num_a = mk_seq_array(num, shape_a) - 10
+    np_choices = mk_seq_array(np, shape_choices)
+    num_choices = mk_seq_array(num, shape_choices)
+
+    np_res = np.choose(np_a, np_choices, mode=mode)
+    num_res = num.choose(num_a, num_choices, mode=mode)
+    assert np.array_equal(np_res, num_res)
+
+
+def test_choose_out():
+    shape_choices = (3, 2, 4)
+    shape_a = (2, 4)
+    shape_a_out = (2, 4)
+    np_a = mk_seq_array(np, shape_a) % shape_choices[0]
+    np_a = np_a.astype(np.int32)
+    num_a = mk_seq_array(num, shape_a) % shape_choices[0]
+    num_a = num_a.astype(
+        np.int32
+    )  # cuNumeric would convert np.int32 to default type np.int64
+    np_choices = mk_seq_array(np, shape_choices)
+    num_choices = mk_seq_array(num, shape_choices)
+    np_aout = mk_seq_array(np, shape_a_out) - 10
+    num_aout = mk_seq_array(num, shape_a_out) - 10
+
+    np_res = np.choose(np_a, np_choices, out=np_aout)
+    num_res = num.choose(num_a, num_choices, out=num_aout)
+    assert np.array_equal(np_res, num_res)
+    assert np.array_equal(np_aout, num_aout)
+
+
+@pytest.mark.xfail
+def test_choose_mode_none():
+    # In Numpy, pass and returns array equals default mode
+    # In cuNumeric, raises ValueError: mode=None not understood.
+    # Must be 'raise', 'wrap', or 'clip'
+    shape_choices = (3, 2, 4)
+    shape_a = (2, 4)
+    np_a = mk_seq_array(np, shape_a) % shape_choices[0]
+    num_a = mk_seq_array(num, shape_a) % shape_choices[0]
+    np_choices = mk_seq_array(np, shape_choices)
+    num_choices = mk_seq_array(num, shape_choices)
+
+    np_res = np.choose(np_a, np_choices, mode=None)
+    num_res = num.choose(num_a, num_choices, mode=None)
+    assert np.array_equal(np_res, num_res)
+
+
+class TestChooseErrors:
+    def setup_method(self):
+        self.shape_choices = (3, 2, 4)
+        self.choices = mk_seq_array(num, self.shape_choices)
+        self.shape_a = (2, 4)
+        self.a = mk_seq_array(num, self.shape_a) % self.shape_choices[0]
+
+    @pytest.mark.parametrize(
+        "value", (-1, 3), ids=lambda value: f"(value={value})"
+    )
+    def test_a_value_out_of_bound(self, value):
+        shape_a = (2, 4)
+        a = num.full(shape_a, value)
+        msg = "invalid entry in choice array"
+        with pytest.raises(ValueError, match=msg):
+            num.choose(a, self.choices)
+
+    def test_a_value_float(self):
+        shape_a = (2, 4)
+        a = num.full(shape_a, 1.0)
+        with pytest.raises(TypeError):
+            num.choose(a, self.choices)
+
+    @pytest.mark.parametrize(
+        "shape_a",
+        ((3, 4), (2, 2), (2,), (0,)),
+        ids=lambda shape_a: f"(shape_a={shape_a})",
+    )
+    def test_a_invalid_shape(self, shape_a):
+        a = mk_seq_array(num, shape_a) % self.shape_choices[0]
+        msg = "shape mismatch"
+        with pytest.raises(ValueError, match=msg):
+            num.choose(a, self.choices)
+
+    @pytest.mark.xfail
+    def test_a_none(self):
+        # In Numpy, it raises TypeError
+        # In cuNumeric, it raises AttributeError:
+        # 'NoneType' object has no attribute 'choose'
+        with pytest.raises(TypeError):
+            num.choose(None, self.choices)
+
+    def test_empty_choices(self):
+        msg = "invalid entry in choice array"
+        with pytest.raises(ValueError, match=msg):
+            num.choose(self.a, [])
+
+    @pytest.mark.xfail
+    def test_choices_none(self):
+        # In Numpy, it raises TypeError
+        # In cuNumeric, it raises IndexError: tuple index out of range
+        with pytest.raises(TypeError):
+            num.choose(self.a, None)
+
+    def test_invalid_mode(self):
+        with pytest.raises(ValueError):
+            num.choose(self.a, self.choices, mode="InvalidValue")
+
+    def test_out_invalid_shape(self):
+        aout = mk_seq_array(num, (1, 4))
+        with pytest.raises(ValueError):
+            num.choose(self.a, self.choices, out=aout)
+
+
 def test_diagonal():
     ad = np.arange(24).reshape(4, 3, 2)
     num_ad = num.array(ad)
@@ -129,7 +297,6 @@ def test_diagonal():
     for ndim in range(3, LEGATE_MAX_DIM + 1):
         a_shape = tuple(random.randint(1, 9) for i in range(ndim))
         np_array = mk_seq_array(np, a_shape)
-        np_array = mk_seq_array(np, a_shape)
         num_array = mk_seq_array(num, a_shape)
         for num_axes in range(3, ndim + 1):
             for axes in permutations(range(ndim), num_axes):
@@ -140,65 +307,212 @@ def test_diagonal():
                 assert np.array_equal(res_num, res_ref)
 
 
-KS = [0, -1, 1, -2, 2]
+KS = (0, -1, 1, -2, 2)
 
 
+@pytest.mark.xfail
 @pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
-def test_diag(k):
-    print(f"diag(k={k})")
-    a = num.array(
-        [
-            [1, 2, 3, 4],
-            [5, 6, 7, 8],
-            [9, 10, 11, 12],
-            [13, 14, 15, 16],
-            [17, 18, 19, 20],
-        ]
+@pytest.mark.parametrize(
+    "shape", ((5, 1), (1, 5)), ids=lambda shape: f"(shape={shape})"
+)
+def test_diagonal_offset(shape, k):
+    # for shape=(5, 1) and k=1, 2,
+    # for shape=(1, 5) and k=-1, -2,
+    # In cuNumeric,  raise ValueError: 'offset'
+    # for diag or diagonal must be in range
+    # In Numpy, pass and returns empty array
+    a = mk_seq_array(num, shape)
+    an = mk_seq_array(np, shape)
+
+    b = num.diagonal(a, k)
+    bn = np.diagonal(an, k)
+    assert np.array_equal(b, bn)
+
+
+@pytest.mark.parametrize(
+    "shape",
+    (pytest.param((3, 0), marks=pytest.mark.xfail), (0, 3)),
+    ids=lambda shape: f"(shape={shape})",
+)
+def test_diagonal_empty_array(shape):
+    # for shape=(3, 0) and k=0,
+    # In cuNumeric,  raise ValueError: 'offset'
+    # for diag or diagonal must be in range
+    # In Numpy, pass and returns empty array
+    a = mk_seq_array(num, shape)
+    an = mk_seq_array(np, shape)
+
+    b = num.diagonal(a)
+    bn = np.diagonal(an)
+    assert np.array_equal(b, bn)
+
+
+class TestDiagonalErrors:
+    def setup_method(self):
+        shape = (3, 4, 5)
+        self.a = mk_seq_array(num, shape)
+
+    def test_0d_array(self):
+        a = num.array(3)
+        with pytest.raises(ValueError):
+            num.diagonal(a)
+
+    def test_1d_array(self):
+        shape = (3,)
+        a = mk_seq_array(num, shape)
+        with pytest.raises(ValueError):
+            num.diagonal(a)
+
+    @pytest.mark.xfail
+    def test_array_none(self):
+        # In cuNumeric, it raises AttributeError:
+        # 'NoneType' object has no attribute 'diagonal'
+        # In Numpy, it raises ValueError:
+        # diag requires an array of at least two dimensions.
+        with pytest.raises(ValueError):
+            num.diagonal(None)
+
+    @pytest.mark.parametrize(
+        "axes",
+        ((0, 0), pytest.param((0, -3), marks=pytest.mark.xfail)),
+        ids=lambda axes: f"(axes={axes})",
     )
-    an = np.array(
-        [
-            [1, 2, 3, 4],
-            [5, 6, 7, 8],
-            [9, 10, 11, 12],
-            [13, 14, 15, 16],
-            [17, 18, 19, 20],
-        ]
+    def test_axes_same(self, axes):
+        # For axes =  (0, -3),
+        # In cuNumeric, it raises ValueError:
+        # axes must be the same size as ndim for transpose
+        # In Numpy, it raises ValueError: axis1 and axis2 cannot be the same
+        axis1, axis2 = axes
+        msg = "axes passed to _diag_helper should be all different"
+        with pytest.raises(ValueError, match=msg):
+            num.diagonal(self.a, 0, axis1, axis2)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "axes", ((0, -4), (3, 0)), ids=lambda axes: f"(axes={axes})"
+    )
+    def test_axes_out_of_bound(self, axes):
+        # In Numpy, it raises numpy.AxisError: is out of bounds
+        # In cuNumeric, it raises ValueError:
+        # axes must be the same size as ndim for transpose
+        axis1, axis2 = axes
+        with pytest.raises(np.AxisError):
+            num.diagonal(self.a, 0, axis1, axis2)
+
+    @pytest.mark.xfail
+    def test_axes_float(self):
+        # In Numpy, it raise TypeError
+        # In cuNumeric, it raises AssertionError
+        with pytest.raises(TypeError):
+            num.diagonal(self.a, 0, 0.0, 1)
+
+    @pytest.mark.xfail
+    def test_axes_none(self):
+        # In Numpy, it raise TypeError
+        # In cuNumeric, it raises AssertionError
+        with pytest.raises(TypeError):
+            num.diagonal(self.a, 0, None, 0)
+
+    @pytest.mark.parametrize(
+        "k",
+        (pytest.param(0.0, marks=pytest.mark.xfail), -1.5, 1.5),
+        ids=lambda k: f"(k={k})",
     )
+    def test_k_float(self, k):
+        # for k=0.0,
+        # In cuNumeric, pass
+        # In Numpy, raises TypeError: integer argument expected, got float
+        with pytest.raises(TypeError):
+            num.diagonal(self.a, k)
+
+    def test_k_none(self):
+        with pytest.raises(TypeError):
+            num.diagonal(self.a, None)
+
+
+@pytest.mark.parametrize("k", KS, ids=lambda k: f"(k={k})")
+@pytest.mark.parametrize(
+    "shape",
+    (
+        (5,),
+        (3, 3),
+        pytest.param((5, 1), marks=pytest.mark.xfail),
+        pytest.param((1, 5), marks=pytest.mark.xfail),
+    ),
+    ids=lambda shape: f"(shape={shape})",
+)
+def test_diag(shape, k):
+    # for shape=(5, 1) and k=1, 2,
+    # for shape=(1, 5) and k=-1, -2,
+    # In cuNumeric,  raise ValueError:
+    # 'offset' for diag or diagonal must be in range
+    # In Numpy, pass and returns empty array
+    a = mk_seq_array(num, shape)
+    an = mk_seq_array(np, shape)
 
     b = num.diag(a, k=k)
     bn = np.diag(an, k=k)
     assert np.array_equal(b, bn)
 
-    c = num.diag(b, k=k)
-    cn = np.diag(bn, k=k)
-    assert np.array_equal(c, cn)
-
-    d = num.array(
-        [
-            [1, 2, 3, 4, 5],
-            [6, 7, 8, 9, 10],
-            [11, 12, 13, 14, 15],
-            [16, 17, 18, 19, 20],
-        ]
-    )
-    dn = np.array(
-        [
-            [1, 2, 3, 4, 5],
-            [6, 7, 8, 9, 10],
-            [11, 12, 13, 14, 15],
-            [16, 17, 18, 19, 20],
-        ]
-    )
 
-    e = num.diag(d, k=k)
-    en = np.diag(dn, k=k)
-    assert np.array_equal(e, en)
+@pytest.mark.parametrize(
+    "shape",
+    ((0,), pytest.param((3, 0), marks=pytest.mark.xfail), (0, 3)),
+    ids=lambda shape: f"(shape={shape})",
+)
+def test_diag_empty_array(shape):
+    # for shape=(3, 0) and k=0,
+    # In cuNumeric,  raise ValueError:
+    # 'offset' for diag or diagonal must be in range
+    # In Numpy, pass and returns empty array
+    a = mk_seq_array(num, shape)
+    an = mk_seq_array(np, shape)
+
+    b = num.diag(a)
+    bn = np.diag(an)
+    assert np.array_equal(b, bn)
+
 
-    f = num.diag(e, k=k)
-    fn = np.diag(en, k=k)
-    assert np.array_equal(f, fn)
+class TestDiagErrors:
+    def test_0d_array(self):
+        a = num.array(3)
+        msg = "Input must be 1- or 2-d"
+        with pytest.raises(ValueError, match=msg):
+            num.diag(a)
+
+    def test_3d_array(self):
+        shape = (3, 4, 5)
+        a = mk_seq_array(num, shape)
+        with pytest.raises(ValueError):
+            num.diag(a)
+
+    @pytest.mark.xfail
+    def test_array_none(self):
+        # In cuNumeric, it raises AttributeError,
+        # 'NoneType' object has no attribute 'ndim'
+        # In Numpy, it raises ValueError, Input must be 1- or 2-d.
+        with pytest.raises(ValueError):
+            num.diag(None)
+
+    @pytest.mark.parametrize(
+        "k",
+        (pytest.param(0.0, marks=pytest.mark.xfail), -1.5, 1.5),
+        ids=lambda k: f"(k={k})",
+    )
+    def test_k_float(self, k):
+        # for k=0.0,
+        # In cuNumeric, pass
+        # In Numpy, raises TypeError: integer argument expected, got float
+        shape = (3, 3)
+        a = mk_seq_array(num, shape)
+        with pytest.raises(TypeError):
+            num.diag(a, k=k)
 
-    return
+    def test_k_none(self):
+        shape = (3, 3)
+        a = mk_seq_array(num, shape)
+        with pytest.raises(TypeError):
+            num.diag(a, k=None)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_reshape.py b/tests/integration/test_reshape.py
index 1e4c7f8c5..4fe6814cb 100644
--- a/tests/integration/test_reshape.py
+++ b/tests/integration/test_reshape.py
@@ -37,12 +37,13 @@ def test_basic(self):
         a = num.arange(100).reshape(10, 10)
         assert np.array_equal(self.anp, a)
 
+    @pytest.mark.parametrize("order", ("C", "F", "A", None), ids=str)
     @pytest.mark.parametrize("shape", SQUARE_CASES, ids=str)
-    def test_shape(self, shape):
+    def test_shape(self, shape, order):
         a = num.arange(100).reshape(10, 10)
         assert np.array_equal(
-            num.reshape(a, shape),
-            np.reshape(self.anp, shape),
+            num.reshape(a, shape, order=order),
+            np.reshape(self.anp, shape, order=order),
         )
 
     def test_1d(self):
@@ -102,7 +103,7 @@ def test_ravel(self):
         assert np.array_equal(a.ravel(), anp.ravel())
 
 
-RECT_CASES = [
+RECT_CASES = (
     (10, 2, 10),
     (20, 10),
     (20, -5),
@@ -110,15 +111,31 @@ def test_ravel(self):
     (200, 1),
     (1, 200),
     (10, 20),
-]
+)
 
 
 class TestRect:
 
     anp = np.random.rand(5, 4, 10)
 
+    @pytest.mark.parametrize("order", ("C", "F", "A", None), ids=str)
     @pytest.mark.parametrize("shape", RECT_CASES, ids=str)
-    def test_shape(self, shape):
+    def test_shape(self, shape, order):
+        a = num.array(self.anp)
+        assert np.array_equal(
+            num.reshape(a, shape, order=order),
+            np.reshape(self.anp, shape, order=order),
+        )
+
+    @pytest.mark.parametrize(
+        "shape",
+        (200, -1, -2, pytest.param(None, marks=pytest.mark.xfail)),
+        ids=str,
+    )
+    def test_0d(self, shape):
+        # for shape=None,
+        # In Numpy, pass, returns the flattened 1-D array
+        # In cuNumeric, raises TypeError: 'NoneType' object is not iterable
         a = num.array(self.anp)
         assert np.array_equal(
             num.reshape(a, shape),
@@ -132,13 +149,95 @@ def test_1d(self):
             np.reshape(self.anp, (200,)),
         )
 
-    def test_ravel(self):
+    @pytest.mark.parametrize(
+        "order",
+        ("C", "F", "A", pytest.param("K", marks=pytest.mark.xfail), None),
+        ids=str,
+    )
+    def test_ravel(self, order):
+        # In Numpy, pass with 'K'
+        # In cuNumeric, when order is 'K', raise ValueError:
+        # order 'K' is not permitted for reshaping
         a = num.array(self.anp)
         assert np.array_equal(
-            num.ravel(a),
-            np.ravel(self.anp),
+            num.ravel(a, order=order),
+            np.ravel(self.anp, order=order),
         )
 
+    @pytest.mark.xfail
+    def test_ravel_a_none(self):
+        # In Numpy, pass and returns [None]
+        # In cuNumeric, raises AttributeError:
+        # 'NoneType' object has no attribute 'ravel'
+        assert np.array_equal(
+            num.ravel(None),
+            np.ravel(None),
+        )
+
+
+@pytest.mark.parametrize("shape", (0, (0,), (1, 0), (0, 1, 1)), ids=str)
+def test_reshape_empty_array(shape):
+    a = num.arange(0).reshape(0, 1)
+    anp = np.arange(0).reshape(0, 1)
+    assert np.array_equal(
+        num.reshape(a, shape),
+        np.reshape(anp, shape),
+    )
+
+
+class TestReshapeErrors:
+    def setup_method(self):
+        self.a = num.arange(24)
+        self.shape = (4, 3, 2)
+
+    @pytest.mark.xfail
+    def test_a_none(self):
+        # In Numpy, it raises ValueError: cannot reshape array
+        # In cuNumeric, it raises AttributeError:
+        # 'NoneType' object has no attribute
+        with pytest.raises(ValueError):
+            num.reshape(None, self.shape)
+
+    def test_empty_array_shape_invalid_size(self):
+        a = num.arange(0).reshape(0, 1, 1)
+        shape = (1, 1)
+        with pytest.raises(ValueError):
+            num.reshape(a, shape)
+
+    @pytest.mark.parametrize(
+        "shape",
+        ((-1, 0, 2), (4, 3, 4), (4, 3, 0), (4, 3), (4,), (0,), 4),
+        ids=str,
+    )
+    def test_shape_invalid_size(self, shape):
+        msg = "cannot reshape array"
+        with pytest.raises(ValueError, match=msg):
+            num.reshape(self.a, shape)
+
+    def test_shape_unknown_dimensions(self):
+        shape = (-5, -1, 2)
+        msg = "can only specify one unknown dimension"
+        with pytest.raises(ValueError, match=msg):
+            num.reshape(self.a, shape)
+
+    @pytest.mark.parametrize("shape", ((4, 3, 2.0), 24.0), ids=str)
+    def test_shape_float(self, shape):
+        with pytest.raises(TypeError):
+            num.reshape(self.a, shape)
+
+    def test_invalid_order(self):
+        with pytest.raises(ValueError):
+            num.reshape(self.a, self.shape, order="Z")
+
+
+class TestRavelErrors:
+    def setup_method(self):
+        self.a = num.arange(24).reshape(4, 3, 2)
+
+    def test_invalid_order(self):
+        with pytest.raises(ValueError):
+            num.ravel(self.a, order="Z")
+
 
 if __name__ == "__main__":
     import sys

From 560372098500c4c6e8265d1593f05dfd3595feda Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Thu, 17 Nov 2022 15:44:05 -0800
Subject: [PATCH 46/89] Minor test QOL improvements: make cunumeric  imports
 consistent (#701)

* make cunumeric test imports consistent

* more prefix/suffix renames and vertical space to emphasize AAA
---
 tests/integration/test_array_dunders.py       | 95 +++++++++++--------
 tests/integration/test_diag_indices.py        | 52 +++++-----
 tests/integration/test_einsum.py              | 28 +++---
 tests/integration/test_einsum_path.py         | 14 +--
 tests/integration/test_indices.py             | 40 ++++----
 tests/integration/test_ingest.py              | 12 +--
 tests/integration/test_lstm_simple_forward.py | 30 +++---
 tests/integration/test_matrix_power.py        | 12 +--
 tests/integration/test_min_on_gpu.py          |  6 +-
 tests/integration/test_moveaxis.py            | 56 +++++------
 tests/integration/test_multi_dot.py           | 22 ++---
 tests/integration/test_norm.py                | 30 +++---
 tests/integration/test_outer.py               |  6 +-
 tests/integration/test_random_creation.py     | 12 ++-
 tests/integration/test_reduction_axis.py      |  6 +-
 tests/integration/test_vdot.py                |  4 +-
 tests/integration/test_window.py              | 10 +-
 tests/integration/utils/contractions.py       | 26 ++---
 18 files changed, 237 insertions(+), 224 deletions(-)

diff --git a/tests/integration/test_array_dunders.py b/tests/integration/test_array_dunders.py
index 42b2a6ec2..626df5dee 100644
--- a/tests/integration/test_array_dunders.py
+++ b/tests/integration/test_array_dunders.py
@@ -16,77 +16,88 @@
 import numpy as np
 import pytest
 
-import cunumeric as cn
+import cunumeric as num
+
+arr_np = np.eye(4)
+vec_np = np.arange(4).astype(np.float64)
+
+arr_num = num.array(arr_np)
+vec_num = num.array(vec_np)
 
-np_arr = np.eye(4)
-np_vec = np.arange(4).astype(np.float64)
-cn_arr = cn.array(np_arr)
-cn_vec = cn.array(np_vec)
 indices = [0, 3, 1, 2]
 
 
 def test_array_function_implemented():
-    np_res = np.dot(np_arr, np_vec)
-    cn_res = np.dot(cn_arr, cn_vec)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, cn.ndarray)  # implemented
+    res_np = np.dot(arr_np, vec_np)
+    res_num = np.dot(arr_num, vec_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, num.ndarray)  # implemented
 
 
 def test_array_function_unimplemented():
-    np_res = np.linalg.tensorsolve(np_arr, np_vec)
-    cn_res = np.linalg.tensorsolve(cn_arr, cn_vec)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, np.ndarray)  # unimplemented
+    res_np = np.linalg.tensorsolve(arr_np, vec_np)
+    res_num = np.linalg.tensorsolve(arr_num, vec_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, np.ndarray)  # unimplemented
 
 
 def test_array_ufunc_through_array_op():
-    assert np.array_equal(cn_vec + cn_vec, np_vec + np_vec)
-    assert isinstance(cn_vec + np_vec, cn.ndarray)
-    assert isinstance(np_vec + cn_vec, cn.ndarray)
+    assert np.array_equal(vec_num + vec_num, vec_np + vec_np)
+    assert isinstance(vec_num + vec_np, num.ndarray)
+    assert isinstance(vec_np + vec_num, num.ndarray)
 
 
 def test_array_ufunc_call():
-    np_res = np.add(np_vec, np_vec)
-    cn_res = np.add(cn_vec, cn_vec)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, cn.ndarray)  # implemented
+    res_np = np.add(vec_np, vec_np)
+    res_num = np.add(vec_num, vec_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, num.ndarray)  # implemented
 
 
 def test_array_ufunc_reduce():
-    np_res = np.add.reduce(np_vec)
-    cn_res = np.add.reduce(cn_vec)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, cn.ndarray)  # implemented
+    res_np = np.add.reduce(vec_np)
+    res_num = np.add.reduce(vec_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, num.ndarray)  # implemented
 
 
 def test_array_ufunc_accumulate():
-    np_res = np.add.accumulate(np_vec)
-    cn_res = np.add.accumulate(cn_vec)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, np.ndarray)  # unimplemented
+    res_np = np.add.accumulate(vec_np)
+    res_num = np.add.accumulate(vec_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, np.ndarray)  # unimplemented
 
 
 def test_array_ufunc_reduceat():
-    np_res = np.add.reduceat(np_vec, indices)
-    cn_res = np.add.reduceat(cn_vec, indices)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, np.ndarray)  # unimplemented
+    res_np = np.add.reduceat(vec_np, indices)
+    res_num = np.add.reduceat(vec_num, indices)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, np.ndarray)  # unimplemented
 
 
 def test_array_ufunc_outer():
-    np_res = np.add.outer(np_vec, np_vec)
-    cn_res = np.add.outer(cn_vec, cn_vec)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, np.ndarray)  # unimplemented
+    res_np = np.add.outer(vec_np, vec_np)
+    res_num = np.add.outer(vec_num, vec_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, np.ndarray)  # unimplemented
 
 
 def test_array_ufunc_at():
-    np_res = np.full((4,), 42)
-    cn_res = cn.full((4,), 42)
-    np.add.at(np_res, indices, np_vec)
-    np.add.at(cn_res, indices, cn_vec)
-    assert np.array_equal(np_res, cn_res)
-    assert isinstance(cn_res, cn.ndarray)
+    res_np = np.full((4,), 42)
+    res_num = num.full((4,), 42)
+
+    np.add.at(res_np, indices, vec_np)
+    np.add.at(res_num, indices, vec_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert isinstance(res_num, num.ndarray)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_diag_indices.py b/tests/integration/test_diag_indices.py
index e848b2dbe..38c421477 100644
--- a/tests/integration/test_diag_indices.py
+++ b/tests/integration/test_diag_indices.py
@@ -17,62 +17,62 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 
-import cunumeric as cn
+import cunumeric as num
 
 
 def test_diag_indices_default_ndim():
-    np_res = np.diag_indices(10)
-    cn_res = cn.diag_indices(10)
-    assert np.array_equal(np_res, cn_res)
+    a_np = np.diag_indices(10)
+    a_num = num.diag_indices(10)
+    assert np.array_equal(a_np, a_num)
 
 
 @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM + 1))
 def test_diag_indices_basic(ndim):
-    np_res = np.diag_indices(10, ndim)
-    cn_res = cn.diag_indices(10, ndim)
-    assert np.array_equal(np_res, cn_res)
+    a_np = np.diag_indices(10, ndim)
+    a_num = num.diag_indices(10, ndim)
+    assert np.array_equal(a_np, a_num)
 
 
 @pytest.mark.parametrize("n", [0, 0.0, 1, 10.5])
 @pytest.mark.parametrize("ndim", [-4, 0, 1])
 def test_diag_indices(n, ndim):
-    np_res = np.diag_indices(n, ndim)
-    cn_res = cn.diag_indices(n, ndim)
-    assert np.array_equal(np_res, cn_res)
+    a_np = np.diag_indices(n, ndim)
+    a_num = num.diag_indices(n, ndim)
+    assert np.array_equal(a_np, a_num)
 
 
 class TestDiagIndicesErrors:
     @pytest.mark.parametrize("n", [-10.5, -1])
     def test_negative_n(self, n):
         with pytest.raises(ValueError):
-            cn.diag_indices(n)
+            num.diag_indices(n)
 
     @pytest.mark.xfail
     @pytest.mark.parametrize("n", [-10.5, -1])
     def test_negative_n_DIVERGENCE(self, n):
         # np.diag_indices(-10.5) returns empty 2-D array, dtype=float64
         # np.diag_indices(-1) returns empty 2-D array, dtype=int32
-        # cn.diag_indices(-10.5) raises ValueError
-        # cn.diag_indices(-1) raises ValueError
-        np_res = np.diag_indices(n)
-        cn_res = cn.diag_indices(n)
-        assert np.array_equal(np_res, cn_res)
+        # num.diag_indices(-10.5) raises ValueError
+        # num.diag_indices(-1) raises ValueError
+        a_np = np.diag_indices(n)
+        a_num = num.diag_indices(n)
+        assert np.array_equal(a_np, a_num)
 
     def test_none_n(self):
         msg = "unsupported operand type"
         with pytest.raises(TypeError, match=msg):
-            cn.diag_indices(None)
+            num.diag_indices(None)
 
     @pytest.mark.parametrize("ndim", [-1.5, 0.0, 1.5])
     def test_float_ndim(self, ndim):
         msg = "can't multiply sequence by non-int of type 'float'"
         with pytest.raises(TypeError, match=msg):
-            cn.diag_indices(10, ndim)
+            num.diag_indices(10, ndim)
 
     def test_none_ndim(self):
         msg = "can't multiply sequence by non-int of type 'NoneType'"
         with pytest.raises(TypeError, match=msg):
-            cn.diag_indices(10, None)
+            num.diag_indices(10, None)
 
 
 @pytest.mark.parametrize("size", [(5,), (0,)], ids=str)
@@ -80,10 +80,10 @@ def test_none_ndim(self):
 def test_diag_indices_from_basic(size, ndim):
     shape = size * ndim
     a = np.ones(shape, dtype=int)
-    a_cn = cn.array(a)
-    np_res = np.diag_indices_from(a)
-    cn_res = cn.diag_indices_from(a_cn)
-    assert np.array_equal(np_res, cn_res)
+    a_num = num.array(a)
+    a_np = np.diag_indices_from(a)
+    a_num = num.diag_indices_from(a_num)
+    assert np.array_equal(a_np, a_num)
 
 
 class TestDiagIndicesFromErrors:
@@ -92,13 +92,13 @@ def test_1d(self, size):
         a = np.ones(size, dtype=int)
         msg = "input array must be at least 2-d"
         with pytest.raises(ValueError, match=msg):
-            cn.diag_indices_from(a)
+            num.diag_indices_from(a)
 
     def test_0d(self):
         a = np.array(3)
         msg = "input array must be at least 2-d"
         with pytest.raises(ValueError, match=msg):
-            cn.diag_indices_from(a)
+            num.diag_indices_from(a)
 
     @pytest.mark.parametrize(
         "size",
@@ -115,7 +115,7 @@ def test_unequal_length(self, size):
         a = np.ones(size, dtype=int)
         msg = "All dimensions of input must be of equal length"
         with pytest.raises(ValueError, match=msg):
-            cn.diag_indices_from(a)
+            num.diag_indices_from(a)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_einsum.py b/tests/integration/test_einsum.py
index 68ca4b949..fd34ef088 100644
--- a/tests/integration/test_einsum.py
+++ b/tests/integration/test_einsum.py
@@ -22,7 +22,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array, permutes_to
 
-import cunumeric as cn
+import cunumeric as num
 
 # Limits for exhaustive expression generation routines
 MAX_MODES = 3
@@ -213,48 +213,48 @@ def mk_typed_output(lib, shape):
     ]
 
 
-def check_np_vs_cn(expr, mk_input, mk_output=None, **kwargs):
+def check_np_vs_num(expr, mk_input, mk_output=None, **kwargs):
     lhs, rhs = expr.split("->")
     opers = lhs.split(",")
     in_shapes = [
         tuple(BASE_DIM_LEN + ord(m) - ord("a") for m in op) for op in opers
     ]
     out_shape = tuple(BASE_DIM_LEN + ord(m) - ord("a") for m in rhs)
-    for (np_inputs, cn_inputs) in zip(
+    for (np_inputs, num_inputs) in zip(
         product(*(mk_input(np, sh) for sh in in_shapes)),
-        product(*(mk_input(cn, sh) for sh in in_shapes)),
+        product(*(mk_input(num, sh) for sh in in_shapes)),
     ):
         np_res = np.einsum(expr, *np_inputs, **kwargs)
-        cn_res = cn.einsum(expr, *cn_inputs, **kwargs)
+        num_res = num.einsum(expr, *num_inputs, **kwargs)
         rtol = (
             1e-02
             if any(x.dtype == np.float16 for x in np_inputs)
             or kwargs.get("dtype") == np.float16
             else 1e-05
         )
-        assert allclose(np_res, cn_res, rtol=rtol)
+        assert allclose(np_res, num_res, rtol=rtol)
         if mk_output is not None:
-            for cn_out in mk_output(cn, out_shape):
-                cn.einsum(expr, *cn_inputs, out=cn_out, **kwargs)
-                rtol_out = 1e-02 if cn_out.dtype == np.float16 else rtol
-                assert allclose(cn_out, cn_res, rtol=rtol_out)
+            for num_out in mk_output(num, out_shape):
+                num.einsum(expr, *num_inputs, out=num_out, **kwargs)
+                rtol_out = 1e-02 if num_out.dtype == np.float16 else rtol
+                assert allclose(num_out, num_res, rtol=rtol_out)
 
 
 @pytest.mark.parametrize("expr", gen_expr())
 def test_small(expr):
-    check_np_vs_cn(expr, mk_input_that_permutes_to)
-    check_np_vs_cn(expr, mk_input_that_broadcasts_to)
+    check_np_vs_num(expr, mk_input_that_permutes_to)
+    check_np_vs_num(expr, mk_input_that_broadcasts_to)
 
 
 @pytest.mark.parametrize("expr", LARGE_EXPRS)
 def test_large(expr):
-    check_np_vs_cn(expr, mk_input_default)
+    check_np_vs_num(expr, mk_input_default)
 
 
 @pytest.mark.parametrize("expr", SMALL_EXPRS)
 @pytest.mark.parametrize("dtype", [None, np.float32])
 def test_cast(expr, dtype):
-    check_np_vs_cn(
+    check_np_vs_num(
         expr, mk_typed_input, mk_typed_output, dtype=dtype, casting="unsafe"
     )
 
diff --git a/tests/integration/test_einsum_path.py b/tests/integration/test_einsum_path.py
index e54fcd1d7..db6370257 100644
--- a/tests/integration/test_einsum_path.py
+++ b/tests/integration/test_einsum_path.py
@@ -16,15 +16,15 @@
 import numpy as np
 import pytest
 
-import cunumeric as cn
+import cunumeric as num
 
 expr = "ij,jk,kl->il"
 np_a = np.empty((2, 2))
 np_b = np.empty((2, 5))
 np_c = np.empty((5, 2))
-cn_a = cn.empty((2, 2))
-cn_b = cn.empty((2, 5))
-cn_c = cn.empty((5, 2))
+num_a = num.empty((2, 2))
+num_b = num.empty((2, 5))
+num_c = num.empty((5, 2))
 
 OPTIMIZE = [
     True,
@@ -38,9 +38,9 @@
 
 @pytest.mark.parametrize("optimize", OPTIMIZE)
 def test_einsum_path(optimize):
-    np_path, _ = np.einsum_path(expr, np_a, np_b, np_c, optimize=optimize)
-    cn_path, _ = cn.einsum_path(expr, cn_a, cn_b, cn_c, optimize=optimize)
-    assert np_path == cn_path
+    path_np, _ = np.einsum_path(expr, np_a, np_b, np_c, optimize=optimize)
+    path_num, _ = num.einsum_path(expr, num_a, num_b, num_c, optimize=optimize)
+    assert path_np == path_num
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_indices.py b/tests/integration/test_indices.py
index 2ebbe9c2a..08b6042a0 100644
--- a/tests/integration/test_indices.py
+++ b/tests/integration/test_indices.py
@@ -19,7 +19,7 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 
-import cunumeric as cn
+import cunumeric as num
 
 
 class TestIndicesErrors:
@@ -32,19 +32,19 @@ def test_int_dimensions(self):
         dimensions = 3
         msg = r"'int' object is not iterable"
         with pytest.raises(TypeError, match=msg):
-            cn.indices(dimensions)
+            num.indices(dimensions)
 
     def test_negative_dimensions(self):
         dimensions = -3
         msg = r"'int' object is not iterable"
         with pytest.raises(TypeError, match=msg):
-            cn.indices(dimensions)
+            num.indices(dimensions)
 
     def test_float_dimensions(self):
         dimensions = 3.2
         msg = r"'float' object is not iterable"
         with pytest.raises(TypeError, match=msg):
-            cn.indices(dimensions)
+            num.indices(dimensions)
 
     def test_negative_tuple_dimensions(self):
         dimensions = (1, -1)
@@ -54,7 +54,7 @@ def test_negative_tuple_dimensions(self):
         # in other conditions, it raises
         # "ValueError: Invalid shape: Shape((2, 1, -1))"
         with pytest.raises(ValueError):
-            cn.indices(dimensions)
+            num.indices(dimensions)
 
     def test_float_tuple_dimensions(self):
         dimensions = (3.5, 2.5)
@@ -62,7 +62,7 @@ def test_float_tuple_dimensions(self):
         # "TypeError: 'float' object cannot be interpreted as an integer"
         msg = r"expected a sequence of integers or a single integer"
         with pytest.raises(TypeError, match=msg):
-            cn.indices(dimensions)
+            num.indices(dimensions)
 
 
 class TestIndices:
@@ -73,40 +73,40 @@ class TestIndices:
     @pytest.mark.parametrize("dimensions", [(0,), (0, 0), (0, 1), (1, 1)])
     def test_indices_zero(self, dimensions):
         np_res = np.indices(dimensions)
-        cn_res = cn.indices(dimensions)
+        num_res = num.indices(dimensions)
 
-        assert np.array_equal(np_res, cn_res)
+        assert np.array_equal(np_res, num_res)
 
     @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
     def test_indices_basic(self, ndim):
-        dimensions = tuple(random.randint(1, 5) for i in range(ndim))
+        dimensions = tuple(random.randint(1, 5) for _ in range(ndim))
 
         np_res = np.indices(dimensions)
-        cn_res = cn.indices(dimensions)
-        assert np.array_equal(np_res, cn_res)
+        num_res = num.indices(dimensions)
+        assert np.array_equal(np_res, num_res)
 
     @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
     def test_indices_dtype_none(self, ndim):
-        dimensions = tuple(random.randint(1, 5) for i in range(ndim))
+        dimensions = tuple(random.randint(1, 5) for _ in range(ndim))
 
         np_res = np.indices(dimensions, dtype=None)
-        cn_res = cn.indices(dimensions, dtype=None)
-        assert np.array_equal(np_res, cn_res)
+        num_res = num.indices(dimensions, dtype=None)
+        assert np.array_equal(np_res, num_res)
 
     @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
     def test_indices_dtype_float(self, ndim):
-        dimensions = tuple(random.randint(1, 5) for i in range(ndim))
+        dimensions = tuple(random.randint(1, 5) for _ in range(ndim))
         np_res = np.indices(dimensions, dtype=float)
-        cn_res = cn.indices(dimensions, dtype=float)
-        assert np.array_equal(np_res, cn_res)
+        num_res = num.indices(dimensions, dtype=float)
+        assert np.array_equal(np_res, num_res)
 
     @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM))
     def test_indices_sparse(self, ndim):
-        dimensions = tuple(random.randint(1, 5) for i in range(ndim))
+        dimensions = tuple(random.randint(1, 5) for _ in range(ndim))
         np_res = np.indices(dimensions, sparse=True)
-        cn_res = cn.indices(dimensions, sparse=True)
+        num_res = num.indices(dimensions, sparse=True)
         for i in range(len(np_res)):
-            assert np.array_equal(np_res[i], cn_res[i])
+            assert np.array_equal(np_res[i], num_res[i])
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_ingest.py b/tests/integration/test_ingest.py
index 0860e7e03..0db699b1c 100644
--- a/tests/integration/test_ingest.py
+++ b/tests/integration/test_ingest.py
@@ -25,7 +25,7 @@
     legion,
 )
 
-import cunumeric as lg
+import cunumeric as num
 
 tile_shape = (4, 7)
 colors = (5, 3)
@@ -80,7 +80,7 @@ def _ingest(custom_partitioning, custom_sharding):
         get_buffer,
         get_local_colors if custom_sharding else None,
     )
-    return lg.array(tab)
+    return num.array(tab)
 
 
 @pytest.mark.parametrize("custom_sharding", [True, False])
@@ -89,10 +89,10 @@ def test(custom_partitioning, custom_sharding):
     size = 1
     for d in shape:
         size *= d
-    np_arr = np.arange(size).reshape(shape)
-    lg_arr = _ingest(custom_partitioning, custom_sharding)
-    assert np.array_equal(np_arr, lg_arr)
-    assert np.array_equal(np_arr, lg_arr * 1.0)  # force a copy
+    a_np = np.arange(size).reshape(shape)
+    a_num = _ingest(custom_partitioning, custom_sharding)
+    assert np.array_equal(a_np, a_num)
+    assert np.array_equal(a_np, a_num * 1.0)  # force a copy
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_lstm_simple_forward.py b/tests/integration/test_lstm_simple_forward.py
index 07da11ced..56f4a4a44 100644
--- a/tests/integration/test_lstm_simple_forward.py
+++ b/tests/integration/test_lstm_simple_forward.py
@@ -14,7 +14,7 @@
 #
 import pytest
 
-import cunumeric as np
+import cunumeric as num
 
 
 def test_basic():
@@ -22,27 +22,27 @@ def test_basic():
     hidden_size = 10
     sentence_length = 2
     batch_size = 3
-    X = np.random.randn(sentence_length, batch_size, hidden_size)
-    h0 = np.random.randn(1, hidden_size)
-    WLSTM = np.random.randn(
+    X = num.random.randn(sentence_length, batch_size, hidden_size)
+    h0 = num.random.randn(1, hidden_size)
+    WLSTM = num.random.randn(
         word_size + hidden_size, 4 * hidden_size
-    ) / np.sqrt(word_size + hidden_size)
+    ) / num.sqrt(word_size + hidden_size)
 
     xphpb = WLSTM.shape[0]
     d = hidden_size
     n = sentence_length
     b = batch_size
 
-    Hin = np.zeros((n, b, xphpb))
-    Hout = np.zeros((n, b, d))
-    IFOG = np.zeros((n, b, d * 4))
-    IFOGf = np.zeros((n, b, d * 4))
-    C = np.zeros((n, b, d))
-    Ct = np.zeros((n, b, d))
+    Hin = num.zeros((n, b, xphpb))
+    Hout = num.zeros((n, b, d))
+    IFOG = num.zeros((n, b, d * 4))
+    IFOGf = num.zeros((n, b, d * 4))
+    C = num.zeros((n, b, d))
+    Ct = num.zeros((n, b, d))
 
     for t in range(0, n):
         if t == 0:
-            prev = np.tile(h0, (b, 1))
+            prev = num.tile(h0, (b, 1))
         else:
             prev = Hout[t - 1]
 
@@ -52,14 +52,14 @@ def test_basic():
         IFOG[t] = Hin[t].dot(WLSTM)
         # non-linearities
         IFOGf[t, :, : 3 * d] = 1.0 / (
-            1.0 + np.exp(-IFOG[t, :, : 3 * d])
+            1.0 + num.exp(-IFOG[t, :, : 3 * d])
         )  # sigmoids these are the gates
-        IFOGf[t, :, 3 * d :] = np.tanh(IFOG[t, :, 3 * d :])  # tanh
+        IFOGf[t, :, 3 * d :] = num.tanh(IFOG[t, :, 3 * d :])  # tanh
         # compute the cell activation
         C[t] = IFOGf[t, :, :d] * IFOGf[t, :, 3 * d :]
         if t > 0:
             C[t] += IFOGf[t, :, d : 2 * d] * C[t - 1]
-        Ct[t] = np.tanh(C[t])
+        Ct[t] = num.tanh(C[t])
         Hout[t] = IFOGf[t, :, 2 * d : 3 * d] * Ct[t]
 
 
diff --git a/tests/integration/test_matrix_power.py b/tests/integration/test_matrix_power.py
index d9a0dfca8..d4cfe4b23 100644
--- a/tests/integration/test_matrix_power.py
+++ b/tests/integration/test_matrix_power.py
@@ -19,7 +19,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as cn
+import cunumeric as num
 
 # TODO: add negative exponents here, once they become supported
 EXPONENTS = [0, 1, 3, 5]
@@ -29,11 +29,11 @@
 @pytest.mark.parametrize("exp", EXPONENTS)
 def test_matrix_power(ndim, exp):
     shape = (3,) * ndim + (2, 2)
-    np_a = mk_0to1_array(np, shape)
-    cn_a = mk_0to1_array(cn, shape)
-    np_res = np.linalg.matrix_power(np_a, exp)
-    cn_res = cn.linalg.matrix_power(cn_a, exp)
-    assert allclose(np_res, cn_res)
+    a_np = mk_0to1_array(np, shape)
+    a_num = mk_0to1_array(num, shape)
+    res_np = np.linalg.matrix_power(a_np, exp)
+    res_num = num.linalg.matrix_power(a_num, exp)
+    assert allclose(res_np, res_num)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_min_on_gpu.py b/tests/integration/test_min_on_gpu.py
index 7a5fb2bb2..2a5345c4f 100644
--- a/tests/integration/test_min_on_gpu.py
+++ b/tests/integration/test_min_on_gpu.py
@@ -15,12 +15,12 @@
 
 import pytest
 
-import cunumeric as cn
+import cunumeric as num
 
 
 def test_min():
-    x = cn.array([1, 2, 3])
-    assert cn.min(x) == 1
+    x = num.array([1, 2, 3])
+    assert num.min(x) == 1
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_moveaxis.py b/tests/integration/test_moveaxis.py
index 9f12ad6d0..6c5682160 100644
--- a/tests/integration/test_moveaxis.py
+++ b/tests/integration/test_moveaxis.py
@@ -18,7 +18,7 @@
 from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_0to1_array
 
-import cunumeric as cn
+import cunumeric as num
 
 AXES = (
     (0, 0),
@@ -34,26 +34,26 @@
 @pytest.mark.parametrize("axes", AXES)
 def test_moveaxis(ndim, axes):
     source, destination = axes
-    np_a = mk_0to1_array(np, (3,) * ndim)
-    cn_a = mk_0to1_array(cn, (3,) * ndim)
-    np_res = np.moveaxis(np_a, source, destination)
-    cn_res = cn.moveaxis(cn_a, source, destination)
-    assert np.array_equal(np_res, cn_res)
+    a_np = mk_0to1_array(np, (3,) * ndim)
+    a_num = mk_0to1_array(num, (3,) * ndim)
+    res_np = np.moveaxis(a_np, source, destination)
+    res_num = num.moveaxis(a_num, source, destination)
+    assert np.array_equal(res_np, res_num)
     # Check that the returned array is a view
-    cn_res[:] = 0
-    assert cn_a.sum() == 0
+    res_num[:] = 0
+    assert a_num.sum() == 0
 
 
 def test_moveaxis_with_empty_axis():
-    np_a = np.ones((3, 4, 5))
-    cn_a = cn.ones((3, 4, 5))
+    a_np = np.ones((3, 4, 5))
+    a_num = num.ones((3, 4, 5))
 
     axes = ([], [])
     source, destination = axes
 
-    np_res = np.moveaxis(np_a, source, destination)
-    cn_res = cn.moveaxis(cn_a, source, destination)
-    assert np.array_equal(np_res, cn_res)
+    res_np = np.moveaxis(a_np, source, destination)
+    res_num = num.moveaxis(a_num, source, destination)
+    assert np.array_equal(res_np, res_num)
 
 
 EMPTY_ARRAYS = (
@@ -68,57 +68,57 @@ def test_moveaxis_with_empty_array(a):
     axes = (0, -1)
     source, destination = axes
 
-    np_res = np.moveaxis(a, source, destination)
-    cn_res = cn.moveaxis(a, source, destination)
-    assert np.array_equal(np_res, cn_res)
+    res_np = np.moveaxis(a, source, destination)
+    res_num = num.moveaxis(a, source, destination)
+    assert np.array_equal(res_np, res_num)
 
 
 class TestMoveAxisErrors:
     def setup(self):
-        self.x = cn.ones((3, 4, 5))
+        self.x = num.ones((3, 4, 5))
 
     def test_repeated_axis(self):
         msg = "repeated axis"
         with pytest.raises(ValueError, match=msg):
-            cn.moveaxis(self.x, [0, 0], [1, 0])
+            num.moveaxis(self.x, [0, 0], [1, 0])
 
         with pytest.raises(ValueError, match=msg):
-            cn.moveaxis(self.x, [0, 1], [0, -3])
+            num.moveaxis(self.x, [0, 1], [0, -3])
 
     def test_axis_out_of_bound(self):
         msg = "out of bound"
         with pytest.raises(np.AxisError, match=msg):
-            cn.moveaxis(self.x, [0, 3], [0, 1])
+            num.moveaxis(self.x, [0, 3], [0, 1])
 
         with pytest.raises(np.AxisError, match=msg):
-            cn.moveaxis(self.x, [0, 1], [0, -4])
+            num.moveaxis(self.x, [0, 1], [0, -4])
 
         with pytest.raises(np.AxisError, match=msg):
-            cn.moveaxis(self.x, 4, 0)
+            num.moveaxis(self.x, 4, 0)
 
         with pytest.raises(np.AxisError, match=msg):
-            cn.moveaxis(self.x, 0, -4)
+            num.moveaxis(self.x, 0, -4)
 
     def test_axis_with_different_length(self):
         msg = "arguments must have the same number of elements"
         with pytest.raises(ValueError, match=msg):
-            cn.moveaxis(self.x, [0], [1, 0])
+            num.moveaxis(self.x, [0], [1, 0])
 
     def test_axis_float(self):
         msg = "integer argument expected, got float"
         with pytest.raises(TypeError, match=msg):
-            cn.moveaxis(self.x, [0.0, 1], [1, 0])
+            num.moveaxis(self.x, [0.0, 1], [1, 0])
 
         with pytest.raises(TypeError, match=msg):
-            cn.moveaxis(self.x, [0, 1], [1, 0.0])
+            num.moveaxis(self.x, [0, 1], [1, 0.0])
 
     def test_axis_none(self):
         msg = "'NoneType' object is not iterable"
         with pytest.raises(TypeError, match=msg):
-            cn.moveaxis(self.x, None, 0)
+            num.moveaxis(self.x, None, 0)
 
         with pytest.raises(TypeError, match=msg):
-            cn.moveaxis(self.x, 0, None)
+            num.moveaxis(self.x, 0, None)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_multi_dot.py b/tests/integration/test_multi_dot.py
index 7fc054bc9..ecba326ef 100644
--- a/tests/integration/test_multi_dot.py
+++ b/tests/integration/test_multi_dot.py
@@ -18,7 +18,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as cn
+import cunumeric as num
 
 SHAPES = [
     # 2 arrays
@@ -42,28 +42,28 @@
 @pytest.mark.parametrize("shapes", SHAPES)
 def test_multi_dot(shapes):
     np_arrays = [mk_0to1_array(np, shape) for shape in shapes]
-    cn_arrays = [mk_0to1_array(cn, shape) for shape in shapes]
-    np_res = np.linalg.multi_dot(np_arrays)
-    cn_res = cn.linalg.multi_dot(cn_arrays)
-    assert allclose(np_res, cn_res)
+    num_arrays = [mk_0to1_array(num, shape) for shape in shapes]
+    res_np = np.linalg.multi_dot(np_arrays)
+    res_num = num.linalg.multi_dot(num_arrays)
+    assert allclose(res_np, res_num)
 
     if len(shapes[0]) == 1:
         if len(shapes[-1]) == 1:
-            out = cn.zeros(())
+            out = num.zeros(())
         else:
-            out = cn.zeros((shapes[-1][1],))
+            out = num.zeros((shapes[-1][1],))
     else:
         if len(shapes[-1]) == 1:
-            out = cn.zeros((shapes[0][0],))
+            out = num.zeros((shapes[0][0],))
         else:
-            out = cn.zeros(
+            out = num.zeros(
                 (
                     shapes[0][0],
                     shapes[-1][1],
                 )
             )
-    cn_res = cn.linalg.multi_dot(cn_arrays, out=out)
-    assert allclose(np_res, out)
+    res_num = num.linalg.multi_dot(num_arrays, out=out)
+    assert allclose(res_np, out)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_norm.py b/tests/integration/test_norm.py
index 7fd6b7461..ca20b8e04 100644
--- a/tests/integration/test_norm.py
+++ b/tests/integration/test_norm.py
@@ -19,7 +19,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as cn
+import cunumeric as num
 
 VECTOR_ORDS = [None, np.inf, -np.inf, 0, 1, -1, 2, -2]
 
@@ -30,8 +30,8 @@
     mk_0to1_array(np, (3,) * ndim) - 0.5
     for ndim in range(0, LEGATE_MAX_DIM + 1)
 ]
-cn_arrays = [
-    mk_0to1_array(cn, (3,) * ndim) - 0.5
+num_arrays = [
+    mk_0to1_array(num, (3,) * ndim) - 0.5
     for ndim in range(0, LEGATE_MAX_DIM + 1)
 ]
 
@@ -40,24 +40,24 @@
 @pytest.mark.parametrize("keepdims", [False, True])
 def test_noaxis_1d(ord, keepdims):
     np_res = np.linalg.norm(np_arrays[1], ord=ord, keepdims=keepdims)
-    cn_res = cn.linalg.norm(cn_arrays[1], ord=ord, keepdims=keepdims)
-    assert allclose(np_res, cn_res)
+    num_res = num.linalg.norm(num_arrays[1], ord=ord, keepdims=keepdims)
+    assert allclose(np_res, num_res)
 
 
 @pytest.mark.parametrize("ord", MATRIX_ORDS)
 @pytest.mark.parametrize("keepdims", [False, True])
 def test_noaxis_2d(ord, keepdims):
     np_res = np.linalg.norm(np_arrays[2], ord=ord, keepdims=keepdims)
-    cn_res = cn.linalg.norm(cn_arrays[2], ord=ord, keepdims=keepdims)
-    assert allclose(np_res, cn_res)
+    num_res = num.linalg.norm(num_arrays[2], ord=ord, keepdims=keepdims)
+    assert allclose(np_res, num_res)
 
 
 @pytest.mark.parametrize("ndim", [0] + list(range(3, LEGATE_MAX_DIM + 1)))
 @pytest.mark.parametrize("keepdims", [False, True])
 def test_noaxis_other(ndim, keepdims):
     np_res = np.linalg.norm(np_arrays[ndim], keepdims=keepdims)
-    cn_res = cn.linalg.norm(cn_arrays[ndim], keepdims=keepdims)
-    assert allclose(np_res, cn_res)
+    num_res = num.linalg.norm(num_arrays[ndim], keepdims=keepdims)
+    assert allclose(np_res, num_res)
 
 
 @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
@@ -67,10 +67,10 @@ def test_axis_1d(ndim, ord, keepdims):
     np_res = np.linalg.norm(
         np_arrays[ndim], ord=ord, axis=0, keepdims=keepdims
     )
-    cn_res = cn.linalg.norm(
-        cn_arrays[ndim], ord=ord, axis=0, keepdims=keepdims
+    num_res = num.linalg.norm(
+        num_arrays[ndim], ord=ord, axis=0, keepdims=keepdims
     )
-    assert allclose(np_res, cn_res)
+    assert allclose(np_res, num_res)
 
 
 @pytest.mark.parametrize("ndim", range(2, LEGATE_MAX_DIM + 1))
@@ -80,10 +80,10 @@ def test_axis_2d(ndim, ord, keepdims):
     np_res = np.linalg.norm(
         np_arrays[ndim], ord=ord, axis=(0, 1), keepdims=keepdims
     )
-    cn_res = cn.linalg.norm(
-        cn_arrays[ndim], ord=ord, axis=(0, 1), keepdims=keepdims
+    num_res = num.linalg.norm(
+        num_arrays[ndim], ord=ord, axis=(0, 1), keepdims=keepdims
     )
-    assert allclose(np_res, cn_res)
+    assert allclose(np_res, num_res)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_outer.py b/tests/integration/test_outer.py
index bbae647db..67b444348 100644
--- a/tests/integration/test_outer.py
+++ b/tests/integration/test_outer.py
@@ -18,7 +18,7 @@
 from legate.core import LEGATE_MAX_DIM
 from utils.generators import mk_0to1_array
 
-import cunumeric as cn
+import cunumeric as num
 
 
 def _outer(a_ndim, b_ndim, lib):
@@ -31,12 +31,12 @@ def _outer(a_ndim, b_ndim, lib):
 @pytest.mark.parametrize("b_ndim", range(1, LEGATE_MAX_DIM + 1))
 def test_basic(a_ndim, b_ndim):
     assert np.array_equal(
-        _outer(a_ndim, b_ndim, np), _outer(a_ndim, b_ndim, cn)
+        _outer(a_ndim, b_ndim, np), _outer(a_ndim, b_ndim, num)
     )
 
 
 def test_empty():
-    assert np.array_equal(_outer(0, 0, np), _outer(0, 0, cn))
+    assert np.array_equal(_outer(0, 0, np), _outer(0, 0, num))
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_random_creation.py b/tests/integration/test_random_creation.py
index 3f3c6cd12..122b77baa 100644
--- a/tests/integration/test_random_creation.py
+++ b/tests/integration/test_random_creation.py
@@ -17,16 +17,18 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as cn
+import cunumeric as num
 
 
 @pytest.mark.xfail
 def test_randn():
-    cn.random.seed(42)
-    x = cn.random.randn(10)
     np.random.seed(42)
-    xn = np.random.randn(10)
-    assert allclose(x, xn)
+    num.random.seed(42)
+
+    a_np = np.random.randn(10)
+    a_num = num.random.randn(10)
+
+    assert allclose(a_num, a_np)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_reduction_axis.py b/tests/integration/test_reduction_axis.py
index 7b6ff0555..6ae89f07c 100644
--- a/tests/integration/test_reduction_axis.py
+++ b/tests/integration/test_reduction_axis.py
@@ -18,7 +18,7 @@
 import numpy as np
 import pytest
 
-import cunumeric as cn
+import cunumeric as num
 
 
 def _sum(shape, axis, lib, dtype=None):
@@ -30,9 +30,9 @@ def _sum(shape, axis, lib, dtype=None):
 @pytest.mark.parametrize("axis", range(3), ids=str)
 @pytest.mark.parametrize("shape", permutations((3, 4, 5)), ids=str)
 def test_3d(shape, axis):
-    assert np.array_equal(_sum(shape, axis, np), _sum(shape, axis, cn))
+    assert np.array_equal(_sum(shape, axis, np), _sum(shape, axis, num))
     assert np.array_equal(
-        _sum(shape, axis, np, dtype="D"), _sum(shape, axis, cn, dtype="D")
+        _sum(shape, axis, np, dtype="D"), _sum(shape, axis, num, dtype="D")
     )
 
 
diff --git a/tests/integration/test_vdot.py b/tests/integration/test_vdot.py
index 06d2e25f4..52497f0db 100644
--- a/tests/integration/test_vdot.py
+++ b/tests/integration/test_vdot.py
@@ -18,7 +18,7 @@
 from utils.comparisons import allclose
 from utils.generators import mk_0to1_array
 
-import cunumeric as cn
+import cunumeric as num
 
 DTYPES = [np.float32, np.complex64]
 
@@ -33,7 +33,7 @@ def _vdot(a_dtype, b_dtype, lib):
 @pytest.mark.parametrize("a_dtype", DTYPES)
 @pytest.mark.parametrize("b_dtype", DTYPES)
 def test(a_dtype, b_dtype):
-    assert allclose(_vdot(a_dtype, b_dtype, np), _vdot(a_dtype, b_dtype, cn))
+    assert allclose(_vdot(a_dtype, b_dtype, np), _vdot(a_dtype, b_dtype, num))
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_window.py b/tests/integration/test_window.py
index a10f02319..2a2d9790c 100644
--- a/tests/integration/test_window.py
+++ b/tests/integration/test_window.py
@@ -18,7 +18,7 @@
 import pytest
 from utils.comparisons import allclose
 
-import cunumeric as cn
+import cunumeric as num
 
 window_functions = ("bartlett", "blackman", "hamming", "hanning")
 
@@ -27,18 +27,18 @@
 @pytest.mark.parametrize("fn", window_functions)
 def test_basic_window(fn, M):
     out_np = getattr(np, fn)(M)
-    out_cn = getattr(cn, fn)(M)
+    out_num = getattr(num, fn)(M)
 
-    assert allclose(out_np, out_cn)
+    assert allclose(out_np, out_num)
 
 
 @pytest.mark.parametrize("beta", (0, 6))
 @pytest.mark.parametrize("M", (0, 1, 10, 100))
 def test_kaiser_window(M, beta):
     out_np = np.kaiser(M, beta)
-    out_cn = cn.kaiser(M, beta)
+    out_num = num.kaiser(M, beta)
 
-    assert allclose(out_np, out_cn)
+    assert allclose(out_np, out_num)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/utils/contractions.py b/tests/integration/utils/contractions.py
index 487cbfac6..e5530b982 100644
--- a/tests/integration/utils/contractions.py
+++ b/tests/integration/utils/contractions.py
@@ -16,7 +16,7 @@
 import numpy as np
 from legate.core import LEGATE_MAX_DIM
 
-import cunumeric as cn
+import cunumeric as num
 
 from .comparisons import allclose
 from .generators import mk_0to1_array
@@ -49,7 +49,7 @@ def gen_inputs_of_various_shapes(lib, modes):
     # making sure common modes appear with the same extent on both arrays
     (a_modes, b_modes, out_modes) = modes
     for (a_shape, b_shape) in gen_shapes(a_modes, b_modes):
-        if lib == cn:
+        if lib == num:
             print(f"  {a_shape} x {b_shape}")
         yield (mk_0to1_array(lib, a_shape), mk_0to1_array(lib, b_shape))
 
@@ -70,7 +70,7 @@ def gen_permuted_inputs(lib, modes):
     b = mk_0to1_array(lib, (5,) * len(b_modes))
     for a_axes in gen_permutations(len(a_modes)):
         for b_axes in gen_permutations(len(b_modes)):
-            if lib == cn:
+            if lib == num:
                 print(f"  transpose{a_axes} x transpose{b_axes}")
             yield (a.transpose(a_axes), b.transpose(b_axes))
 
@@ -85,7 +85,7 @@ def gen_inputs_of_various_types(lib, modes):
         (np.float32, np.float32),
         (np.complex64, np.complex64),
     ]:
-        if lib == cn:
+        if lib == num:
             print(f"  {a_dtype} x {b_dtype}")
         yield (
             mk_0to1_array(lib, a_shape, a_dtype),
@@ -97,7 +97,7 @@ def gen_output_of_various_types(lib, modes, a, b):
     (a_modes, b_modes, out_modes) = modes
     out_shape = (5,) * len(out_modes)
     for out_dtype in [np.float16, np.complex64]:
-        if lib == cn:
+        if lib == num:
             print(f"  -> {out_dtype}")
         yield lib.zeros(out_shape, out_dtype)
 
@@ -109,23 +109,23 @@ def _test(name, modes, operation, gen_inputs, gen_output=None, **kwargs):
         # because we may need to promote arrays so that one includes all modes.
         return
     print(name)
-    for (np_inputs, cn_inputs) in zip(
-        gen_inputs(np, modes), gen_inputs(cn, modes)
+    for (np_inputs, num_inputs) in zip(
+        gen_inputs(np, modes), gen_inputs(num, modes)
     ):
         np_res = operation(np, *np_inputs, **kwargs)
-        cn_res = operation(cn, *cn_inputs, **kwargs)
+        num_res = operation(num, *num_inputs, **kwargs)
         rtol = (
             1e-02
             if any(x.dtype == np.float16 for x in np_inputs)
             or kwargs.get("dtype") == np.float16
             else 1e-05
         )
-        assert allclose(np_res, cn_res, rtol=rtol)
+        assert allclose(np_res, num_res, rtol=rtol)
         if gen_output is not None:
-            for cn_out in gen_output(cn, modes, *cn_inputs):
-                operation(cn, *cn_inputs, out=cn_out, **kwargs)
-                rtol_out = 1e-02 if cn_out.dtype == np.float16 else rtol
-                assert allclose(cn_out, cn_res, rtol=rtol_out)
+            for num_out in gen_output(num, modes, *num_inputs):
+                operation(num, *num_inputs, out=num_out, **kwargs)
+                rtol_out = 1e-02 if num_out.dtype == np.float16 else rtol
+                assert allclose(num_out, num_res, rtol=rtol_out)
 
 
 def check_default(name, modes, operation):

From ae7610261001015a81c6149ec05472fd1e97bb90 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 17 Nov 2022 15:59:26 -0800
Subject: [PATCH 47/89] Sync supported-dtype check between utils.py and
 runtime.py (#698)

* Sync supported-dtype check between utils.py and runtime.py

* Make supported datatypes dict public

* Fix tests

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 cunumeric/runtime.py                     | 29 ++++------------
 cunumeric/utils.py                       | 37 ++++++++++++---------
 tests/integration/test_data_interface.py |  4 +--
 tests/unit/cunumeric/test_utils.py       | 42 +++++++++++++-----------
 4 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py
index 49e36abb8..2fdb97822 100644
--- a/cunumeric/runtime.py
+++ b/cunumeric/runtime.py
@@ -39,33 +39,18 @@
 from .eager import EagerArray
 from .thunk import NumPyThunk
 from .types import NdShape
-from .utils import calculate_volume, find_last_user_stacklevel, get_arg_dtype
+from .utils import (
+    SUPPORTED_DTYPES,
+    calculate_volume,
+    find_last_user_stacklevel,
+    get_arg_dtype,
+)
 
 if TYPE_CHECKING:
     import numpy.typing as npt
     from legate.core._legion.future import Future
     from legate.core.operation import AutoTask, ManualTask
 
-_supported_dtypes = {
-    np.bool_: ty.bool_,
-    np.int8: ty.int8,
-    np.int16: ty.int16,
-    np.int32: ty.int32,
-    int: ty.int64,
-    np.int64: ty.int64,
-    np.uint8: ty.uint8,
-    np.uint16: ty.uint16,
-    np.uint32: ty.uint32,
-    np.uint: ty.uint64,
-    np.uint64: ty.uint64,
-    np.float16: ty.float16,
-    np.float32: ty.float32,
-    float: ty.float64,
-    np.float64: ty.float64,
-    np.complex64: ty.complex64,
-    np.complex128: ty.complex128,
-}
-
 ARGS = [
     Argument(
         "test",
@@ -170,7 +155,7 @@ def __init__(self, legate_context: LegateContext) -> None:
 
     def _register_dtypes(self) -> None:
         type_system = self.legate_context.type_system
-        for numpy_type, core_type in _supported_dtypes.items():
+        for numpy_type, core_type in SUPPORTED_DTYPES.items():
             type_system.make_alias(np.dtype(numpy_type), core_type)
 
         for dtype in _CUNUMERIC_DTYPES:
diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 5bfd0b54e..fa5b4462d 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -20,25 +20,30 @@
 from types import FrameType
 from typing import Any, List, Sequence, Tuple, Union, cast
 
+import legate.core.types as ty
 import numpy as np
 
 from .types import NdShape
 
-_SUPPORTED_DTYPES = [
-    np.float16,
-    np.float32,
-    np.float64,
-    float,
-    np.int16,
-    np.int32,
-    np.int64,
-    int,
-    np.uint16,
-    np.uint32,
-    np.uint64,
-    np.bool_,
-    bool,
-]
+SUPPORTED_DTYPES = {
+    bool: ty.bool_,
+    np.bool_: ty.bool_,
+    np.int8: ty.int8,
+    np.int16: ty.int16,
+    np.int32: ty.int32,
+    int: ty.int64,  # np.int is int
+    np.int64: ty.int64,
+    np.uint8: ty.uint8,
+    np.uint16: ty.uint16,
+    np.uint32: ty.uint32,
+    np.uint64: ty.uint64,  # np.uint is np.uint64
+    np.float16: ty.float16,
+    np.float32: ty.float32,
+    float: ty.float64,
+    np.float64: ty.float64,
+    np.complex64: ty.complex64,
+    np.complex128: ty.complex128,
+}
 
 
 def is_advanced_indexing(key: Any) -> bool:
@@ -91,7 +96,7 @@ def find_last_user_frames(top_only: bool = True) -> str:
 def is_supported_dtype(dtype: Any) -> bool:
     if not isinstance(dtype, np.dtype):
         raise TypeError("expected a NumPy dtype")
-    return dtype.type in _SUPPORTED_DTYPES
+    return dtype.type in SUPPORTED_DTYPES
 
 
 def calculate_volume(shape: NdShape) -> int:
diff --git a/tests/integration/test_data_interface.py b/tests/integration/test_data_interface.py
index 6c617db43..a3329a1b6 100644
--- a/tests/integration/test_data_interface.py
+++ b/tests/integration/test_data_interface.py
@@ -16,9 +16,9 @@
 import pytest
 
 import cunumeric as num
-from cunumeric.runtime import _supported_dtypes
+from cunumeric.utils import SUPPORTED_DTYPES
 
-DTYPES = _supported_dtypes.keys()
+DTYPES = SUPPORTED_DTYPES.keys()
 
 
 # A simple wrapper with a legate data interface implementation for testing
diff --git a/tests/unit/cunumeric/test_utils.py b/tests/unit/cunumeric/test_utils.py
index 01a12961a..fa2880bed 100644
--- a/tests/unit/cunumeric/test_utils.py
+++ b/tests/unit/cunumeric/test_utils.py
@@ -21,21 +21,27 @@
 
 import cunumeric.utils as m  # module under test
 
-EXPECTED_SUPPORTED_DTYPES = [
-    np.float16,
-    np.float32,
-    np.float64,
-    float,
-    np.int16,
-    np.int32,
-    np.int64,
-    int,
-    np.uint16,
-    np.uint32,
-    np.uint64,
-    np.bool_,
-    bool,
-]
+EXPECTED_SUPPORTED_DTYPES = set(
+    [
+        bool,
+        np.bool_,
+        np.int8,
+        np.int16,
+        np.int32,
+        int,
+        np.int64,
+        np.uint8,
+        np.uint16,
+        np.uint32,
+        np.uint64,
+        np.float16,
+        np.float32,
+        float,
+        np.float64,
+        np.complex64,
+        np.complex128,
+    ]
+)
 
 
 class Test_is_advanced_indexing:
@@ -110,7 +116,7 @@ def test_top_only_False(self) -> None:
 
 
 def test__SUPPORTED_DTYPES():
-    assert m._SUPPORTED_DTYPES == EXPECTED_SUPPORTED_DTYPES
+    assert set(m.SUPPORTED_DTYPES.keys()) == EXPECTED_SUPPORTED_DTYPES
 
 
 class Test_is_supported_dtype:
@@ -126,9 +132,7 @@ def test_supported(self, value) -> None:
         assert m.is_supported_dtype(np.dtype(value))
 
     # This is just a representative sample, not exhasutive
-    @pytest.mark.parametrize(
-        "value", [np.float128, np.complex64, np.datetime64]
-    )
+    @pytest.mark.parametrize("value", [np.float128, np.datetime64])
     def test_unsupported(self, value) -> None:
         assert not m.is_supported_dtype(np.dtype(value))
 

From 90dbecf9dab5b5cd23efe5ecedf4d01b0289dce4 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Fri, 18 Nov 2022 11:06:06 -0800
Subject: [PATCH 48/89] Also check shape and dtype in allclose (#699)

* Also check shape and dtype in allclose

* Fix test failures

* Don't check types in solver tests

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 cunumeric/linalg/linalg.py              | 2 +-
 cunumeric/module.py                     | 9 +++++++--
 tests/integration/test_einsum.py        | 4 +++-
 tests/integration/test_solve.py         | 8 ++++++--
 tests/integration/utils/comparisons.py  | 9 +++++++++
 tests/integration/utils/contractions.py | 4 +++-
 6 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/cunumeric/linalg/linalg.py b/cunumeric/linalg/linalg.py
index 88e457194..1eb8454c6 100644
--- a/cunumeric/linalg/linalg.py
+++ b/cunumeric/linalg/linalg.py
@@ -521,7 +521,7 @@ def norm(
             # Zero norm
             return (
                 (x != 0)
-                .astype(np.int64)
+                .astype(x.dtype)
                 .sum(axis=computed_axis, keepdims=keepdims)
             )
         elif ord == 1:
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 390f8d755..f60021ae8 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -4065,7 +4065,8 @@ def _contract(
         raise ValueError("Unknown mode labels on output")
 
     # Handle types
-    if dtype is not None:
+    makes_view = b is None and len(a_modes) == len(out_modes)
+    if dtype is not None and not makes_view:
         c_dtype = dtype
     elif out is not None:
         c_dtype = out.dtype
@@ -5867,8 +5868,12 @@ def sort_complex(a: ndarray) -> ndarray:
     # force complex result upon return
     if np.issubdtype(result.dtype, np.complexfloating):
         return result
-    else:
+    elif (
+        np.issubdtype(result.dtype, np.integer) and result.dtype.itemsize <= 2
+    ):
         return result.astype(np.complex64, copy=True)
+    else:
+        return result.astype(np.complex128, copy=True)
 
 
 # partition
diff --git a/tests/integration/test_einsum.py b/tests/integration/test_einsum.py
index fd34ef088..c4014b6fa 100644
--- a/tests/integration/test_einsum.py
+++ b/tests/integration/test_einsum.py
@@ -237,7 +237,9 @@ def check_np_vs_num(expr, mk_input, mk_output=None, **kwargs):
             for num_out in mk_output(num, out_shape):
                 num.einsum(expr, *num_inputs, out=num_out, **kwargs)
                 rtol_out = 1e-02 if num_out.dtype == np.float16 else rtol
-                assert allclose(num_out, num_res, rtol=rtol_out)
+                assert allclose(
+                    num_out, num_res, rtol=rtol_out, check_dtype=False
+                )
 
 
 @pytest.mark.parametrize("expr", gen_expr())
diff --git a/tests/integration/test_solve.py b/tests/integration/test_solve.py
index 7a8bc3770..30b569401 100644
--- a/tests/integration/test_solve.py
+++ b/tests/integration/test_solve.py
@@ -47,7 +47,9 @@ def test_solve_1d(n, a_dtype, b_dtype):
 
     rtol = RTOL[out.dtype]
     atol = ATOL[out.dtype]
-    assert allclose(b, num.matmul(a, out), rtol=rtol, atol=atol)
+    assert allclose(
+        b, num.matmul(a, out), rtol=rtol, atol=atol, check_dtype=False
+    )
 
 
 @pytest.mark.parametrize("n", SIZES)
@@ -61,7 +63,9 @@ def test_solve_2d(n, a_dtype, b_dtype):
 
     rtol = RTOL[out.dtype]
     atol = ATOL[out.dtype]
-    assert allclose(b, num.matmul(a, out), rtol=rtol, atol=atol)
+    assert allclose(
+        b, num.matmul(a, out), rtol=rtol, atol=atol, check_dtype=False
+    )
 
 
 def test_solve_corner_cases():
diff --git a/tests/integration/utils/comparisons.py b/tests/integration/utils/comparisons.py
index dde1011b6..65571b38c 100644
--- a/tests/integration/utils/comparisons.py
+++ b/tests/integration/utils/comparisons.py
@@ -27,10 +27,19 @@ def allclose(
     equal_nan: bool = False,
     *,
     diff_limit: Union[int, None] = 5,  # None means no limit at all
+    check_dtype: bool = True,
 ) -> bool:
+    if np.shape(a) != np.shape(b):
+        print(f"allclose: different shape: {np.shape(a)} vs {np.shape(b)}")
+        return False
+
     # simplify handling of scalar values
     a, b = np.atleast_1d(a), np.atleast_1d(b)
 
+    if check_dtype and a.dtype != b.dtype:
+        print(f"allclose: different dtype: {a.dtype} vs {b.dtype}")
+        return False
+
     close = np.isclose(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
 
     all_close = np.all(close)
diff --git a/tests/integration/utils/contractions.py b/tests/integration/utils/contractions.py
index e5530b982..cc719f696 100644
--- a/tests/integration/utils/contractions.py
+++ b/tests/integration/utils/contractions.py
@@ -125,7 +125,9 @@ def _test(name, modes, operation, gen_inputs, gen_output=None, **kwargs):
             for num_out in gen_output(num, modes, *num_inputs):
                 operation(num, *num_inputs, out=num_out, **kwargs)
                 rtol_out = 1e-02 if num_out.dtype == np.float16 else rtol
-                assert allclose(num_out, num_res, rtol=rtol_out)
+                assert allclose(
+                    num_out, num_res, rtol=rtol_out, check_dtype=False
+                )
 
 
 def check_default(name, modes, operation):

From b30e08a77ca17c6f16d6fa18b95791cbe16f922b Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 21 Nov 2022 12:01:13 -0800
Subject: [PATCH 49/89] Regenerate `install_info.py` on every build (#705)

* regenerate install_info.py on every build

* specify custom target dependencies correctly

* fix typo
---
 CMakeLists.txt                                | 13 ++++----
 cmake/generate_install_info_py.cmake          | 31 +++++++++++++++++++
 cunumeric_python.cmake                        | 20 ++++--------
 scripts/build-install.sh                      |  2 +-
 scripts/build-no-install.sh                   |  2 +-
 scripts/build-separately-no-install.sh        |  2 +-
 scripts/build-with-legate-no-install.sh       |  2 +-
 ...build-with-legate-separately-no-install.sh |  2 +-
 8 files changed, 48 insertions(+), 26 deletions(-)
 create mode 100644 cmake/generate_install_info_py.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee10d8337..417bb9aa4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,24 +79,23 @@ endif()
 if(CMAKE_GENERATOR STREQUAL "Ninja")
   function(add_touch_cunumeric_ninja_build_target)
     set(_suf )
-    set(_depends )
     if(SKBUILD)
       set(_suf "_python")
     endif()
+    add_custom_target("touch_cunumeric${_suf}_ninja_build" ALL
+      COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
+      COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
+      VERBATIM
+    )
     foreach(_dep IN ITEMS cunumeric cunumeric_python
                           legion_core legion_core_python
                           Legion LegionRuntime
                           Realm RealmRuntime
                           Regent)
       if(TARGET ${_dep})
-        list(APPEND _depends ${_dep})
+        add_dependencies("touch_cunumeric${_suf}_ninja_build" ${_dep})
       endif()
     endforeach()
-    add_custom_target("touch_cunumeric${_suf}_ninja_build" ALL
-      COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
-      COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
-      VERBATIM DEPENDS ${_depends}
-    )
   endfunction()
   add_touch_cunumeric_ninja_build_target()
 endif()
diff --git a/cmake/generate_install_info_py.cmake b/cmake/generate_install_info_py.cmake
new file mode 100644
index 000000000..2fb14cbcb
--- /dev/null
+++ b/cmake/generate_install_info_py.cmake
@@ -0,0 +1,31 @@
+#=============================================================================
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+execute_process(
+  COMMAND ${CMAKE_C_COMPILER}
+    -E -DLEGATE_USE_PYTHON_CFFI
+    -I "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric"
+    -P "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric/cunumeric_c.h"
+  ECHO_ERROR_VARIABLE
+  OUTPUT_VARIABLE header
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+set(libpath "")
+configure_file(
+  "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py.in"
+  "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py"
+@ONLY)
diff --git a/cunumeric_python.cmake b/cunumeric_python.cmake
index 3430b5828..c1ca06015 100644
--- a/cunumeric_python.cmake
+++ b/cunumeric_python.cmake
@@ -43,22 +43,14 @@ if(NOT cunumeric_FOUND)
   set(SKBUILD ON)
 endif()
 
-execute_process(
-  COMMAND ${CMAKE_C_COMPILER}
-    -E -DLEGATE_USE_PYTHON_CFFI
-    -I "${CMAKE_CURRENT_SOURCE_DIR}/src/cunumeric"
-    -P "${CMAKE_CURRENT_SOURCE_DIR}/src/cunumeric/cunumeric_c.h"
-  ECHO_ERROR_VARIABLE
-  OUTPUT_VARIABLE header
-  COMMAND_ERROR_IS_FATAL ANY
+add_custom_target("generate_install_info_py" ALL
+  COMMAND ${CMAKE_COMMAND}
+          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+          -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate_install_info_py.cmake"
+  COMMENT "Generate install_info.py"
+  VERBATIM
 )
 
-set(libpath "")
-configure_file(
-  "${CMAKE_CURRENT_SOURCE_DIR}/cunumeric/install_info.py.in"
-  "${CMAKE_CURRENT_SOURCE_DIR}/cunumeric/install_info.py"
-@ONLY)
-
 add_library(cunumeric_python INTERFACE)
 add_library(cunumeric::cunumeric_python ALIAS cunumeric_python)
 target_link_libraries(cunumeric_python INTERFACE legate::core)
diff --git a/scripts/build-install.sh b/scripts/build-install.sh
index 8adb472d2..af0f8429d 100755
--- a/scripts/build-install.sh
+++ b/scripts/build-install.sh
@@ -16,7 +16,7 @@ rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-no-install.sh b/scripts/build-no-install.sh
index c398eda58..1237d1a5a 100755
--- a/scripts/build-no-install.sh
+++ b/scripts/build-no-install.sh
@@ -14,7 +14,7 @@ rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-separately-no-install.sh b/scripts/build-separately-no-install.sh
index 8d8078723..be31507ee 100644
--- a/scripts/build-separately-no-install.sh
+++ b/scripts/build-separately-no-install.sh
@@ -14,7 +14,7 @@ rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-with-legate-no-install.sh b/scripts/build-with-legate-no-install.sh
index 498745e31..9d83010b7 100644
--- a/scripts/build-with-legate-no-install.sh
+++ b/scripts/build-with-legate-no-install.sh
@@ -16,7 +16,7 @@ rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="
diff --git a/scripts/build-with-legate-separately-no-install.sh b/scripts/build-with-legate-separately-no-install.sh
index fa9e97d05..74cc277a0 100755
--- a/scripts/build-with-legate-separately-no-install.sh
+++ b/scripts/build-with-legate-separately-no-install.sh
@@ -16,7 +16,7 @@ rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
 cmake_args="${CMAKE_ARGS:-}"
 
 # Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
 
 # Add other build options here as desired
 cmake_args+="

From aeeb82bbeac2d92a901378b0ed903d66005260fd Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 21 Nov 2022 14:19:33 -0800
Subject: [PATCH 50/89] More argument checks for `bincount` (#711)

---
 cunumeric/module.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/cunumeric/module.py b/cunumeric/module.py
index f60021ae8..95b5350cf 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -6246,6 +6246,8 @@ def bincount(
     --------
     Multiple GPUs, Multiple CPUs
     """
+    if x.ndim != 1:
+        raise ValueError("the input array must be 1-dimensional")
     if weights is not None:
         if weights.shape != x.shape:
             raise ValueError("weights array must be same shape for bincount")
@@ -6253,11 +6255,16 @@ def bincount(
             raise ValueError("weights must be convertible to float64")
         # Make sure the weights are float64
         weights = weights.astype(np.float64)
-    if x.dtype.kind != "i" and x.dtype.kind != "u":
+    if x.dtype.kind != "i":
         raise TypeError("input array for bincount must be integer type")
     if minlength < 0:
         raise ValueError("'minlength' must not be negative")
-    minlength = _builtin_max(minlength, int(amax(x)) + 1)
+    # Note that the following are non-blocking operations,
+    # though passing their results to `int` is blocking
+    max_val, min_val = amax(x), amin(x)
+    if int(min_val) < 0:
+        raise ValueError("the input array must have no negative elements")
+    minlength = _builtin_max(minlength, int(max_val) + 1)
     if x.size == 1:
         # Handle the special case of 0-D array
         if weights is None:

From c8f0e750f8905b87ef4e78bfa0780876994c532c Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 21 Nov 2022 17:59:22 -0800
Subject: [PATCH 51/89] Fix a typo in unique.cu indexing (#713)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 src/cunumeric/set/unique.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cunumeric/set/unique.cu b/src/cunumeric/set/unique.cu
index 908a87664..2cc4e6363 100644
--- a/src/cunumeric/set/unique.cu
+++ b/src/cunumeric/set/unique.cu
@@ -40,7 +40,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   size_t offset = blockIdx.x * blockDim.x + threadIdx.x;
   if (offset >= volume) return;
   auto point  = pitches.unflatten(offset, lo);
-  out[offset] = accessor[lo + point];
+  out[offset] = accessor[point];
 }
 
 template <typename VAL>

From b8ad06f5e4b0ed0492f0066ec324762dd9958397 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Mon, 21 Nov 2022 21:10:19 -0800
Subject: [PATCH 52/89] Fixes for buffer allocations (#706)

* Two updates to buffer allocations:

* Remove the obsolete has_numamem flag, as create_buffer now uses
socket memories.
* Use layout-aware create_output_buffer to create output buffers for
unbound output stores

* Start running OpenMP tests with socket memories

* Unify temporary buffer allocations for matrix tasks on CPUs

* Remove references to an obsolete tunable name

* Another place where the obsolete tunable was being used
---
 .github/workflows/ci.yml                     |  2 +-
 cunumeric/config.py                          |  2 --
 src/cunumeric/cunumeric.cc                   | 10 ------
 src/cunumeric/cunumeric.h                    |  4 ---
 src/cunumeric/cunumeric_c.h                  |  1 -
 src/cunumeric/index/repeat_omp.cc            |  3 +-
 src/cunumeric/index/repeat_template.inl      |  4 +--
 src/cunumeric/mapper.cc                      |  9 ------
 src/cunumeric/matrix/contract_omp.cc         |  6 ++--
 src/cunumeric/matrix/matmul_omp.cc           |  5 +--
 src/cunumeric/matrix/matvecmul_omp.cc        |  5 +--
 src/cunumeric/matrix/solve_cpu.inl           | 17 +++-------
 src/cunumeric/matrix/util.cc                 |  3 +-
 src/cunumeric/matrix/util_omp.cc             |  8 -----
 src/cunumeric/matrix/util_omp.h              |  2 --
 src/cunumeric/search/argwhere_template.inl   |  5 +--
 src/cunumeric/search/nonzero.cc              | 16 +++++-----
 src/cunumeric/search/nonzero.cu              | 16 +++++-----
 src/cunumeric/search/nonzero_omp.cc          | 17 +++++-----
 src/cunumeric/search/nonzero_template.inl    |  9 ++----
 src/cunumeric/set/unique.cc                  | 21 ++++++-------
 src/cunumeric/set/unique.cu                  | 33 +++++++++++---------
 src/cunumeric/set/unique_omp.cc              | 22 ++++++-------
 src/cunumeric/set/unique_reduce.cc           |  7 ++---
 src/cunumeric/set/unique_reduce_template.inl |  6 +---
 src/cunumeric/set/unique_template.inl        |  8 ++---
 src/cunumeric/sort/sort.cu                   |  1 -
 tests/unit/cunumeric/test_config.py          |  1 -
 28 files changed, 86 insertions(+), 157 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index be0db706c..b9b3bc526 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -77,7 +77,7 @@ jobs:
           - {name: GPU test, options: --use cuda --gpus 1 --debug, log: gpu}
           - {name: 2 GPUs test, options: --use cuda --gpus 2 --debug, log: gpus}
           - {name: OpenMP test, options: --use openmp --omps 1 --ompthreads 2 --debug, log: omp}
-          - {name: 2 OpenMPs test, options: --use openmp --omps 2 --ompthreads 2 --debug, log: omps}
+          - {name: 2 NUMA OpenMPs test, options: --use openmp --omps 2 --ompthreads 2 --numamem 2048 --debug, log: omps}
           - {name: Eager execution test, options: --use eager --debug, log: eager}
           - {name: mypy, options: mypy, log: mypy}
           - {name: documentation, options: docs, log: docs}
diff --git a/cunumeric/config.py b/cunumeric/config.py
index 9195022d6..cad52e77f 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -194,7 +194,6 @@ class _CunumericSharedLib:
     CUNUMERIC_TRANSPOSE_COPY_2D: int
     CUNUMERIC_TRILU: int
     CUNUMERIC_TRSM: int
-    CUNUMERIC_TUNABLE_HAS_NUMAMEM: int
     CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME: int
     CUNUMERIC_TUNABLE_NUM_GPUS: int
     CUNUMERIC_TUNABLE_NUM_PROCS: int
@@ -524,7 +523,6 @@ class CuNumericTunable(IntEnum):
     NUM_GPUS = _cunumeric.CUNUMERIC_TUNABLE_NUM_GPUS
     NUM_PROCS = _cunumeric.CUNUMERIC_TUNABLE_NUM_PROCS
     MAX_EAGER_VOLUME = _cunumeric.CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME
-    HAS_NUMAMEM = _cunumeric.CUNUMERIC_TUNABLE_HAS_NUMAMEM
 
 
 # Match these to CuNumericScanCode in cunumeric_c.h
diff --git a/src/cunumeric/cunumeric.cc b/src/cunumeric/cunumeric.cc
index bf1ef7657..e8f87bbf6 100644
--- a/src/cunumeric/cunumeric.cc
+++ b/src/cunumeric/cunumeric.cc
@@ -25,9 +25,6 @@ namespace cunumeric {
 
 static const char* const cunumeric_library_name = "cunumeric";
 
-/*static*/ bool CuNumeric::has_numamem   = false;
-/*static*/ MapperID CuNumeric::mapper_id = -1;
-
 /*static*/ LegateTaskRegistrar& CuNumeric::get_registrar()
 {
   static LegateTaskRegistrar registrar;
@@ -60,7 +57,6 @@ void registration_callback(Machine machine,
 #endif
 
   // Now we can register our mapper with the runtime
-  CuNumeric::mapper_id = context.get_mapper_id(0);
   context.register_mapper(new CuNumericMapper(runtime, machine, context), 0);
 }
 
@@ -74,12 +70,6 @@ void cunumeric_perform_registration(void)
   // in before the runtime starts and make it global so that we know
   // that this call back is invoked everywhere across all nodes
   Runtime::perform_registration_callback(cunumeric::registration_callback, true /*global*/);
-
-  Runtime* runtime = Runtime::get_runtime();
-  Context ctx      = Runtime::get_context();
-  Future fut       = runtime->select_tunable_value(
-    ctx, CUNUMERIC_TUNABLE_HAS_NUMAMEM, cunumeric::CuNumeric::mapper_id);
-  if (fut.get_result<int32_t>() != 0) cunumeric::CuNumeric::has_numamem = true;
 }
 
 bool cunumeric_has_curand()
diff --git a/src/cunumeric/cunumeric.h b/src/cunumeric/cunumeric.h
index 32af7e6b7..11c4cd990 100644
--- a/src/cunumeric/cunumeric.h
+++ b/src/cunumeric/cunumeric.h
@@ -37,10 +37,6 @@ struct CuNumeric {
     get_registrar().record_variant(std::forward<Args>(args)...);
   }
   static legate::LegateTaskRegistrar& get_registrar();
-
- public:
-  static bool has_numamem;
-  static Legion::MapperID mapper_id;
 };
 
 template <typename T>
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 462214782..724db0013 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -206,7 +206,6 @@ enum CuNumericTunable {
   CUNUMERIC_TUNABLE_NUM_GPUS         = 1,
   CUNUMERIC_TUNABLE_NUM_PROCS        = 2,
   CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME = 3,
-  CUNUMERIC_TUNABLE_HAS_NUMAMEM      = 4,
 };
 
 enum CuNumericBounds {
diff --git a/src/cunumeric/index/repeat_omp.cc b/src/cunumeric/index/repeat_omp.cc
index 823a1a16a..9344452d1 100644
--- a/src/cunumeric/index/repeat_omp.cc
+++ b/src/cunumeric/index/repeat_omp.cc
@@ -62,9 +62,8 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
                   const int32_t axis,
                   const Rect<DIM>& in_rect) const
   {
-    auto kind = CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
     int64_t axis_extent = in_rect.hi[axis] - in_rect.lo[axis] + 1;
-    auto offsets        = create_buffer<int64_t>(axis_extent, kind);
+    auto offsets        = create_buffer<int64_t>(axis_extent);
 
     const auto max_threads = omp_get_max_threads();
     ThreadLocalStorage<int64_t> local_sums(max_threads);
diff --git a/src/cunumeric/index/repeat_template.inl b/src/cunumeric/index/repeat_template.inl
index c47603916..30b3249cf 100644
--- a/src/cunumeric/index/repeat_template.inl
+++ b/src/cunumeric/index/repeat_template.inl
@@ -38,9 +38,7 @@ struct RepeatImpl {
     auto input_arr  = args.input.read_accessor<VAL, DIM>(input_rect);
 
     if (input_rect.empty()) {
-      auto extents = Point<DIM>::ZEROES();
-      auto buffer  = create_buffer<VAL, DIM>(extents);
-      args.output.return_data(buffer, extents);
+      args.output.make_empty();
       return;
     }
 
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index ada6ca268..51797acfe 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -65,15 +65,6 @@ Scalar CuNumericMapper::tunable_value(TunableID tunable_id)
       }
       return Scalar(eager_volume);
     }
-    case CUNUMERIC_TUNABLE_HAS_NUMAMEM: {
-      // TODO: This assumes that either all OpenMP processors across the machine have a NUMA
-      // memory or none does.
-      Legion::Machine::MemoryQuery query(machine);
-      query.local_address_space();
-      query.only_kind(Legion::Memory::SOCKET_MEM);
-      int32_t has_numamem = query.count() > 0;
-      return Scalar(has_numamem);
-    }
     default: break;
   }
   LEGATE_ABORT;  // unknown tunable value
diff --git a/src/cunumeric/matrix/contract_omp.cc b/src/cunumeric/matrix/contract_omp.cc
index 4a1dd27b2..659db3f0a 100644
--- a/src/cunumeric/matrix/contract_omp.cc
+++ b/src/cunumeric/matrix/contract_omp.cc
@@ -112,17 +112,17 @@ struct ContractImplBody<VariantKind::OMP, LegateTypeCode::HALF_LT> {
 
     std::vector<int64_t> lhs_copy_strides(lhs_ndim);
     int64_t lhs_size     = calculate_volume(lhs_ndim, lhs_shape, lhs_copy_strides.data());
-    float* lhs_copy_data = allocate_buffer_omp(lhs_size);
+    float* lhs_copy_data = allocate_buffer(lhs_size);
     half_tensor_to_float_omp(lhs_copy_data, lhs_data, lhs_ndim, lhs_shape, lhs_strides);
 
     std::vector<int64_t> rhs1_copy_strides(rhs1_ndim);
     int64_t rhs1_size     = calculate_volume(rhs1_ndim, rhs1_shape, rhs1_copy_strides.data());
-    float* rhs1_copy_data = allocate_buffer_omp(rhs1_size);
+    float* rhs1_copy_data = allocate_buffer(rhs1_size);
     half_tensor_to_float_omp(rhs1_copy_data, rhs1_data, rhs1_ndim, rhs1_shape, rhs1_strides);
 
     std::vector<int64_t> rhs2_copy_strides(rhs2_ndim);
     int64_t rhs2_size     = calculate_volume(rhs2_ndim, rhs2_shape, rhs2_copy_strides.data());
-    float* rhs2_copy_data = allocate_buffer_omp(rhs2_size);
+    float* rhs2_copy_data = allocate_buffer(rhs2_size);
     half_tensor_to_float_omp(rhs2_copy_data, rhs2_data, rhs2_ndim, rhs2_shape, rhs2_strides);
 
     ContractImplBody<VariantKind::OMP, LegateTypeCode::FLOAT_LT>{}(lhs_copy_data,
diff --git a/src/cunumeric/matrix/matmul_omp.cc b/src/cunumeric/matrix/matmul_omp.cc
index 72b7add85..dd8ea9910 100644
--- a/src/cunumeric/matrix/matmul_omp.cc
+++ b/src/cunumeric/matrix/matmul_omp.cc
@@ -16,6 +16,7 @@
 
 #include "cunumeric/matrix/matmul.h"
 #include "cunumeric/matrix/matmul_template.inl"
+#include "cunumeric/matrix/util.h"
 #include "cunumeric/matrix/util_omp.h"
 
 #include <cblas.h>
@@ -102,8 +103,8 @@ struct MatMulImplBody<VariantKind::OMP, LegateTypeCode::HALF_LT> {
                   bool rhs1_transposed,
                   bool rhs2_transposed)
   {
-    auto rhs1_copy = allocate_buffer_omp(m * k);
-    auto rhs2_copy = allocate_buffer_omp(k * n);
+    auto rhs1_copy = allocate_buffer(m * k);
+    auto rhs2_copy = allocate_buffer(k * n);
 
     if (rhs1_transposed)
       half_matrix_to_float_omp(rhs1_copy, rhs1, k, m, rhs1_stride);
diff --git a/src/cunumeric/matrix/matvecmul_omp.cc b/src/cunumeric/matrix/matvecmul_omp.cc
index 33a59052c..4166098be 100644
--- a/src/cunumeric/matrix/matvecmul_omp.cc
+++ b/src/cunumeric/matrix/matvecmul_omp.cc
@@ -16,6 +16,7 @@
 
 #include "cunumeric/matrix/matvecmul.h"
 #include "cunumeric/matrix/matvecmul_template.inl"
+#include "cunumeric/matrix/util.h"
 #include "cunumeric/matrix/util_omp.h"
 
 #include <cblas.h>
@@ -68,8 +69,8 @@ struct MatVecMulImplBody<VariantKind::OMP, LegateTypeCode::HALF_LT> {
   {
     auto vec_size = transpose_mat ? m : n;
 
-    auto mat_copy = allocate_buffer_omp(m * n);
-    auto vec_copy = allocate_buffer_omp(vec_size);
+    auto mat_copy = allocate_buffer(m * n);
+    auto vec_copy = allocate_buffer(vec_size);
 
     half_matrix_to_float_omp(mat_copy, mat, m, n, mat_stride);
     half_vector_to_float_omp(vec_copy, vec, vec_size);
diff --git a/src/cunumeric/matrix/solve_cpu.inl b/src/cunumeric/matrix/solve_cpu.inl
index 98cba89aa..83dae8063 100644
--- a/src/cunumeric/matrix/solve_cpu.inl
+++ b/src/cunumeric/matrix/solve_cpu.inl
@@ -24,20 +24,11 @@ namespace cunumeric {
 using namespace Legion;
 using namespace legate;
 
-template <VariantKind KIND>
-Memory::Kind get_memory_kind()
-{
-  if constexpr (KIND == VariantKind::OMP)
-    return CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
-  else
-    return Memory::Kind::SYSTEM_MEM;
-}
-
 template <VariantKind KIND>
 struct SolveImplBody<KIND, LegateTypeCode::FLOAT_LT> {
   void operator()(int32_t m, int32_t n, int32_t nrhs, float* a, float* b)
   {
-    auto ipiv = create_buffer<int32_t>(std::min(m, n), get_memory_kind<KIND>());
+    auto ipiv = create_buffer<int32_t>(std::min(m, n));
 
     int32_t info = 0;
     LAPACK_sgesv(&n, &nrhs, a, &m, ipiv.ptr(0), b, &n, &info);
@@ -50,7 +41,7 @@ template <VariantKind KIND>
 struct SolveImplBody<KIND, LegateTypeCode::DOUBLE_LT> {
   void operator()(int32_t m, int32_t n, int32_t nrhs, double* a, double* b)
   {
-    auto ipiv = create_buffer<int32_t>(std::min(m, n), get_memory_kind<KIND>());
+    auto ipiv = create_buffer<int32_t>(std::min(m, n));
 
     int32_t info = 0;
     LAPACK_dgesv(&n, &nrhs, a, &m, ipiv.ptr(0), b, &n, &info);
@@ -63,7 +54,7 @@ template <VariantKind KIND>
 struct SolveImplBody<KIND, LegateTypeCode::COMPLEX64_LT> {
   void operator()(int32_t m, int32_t n, int32_t nrhs, complex<float>* a_, complex<float>* b_)
   {
-    auto ipiv = create_buffer<int32_t>(std::min(m, n), get_memory_kind<KIND>());
+    auto ipiv = create_buffer<int32_t>(std::min(m, n));
 
     auto a = reinterpret_cast<__complex__ float*>(a_);
     auto b = reinterpret_cast<__complex__ float*>(b_);
@@ -79,7 +70,7 @@ template <VariantKind KIND>
 struct SolveImplBody<KIND, LegateTypeCode::COMPLEX128_LT> {
   void operator()(int32_t m, int32_t n, int32_t nrhs, complex<double>* a_, complex<double>* b_)
   {
-    auto ipiv = create_buffer<int32_t>(std::min(m, n), get_memory_kind<KIND>());
+    auto ipiv = create_buffer<int32_t>(std::min(m, n));
 
     auto a = reinterpret_cast<__complex__ double*>(a_);
     auto b = reinterpret_cast<__complex__ double*>(b_);
diff --git a/src/cunumeric/matrix/util.cc b/src/cunumeric/matrix/util.cc
index f2bbb88ee..67010f062 100644
--- a/src/cunumeric/matrix/util.cc
+++ b/src/cunumeric/matrix/util.cc
@@ -74,8 +74,7 @@ int64_t calculate_volume(size_t ndim, const int64_t* shape, int64_t* strides)
 
 float* allocate_buffer(size_t size)
 {
-  // We will not call this function on GPUs
-  auto buffer = legate::create_buffer<float, 1>(size, Memory::Kind::SYSTEM_MEM);
+  auto buffer = legate::create_buffer<float, 1>(size);
   return buffer.ptr(0);
 }
 
diff --git a/src/cunumeric/matrix/util_omp.cc b/src/cunumeric/matrix/util_omp.cc
index c847ce6cf..af157e285 100644
--- a/src/cunumeric/matrix/util_omp.cc
+++ b/src/cunumeric/matrix/util_omp.cc
@@ -24,14 +24,6 @@ namespace cunumeric {
 
 using namespace Legion;
 
-float* allocate_buffer_omp(size_t size)
-{
-  Memory::Kind kind = CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
-  // We will not call this function on GPUs
-  auto buffer = legate::create_buffer<float, 1>(size, kind);
-  return buffer.ptr(0);
-}
-
 void half_vector_to_float_omp(float* out, const __half* ptr, size_t n)
 {
 #pragma omp parallel for schedule(static)
diff --git a/src/cunumeric/matrix/util_omp.h b/src/cunumeric/matrix/util_omp.h
index b17072b3c..805c622e7 100644
--- a/src/cunumeric/matrix/util_omp.h
+++ b/src/cunumeric/matrix/util_omp.h
@@ -20,8 +20,6 @@
 
 namespace cunumeric {
 
-float* allocate_buffer_omp(size_t size);
-
 // The following assume that the float array was created using allocate_buffer
 
 void half_vector_to_float_omp(float* out, const __half* ptr, size_t n);
diff --git a/src/cunumeric/search/argwhere_template.inl b/src/cunumeric/search/argwhere_template.inl
index f609eaef1..da9224bea 100644
--- a/src/cunumeric/search/argwhere_template.inl
+++ b/src/cunumeric/search/argwhere_template.inl
@@ -41,10 +41,7 @@ struct ArgWhereImpl {
     size_t volume = pitches.flatten(rect_in);
 
     if (volume == 0) {
-      auto extents = Point<2>::ZEROES();
-      // auto extents = Point<2>(0,DIM);
-      auto buffer = create_buffer<VAL, 2>(extents);
-      args.out.return_data(buffer, extents);
+      args.out.make_empty();
       return;
     }
 
diff --git a/src/cunumeric/search/nonzero.cc b/src/cunumeric/search/nonzero.cc
index 76a0dd8ea..0ccea91f6 100644
--- a/src/cunumeric/search/nonzero.cc
+++ b/src/cunumeric/search/nonzero.cc
@@ -26,11 +26,11 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct NonzeroImplBody<VariantKind::CPU, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  size_t operator()(const AccessorRO<VAL, DIM>& in,
-                    const Pitches<DIM - 1>& pitches,
-                    const Rect<DIM>& rect,
-                    const size_t volume,
-                    std::vector<Buffer<int64_t>>& results)
+  void operator()(std::vector<Array>& outputs,
+                  const AccessorRO<VAL, DIM>& in,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume)
   {
     int64_t size = 0;
 
@@ -39,7 +39,9 @@ struct NonzeroImplBody<VariantKind::CPU, CODE, DIM> {
       size += in[point] != VAL(0);
     }
 
-    for (auto& result : results) result = create_buffer<int64_t>(size, Memory::Kind::SYSTEM_MEM);
+    std::vector<Buffer<int64_t>> results;
+    for (auto& output : outputs)
+      results.push_back(output.create_output_buffer<int64_t, 1>(Point<1>(size), true));
 
     int64_t out_idx = 0;
     for (size_t idx = 0; idx < volume; ++idx) {
@@ -49,8 +51,6 @@ struct NonzeroImplBody<VariantKind::CPU, CODE, DIM> {
       ++out_idx;
     }
     assert(size == out_idx);
-
-    return size;
   }
 };
 
diff --git a/src/cunumeric/search/nonzero.cu b/src/cunumeric/search/nonzero.cu
index a542b1bac..6356d1076 100644
--- a/src/cunumeric/search/nonzero.cu
+++ b/src/cunumeric/search/nonzero.cu
@@ -62,23 +62,23 @@ struct NonzeroImplBody<VariantKind::GPU, CODE, DIM> {
       volume, in, pitches, rect.lo, offsets, p_results);
   }
 
-  size_t operator()(const AccessorRO<VAL, DIM>& in,
-                    const Pitches<DIM - 1>& pitches,
-                    const Rect<DIM>& rect,
-                    const size_t volume,
-                    std::vector<Buffer<int64_t>>& results)
+  void operator()(std::vector<Array>& outputs,
+                  const AccessorRO<VAL, DIM>& in,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume)
   {
     auto stream = get_cached_stream();
 
     auto offsets = create_buffer<int64_t>(volume, Memory::Kind::GPU_FB_MEM);
     auto size    = compute_offsets(in, pitches, rect, volume, offsets, stream);
 
-    for (auto& result : results) result = create_buffer<int64_t>(size, Memory::Kind::GPU_FB_MEM);
+    std::vector<Buffer<int64_t>> results;
+    for (auto& output : outputs)
+      results.push_back(output.create_output_buffer<int64_t, 1>(Point<1>(size), true));
 
     if (size > 0) populate_nonzeros(in, pitches, rect, volume, results, offsets, stream);
     CHECK_CUDA_STREAM(stream);
-
-    return size;
   }
 };
 
diff --git a/src/cunumeric/search/nonzero_omp.cc b/src/cunumeric/search/nonzero_omp.cc
index 178956bd6..b294567c7 100644
--- a/src/cunumeric/search/nonzero_omp.cc
+++ b/src/cunumeric/search/nonzero_omp.cc
@@ -29,11 +29,11 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct NonzeroImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  size_t operator()(const AccessorRO<VAL, DIM>& in,
-                    const Pitches<DIM - 1>& pitches,
-                    const Rect<DIM>& rect,
-                    const size_t volume,
-                    std::vector<Buffer<int64_t>>& results)
+  void operator()(std::vector<Array>& outputs,
+                  const AccessorRO<VAL, DIM>& in,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume)
   {
     const auto max_threads = omp_get_max_threads();
 
@@ -59,8 +59,9 @@ struct NonzeroImplBody<VariantKind::OMP, CODE, DIM> {
       for (auto idx = 1; idx < max_threads; ++idx) offsets[idx] = offsets[idx - 1] + sizes[idx - 1];
     }
 
-    auto kind = CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
-    for (auto& result : results) result = create_buffer<int64_t>(size, kind);
+    std::vector<Buffer<int64_t>> results;
+    for (auto& output : outputs)
+      results.push_back(output.create_output_buffer<int64_t, 1>(Point<1>(size), true));
 
 #pragma omp parallel
     {
@@ -74,8 +75,6 @@ struct NonzeroImplBody<VariantKind::OMP, CODE, DIM> {
         ++out_idx;
       }
     }
-
-    return size;
   }
 };
 
diff --git a/src/cunumeric/search/nonzero_template.inl b/src/cunumeric/search/nonzero_template.inl
index cfeaaefaf..0d5227a87 100644
--- a/src/cunumeric/search/nonzero_template.inl
+++ b/src/cunumeric/search/nonzero_template.inl
@@ -41,17 +41,12 @@ struct NonzeroImpl {
     size_t volume = pitches.flatten(rect);
 
     if (volume == 0) {
-      auto empty = create_buffer<int64_t>(0);
-      for (auto& store : args.results) store.return_data(empty, Point<1>(0));
+      for (auto& store : args.results) store.make_empty();
       return;
     }
 
     auto in = args.input.read_accessor<VAL, DIM>(rect);
-    std::vector<Buffer<int64_t>> results(DIM);
-    auto size = NonzeroImplBody<KIND, CODE, DIM>()(in, pitches, rect, volume, results);
-
-    for (int32_t idx = 0; idx < DIM; ++idx)
-      args.results[idx].return_data(results[idx], Point<1>(size));
+    NonzeroImplBody<KIND, CODE, DIM>()(args.results, in, pitches, rect, volume);
   }
 };
 
diff --git a/src/cunumeric/set/unique.cc b/src/cunumeric/set/unique.cc
index 997d99cd6..b7ff2f25f 100644
--- a/src/cunumeric/set/unique.cc
+++ b/src/cunumeric/set/unique.cc
@@ -26,13 +26,14 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct UniqueImplBody<VariantKind::CPU, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  std::pair<Buffer<VAL>, size_t> operator()(const AccessorRO<VAL, DIM>& in,
-                                            const Pitches<DIM - 1>& pitches,
-                                            const Rect<DIM>& rect,
-                                            const size_t volume,
-                                            const std::vector<comm::Communicator>& comms,
-                                            const DomainPoint& point,
-                                            const Domain& launch_domain)
+  void operator()(Array& output,
+                  const AccessorRO<VAL, DIM>& in,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  const std::vector<comm::Communicator>& comms,
+                  const DomainPoint& point,
+                  const Domain& launch_domain)
   {
     std::set<VAL> dedup_set;
 
@@ -41,13 +42,9 @@ struct UniqueImplBody<VariantKind::CPU, CODE, DIM> {
       dedup_set.insert(in[p]);
     }
 
-    size_t size = dedup_set.size();
+    auto result = output.create_output_buffer<VAL, 1>(dedup_set.size(), true);
     size_t pos  = 0;
-    auto result = create_buffer<VAL>(size);
-
     for (auto e : dedup_set) result[pos++] = e;
-
-    return std::make_pair(result, size);
   }
 };
 
diff --git a/src/cunumeric/set/unique.cu b/src/cunumeric/set/unique.cu
index 2cc4e6363..9104474ef 100644
--- a/src/cunumeric/set/unique.cu
+++ b/src/cunumeric/set/unique.cu
@@ -49,8 +49,12 @@ using Piece = std::pair<Buffer<VAL>, size_t>;
 auto get_aligned_size = [](auto size) { return std::max<size_t>(16, (size + 15) / 16 * 16); };
 
 template <typename VAL>
-static Piece<VAL> tree_reduce(
-  Piece<VAL> my_piece, size_t my_id, size_t num_ranks, cudaStream_t stream, ncclComm_t* comm)
+static Piece<VAL> tree_reduce(Array& output,
+                              Piece<VAL> my_piece,
+                              size_t my_id,
+                              size_t num_ranks,
+                              cudaStream_t stream,
+                              ncclComm_t* comm)
 {
   size_t remaining = num_ranks;
   size_t radix     = 2;
@@ -114,7 +118,7 @@ static Piece<VAL> tree_reduce(
       auto buf_size =
         (get_aligned_size(my_piece.second * sizeof(VAL)) + sizeof(VAL) - 1) / sizeof(VAL);
       assert(my_piece.second <= buf_size);
-      my_piece.first = create_buffer<VAL>(buf_size);
+      my_piece.first = output.create_output_buffer<VAL, 1>(buf_size);
 
       CHECK_CUDA(cudaMemcpyAsync(my_piece.first.ptr(0),
                                  p_merged,
@@ -130,7 +134,7 @@ static Piece<VAL> tree_reduce(
 
   if (my_id != 0) {
     my_piece.second = 0;
-    my_piece.first  = create_buffer<VAL>(0);
+    my_piece.first  = output.create_output_buffer<VAL, 1>(0);
   }
 
   return my_piece;
@@ -140,13 +144,14 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  Piece<VAL> operator()(const AccessorRO<VAL, DIM>& in,
-                        const Pitches<DIM - 1>& pitches,
-                        const Rect<DIM>& rect,
-                        const size_t volume,
-                        const std::vector<comm::Communicator>& comms,
-                        const DomainPoint& point,
-                        const Domain& launch_domain)
+  void operator()(Array& output,
+                  const AccessorRO<VAL, DIM>& in,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  const std::vector<comm::Communicator>& comms,
+                  const DomainPoint& point,
+                  const Domain& launch_domain)
   {
     auto stream = get_cached_stream();
 
@@ -175,7 +180,7 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
     result.second = end - ptr;
     auto buf_size = (get_aligned_size(result.second * sizeof(VAL)) + sizeof(VAL) - 1) / sizeof(VAL);
     assert(end - ptr <= buf_size);
-    result.first = create_buffer<VAL>(buf_size);
+    result.first = output.create_output_buffer<VAL, 1>(buf_size);
     if (result.second > 0)
       CHECK_CUDA(cudaMemcpyAsync(
         result.first.ptr(0), ptr, sizeof(VAL) * result.second, cudaMemcpyDeviceToDevice, stream));
@@ -184,12 +189,12 @@ struct UniqueImplBody<VariantKind::GPU, CODE, DIM> {
       // The launch domain is 1D because of the output region
       assert(point.dim == 1);
       auto comm = comms[0].get<ncclComm_t*>();
-      result    = tree_reduce(result, point[0], launch_domain.get_volume(), stream, comm);
+      result    = tree_reduce(output, result, point[0], launch_domain.get_volume(), stream, comm);
     }
     CHECK_CUDA_STREAM(stream);
 
     // Finally we pack the result
-    return result;
+    output.return_data(result.first, Point<1>(result.second));
   }
 };
 
diff --git a/src/cunumeric/set/unique_omp.cc b/src/cunumeric/set/unique_omp.cc
index 42914d853..656da03cc 100644
--- a/src/cunumeric/set/unique_omp.cc
+++ b/src/cunumeric/set/unique_omp.cc
@@ -28,13 +28,14 @@ template <LegateTypeCode CODE, int32_t DIM>
 struct UniqueImplBody<VariantKind::OMP, CODE, DIM> {
   using VAL = legate_type_of<CODE>;
 
-  std::pair<Buffer<VAL>, size_t> operator()(const AccessorRO<VAL, DIM>& in,
-                                            const Pitches<DIM - 1>& pitches,
-                                            const Rect<DIM>& rect,
-                                            const size_t volume,
-                                            const std::vector<comm::Communicator>& comms,
-                                            const DomainPoint& point,
-                                            const Domain& launch_domain)
+  void operator()(Array& output,
+                  const AccessorRO<VAL, DIM>& in,
+                  const Pitches<DIM - 1>& pitches,
+                  const Rect<DIM>& rect,
+                  const size_t volume,
+                  const std::vector<comm::Communicator>& comms,
+                  const DomainPoint& point,
+                  const Domain& launch_domain)
   {
     const auto max_threads = omp_get_max_threads();
     std::vector<std::set<VAL>> dedup_set(max_threads);
@@ -66,14 +67,9 @@ struct UniqueImplBody<VariantKind::OMP, CODE, DIM> {
     }
 
     auto& final_dedup_set = dedup_set[0];
-    size_t size           = final_dedup_set.size();
+    auto result           = output.create_output_buffer<VAL, 1>(final_dedup_set.size(), true);
     size_t pos            = 0;
-    auto kind   = CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
-    auto result = create_buffer<VAL>(size, kind);
-
     for (auto e : final_dedup_set) result[pos++] = e;
-
-    return std::make_pair(result, size);
   }
 };
 
diff --git a/src/cunumeric/set/unique_reduce.cc b/src/cunumeric/set/unique_reduce.cc
index 4f8a4d29e..129d9903f 100644
--- a/src/cunumeric/set/unique_reduce.cc
+++ b/src/cunumeric/set/unique_reduce.cc
@@ -26,8 +26,7 @@ template <LegateTypeCode CODE>
 struct UniqueReduceImplBody<VariantKind::CPU, CODE> {
   using VAL = legate_type_of<CODE>;
 
-  std::pair<Buffer<VAL>, size_t> operator()(
-    const std::vector<std::pair<AccessorRO<VAL, 1>, Rect<1>>>& inputs)
+  void operator()(Array& output, const std::vector<std::pair<AccessorRO<VAL, 1>, Rect<1>>>& inputs)
   {
     std::set<VAL> dedup_set;
 
@@ -39,11 +38,9 @@ struct UniqueReduceImplBody<VariantKind::CPU, CODE> {
 
     size_t size = dedup_set.size();
     size_t pos  = 0;
-    auto result = create_buffer<VAL>(size);
+    auto result = output.create_output_buffer<VAL, 1>(Point<1>(size), true);
 
     for (auto e : dedup_set) result[pos++] = e;
-
-    return std::make_pair(result, size);
   }
 };
 
diff --git a/src/cunumeric/set/unique_reduce_template.inl b/src/cunumeric/set/unique_reduce_template.inl
index 58d810405..c51976e0a 100644
--- a/src/cunumeric/set/unique_reduce_template.inl
+++ b/src/cunumeric/set/unique_reduce_template.inl
@@ -43,11 +43,7 @@ struct UniqueReduceImpl {
       inputs.push_back(std::make_pair(acc, shape));
     }
 
-    size_t size;
-    Buffer<VAL> result;
-    std::tie(result, size) = UniqueReduceImplBody<KIND, CODE>()(inputs);
-
-    output.return_data(result, Point<1>(size));
+    UniqueReduceImplBody<KIND, CODE>()(output, inputs);
   }
 };
 
diff --git a/src/cunumeric/set/unique_template.inl b/src/cunumeric/set/unique_template.inl
index e1fbf076c..04f458331 100644
--- a/src/cunumeric/set/unique_template.inl
+++ b/src/cunumeric/set/unique_template.inl
@@ -44,12 +44,8 @@ struct UniqueImpl {
     size_t volume = pitches.flatten(rect);
 
     auto in = input.read_accessor<VAL, DIM>(rect);
-    size_t size;
-    Buffer<VAL> result;
-    std::tie(result, size) =
-      UniqueImplBody<KIND, CODE, DIM>()(in, pitches, rect, volume, comms, point, launch_domain);
-
-    output.return_data(result, Point<1>(size));
+    UniqueImplBody<KIND, CODE, DIM>()(
+      output, in, pitches, rect, volume, comms, point, launch_domain);
   }
 };
 
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 3d76f37e7..0297056d1 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -1426,7 +1426,6 @@ void sample_sort_nccl_nd(SortPiece<legate_type_of<CODE>> local_sorted,
   CHECK_NCCL(ncclGroupEnd());
 
   // we need the amount of data to transfer on the host --> get it
-  // FIXME auto kind = CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
   Buffer<size_t> size_send_total = create_buffer<size_t>(num_sort_ranks, Memory::Z_COPY_MEM);
   Buffer<size_t> size_recv_total = create_buffer<size_t>(num_sort_ranks, Memory::Z_COPY_MEM);
   {
diff --git a/tests/unit/cunumeric/test_config.py b/tests/unit/cunumeric/test_config.py
index ddede6241..a3cbd1529 100644
--- a/tests/unit/cunumeric/test_config.py
+++ b/tests/unit/cunumeric/test_config.py
@@ -248,7 +248,6 @@ def test_CuNumericTunable() -> None:
         "NUM_GPUS",
         "NUM_PROCS",
         "MAX_EAGER_VOLUME",
-        "HAS_NUMAMEM",
     }
 
 

From d870e9a8ecfebf8f50de7583a27a34ea90d6c10c Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 22 Nov 2022 14:36:23 -0800
Subject: [PATCH 53/89] Don't use cmake 3.25.0 in build-isolation mode (#714)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 22727eb03..5ac994ab9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ requires = [
     "ninja",
     "setuptools",
     "scikit-build>=0.13.1",
-    "cmake>=3.22.1,!=3.23.0",
+    "cmake>=3.22.1,!=3.23.0,!=3.25.0",
 ]
 
 [tool.pytest.ini_options]

From eb46e15eb2c39001c52e50d6f3d4c2904658c3ef Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Wed, 23 Nov 2022 09:02:47 -0800
Subject: [PATCH 54/89] Minor type improvements (#716)

* remove obsolete show_none_errors

* tighten up runtime.py

* tighten up coverage.py

* tighen up linalg.py

* tighten up __init__.py

* tighten up sort.py

* tighten up array.py

* tighten up module.py

* tighten up eager.py

* tighten up deferred.py

* tighten up utils.py

* fix initial param bug

* remove spurious assert
---
 cunumeric/__init__.py      |  2 +-
 cunumeric/array.py         | 10 +++++++---
 cunumeric/coverage.py      |  2 +-
 cunumeric/deferred.py      | 16 +++++++++-------
 cunumeric/eager.py         | 26 ++++++++++++++------------
 cunumeric/linalg/linalg.py |  8 ++++++--
 cunumeric/module.py        | 14 ++++++++++----
 cunumeric/runtime.py       | 13 ++++++++-----
 cunumeric/sort.py          |  4 +++-
 cunumeric/utils.py         |  2 +-
 pyproject.toml             |  1 -
 11 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/cunumeric/__init__.py b/cunumeric/__init__.py
index b8b028f9f..7c9e122aa 100644
--- a/cunumeric/__init__.py
+++ b/cunumeric/__init__.py
@@ -42,4 +42,4 @@
 
 from . import _version
 
-__version__ = _version.get_versions()["version"]  # type: ignore
+__version__ = _version.get_versions()["version"]  # type: ignore [no-untyped-call]
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 2c7b7f770..f40444d58 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -33,10 +33,14 @@
 
 import legate.core.types as ty
 import numpy as np
-import pyarrow  # type: ignore
+import pyarrow  # type: ignore  [import]
 from legate.core import Array
-from numpy.core.multiarray import normalize_axis_index  # type: ignore
-from numpy.core.numeric import normalize_axis_tuple  # type: ignore
+from numpy.core.multiarray import (  # type: ignore [attr-defined]
+    normalize_axis_index,
+)
+from numpy.core.numeric import (  # type: ignore [attr-defined]
+    normalize_axis_tuple,
+)
 from typing_extensions import ParamSpec
 
 from .config import (
diff --git a/cunumeric/coverage.py b/cunumeric/coverage.py
index f4d2e0128..f8f4446ae 100644
--- a/cunumeric/coverage.py
+++ b/cunumeric/coverage.py
@@ -78,7 +78,7 @@ class CuWrapperMetadata:
 
 class CuWrapped(AnyCallable, Protocol):
     _cunumeric: CuWrapperMetadata
-    __wrapped__: Any
+    __wrapped__: AnyCallable
     __name__: str
     __qualname__: str
 
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 5c9330ee4..a7ba5d6c0 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -36,7 +36,9 @@
 import legate.core.types as ty
 import numpy as np
 from legate.core import Annotation, Future, ReductionOp, Store
-from numpy.core.numeric import normalize_axis_tuple  # type: ignore
+from numpy.core.numeric import (  # type: ignore [attr-defined]
+    normalize_axis_tuple,
+)
 from typing_extensions import ParamSpec
 
 from .config import (
@@ -710,7 +712,7 @@ def _create_indexing_array(
         shift = 0
         for dim, k in enumerate(key):
             if np.isscalar(k):
-                if k < 0:  # type: ignore
+                if k < 0:  # type: ignore [operator]
                     k += store.shape[dim + shift]
                 store = store.project(dim + shift, k)
                 shift -= 1
@@ -787,7 +789,7 @@ def _get_view(self, key: Any) -> DeferredArray:
             elif isinstance(k, slice):
                 k, store = self._slice_store(k, store, dim + shift)
             elif np.isscalar(k):
-                if k < 0:  # type: ignore
+                if k < 0:  # type: ignore [operator]
                     k += store.shape[dim + shift]
                 store = store.project(dim + shift, k)
                 shift -= 1
@@ -3032,7 +3034,7 @@ def unary_reduction(
         args: Any,
         initial: Any,
     ) -> None:
-        lhs_array = self
+        lhs_array: Union[NumPyThunk, DeferredArray] = self
         rhs_array = src
         assert lhs_array.ndim <= rhs_array.ndim
 
@@ -3040,7 +3042,7 @@ def unary_reduction(
 
         if argred:
             argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype)
-            lhs_array = self.runtime.create_empty_thunk(  # type: ignore
+            lhs_array = self.runtime.create_empty_thunk(
                 lhs_array.shape,
                 dtype=argred_dtype,
                 inputs=[self],
@@ -3060,7 +3062,7 @@ def unary_reduction(
 
             lhs_array.fill(np.array(fill_value, dtype=lhs_array.dtype))
 
-            lhs = lhs_array.base
+            lhs = lhs_array.base  # type: ignore
             while lhs.ndim > 1:
                 lhs = lhs.project(0, 0)
 
@@ -3094,7 +3096,7 @@ def unary_reduction(
             # If output dims is not 0, then we must have axes
             assert axes is not None
             # Reduction to a smaller array
-            result = lhs_array.base
+            result = lhs_array.base  # type: ignore
             if keepdims:
                 for axis in axes:
                     result = result.project(axis, 0)
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 530b805c5..0c792fbae 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -215,14 +215,17 @@ def __init__(
         self.key: Optional[tuple[Any, ...]] = key
         #: if this ever becomes set (to a DeferredArray), we forward all
         #: operations to it
-        self.deferred: Optional[DeferredArray] = None
+        self.deferred: Optional[Union[DeferredArray, NumPyThunk]] = None
         self.escaped = False
 
     @property
     def storage(self) -> Union[Future, tuple[Region, FieldID]]:
         if self.deferred is None:
             self.to_deferred_array()
-        return self.deferred.storage  # type: ignore
+
+        assert self.deferred is not None
+
+        return self.deferred.storage
 
     @property
     def shape(self) -> NdShape:
@@ -265,10 +268,9 @@ def _convert_children(self) -> None:
         assert self.runtime.is_deferred_array(self.deferred)
         for child in self.children:
             if child.deferred is None:
-                # mypy can't deduce that children nodes will always have
-                # their .key attribute set.
-                func = getattr(self.deferred, child.key[0])  # type: ignore
-                args = child.key[1:]  # type: ignore
+                assert child.key is not None
+                func = getattr(self.deferred, child.key[0])
+                args = child.key[1:]
                 child.deferred = func(*args)
         # After we've made all the deferred views for each child then
         # we can traverse down. Do it this way so we can get partition
@@ -298,7 +300,7 @@ def to_deferred_array(self) -> DeferredArray:
                         shape=self.shape,
                     )
                 else:
-                    self.deferred = self.runtime.find_or_create_array_thunk(  # type: ignore # noqa E501
+                    self.deferred = self.runtime.find_or_create_array_thunk(
                         self.array,
                         share=self.escaped,
                         defer=True,
@@ -334,7 +336,7 @@ def convolve(self, v: Any, out: Any, mode: ConvolveMode) -> None:
             if self.ndim == 1:
                 out.array = np.convolve(self.array, v.array, mode)
             else:
-                from scipy.signal import convolve  # type: ignore
+                from scipy.signal import convolve  # type: ignore [import]
 
                 out.array = convolve(self.array, v.array, mode)
 
@@ -1468,10 +1470,9 @@ def unary_reduction(
             return
         if op in _UNARY_RED_OPS:
             fn = _UNARY_RED_OPS[op]
-            if initial is None:
-                # NumPy starts using this predefined constant, instead of None,
-                # to mean no value was given by the caller
-                initial = np._NoValue  # type: ignore
+            # Need to be more careful here, Numpy does not use None to mean
+            # "was not passed in" in this instance
+            kws = {"initial": initial} if initial is not None else {}
             fn(
                 rhs.array,
                 out=self.array,
@@ -1480,6 +1481,7 @@ def unary_reduction(
                 where=where
                 if not isinstance(where, EagerArray)
                 else where.array,
+                **kws,
             )
         elif op == UnaryRedCode.ARGMAX:
             np.argmax(
diff --git a/cunumeric/linalg/linalg.py b/cunumeric/linalg/linalg.py
index 1eb8454c6..6474f56f3 100644
--- a/cunumeric/linalg/linalg.py
+++ b/cunumeric/linalg/linalg.py
@@ -17,8 +17,12 @@
 from typing import TYPE_CHECKING, Sequence, Union
 
 import numpy as np
-from numpy.core.multiarray import normalize_axis_index  # type: ignore
-from numpy.core.numeric import normalize_axis_tuple  # type: ignore
+from numpy.core.multiarray import (  # type: ignore [attr-defined]
+    normalize_axis_index,
+)
+from numpy.core.numeric import (  # type: ignore [attr-defined]
+    normalize_axis_tuple,
+)
 
 from cunumeric._ufunc.math import add, sqrt as _sqrt
 from cunumeric.array import add_boilerplate, convert_to_cunumeric_ndarray
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 95b5350cf..a2a972087 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -23,7 +23,9 @@
 
 import numpy as np
 import opt_einsum as oe  # type: ignore [import]
-from numpy.core.multiarray import normalize_axis_index  # type: ignore
+from numpy.core.multiarray import (  # type: ignore [attr-defined]
+    normalize_axis_index,
+)
 from numpy.core.numeric import (  # type: ignore [attr-defined]
     normalize_axis_tuple,
 )
@@ -4012,9 +4014,13 @@ def tensordot(
 
 
 # Trivial multi-tensor contraction strategy: contract in input order
-class NullOptimizer(oe.paths.PathOptimizer):  # type: ignore
-    def __call__(  # type: ignore [no-untyped-def]
-        self, inputs, output, size_dict, memory_limit=None
+class NullOptimizer(oe.paths.PathOptimizer):  # type: ignore [misc,no-any-unimported] # noqa
+    def __call__(
+        self,
+        inputs: list[set[str]],
+        outputs: set[str],
+        size_dict: dict[str, int],
+        memory_limit: Union[int, None] = None,
     ) -> list[tuple[int, int]]:
         return [(0, 1)] + [(0, -1)] * (len(inputs) - 2)
 
diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py
index 2fdb97822..26d8ab207 100644
--- a/cunumeric/runtime.py
+++ b/cunumeric/runtime.py
@@ -51,6 +51,8 @@
     from legate.core._legion.future import Future
     from legate.core.operation import AutoTask, ManualTask
 
+    from .array import ndarray
+
 ARGS = [
     Argument(
         "test",
@@ -351,7 +353,7 @@ def is_supported_type(self, dtype: Union[str, np.dtype[Any]]) -> bool:
 
     def get_numpy_thunk(
         self,
-        obj: Any,
+        obj: Union[ndarray, npt.NDArray[Any]],
         share: bool = False,
         dtype: Optional[np.dtype[Any]] = None,
     ) -> NumPyThunk:
@@ -403,11 +405,12 @@ def compute_parent_child_mapping(
         # slice object that was used to generate a child array from
         # a parent array so we can build the same mapping from a
         # logical region to a subregion
-        parent_ptr = int(array.base.ctypes.data)  # type: ignore
+        assert array.base is not None
+        parent_ptr = int(array.base.ctypes.data)
         child_ptr = int(array.ctypes.data)
         assert child_ptr >= parent_ptr
         ptr_diff = child_ptr - parent_ptr
-        parent_shape = array.base.shape  # type: ignore
+        parent_shape = array.base.shape
         div = (
             reduce(lambda x, y: x * y, parent_shape)
             if len(parent_shape) > 1
@@ -425,8 +428,8 @@ def compute_parent_child_mapping(
         key: tuple[Union[slice, None], ...] = ()
         child_idx = 0
         child_strides = tuple(array.strides)
-        parent_strides = tuple(array.base.strides)  # type: ignore
-        for idx in range(array.base.ndim):  # type: ignore
+        parent_strides = tuple(array.base.strides)
+        for idx in range(array.base.ndim):
             # Handle the adding and removing dimension cases
             if parent_strides[idx] == 0:
                 # This was an added dimension in the parent
diff --git a/cunumeric/sort.py b/cunumeric/sort.py
index 86fa1177e..fbca9146a 100644
--- a/cunumeric/sort.py
+++ b/cunumeric/sort.py
@@ -17,7 +17,9 @@
 from typing import TYPE_CHECKING, Union, cast
 
 from legate.core import types as ty
-from numpy.core.multiarray import normalize_axis_index  # type: ignore
+from numpy.core.multiarray import (  # type: ignore [attr-defined]
+    normalize_axis_index,
+)
 
 from .config import CuNumericOpCode
 
diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index fa5b4462d..25f0f19f1 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -114,7 +114,7 @@ def get_arg_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]:
 
 def get_arg_value_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]:
     dt = dtype.fields["arg_value"][0].type  # type: ignore [index]
-    return cast(Any, dt)
+    return cast(np.dtype[Any], dt)
 
 
 Modes = Tuple[List[str], List[str], List[str]]
diff --git a/pyproject.toml b/pyproject.toml
index 5ac994ab9..73ebc13c8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,7 +75,6 @@ warn_no_return = true
 warn_return_any = false
 warn_unreachable = true
 
-show_none_errors = true
 ignore_errors = false
 
 allow_untyped_globals = false

From 3077504ca122a1ff8ab520c94b1d852680435f7a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 23 Nov 2022 13:25:42 -0800
Subject: [PATCH 55/89] [pre-commit.ci] pre-commit autoupdate (#712)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/pre-commit/mirrors-mypy: v0.982 → v0.991](https://github.com/pre-commit/mirrors-mypy/compare/v0.982...v0.991)
- [github.com/pre-commit/mirrors-clang-format: v14.0.6 → v15.0.4](https://github.com/pre-commit/mirrors-clang-format/compare/v14.0.6...v15.0.4)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml                        |  4 ++--
 .../execution_policy/indexing/parallel_loop.h  |  3 +--
 src/cunumeric/matrix/contract.cu               |  3 +--
 src/cunumeric/matrix/contract_template.inl     | 18 ++++++------------
 src/cunumeric/matrix/gemm_template.inl         | 15 +++++----------
 src/cunumeric/matrix/matmul_template.inl       |  3 +--
 src/cunumeric/matrix/matvecmul_template.inl    |  3 +--
 src/cunumeric/matrix/potrf_template.inl        | 15 +++++----------
 src/cunumeric/matrix/solve_template.inl        | 15 +++++----------
 src/cunumeric/matrix/syrk_template.inl         | 15 +++++----------
 src/cunumeric/matrix/trsm_template.inl         | 15 +++++----------
 src/cunumeric/scan/scan_global_util.h          |  3 +--
 src/cunumeric/scan/scan_local_util.h           |  3 +--
 src/cunumeric/sort/sort.cu                     |  9 +++------
 src/cunumeric/unary/convert_util.h             |  3 +--
 src/cunumeric/unary/unary_red_util.h           |  9 +++------
 16 files changed, 46 insertions(+), 90 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 798efa23d..dc8cecafe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
     - repo: https://github.com/pre-commit/mirrors-mypy
-      rev: 'v0.982'
+      rev: 'v0.991'
       hooks:
         - id: mypy
           language: system
@@ -19,7 +19,7 @@ repos:
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v14.0.6'  # Use the sha / tag you want to point at
+      rev: 'v15.0.4'  # Use the sha / tag you want to point at
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop.h b/src/cunumeric/execution_policy/indexing/parallel_loop.h
index 31adf811f..609ed04ca 100644
--- a/src/cunumeric/execution_policy/indexing/parallel_loop.h
+++ b/src/cunumeric/execution_policy/indexing/parallel_loop.h
@@ -21,8 +21,7 @@
 namespace cunumeric {
 
 template <VariantKind KIND, class Tag = void>
-struct ParallelLoopPolicy {
-};
+struct ParallelLoopPolicy {};
 
 template <class Tag>
 struct ParallelLoopPolicy<VariantKind::CPU, Tag> {
diff --git a/src/cunumeric/matrix/contract.cu b/src/cunumeric/matrix/contract.cu
index 722916043..7a66e9ba8 100644
--- a/src/cunumeric/matrix/contract.cu
+++ b/src/cunumeric/matrix/contract.cu
@@ -26,8 +26,7 @@ using namespace Legion;
 namespace {  // anonymous
 
 template <typename T>
-struct contract_helper {
-};
+struct contract_helper {};
 
 template <>
 struct contract_helper<__half> {
diff --git a/src/cunumeric/matrix/contract_template.inl b/src/cunumeric/matrix/contract_template.inl
index 6bd375e5e..d067cafd3 100644
--- a/src/cunumeric/matrix/contract_template.inl
+++ b/src/cunumeric/matrix/contract_template.inl
@@ -33,23 +33,17 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct ContractImplBody;
 
 template <LegateTypeCode CODE>
-struct support_contract : std::false_type {
-};
+struct support_contract : std::false_type {};
 template <>
-struct support_contract<LegateTypeCode::HALF_LT> : std::true_type {
-};
+struct support_contract<LegateTypeCode::HALF_LT> : std::true_type {};
 template <>
-struct support_contract<LegateTypeCode::FLOAT_LT> : std::true_type {
-};
+struct support_contract<LegateTypeCode::FLOAT_LT> : std::true_type {};
 template <>
-struct support_contract<LegateTypeCode::DOUBLE_LT> : std::true_type {
-};
+struct support_contract<LegateTypeCode::DOUBLE_LT> : std::true_type {};
 template <>
-struct support_contract<LegateTypeCode::COMPLEX64_LT> : std::true_type {
-};
+struct support_contract<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 template <>
-struct support_contract<LegateTypeCode::COMPLEX128_LT> : std::true_type {
-};
+struct support_contract<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
 #if 0  // debugging output
 
diff --git a/src/cunumeric/matrix/gemm_template.inl b/src/cunumeric/matrix/gemm_template.inl
index 15d2c8f27..4ccf089b7 100644
--- a/src/cunumeric/matrix/gemm_template.inl
+++ b/src/cunumeric/matrix/gemm_template.inl
@@ -28,20 +28,15 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct GemmImplBody;
 
 template <LegateTypeCode CODE>
-struct support_gemm : std::false_type {
-};
+struct support_gemm : std::false_type {};
 template <>
-struct support_gemm<LegateTypeCode::DOUBLE_LT> : std::true_type {
-};
+struct support_gemm<LegateTypeCode::DOUBLE_LT> : std::true_type {};
 template <>
-struct support_gemm<LegateTypeCode::FLOAT_LT> : std::true_type {
-};
+struct support_gemm<LegateTypeCode::FLOAT_LT> : std::true_type {};
 template <>
-struct support_gemm<LegateTypeCode::COMPLEX64_LT> : std::true_type {
-};
+struct support_gemm<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 template <>
-struct support_gemm<LegateTypeCode::COMPLEX128_LT> : std::true_type {
-};
+struct support_gemm<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
 template <VariantKind KIND>
 struct GemmImpl {
diff --git a/src/cunumeric/matrix/matmul_template.inl b/src/cunumeric/matrix/matmul_template.inl
index 285c6ceec..5ee01a23c 100644
--- a/src/cunumeric/matrix/matmul_template.inl
+++ b/src/cunumeric/matrix/matmul_template.inl
@@ -29,8 +29,7 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct MatMulImplBody;
 
 template <LegateTypeCode CODE>
-struct support_matmul : std::false_type {
-};
+struct support_matmul : std::false_type {};
 template <>
 struct support_matmul<LegateTypeCode::DOUBLE_LT> : std::true_type {
   using ACC_TYPE = double;
diff --git a/src/cunumeric/matrix/matvecmul_template.inl b/src/cunumeric/matrix/matvecmul_template.inl
index 57e3970e4..7ccb73b6d 100644
--- a/src/cunumeric/matrix/matvecmul_template.inl
+++ b/src/cunumeric/matrix/matvecmul_template.inl
@@ -29,8 +29,7 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct MatVecMulImplBody;
 
 template <LegateTypeCode CODE>
-struct support_matvecmul : std::false_type {
-};
+struct support_matvecmul : std::false_type {};
 template <>
 struct support_matvecmul<LegateTypeCode::DOUBLE_LT> : std::true_type {
   using ACC_TYPE = double;
diff --git a/src/cunumeric/matrix/potrf_template.inl b/src/cunumeric/matrix/potrf_template.inl
index 05ff60b46..eea21d8cc 100644
--- a/src/cunumeric/matrix/potrf_template.inl
+++ b/src/cunumeric/matrix/potrf_template.inl
@@ -28,20 +28,15 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct PotrfImplBody;
 
 template <LegateTypeCode CODE>
-struct support_potrf : std::false_type {
-};
+struct support_potrf : std::false_type {};
 template <>
-struct support_potrf<LegateTypeCode::DOUBLE_LT> : std::true_type {
-};
+struct support_potrf<LegateTypeCode::DOUBLE_LT> : std::true_type {};
 template <>
-struct support_potrf<LegateTypeCode::FLOAT_LT> : std::true_type {
-};
+struct support_potrf<LegateTypeCode::FLOAT_LT> : std::true_type {};
 template <>
-struct support_potrf<LegateTypeCode::COMPLEX64_LT> : std::true_type {
-};
+struct support_potrf<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 template <>
-struct support_potrf<LegateTypeCode::COMPLEX128_LT> : std::true_type {
-};
+struct support_potrf<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
 template <VariantKind KIND>
 struct PotrfImpl {
diff --git a/src/cunumeric/matrix/solve_template.inl b/src/cunumeric/matrix/solve_template.inl
index bff40ad9c..12fdf3f10 100644
--- a/src/cunumeric/matrix/solve_template.inl
+++ b/src/cunumeric/matrix/solve_template.inl
@@ -30,20 +30,15 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct SolveImplBody;
 
 template <LegateTypeCode CODE>
-struct support_solve : std::false_type {
-};
+struct support_solve : std::false_type {};
 template <>
-struct support_solve<LegateTypeCode::DOUBLE_LT> : std::true_type {
-};
+struct support_solve<LegateTypeCode::DOUBLE_LT> : std::true_type {};
 template <>
-struct support_solve<LegateTypeCode::FLOAT_LT> : std::true_type {
-};
+struct support_solve<LegateTypeCode::FLOAT_LT> : std::true_type {};
 template <>
-struct support_solve<LegateTypeCode::COMPLEX64_LT> : std::true_type {
-};
+struct support_solve<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 template <>
-struct support_solve<LegateTypeCode::COMPLEX128_LT> : std::true_type {
-};
+struct support_solve<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
 template <VariantKind KIND>
 struct SolveImpl {
diff --git a/src/cunumeric/matrix/syrk_template.inl b/src/cunumeric/matrix/syrk_template.inl
index 739581bcb..9b1184eef 100644
--- a/src/cunumeric/matrix/syrk_template.inl
+++ b/src/cunumeric/matrix/syrk_template.inl
@@ -28,20 +28,15 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct SyrkImplBody;
 
 template <LegateTypeCode CODE>
-struct support_syrk : std::false_type {
-};
+struct support_syrk : std::false_type {};
 template <>
-struct support_syrk<LegateTypeCode::DOUBLE_LT> : std::true_type {
-};
+struct support_syrk<LegateTypeCode::DOUBLE_LT> : std::true_type {};
 template <>
-struct support_syrk<LegateTypeCode::FLOAT_LT> : std::true_type {
-};
+struct support_syrk<LegateTypeCode::FLOAT_LT> : std::true_type {};
 template <>
-struct support_syrk<LegateTypeCode::COMPLEX64_LT> : std::true_type {
-};
+struct support_syrk<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 template <>
-struct support_syrk<LegateTypeCode::COMPLEX128_LT> : std::true_type {
-};
+struct support_syrk<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
 template <VariantKind KIND>
 struct SyrkImpl {
diff --git a/src/cunumeric/matrix/trsm_template.inl b/src/cunumeric/matrix/trsm_template.inl
index 40dd2ca17..ae2b7b840 100644
--- a/src/cunumeric/matrix/trsm_template.inl
+++ b/src/cunumeric/matrix/trsm_template.inl
@@ -28,20 +28,15 @@ template <VariantKind KIND, LegateTypeCode CODE>
 struct TrsmImplBody;
 
 template <LegateTypeCode CODE>
-struct support_trsm : std::false_type {
-};
+struct support_trsm : std::false_type {};
 template <>
-struct support_trsm<LegateTypeCode::DOUBLE_LT> : std::true_type {
-};
+struct support_trsm<LegateTypeCode::DOUBLE_LT> : std::true_type {};
 template <>
-struct support_trsm<LegateTypeCode::FLOAT_LT> : std::true_type {
-};
+struct support_trsm<LegateTypeCode::FLOAT_LT> : std::true_type {};
 template <>
-struct support_trsm<LegateTypeCode::COMPLEX64_LT> : std::true_type {
-};
+struct support_trsm<LegateTypeCode::COMPLEX64_LT> : std::true_type {};
 template <>
-struct support_trsm<LegateTypeCode::COMPLEX128_LT> : std::true_type {
-};
+struct support_trsm<LegateTypeCode::COMPLEX128_LT> : std::true_type {};
 
 template <VariantKind KIND>
 struct TrsmImpl {
diff --git a/src/cunumeric/scan/scan_global_util.h b/src/cunumeric/scan/scan_global_util.h
index b53ada288..502b9720c 100644
--- a/src/cunumeric/scan/scan_global_util.h
+++ b/src/cunumeric/scan/scan_global_util.h
@@ -41,8 +41,7 @@ constexpr decltype(auto) op_dispatch(ScanCode op_code, Functor f, Fnargs&&... ar
 }
 
 template <ScanCode OP_CODE, legate::LegateTypeCode CODE>
-struct ScanOp {
-};
+struct ScanOp {};
 
 template <legate::LegateTypeCode CODE>
 struct ScanOp<ScanCode::SUM, CODE> : thrust::plus<legate::legate_type_of<CODE>> {
diff --git a/src/cunumeric/scan/scan_local_util.h b/src/cunumeric/scan/scan_local_util.h
index 7f1eefc7d..0cfbacb00 100644
--- a/src/cunumeric/scan/scan_local_util.h
+++ b/src/cunumeric/scan/scan_local_util.h
@@ -53,8 +53,7 @@ constexpr decltype(auto) op_dispatch(ScanCode op_code,
 }
 
 template <ScanCode OP_CODE, legate::LegateTypeCode CODE>
-struct ScanOp {
-};
+struct ScanOp {};
 
 template <legate::LegateTypeCode CODE>
 struct ScanOp<ScanCode::SUM, CODE> : thrust::plus<legate::legate_type_of<CODE>> {
diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index 0297056d1..af931c807 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -42,14 +42,11 @@
 namespace cunumeric {
 
 template <LegateTypeCode CODE>
-struct support_cub : std::true_type {
-};
+struct support_cub : std::true_type {};
 template <>
-struct support_cub<LegateTypeCode::COMPLEX64_LT> : std::false_type {
-};
+struct support_cub<LegateTypeCode::COMPLEX64_LT> : std::false_type {};
 template <>
-struct support_cub<LegateTypeCode::COMPLEX128_LT> : std::false_type {
-};
+struct support_cub<LegateTypeCode::COMPLEX128_LT> : std::false_type {};
 
 template <LegateTypeCode CODE, std::enable_if_t<support_cub<CODE>::value>* = nullptr>
 void local_sort(const legate_type_of<CODE>* values_in,
diff --git a/src/cunumeric/unary/convert_util.h b/src/cunumeric/unary/convert_util.h
index 03e3692c8..f58c0265c 100644
--- a/src/cunumeric/unary/convert_util.h
+++ b/src/cunumeric/unary/convert_util.h
@@ -44,8 +44,7 @@ constexpr decltype(auto) op_dispatch(ConvertCode nan_op, Functor f, Fnargs&&...
 }
 
 template <ConvertCode NAN_OP, legate::LegateTypeCode DST_TYPE, legate::LegateTypeCode SRC_TYPE>
-struct ConvertOp {
-};
+struct ConvertOp {};
 
 template <legate::LegateTypeCode DST_TYPE, legate::LegateTypeCode SRC_TYPE>
 struct ConvertOp<ConvertCode::NOOP, DST_TYPE, SRC_TYPE> {
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index ab193a7df..94536cd04 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -36,14 +36,11 @@ enum class UnaryRedCode : int {
 };
 
 template <UnaryRedCode OP_CODE>
-struct is_arg_reduce : std::false_type {
-};
+struct is_arg_reduce : std::false_type {};
 template <>
-struct is_arg_reduce<UnaryRedCode::ARGMAX> : std::true_type {
-};
+struct is_arg_reduce<UnaryRedCode::ARGMAX> : std::true_type {};
 template <>
-struct is_arg_reduce<UnaryRedCode::ARGMIN> : std::true_type {
-};
+struct is_arg_reduce<UnaryRedCode::ARGMIN> : std::true_type {};
 
 template <typename Functor, typename... Fnargs>
 constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&... args)

From e07c26d4be0a73a54f2889045c2867a98bf22680 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 28 Nov 2022 12:04:26 -0800
Subject: [PATCH 56/89] Update Availability annotations (#715)

* Update Availability annotations according to proposed policy

* Missing documentation and case for ndarray.view
---
 cunumeric/array.py         | 44 +++++++++++++++++++++++++++++++++++---
 cunumeric/logic.py         |  6 +++---
 cunumeric/random/random.py |  4 ++++
 3 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index f40444d58..d163e4ca4 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -541,6 +541,10 @@ def flat(self) -> np.flatiter[npt.NDArray[Any]]:
         --------
         flatten : Return a copy of the array collapsed into one dimension.
 
+        Availability
+        --------
+        Single CPU
+
         """
         return self.__array__().flat
 
@@ -2645,7 +2649,7 @@ def dump(self, file: Union[str, Path]) -> None:
 
         Availability
         --------
-        Multiple GPUs, Multiple CPUs
+        Single CPU
 
         """
         self.__array__().dump(file=file)
@@ -3645,7 +3649,7 @@ def tofile(self, fid: Any, sep: str = "", format: str = "%s") -> None:
 
         Availability
         --------
-        Multiple GPUs, Multiple CPUs
+        Single CPU
 
         """
         return self.__array__().tofile(fid=fid, sep=sep, format=format)
@@ -3817,12 +3821,46 @@ def flip(self, axis: Any = None) -> ndarray:
     def view(
         self,
         dtype: Union[npt.DTypeLike, None] = None,
-        type: Union[Any, None] = None,
+        type: Union[type, None] = None,
     ) -> ndarray:
+        """
+        New view of array with the same data.
+
+        Parameters
+        ----------
+        dtype : data-type or ndarray sub-class, optional
+            Data-type descriptor of the returned view, e.g., float32 or int16.
+            Omitting it results in the view having the same data-type as the
+            input array. This argument can also be specified as an ndarray
+            sub-class, which then specifies the type of the returned object
+            (this is equivalent to setting the ``type`` parameter).
+        type : ndarray sub-class, optional
+            Type of the returned view, e.g., ndarray or matrix. Again, omission
+            of the parameter results in type preservation.
+
+        Notes
+        -----
+        cuNumeric does not currently support type reinterpretation, or
+        conversion to ndarray sub-classes; use :func:`ndarray.__array__()` to
+        convert to `numpy.ndarray`.
+
+        See Also
+        --------
+        numpy.ndarray.view
+
+        Availability
+        --------
+        Multiple GPUs, Multiple CPUs
+        """
         if dtype is not None and dtype != self.dtype:
             raise NotImplementedError(
                 "cuNumeric does not currently support type reinterpretation"
             )
+        if type is not None:
+            raise NotImplementedError(
+                "cuNumeric does not currently support conversion to ndarray "
+                "sub-classes; use __array__() to convert to numpy.ndarray"
+            )
         return ndarray(shape=self.shape, dtype=self.dtype, thunk=self._thunk)
 
     def unique(self) -> ndarray:
diff --git a/cunumeric/logic.py b/cunumeric/logic.py
index 5cafffdc5..667ae1d13 100644
--- a/cunumeric/logic.py
+++ b/cunumeric/logic.py
@@ -176,7 +176,7 @@ def iscomplexobj(x: Union[ndarray, npt.NDArray[Any]]) -> bool:
 
     Availability
     --------
-    Single CPU
+    Multiple GPUs, Multiple CPUs
     """
     if isinstance(x, ndarray):
         return x.dtype.kind == "c"
@@ -244,7 +244,7 @@ def isrealobj(x: ndarray) -> bool:
 
     Availability
     --------
-    Single CPU
+    Multiple GPUs, Multiple CPUs
     """
     return not iscomplexobj(x)
 
@@ -275,7 +275,7 @@ def isscalar(x: Union[ndarray, npt.NDArray[Any]]) -> bool:
 
     Availability
     --------
-    Single CPU
+    Multiple GPUs, Multiple CPUs
     """
 
     # Since the input can be any value, we can't just convert it to cunumeric
diff --git a/cunumeric/random/random.py b/cunumeric/random/random.py
index 7a036a86e..7f37e5651 100644
--- a/cunumeric/random/random.py
+++ b/cunumeric/random/random.py
@@ -37,6 +37,10 @@ def seed(init: Union[int, None] = None) -> None:
 
     This function is effective only when cuRAND is NOT used in the build
     and is a no-op otherwise.
+
+    Availability
+    --------
+    Multiple GPUs, Multiple CPUs
     """
     if init is None:
         init = 0

From ae1faf4d913dd088add4396711fa53c64cd47c7d Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Tue, 29 Nov 2022 09:05:15 +0800
Subject: [PATCH 57/89] Enhance test_where.py and test_atleast_nd.py (#717)

* Enhance test_where.py and test_atleast_nd.py

* Fix naming.
---
 tests/integration/test_atleast_nd.py |  71 +++++++-----
 tests/integration/test_where.py      | 157 ++++++++++++++++++++++-----
 2 files changed, 170 insertions(+), 58 deletions(-)

diff --git a/tests/integration/test_atleast_nd.py b/tests/integration/test_atleast_nd.py
index 62bc80d5e..3946cb92f 100644
--- a/tests/integration/test_atleast_nd.py
+++ b/tests/integration/test_atleast_nd.py
@@ -16,35 +16,10 @@
 import numpy as np
 import pytest
 from legate.core import LEGATE_MAX_DIM
+from utils.utils import check_module_function
 
 import cunumeric as num
 
-
-def _check(a, routine, sizes):
-    b = getattr(np, routine)(*a)
-    c = getattr(num, routine)(*a)
-    is_equal = True
-    err_arr = [b, c]
-
-    if len(b) != len(c):
-        is_equal = False
-        err_arr = [b, c]
-    else:
-        for each in zip(b, c):
-            if not np.array_equal(*each):
-                err_arr = each
-                is_equal = False
-                break
-    print_msg = f"np.{routine}({sizes})"
-    assert is_equal, (
-        f"Failed, {print_msg}\n"
-        f"numpy result: {err_arr[0]}\n"
-        f"cunumeric_result: {err_arr[1]}\n"
-        f"cunumeric and numpy shows different result\n"
-    )
-    print(f"Passed, {print_msg}, np: {b}, cunumeric: {c}")
-
-
 DIM = 10
 
 SIZE_CASES = list((DIM,) * ndim for ndim in range(LEGATE_MAX_DIM + 1))
@@ -59,26 +34,62 @@ def _check(a, routine, sizes):
 @pytest.mark.parametrize("size", SIZE_CASES, ids=str)
 def test_atleast_1d(size):
     a = [np.arange(np.prod(size)).reshape(size)]
-    _check(a, "atleast_1d", size)
+    print_msg = f"np & cunumeric.atleast_1d(size={size})"
+    check_module_function("atleast_1d", a, {}, print_msg)
+
+
+def test_atleast_1d_scalar():
+    a = 1.0
+    assert np.array_equal(np.atleast_1d(a), num.atleast_1d(a))
+
+
+def test_atleast_1d_none():
+    a = None
+    assert np.array_equal(np.atleast_1d(a), num.atleast_1d(a))
 
 
 @pytest.mark.parametrize("size", SIZE_CASES, ids=str)
 def test_atleast_2d(size):
     a = [np.arange(np.prod(size)).reshape(size)]
-    _check(a, "atleast_2d", size)
+    print_msg = f"np & cunumeric.atleast_2d(size={size})"
+    check_module_function("atleast_2d", a, {}, print_msg)
+
+
+def test_atleast_2d_scalar():
+    a = 1.0
+    assert np.array_equal(np.atleast_2d(a), num.atleast_2d(a))
+
+
+def test_atleast_2d_none():
+    a = None
+    assert np.array_equal(np.atleast_2d(a), num.atleast_2d(a))
 
 
 @pytest.mark.parametrize("size", SIZE_CASES, ids=str)
 def test_atleast_3d(size):
     a = [np.arange(np.prod(size)).reshape(size)]
-    _check(a, "atleast_3d", size)
+    print_msg = f"np & cunumeric.atleast_3d(size={size})"
+    check_module_function("atleast_3d", a, {}, print_msg)
+
+
+def test_atleast_3d_scalar():
+    a = 1.0
+    assert np.array_equal(np.atleast_2d(a), num.atleast_2d(a))
+
+
+def test_atleast_3d_none():
+    a = None
+    assert np.array_equal(np.atleast_2d(a), num.atleast_2d(a))
 
 
 # test to run atleast_nd w/ list of arrays
 @pytest.mark.parametrize("dim", range(1, 4))
 def test_atleast_nd(dim):
     a = list(np.arange(np.prod(size)).reshape(size) for size in SIZE_CASES)
-    _check(a, f"atleast_{dim}d", SIZE_CASES)
+    scalar = 10.0
+    a.append(scalar)
+    print_msg = f"np & cunumeric.atleast_{dim}d(size={SIZE_CASES})"
+    check_module_function(f"atleast_{dim}d", a, {}, print_msg)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_where.py b/tests/integration/test_where.py
index 20d813bd9..cd66c0ce7 100644
--- a/tests/integration/test_where.py
+++ b/tests/integration/test_where.py
@@ -15,34 +15,130 @@
 
 import numpy as np
 import pytest
+from utils.generators import mk_seq_array
 
 import cunumeric as num
 
-np.random.seed(42)
-
 CONDITIONS = [
     [[True, False], [True, True]],
     [[True, False]],
     [True, False],
     False,
+    [[0.0, 1.0], [0, -2]],
 ]
 
 
 def test_basic():
-    anp = np.array([1, 54, 4, 4, 0, 45, 5, 58, 0, 9, 0, 4, 0, 0, 0, 5, 0])
-    a = num.array(anp)
-    assert num.array_equal(np.where(anp), num.where(a))
+    a_np = np.array([1, 54, 4, 4, 0, 45, 5, 58, 0, 9, 0, 4, 0, 0, 0, 5, 0])
+    a_num = num.array(a_np)
+    assert num.array_equal(np.where(a_np), num.where(a_num))
 
 
 @pytest.mark.parametrize("cond", CONDITIONS, ids=str)
 def test_condition(cond):
-    anp = np.array(cond)
-    xnp = np.array([[1, 2], [3, 4]])
-    ynp = np.array([[9, 8], [7, 6]])
-    a = num.array(anp)
-    x = num.array(xnp)
-    y = num.array(ynp)
-    assert np.array_equal(np.where(anp, xnp, ynp), num.where(a, x, y))
+    a_np = np.array(cond)
+    x_np = np.array([[1, 2], [3, 4]])
+    y_np = np.array([[9, 8], [7, 6]])
+    a_num = num.array(a_np)
+    x_num = num.array(x_np)
+    y_num = num.array(y_np)
+    assert np.array_equal(
+        np.where(a_np, x_np, y_np), num.where(a_num, x_num, y_num)
+    )
+
+
+@pytest.mark.parametrize(
+    "shape_a",
+    ((1,), (3,), (1, 3), (3, 3), (2, 3, 3)),
+    ids=lambda shape_a: f"(shape_a={shape_a})",
+)
+def test_broadcast(shape_a):
+    a_num = mk_seq_array(num, shape_a)
+    a_np = mk_seq_array(np, shape_a)
+    cond_num = a_num > 5
+    cond_np = a_np > 5
+
+    shape_x = (3, 3)
+    x_num = mk_seq_array(num, shape_x)
+    x_np = mk_seq_array(np, shape_x)
+    shape_y = (1, 3)
+    y_num = mk_seq_array(num, shape_y) * 10
+    y_np = mk_seq_array(np, shape_y) * 10
+
+    assert np.array_equal(
+        np.where(cond_np, x_np, y_np), num.where(cond_num, x_num, y_num)
+    )
+
+
+@pytest.mark.xfail
+def test_condition_none():
+    # In Numpy, pass and returns [1, 2]
+    # In cuNumeric, raises AttributeError:
+    # 'NoneType' object has no attribute '_maybe_convert'
+    x = 0
+    y_np = np.array([1, 2])
+    y_num = num.array(y_np)
+    assert np.array_equal(np.where(None, x, y_np), num.where(None, x, y_num))
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize(
+    "values",
+    ((None, None), (None, 1), (1, None)),
+    ids=lambda values: f"(values={values})",
+)
+def test_x_y_none(values):
+    # For x=None and y=None,
+    # In Numpy, pass and returns [None, None]
+    # In cuNumeric, pass and returns (array([0]),)
+    # For x=None and y=1
+    # In Numpy, pass and returns [None, 1]
+    # In cuNumeric, raises ValueError: both 'x' and 'y' parameters
+    # must be specified together for where
+    cond = [True, False]
+    a_np = np.array(cond)
+    a_num = num.array(a_np)
+    x, y = values
+    assert np.array_equal(np.where(a_np, x, y), num.where(a_num, x, y))
+
+
+def test_x_y_type():
+    x_np = np.arange(4, dtype=np.int32)
+    y_np = np.arange(4, dtype=np.float32) * 2.2
+    x_num = num.array(x_np)
+    y_num = num.array(y_np)
+
+    res_np = np.where(x_np > 2.0, x_np, y_np)
+    res_num = num.where(x_num > 2.0, x_num, y_num)
+
+    assert np.array_equal(res_np, res_num)
+    assert res_np.dtype == res_num.dtype
+
+
+def test_condition_empty():
+    cond_num = num.array([])
+    cond_np = np.array([])
+    x = 0
+    y = 1
+    assert np.array_equal(np.where(cond_np, x, y), num.where(cond_num, x, y))
+
+
+class TestWhereErrors:
+    @pytest.mark.parametrize(
+        "shape_y",
+        ((0,), (2,), (1, 2), (4, 1)),
+        ids=lambda shape_y: f"(shape_y={shape_y})",
+    )
+    def test_x_y_bad_shape(self, shape_y):
+        shape_a = (3, 3)
+        a = mk_seq_array(num, shape_a)
+        cond = a > 5
+        x = 1
+        y = mk_seq_array(num, shape_y)
+
+        msg = "shape mismatch"
+        with pytest.raises(ValueError, match=msg):
+            num.where(cond, x, y)
 
 
 INPUT = [
@@ -60,22 +156,27 @@ def test_condition(cond):
 
 @pytest.mark.parametrize("input", INPUT, ids=str)
 def test_argwhere(input):
-    anp = np.array(input)
-    a = num.array(anp)
-    assert np.array_equal(np.argwhere(anp), num.argwhere(a))
-
-
-@pytest.mark.skip
-def test_extract():
-    cnp = np.array(
-        [1, 54, 4, 4, 0, 45, 5, 58, 0, 9, 0, 4, 0, 0, 0, 5, 0, 1]
-    ).reshape(
-        (6, 3)
-    )  # noqa E501
-    c = num.array(cnp)
-    bnp = np.random.randn(6, 3)
-    b = num.array(bnp)
-    assert num.array_equal(num.extract(c, b), np.extract(cnp, bnp))
+    a_np = np.array(input)
+    a_num = num.array(a_np)
+    assert np.array_equal(np.argwhere(a_np), num.argwhere(a_num))
+
+
+@pytest.mark.xfail
+def test_argwhere_none():
+    # In Numpy, it pass and returns []
+    # In cuNumeric, it raises AttributeError:
+    # 'NoneType' object has no attribute '_thunk'
+    assert np.array_equal(np.argwhere(None), num.argwhere(None))
+
+
+def test_argwhere_empty():
+    a_np = np.array([])
+    a_num = num.array(a_np)
+    assert np.array_equal(np.argwhere(a_np), num.argwhere(a_num))
+
+
+def test_argwhere_scalar():
+    assert np.array_equal(np.argwhere(1), num.argwhere(1))
 
 
 if __name__ == "__main__":

From 4b4b924c13e71c49646c6a321468398d72c0fa43 Mon Sep 17 00:00:00 2001
From: xialu00 <110973296+xialu00@users.noreply.github.com>
Date: Tue, 29 Nov 2022 12:28:03 +0800
Subject: [PATCH 58/89] Testcase enhance for test_bincount.py (#708)

* add negative test case for test_convolve.py

* add test case for test_astype.py

* add test case for test_astype.py

* fix bug

* enhance test_bincount.py

* enhance test_bincount.py

* enhance test_cholesky.py
---
 tests/integration/test_bincount.py | 30 ++++++++++++++++++++++++++++++
 tests/integration/test_cholesky.py | 19 +++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/tests/integration/test_bincount.py b/tests/integration/test_bincount.py
index b1d9fd4c5..f137e9151 100644
--- a/tests/integration/test_bincount.py
+++ b/tests/integration/test_bincount.py
@@ -27,6 +27,36 @@
 MINLENGTHS = [0, 5, 15]
 
 
+def test_dtype_negative():
+    arr = num.arange(5, dtype=float)
+    msg = r"integer type"
+    with pytest.raises(TypeError, match=msg):
+        num.bincount(arr)
+
+
+def test_weight_mismatch():
+    v_num = num.random.randint(0, 9, size=N)
+    w_num = num.random.randn(N + 1)
+    msg = r"same shape"
+    with pytest.raises(ValueError, match=msg):
+        num.bincount(v_num, weights=w_num)
+
+
+def test_out_size():
+    arr = num.array([0, 1, 1, 3, 2, 1, 7, 23])
+    assert num.bincount(arr).size == num.amax(arr) + 1
+
+
+@pytest.mark.skip()
+def test_array_ndim():
+    size = (2,) * 3
+    arr = num.random.randint(0, high=9, size=size)
+    # Numpy raises : ValueError: object too deep for desired array
+    # cuNumeric run aborted
+    with pytest.raises(ValueError):
+        num.bincount(arr)
+
+
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("minlength", MINLENGTHS)
 def test_bincount_basic(dtype, minlength):
diff --git a/tests/integration/test_cholesky.py b/tests/integration/test_cholesky.py
index b630aba70..6ed8e35f8 100644
--- a/tests/integration/test_cholesky.py
+++ b/tests/integration/test_cholesky.py
@@ -22,6 +22,25 @@
 SIZES = [8, 9, 255, 512]
 
 
+def test_matrix():
+    arr = [[1, -2j], [2j, 5]]
+    np_out = np.linalg.cholesky(arr)
+    num_out = num.linalg.cholesky(arr)
+    assert np.array_equal(np_out, num_out)
+
+
+def test_array_negative_1dim():
+    arr = num.random.randint(0, 9, size=(3,))
+    with pytest.raises(ValueError):
+        num.linalg.cholesky(arr)
+
+
+def test_array_negative_3dim():
+    arr = num.random.randint(0, 9, size=(3, 3, 3))
+    with pytest.raises(NotImplementedError):
+        num.linalg.cholesky(arr)
+
+
 def test_diagonal():
     a = num.eye(10) * 10.0
     b = num.linalg.cholesky(a)

From 5b043932e940e9927fdebf8ae89a6b62fc5b8fff Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 29 Nov 2022 17:50:43 -0800
Subject: [PATCH 59/89] [pre-commit.ci] pre-commit autoupdate (#718)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/PyCQA/flake8: 5.0.4 → 6.0.0](https://github.com/PyCQA/flake8/compare/5.0.4...6.0.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dc8cecafe..e929c0833 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8
-      rev: 5.0.4
+      rev: 6.0.0
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format

From 2067484fb568a64e457faa137efd4c997bd07abf Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Thu, 1 Dec 2022 15:57:43 -0800
Subject: [PATCH 60/89] Fix build under CUDA 11.8 (#723)

---
 src/cunumeric/random/randutil/generator.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cunumeric/random/randutil/generator.cuh b/src/cunumeric/random/randutil/generator.cuh
index 1b2bb33df..023de9496 100644
--- a/src/cunumeric/random/randutil/generator.cuh
+++ b/src/cunumeric/random/randutil/generator.cuh
@@ -18,6 +18,8 @@
 
 #include "generator.h"
 
+#include <cuda.h>
+
 namespace randutilimpl {
 static constexpr int blocksPerMultiProcessor = 2;    // TODO: refine => number of blocks per mp
 static constexpr int blockDimX               = 256;  // TODO: refine ?

From d9dd7b33c900cef14c863580cf17930a0498886d Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Tue, 6 Dec 2022 09:15:15 +0800
Subject: [PATCH 61/89] Enhance test_solve.py and test_matrix_power.py (#719)

* Enhance test_solve.py and test_matrix_power.py

* Address comments

* Replace one-letter dtype with full name.
---
 cunumeric/linalg/linalg.py             |   4 +-
 tests/integration/test_matrix_power.py |  74 +++++++++++-
 tests/integration/test_solve.py        | 154 +++++++++++++++++++++++--
 3 files changed, 215 insertions(+), 17 deletions(-)

diff --git a/cunumeric/linalg/linalg.py b/cunumeric/linalg/linalg.py
index 6474f56f3..18ecfa140 100644
--- a/cunumeric/linalg/linalg.py
+++ b/cunumeric/linalg/linalg.py
@@ -196,9 +196,9 @@ def matrix_power(a: ndarray, n: int) -> ndarray:
     """
     # Process inputs
     if a.ndim < 2:
-        raise ValueError(f"Expected at least 2d array, but got {a.ndim}d")
+        raise LinAlgError(f"Expected at least 2d array, but got {a.ndim}d")
     if a.shape[-2] != a.shape[-1]:
-        raise ValueError("Last 2 dimensions of the array must be square")
+        raise LinAlgError("Last 2 dimensions of the array must be square")
     if not isinstance(n, int):
         raise TypeError("exponent must be an integer")
 
diff --git a/tests/integration/test_matrix_power.py b/tests/integration/test_matrix_power.py
index d4cfe4b23..de4838798 100644
--- a/tests/integration/test_matrix_power.py
+++ b/tests/integration/test_matrix_power.py
@@ -22,18 +22,86 @@
 import cunumeric as num
 
 # TODO: add negative exponents here, once they become supported
-EXPONENTS = [0, 1, 3, 5]
+EXPONENTS = (0, 1, 2, 3, 5)
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    (
+        np.float64,
+        np.complex128,
+        pytest.param(np.int32, marks=pytest.mark.xfail),
+    ),
+)
 @pytest.mark.parametrize("ndim", range(0, LEGATE_MAX_DIM - 2))
 @pytest.mark.parametrize("exp", EXPONENTS)
-def test_matrix_power(ndim, exp):
+def test_matrix_power(ndim, exp, dtype):
+    # If dtype=np.int32 and exp greater than 1,
+    # In Numpy, pass
+    # In cuNumeric, raises TypeError: Unsupported type: int32
     shape = (3,) * ndim + (2, 2)
+    a_np = mk_0to1_array(np, shape, dtype=dtype)
+    a_num = mk_0to1_array(num, shape, dtype=dtype)
+    res_np = np.linalg.matrix_power(a_np, exp)
+    res_num = num.linalg.matrix_power(a_num, exp)
+    assert allclose(res_np, res_num)
+
+
+@pytest.mark.parametrize(
+    "exp",
+    (
+        0,
+        1,
+        pytest.param(2, marks=pytest.mark.xfail),
+        pytest.param(3, marks=pytest.mark.xfail),
+    ),
+)
+def test_matrix_power_empty_matrix(exp):
+    # If exp =2 or 3,
+    # In Numpy, pass and returns empty array
+    # In cuNumeric, raise AssertionError in _contract
+    shape = (0, 0)
     a_np = mk_0to1_array(np, shape)
     a_num = mk_0to1_array(num, shape)
     res_np = np.linalg.matrix_power(a_np, exp)
     res_num = num.linalg.matrix_power(a_num, exp)
-    assert allclose(res_np, res_num)
+    assert np.array_equal(res_np, res_num)
+
+
+class TestMatrixPowerErrors:
+    @pytest.mark.parametrize("ndim", (0, 1), ids=lambda ndim: f"(ndim={ndim})")
+    def test_matrix_ndim_smaller_than_two(self, ndim):
+        shape = (3,) * ndim
+        a_num = mk_0to1_array(num, shape)
+        msg = "Expected at least 2d array"
+        with pytest.raises(num.linalg.LinAlgError, match=msg):
+            num.linalg.matrix_power(a_num, 1)
+
+    @pytest.mark.parametrize(
+        "shape", ((2, 1), (2, 2, 1)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_matrix_not_square(self, shape):
+        a_num = mk_0to1_array(num, shape)
+        msg = "Last 2 dimensions of the array must be square"
+        with pytest.raises(num.linalg.LinAlgError, match=msg):
+            num.linalg.matrix_power(a_num, 1)
+
+    @pytest.mark.parametrize(
+        "n", (-1.0, 1.0, [1], None), ids=lambda n: f"(n={n})"
+    )
+    def test_n_not_int(self, n):
+        shape = (2, 2)
+        a_num = mk_0to1_array(num, shape)
+        msg = "exponent must be an integer"
+        with pytest.raises(TypeError, match=msg):
+            num.linalg.matrix_power(a_num, n)
+
+    def test_n_negative_int(self):
+        shape = (2, 2)
+        n = -1
+        a_num = mk_0to1_array(num, shape)
+        with pytest.raises(NotImplementedError):
+            num.linalg.matrix_power(a_num, n)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_solve.py b/tests/integration/test_solve.py
index 30b569401..82a204889 100644
--- a/tests/integration/test_solve.py
+++ b/tests/integration/test_solve.py
@@ -22,23 +22,27 @@
 SIZES = (8, 9, 255)
 
 RTOL = {
-    np.dtype("f"): 1e-1,
-    np.dtype("F"): 1e-1,
-    np.dtype("d"): 1e-5,
-    np.dtype("D"): 1e-5,
+    np.dtype(np.float32): 1e-1,
+    np.dtype(np.complex64): 1e-1,
+    np.dtype(np.float64): 1e-5,
+    np.dtype(np.complex128): 1e-5,
 }
 
 ATOL = {
-    np.dtype("f"): 1e-3,
-    np.dtype("F"): 1e-3,
-    np.dtype("d"): 1e-8,
-    np.dtype("D"): 1e-8,
+    np.dtype(np.float32): 1e-3,
+    np.dtype(np.complex64): 1e-3,
+    np.dtype(np.float64): 1e-8,
+    np.dtype(np.complex128): 1e-8,
 }
 
 
 @pytest.mark.parametrize("n", SIZES)
-@pytest.mark.parametrize("a_dtype", ("f", "d", "F", "D"))
-@pytest.mark.parametrize("b_dtype", ("f", "d", "F", "D"))
+@pytest.mark.parametrize(
+    "a_dtype", (np.float32, np.float64, np.complex64, np.complex128)
+)
+@pytest.mark.parametrize(
+    "b_dtype", (np.float32, np.float64, np.complex64, np.complex128)
+)
 def test_solve_1d(n, a_dtype, b_dtype):
     a = np.random.rand(n, n).astype(a_dtype)
     b = np.random.rand(n).astype(b_dtype)
@@ -53,8 +57,12 @@ def test_solve_1d(n, a_dtype, b_dtype):
 
 
 @pytest.mark.parametrize("n", SIZES)
-@pytest.mark.parametrize("a_dtype", ("f", "d", "F", "D"))
-@pytest.mark.parametrize("b_dtype", ("f", "d", "F", "D"))
+@pytest.mark.parametrize(
+    "a_dtype", (np.float32, np.float64, np.complex64, np.complex128)
+)
+@pytest.mark.parametrize(
+    "b_dtype", (np.float32, np.float64, np.complex64, np.complex128)
+)
 def test_solve_2d(n, a_dtype, b_dtype):
     a = np.random.rand(n, n).astype(a_dtype)
     b = np.random.rand(n, n + 2).astype(b_dtype)
@@ -80,6 +88,128 @@ def test_solve_corner_cases():
     assert allclose(b, num.matmul(a, out))
 
 
+def test_solve_b_is_empty():
+    a = num.random.rand(1, 1)
+    b = num.atleast_2d([])
+
+    out = num.linalg.solve(a, b)
+    assert np.array_equal(b, out)
+
+
+@pytest.mark.parametrize("dtype", (np.int32, np.int64))
+def test_solve_dtype_int(dtype):
+    a = [[1, 4, 5], [2, 3, 1], [9, 5, 2]]
+    b = [1, 2, 3]
+    a_num = num.array(a).astype(dtype)
+    b_num = num.array(b).astype(dtype)
+    out = num.linalg.solve(a_num, b_num)
+
+    rtol = RTOL[out.dtype]
+    atol = ATOL[out.dtype]
+    assert allclose(
+        b_num, num.matmul(a_num, out), rtol=rtol, atol=atol, check_dtype=False
+    )
+
+
+def test_solve_with_output():
+    n = 8
+    a = np.random.rand(n, n).astype(np.float32)
+    b = np.random.rand(n).astype(np.float32)
+    output = np.zeros((n,)).astype(np.float32)
+
+    out = num.linalg.solve(a, b, out=output)
+
+    rtol = RTOL[out.dtype]
+    atol = ATOL[out.dtype]
+    assert allclose(
+        b, num.matmul(a, out), rtol=rtol, atol=atol, check_dtype=False
+    )
+    assert allclose(
+        b, num.matmul(a, output), rtol=rtol, atol=atol, check_dtype=False
+    )
+
+
+class TestSolveErrors:
+    def setup_method(self):
+        self.n = 3
+        self.a = num.random.rand(self.n, self.n).astype(np.float64)
+        self.b = num.random.rand(self.n).astype(np.float64)
+
+    def test_a_bad_dim(self):
+        a = num.random.rand(self.n).astype(np.float64)
+        msg = "Array must be at least two-dimensional"
+        with pytest.raises(num.linalg.LinAlgError, match=msg):
+            num.linalg.solve(a, self.b)
+
+        a = 10
+        msg = "Array must be at least two-dimensional"
+        with pytest.raises(num.linalg.LinAlgError, match=msg):
+            num.linalg.solve(a, self.b)
+
+    def test_b_bad_dim(self):
+        b = 10
+        msg = "Array must be at least one-dimensional"
+        with pytest.raises(num.linalg.LinAlgError, match=msg):
+            num.linalg.solve(self.a, b)
+
+    def test_a_dim_greater_than_two(self):
+        a = num.random.rand(self.n, self.n, self.n).astype(np.float64)
+        b = num.random.rand(self.n, self.n).astype(np.float64)
+        with pytest.raises(NotImplementedError):
+            num.linalg.solve(a, b)
+
+    def test_b_dim_greater_than_two(self):
+        a = num.random.rand(self.n, self.n).astype(np.float64)
+        b = num.random.rand(self.n, self.n, self.n).astype(np.float64)
+        with pytest.raises(NotImplementedError):
+            num.linalg.solve(a, b)
+
+    def test_a_bad_dtype_float16(self):
+        a = self.a.astype(np.float16)
+        msg = "array type float16 is unsupported in linalg"
+        with pytest.raises(TypeError, match=msg):
+            num.linalg.solve(a, self.b)
+
+    def test_b_bad_dtype_float16(self):
+        b = self.b.astype(np.float16)
+        msg = "array type float16 is unsupported in linalg"
+        with pytest.raises(TypeError, match=msg):
+            num.linalg.solve(self.a, b)
+
+    def test_a_last_2_dims_not_square(self):
+        a = num.random.rand(self.n, self.n + 1).astype(np.float64)
+        msg = "Last 2 dimensions of the array must be square"
+        with pytest.raises(num.linalg.LinAlgError, match=msg):
+            num.linalg.solve(a, self.b)
+
+    def test_a_b_mismatched_shape(self):
+        b = num.random.rand(self.n + 1).astype(np.float64)
+        with pytest.raises(ValueError):
+            num.linalg.solve(self.a, b)
+
+        b = num.random.rand(self.n + 1, self.n).astype(np.float64)
+        with pytest.raises(ValueError):
+            num.linalg.solve(self.a, b)
+
+    def test_output_mismatched_shape(self):
+        output = num.zeros((self.n + 1,)).astype(np.float64)
+        msg = "Output shape mismatch"
+        with pytest.raises(ValueError, match=msg):
+            num.linalg.solve(self.a, self.b, out=output)
+
+    def test_output_mismatched_dtype(self):
+        output = num.zeros((self.n,)).astype(np.float32)
+        msg = "Output type mismatch"
+        with pytest.raises(TypeError, match=msg):
+            num.linalg.solve(self.a, self.b, out=output)
+
+    def test_a_singular_matrix(self):
+        a = num.zeros((self.n, self.n)).astype(np.float64)
+        msg = "Singular matrix"
+        with pytest.raises(num.linalg.LinAlgError, match=msg):
+            num.linalg.solve(a, self.b)
+
+
 if __name__ == "__main__":
     import sys
 

From 6cf5032faa39a463dd2f15b698403fcb9152b345 Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Wed, 7 Dec 2022 10:05:44 +0800
Subject: [PATCH 62/89] Enhance test_dot.py and test_multi_dot.py (#730)

---
 tests/integration/test_dot.py       | 41 +++++++++++++++
 tests/integration/test_multi_dot.py | 77 +++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

diff --git a/tests/integration/test_dot.py b/tests/integration/test_dot.py
index 912cd9ed4..d0157cbf1 100644
--- a/tests/integration/test_dot.py
+++ b/tests/integration/test_dot.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import numpy as np
 import pytest
 from legate.core import LEGATE_MAX_DIM
 from utils.contractions import check_default
+from utils.generators import mk_0to1_array
 
+import cunumeric as num
 from cunumeric.utils import dot_modes
 
 
@@ -31,6 +34,44 @@ def operation(lib, *args, **kwargs):
     check_default(name, modes, operation)
 
 
+class TestDotErrors:
+    def setup_method(self):
+        self.A = mk_0to1_array(num, (5, 3))
+        self.B = mk_0to1_array(num, (3, 2))
+
+    @pytest.mark.parametrize(
+        "shapeA",
+        ((3,), (4, 3), (5, 4, 3)),
+        ids=lambda shapeA: f"(shapeA={shapeA})",
+    )
+    def test_a_b_invalid_shape(self, shapeA):
+        A = mk_0to1_array(num, shapeA)
+        B = mk_0to1_array(num, (2, 2))
+        with pytest.raises(ValueError):
+            num.dot(A, B)
+
+    @pytest.mark.parametrize(
+        "shape", ((5,), (2,), (5, 3)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_out_invalid_shape(self, shape):
+        out = num.zeros(shape)
+        with pytest.raises(ValueError):
+            num.dot(self.A, self.B, out=out)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "dtype", (np.float32, np.int64), ids=lambda dtype: f"(dtype={dtype})"
+    )
+    def test_out_invalid_dtype(self, dtype):
+        # In Numpy, for np.float32 and np.int64, it raises ValueError
+        # In cuNumeric,
+        # for np.float32, it pass
+        # for np.int64, it raises TypeError: Unsupported type: int64
+        out = np.zeros((5, 2), dtype=dtype)
+        with pytest.raises(ValueError):
+            np.dot(self.A, self.B, out=out)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_multi_dot.py b/tests/integration/test_multi_dot.py
index ecba326ef..1c4ca3d05 100644
--- a/tests/integration/test_multi_dot.py
+++ b/tests/integration/test_multi_dot.py
@@ -66,6 +66,83 @@ def test_multi_dot(shapes):
     assert allclose(res_np, out)
 
 
+class TestMultiDotErrors:
+    def setup_method(self):
+        A = mk_0to1_array(num, (2, 2))
+        B = mk_0to1_array(num, (2, 2))
+        C = mk_0to1_array(num, (2, 2))
+        self.arrays = [A, B, C]
+
+    def test_zero_array(self):
+        arrays = []
+        msg = "at least two arrays"
+        with pytest.raises(ValueError, match=msg):
+            num.linalg.multi_dot(arrays)
+
+    def test_one_array(self):
+        arrays = [num.array([[1, 2], [3, 4]])]
+        msg = "at least two arrays"
+        with pytest.raises(ValueError, match=msg):
+            num.linalg.multi_dot(arrays)
+
+    def test_invalid_array_dim_zero(self):
+        A = num.array(3)
+        B = mk_0to1_array(num, (2, 2))
+        C = mk_0to1_array(num, (2, 2))
+        arrays = [A, B, C]
+        with pytest.raises(ValueError):
+            num.linalg.multi_dot(arrays)
+
+    def test_invalid_array_dim_one(self):
+        A = mk_0to1_array(num, (2, 2))
+        B = mk_0to1_array(num, (2,))
+        C = mk_0to1_array(num, (2, 2))
+        arrays = [A, B, C]
+        with pytest.raises(ValueError):
+            num.linalg.multi_dot(arrays)
+
+    def test_invalid_array_dim_three(self):
+        A = mk_0to1_array(num, (2, 2, 2))
+        B = mk_0to1_array(num, (2, 2, 2))
+        C = mk_0to1_array(num, (2, 2, 2))
+        arrays = [A, B, C]
+        with pytest.raises(ValueError):
+            num.linalg.multi_dot(arrays)
+
+    def test_invalid_array_shape(self):
+        A = mk_0to1_array(num, (2, 2))
+        B = mk_0to1_array(num, (3, 2))
+        C = mk_0to1_array(num, (2, 2))
+        arrays = [A, B, C]
+        with pytest.raises(ValueError):
+            num.linalg.multi_dot(arrays)
+
+    def test_out_invalid_dim(self):
+        out = num.zeros((2,))
+        with pytest.raises(ValueError):
+            num.linalg.multi_dot(self.arrays, out=out)
+
+    @pytest.mark.xfail
+    def test_out_invalid_shape(self):
+        # In cuNumeric, it raises AssertionError
+        out = num.zeros((2, 1))
+        with pytest.raises(ValueError):
+            num.linalg.multi_dot(self.arrays, out=out)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "dtype", (np.float32, np.int64), ids=lambda dtype: f"(dtype={dtype})"
+    )
+    def test_out_invalid_dtype(self, dtype):
+        # In Numpy, for np.float32 and np.int64, it raises ValueError
+        # In cuNumeric,
+        # for np.float32, it pass
+        # for np.int64, it raises TypeError: Unsupported type: int64
+        out = num.zeros((2, 2), dtype=dtype)
+        with pytest.raises(ValueError):
+            num.linalg.multi_dot(self.arrays, out=out)
+
+
 if __name__ == "__main__":
     import sys
 

From f1fe90aa351f140bf2fdd6b38f0be1bebcc8c2a9 Mon Sep 17 00:00:00 2001
From: xialu00 <110973296+xialu00@users.noreply.github.com>
Date: Tue, 13 Dec 2022 09:26:43 +0800
Subject: [PATCH 63/89] Testcase enhance test_reduction.py and test_prod.py
 (#726)

* add negative test case for test_convolve.py

* add test case for test_astype.py

* add test case for test_astype.py

* fix bug

* enhance test_bincount.py

* enhance test_bincount.py

* enhance test_cholesky.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_prod.py

* fix bug tests/integration/test_prod.py
---
 tests/integration/test_2d_reduction.py      |  49 ---
 tests/integration/test_3d_reduction.py      |  38 ---
 tests/integration/test_prod.py              | 332 ++++++++++++++++++++
 tests/integration/test_reduction.py         | 312 ++++++++++++++----
 tests/integration/test_reduction_axis.py    |  42 ---
 tests/integration/test_reduction_complex.py |  37 ---
 6 files changed, 586 insertions(+), 224 deletions(-)
 delete mode 100644 tests/integration/test_2d_reduction.py
 delete mode 100644 tests/integration/test_3d_reduction.py
 create mode 100644 tests/integration/test_prod.py
 delete mode 100644 tests/integration/test_reduction_axis.py
 delete mode 100644 tests/integration/test_reduction_complex.py

diff --git a/tests/integration/test_2d_reduction.py b/tests/integration/test_2d_reduction.py
deleted file mode 100644
index 86a88d2f7..000000000
--- a/tests/integration/test_2d_reduction.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import pytest
-from utils.comparisons import allclose
-
-import cunumeric as num
-
-
-def test_sum():
-    anp = np.array([[1, 2, 3], [4, 5, 6]])
-    a = num.array(anp)
-    r = a.sum(0)
-    assert np.array_equal(r, [5, 7, 9])
-
-    r = a.sum(1)
-    assert np.array_equal(r, [6, 15])
-
-
-def test_random():
-    bnp = np.random.random((2, 3))
-    b = num.array(bnp)
-    assert allclose(num.sum(b), np.sum(bnp))
-
-
-def test_randn():
-    af = np.random.randn(4, 5)
-    bf = num.array(af)
-    assert allclose(af.mean(0), bf.mean(0))
-    assert allclose(af.mean(), bf.mean())
-
-
-if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_3d_reduction.py b/tests/integration/test_3d_reduction.py
deleted file mode 100644
index 79b04d614..000000000
--- a/tests/integration/test_3d_reduction.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import pytest
-from utils.comparisons import allclose
-
-import cunumeric as num
-
-np.random.seed(42)
-
-
-def test_sum():
-    b = np.random.random((10, 12, 13))
-    a = num.array(b)
-    assert allclose(a, b)
-
-    lg_sum = num.sum(a)
-    np_sum = np.sum(b)
-    assert allclose(np_sum, lg_sum)
-
-
-if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_prod.py b/tests/integration/test_prod.py
new file mode 100644
index 000000000..01b95cadc
--- /dev/null
+++ b/tests/integration/test_prod.py
@@ -0,0 +1,332 @@
+# Copyright 2021-2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import random
+
+import numpy as np
+import pytest
+from utils.comparisons import allclose
+
+import cunumeric as num
+
+# numpy.prod(a, axis=None, dtype=None, out=None, keepdims=<no value>,
+# initial=<no value>, where=<no value>)
+
+DIM = 5
+SIZES = [
+    (0,),
+    (1),
+    (DIM),
+    (0, 1),
+    (1, 0),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 0, 0),
+    (1, 1, 0),
+    (1, 0, 1),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
+SIZES_E2 = [
+    (DIM),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
+SIZE_E = [
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+]
+NO_EMPTY_SIZE = [
+    (1),
+    (DIM),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
+
+ARR = ([], [[]], [[], []], np.inf, np.Inf, -10.3, 0, 200, 5 + 8j)
+
+DTYPE = ["l", "L", "f", "e", "d"]
+COMPLEX_TYPE = ["F"]
+NEGATIVE_COMPLEX_TYPE = ["D"]
+NEGATIVE_DTYPE = ["h", "i", "H", "I", "?", "b", "B"]
+
+
+def to_dtype(s):
+    return str(np.dtype(s))
+
+
+class TestProdNegative(object):
+    """
+    this class is to test negative cases
+    """
+
+    @pytest.mark.parametrize("arr", ARR)
+    def test_array(self, arr):
+        assert np.array_equal(np.prod(arr), num.prod(arr))
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("dtype", NEGATIVE_DTYPE, ids=to_dtype)
+    def test_dtype_negative(self, dtype):
+        size = (5, 5, 5)
+        arr = np.random.random(size) * 10 + 2
+        arr_np = np.array(arr, dtype=dtype)
+        arr_num = num.array(arr_np)
+        out_np = np.prod(arr_np)  # Numpy return product of all datas
+        out_num = num.prod(arr_num)
+        # cuNumeric return an array with a different data
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.skip
+    @pytest.mark.parametrize("dtype", NEGATIVE_COMPLEX_TYPE, ids=to_dtype)
+    def test_dtype_complex_negative(self, dtype):
+        arr = (num.random.rand(5, 5) * 10 + 2) + (
+            num.random.rand(5, 5) * 10 * 1.0j + 0.2j
+        )
+        arr_np = np.array(arr, dtype=dtype)
+        arr_num = num.array(arr_np)
+        out_np = np.prod(arr_np)
+        out_num = num.prod(arr_num)
+        assert allclose(out_np, out_num)
+
+    def test_axis_out_bound(self):
+        arr = [-1, 0, 1, 2, 10]
+        msg = r"bounds"
+        with pytest.raises(np.AxisError, match=msg):
+            num.prod(arr, axis=2)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("axis", ((-1, 1), (0, 1), (1, 2), (0, 2)))
+    def test_axis_tuple(self, axis):
+        size = (5, 5, 5)
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        out_np = np.prod(arr_np, axis=axis)
+        # cuNumeric raises NotImplementedError:
+        # Need support for reducing multiple dimensions.
+        # Numpy get results.
+        out_num = num.prod(arr_num, axis=axis)
+        assert allclose(out_np, out_num)
+
+    def test_out_negative(self):
+        in_shape = (2, 3, 4)
+        out_shape = (2, 3, 3)
+        arr_num = num.random.random(in_shape) * 10
+        arr_out = num.random.random(out_shape) * 10
+        msg = r"shapes do not match"
+        with pytest.raises(ValueError, match=msg):
+            num.prod(arr_num, out=arr_out, axis=2)
+
+    def test_keepdims(self):
+        in_shape = (2, 3, 4)
+        arr_num = num.random.random(in_shape) * 10
+        arr_np = np.array(arr_num)
+        out_np = np.prod(arr_np, axis=2, keepdims=True)
+        out_num = num.prod(arr_num, axis=2, keepdims=True)
+        assert np.array_equal(out_np, out_num)
+
+    @pytest.mark.xfail
+    def test_initial_scalar_list(self):
+        arr = [[1, 2], [3, 4]]
+        initial_value = [3]
+
+        out_num = num.prod(arr, initial=initial_value)  # array(72)
+        # Numpy raises ValueError:
+        # Input object to FillWithScalar is not a scalar
+        out_np = np.prod(arr, initial=initial_value)
+
+        assert np.array_equal(out_np, out_num)
+
+    def test_initial_list(self):
+        arr = [[1, 2], [3, 4]]
+        initial_value = [2, 3]
+        with pytest.raises(ValueError):
+            num.prod(arr, initial=initial_value)
+
+    def test_initial_empty_array(self):
+        size = (1, 0)
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        initial_value = random.uniform(-20.0, 20.0)
+        out_num = num.prod(arr_num, initial=initial_value)
+        out_np = np.prod(arr_np, initial=initial_value)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.xfail
+    def test_where(self):
+        arr = [[1, 2], [3, 4]]
+        out_np = np.prod(arr, where=[False, True])  # return 8
+        # cuNumeric raises NotImplementedError:
+        # the `where` parameter is currently not supported
+        out_num = num.prod(arr, where=[False, True])
+        assert np.array_equal(out_np, out_num)
+
+
+class TestProdPositive(object):
+    """
+    this class is to test positive cases
+    """
+
+    @pytest.mark.parametrize("size", SIZES)
+    def test_basic(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        out_np = np.prod(arr_np)
+        out_num = np.prod(arr_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("dtype", DTYPE, ids=to_dtype)
+    def test_dtype(self, dtype):
+        size = (5, 5, 5)
+        arr = np.random.random(size) * 10
+        arr_np = np.array(arr, dtype=dtype)
+        arr_num = num.array(arr_np)
+        out_np = np.prod(arr_np)
+        out_num = num.prod(arr_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("dtype", COMPLEX_TYPE, ids=to_dtype)
+    def test_dtype_complex(self, dtype):
+        arr = (num.random.rand(5, 5) * 10 + 2) + (
+            num.random.rand(5, 5) * 10 * 1.0j + 0.2j
+        )
+        arr_np = np.array(arr, dtype=dtype)
+        arr_num = num.array(arr_np)
+        out_np = np.prod(arr_np)
+        out_num = num.prod(arr_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("axis", (_ for _ in range(-2, 3, 1)))
+    def test_axis_basic(self, axis):
+        size = (5, 5, 5)
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        out_num = num.prod(arr_num, axis=axis)
+        out_np = np.prod(arr_np, axis=axis)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("size", SIZES)
+    def test_out_basic(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        out_np = np.random.random(())
+        out_num = num.random.random(())
+        np.prod(arr_np, out=out_np)
+        num.prod(arr_num, out=out_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("size", SIZES)
+    def test_out_axis(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        ndim = arr_np.ndim
+        for axis in range(-ndim + 1, ndim, 1):
+            out_shape = ()
+            if type(size) == tuple:
+                out_shape_list = list(size)
+                del out_shape_list[axis]
+                out_shape = tuple(out_shape_list)
+            out_np = np.random.random(out_shape)
+            out_num = num.random.random(out_shape)
+            np.prod(arr_np, out=out_np, axis=axis)
+            num.prod(arr_num, out=out_num, axis=axis)
+            assert allclose(out_np, out_num)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", SIZES_E2)
+    def test_out_axis_dtype(self, size):
+        arr = np.random.random(size) * 10
+        arr_np = np.array(arr, dtype=to_dtype("f"))
+        arr_num = num.array(arr, dtype=to_dtype("f"))
+
+        ndim = arr_np.ndim
+        for axis in range(-ndim + 1, ndim, 1):
+            out_shape = ()
+            if type(size) == tuple:
+                out_shape_list = list(size)
+                del out_shape_list[axis]
+                out_shape = tuple(out_shape_list)
+            out = np.random.random(out_shape)
+
+            out_np = np.array(out, dtype=to_dtype("i"))
+            out_num = num.array(out, dtype=to_dtype("i"))
+
+            np.prod(arr_np, out=out_np, axis=axis)
+            num.prod(arr_num, out=out_num, axis=axis)
+
+            assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("size", SIZES)
+    def test_axis_keepdims_false(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        ndim = arr_np.ndim
+        for axis in range(-ndim + 1, ndim, 1):
+            out_np = np.prod(arr_np, axis=axis, keepdims=False)
+            out_num = num.prod(arr_num, axis=axis, keepdims=False)
+            assert allclose(out_np, out_num)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", SIZE_E)
+    def test_axis_keepdims_true(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        ndim = arr_np.ndim
+        for axis in range(-ndim + 1, ndim, 1):
+            out_np = np.prod(arr_np, axis=axis, keepdims=True)
+            out_num = num.prod(arr_num, axis=axis, keepdims=True)
+            # in cunumeric/deferred/unary_reduction:
+            # if lhs_array.size == 1:
+            #     > assert axes is None or len(axes) == rhs_array.ndim - (
+            #         0 if keepdims else lhs_array.ndim
+            #     )
+            # E    AssertionError
+            assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+    def test_initial(self, size):
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        initial_value = random.uniform(-20.0, 20.0)
+        out_num = num.prod(arr_num, initial=initial_value)
+        out_np = np.prod(arr_np, initial=initial_value)
+
+        assert allclose(out_np, out_num)
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_reduction.py b/tests/integration/test_reduction.py
index a5c0ddad3..b406a10d7 100644
--- a/tests/integration/test_reduction.py
+++ b/tests/integration/test_reduction.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import random
 
 import numpy as np
 import pytest
@@ -19,85 +20,280 @@
 
 import cunumeric as num
 
+# numpy.sum(a, axis=None, dtype=None, out=None, keepdims=<no value>,
+# initial=<no value>, where=<no value>)
 
-def test_basic():
-    x = num.array([])
-    r = num.sum(x)
-    assert r == 0
+DIM = 5
+SIZES = [
+    (0,),
+    (1),
+    (DIM),
+    (0, 1),
+    (1, 0),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 0, 0),
+    (1, 1, 0),
+    (1, 0, 1),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
 
-    x = num.array([1])
-    r = num.sum(x)
+NO_EMPTY_SIZE = [
+    (1),
+    (DIM),
+    (1, 1),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (1, 1, 1),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
 
-    assert r == 1
+ARR = ([], [[]], [[], []], np.inf, np.Inf, -10.3, 0, 200, 5 + 8j)
 
-    x = num.eye(3)
-    r = num.sum(x)
+DTYPE = ["l", "L", "f", "d"]
+COMPLEX_TYPE = ["F", "D"]
+NEGATIVE_DTYPE = ["h", "i", "H", "I", "e", "?", "b", "B"]
 
-    assert r == 3
 
-    x = num.array([1, 2, 3, 4.0])
-    r = num.sum(x)
+def to_dtype(s):
+    return str(np.dtype(s))
 
-    assert r == 10
 
-    x = num.array([1, 2, 3, 4.0, 5.0])
-    r = num.prod(x)
-    assert r == 120
+class TestSumNegative(object):
+    """
+    this class is to test negative cases
+    """
 
+    @pytest.mark.parametrize("arr", ARR)
+    def test_array(self, arr):
+        assert np.array_equal(np.sum(arr), num.sum(arr))
 
-def test_empty():
-    assert np.array_equal(num.sum([]), np.sum([]))
-    assert np.array_equal(num.sum([[], []]), np.sum([[], []]))
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("dtype", NEGATIVE_DTYPE, ids=to_dtype)
+    def test_dtype_negative(self, dtype):
+        size = (5, 5, 5)
+        arr = np.random.random(size) * 10
+        arr_np = np.array(arr, dtype=dtype)
+        arr_num = num.array(arr_np)
+        out_np = np.sum(arr_np)  # Numpy return sum of all datas
+        out_num = num.sum(
+            arr_num
+        )  # cuNumeric return an array with different data
+        assert allclose(out_np, out_num)
 
+    def test_axis_out_bound(self):
+        arr = [-1, 0, 1, 2, 10]
+        msg = r"bounds"
+        with pytest.raises(np.AxisError, match=msg):
+            num.sum(arr, axis=2)
 
-def test_scalar():
-    assert np.array_equal(num.sum(0), np.sum(0))
-    assert np.array_equal(num.sum(1), np.sum(1))
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("axis", ((-1, 1), (0, 1), (1, 2), (0, 2)))
+    def test_axis_tuple(self, axis):
+        size = (5, 5, 5)
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        out_np = np.sum(arr_np, axis=axis)
+        # cuNumeric raises NotImplementedError:
+        # 'Need support for reducing multiple dimensions'
+        # Numpy get results
+        out_num = num.sum(arr_num, axis=axis)
+        assert allclose(out_np, out_num)
 
+    def test_out_negative(self):
+        in_shape = (2, 3, 4)
+        out_shape = (2, 3, 3)
+        arr_num = num.random.random(in_shape) * 10
+        arr_out = num.random.random(out_shape) * 10
+        msg = r"shapes do not match"
+        with pytest.raises(ValueError, match=msg):
+            num.sum(arr_num, out=arr_out, axis=2)
 
-def test_1d():
-    assert np.array_equal(num.sum(num.array([0])), np.sum(np.array([0])))
-    assert np.array_equal(num.sum([1]), np.sum([1]))
+    def test_keepdims(self):
+        in_shape = (2, 3, 4)
+        arr_num = num.random.random(in_shape) * 10
+        arr_np = np.array(arr_num)
+        out_np = np.sum(arr_np, axis=2, keepdims=True)
+        out_num = num.sum(arr_num, axis=2, keepdims=True)
+        assert np.array_equal(out_np, out_num)
 
-    x = num.array([1, 0, 2, -1, 0, 0, 8])
-    x_np = np.array([1, 0, 2, -1, 0, 0, 8])
-    assert np.array_equal(num.sum(x), np.sum(x_np))
+    @pytest.mark.xfail
+    def test_initial_scalar_list(self):
+        arr = [[1, 2], [3, 4]]
+        initial_value = [3]
+        out_num = num.sum(arr, initial=initial_value)  # array(13)
+        out_np = np.sum(
+            arr, initial=initial_value
+        )  # ValueError: Input object to FillWithScalar is not a scalar
+        assert np.array_equal(out_np, out_num)
 
+    def test_initial_list(self):
+        arr = [[1, 2], [3, 4]]
+        initial_value = [2, 3]
+        with pytest.raises(ValueError):
+            num.sum(arr, initial=initial_value)
 
-def test_2d():
-    x = num.array([[0, 1, 0], [2, 0, 3]])
-    x_np = np.array([[0, 1, 0], [2, 0, 3]])
-    assert np.array_equal(num.sum(x), np.sum(x_np))
+    @pytest.mark.xfail
+    def test_initial_empty_array(self):
+        size = (1, 0)
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        initial_value = random.uniform(-20.0, 20.0)
+        out_num = num.sum(arr_num, initial=initial_value)  # return 0.0
+        out_np = np.sum(arr_np, initial=initial_value)  # return initial_value
+        assert allclose(out_np, out_num)
 
-    x = num.eye(3)
-    x_np = np.eye(3)
-    assert np.array_equal(num.sum(x), np.sum(x_np))
+    @pytest.mark.xfail
+    def test_where(self):
+        arr = [[1, 2], [3, 4]]
+        out_np = np.sum(arr, where=[False, True])  # return 6
+        # cuNumeric raises NotImplementedError:
+        # "the `where` parameter is currently not supported"
+        out_num = num.sum(arr, where=[False, True])
+        assert np.array_equal(out_np, out_num)
 
 
-def test_3d():
-    x = num.array(
-        [
-            [[0, 1], [1, 1], [7, 0], [1, 0], [0, 1]],
-            [[3, 0], [0, 3], [0, 0], [2, 2], [0, 19]],
-        ]
-    )
-    x_np = np.array(
-        [
-            [[0, 1], [1, 1], [7, 0], [1, 0], [0, 1]],
-            [[3, 0], [0, 3], [0, 0], [2, 2], [0, 19]],
-        ]
-    )
-    assert np.array_equal(num.sum(x, axis=0), np.sum(x_np, axis=0))
-    assert np.array_equal(num.sum(x, axis=1), np.sum(x_np, axis=1))
-    assert np.array_equal(num.sum(x, axis=2), np.sum(x_np, axis=2))
-    assert np.array_equal(num.sum(x), np.sum(x_np))
+class TestSumPositive(object):
+    """
+    this class is to test positive cases
+    """
 
-    x_np = np.concatenate((x_np,) * 2000, axis=1)
-    x = num.array(x_np)
-    assert np.array_equal(num.sum(x, axis=0), np.sum(x_np, axis=0))
-    assert np.array_equal(num.sum(x, axis=1), np.sum(x_np, axis=1))
-    assert np.array_equal(num.sum(x, axis=2), np.sum(x_np, axis=2))
-    assert np.array_equal(num.sum(x), np.sum(x_np))
+    @pytest.mark.parametrize("size", SIZES)
+    def test_basic(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        out_np = np.sum(arr_np)
+        out_num = np.sum(arr_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("dtype", DTYPE, ids=to_dtype)
+    def test_dtype(self, dtype):
+        size = (5, 5, 5)
+        arr = np.random.random(size) * 10
+        arr_np = np.array(arr, dtype=dtype)
+        arr_num = num.array(arr_np)
+        out_np = np.sum(arr_np)
+        out_num = num.sum(arr_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("dtype", COMPLEX_TYPE, ids=to_dtype)
+    def test_dtype_complex(self, dtype):
+        arr = num.random.rand(5, 5) * 10 + num.random.rand(5, 5) * 10 * 1.0j
+        arr_np = np.array(arr, dtype=dtype)
+        arr_num = num.array(arr_np)
+        out_np = np.sum(arr_np)
+        out_num = num.sum(arr_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("axis", (_ for _ in range(-2, 3, 1)))
+    def test_axis_basic(self, axis):
+        size = (5, 5, 5)
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        out_num = num.sum(arr_num, axis=axis)
+        out_np = np.sum(arr_np, axis=axis)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("size", SIZES)
+    def test_out_basic(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        out_np = np.random.random(())
+        out_num = num.random.random(())
+        np.sum(arr_np, out=out_np)
+        num.sum(arr_num, out=out_num)
+        assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("size", SIZES)
+    def test_out_axis(self, size):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        ndim = arr_np.ndim
+        for axis in range(-ndim + 1, ndim, 1):
+            out_shape = ()
+            if type(size) == tuple:
+                out_shape_list = list(size)
+                del out_shape_list[axis]
+                out_shape = tuple(out_shape_list)
+            out_np = np.random.random(out_shape)
+            out_num = num.random.random(out_shape)
+            np.sum(arr_np, out=out_np, axis=axis)
+            num.sum(arr_num, out=out_num, axis=axis)
+            assert allclose(out_np, out_num)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", SIZES)
+    def test_out_axis_dtype(self, size):
+        arr = np.random.random(size) * 10
+        arr_np = np.array(arr, dtype=to_dtype("f"))
+        arr_num = num.array(arr, dtype=to_dtype("f"))
+
+        ndim = arr_np.ndim
+        for axis in range(-ndim + 1, ndim, 1):
+            out_shape = ()
+            if type(size) == tuple:
+                out_shape_list = list(size)
+                del out_shape_list[axis]
+                out_shape = tuple(out_shape_list)
+            out = np.random.random(out_shape)
+
+            out_np = np.array(out, dtype=to_dtype("i"))
+            out_num = num.array(out, dtype=to_dtype("i"))
+
+            np.sum(arr_np, out=out_np, axis=axis)
+            num.sum(arr_num, out=out_num, axis=axis)
+
+            # some data in the out_result are different
+            # out_np     = array([[39, 23, 22, 37, 19],
+            #        [21, 28, 29, 38, 24],
+            #        [29, 25, 30, 27, 23],
+            #        [24, 30, 22, 29, 22],
+            #        [16, 15, 29, 22, 13]], dtype=int32)
+            # out_num    = array([[38, 21, 20, 35, 17],
+            #        [19, 25, 27, 37, 22],
+            #        [27, 24, 29, 24, 22],
+            #        [21, 27, 20, 26, 19],
+            #        [13, 14, 26, 20, 12]], dtype=int32)
+
+            assert allclose(out_np, out_num)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize("size", SIZES)
+    @pytest.mark.parametrize("keepdims", [False, True])
+    def test_axis_keepdims(self, size, keepdims):
+        arr_np = np.random.random(size)
+        arr_num = num.array(arr_np)
+        ndim = arr_np.ndim
+        for axis in range(-ndim + 1, ndim, 1):
+            out_np = np.sum(arr_np, axis=axis, keepdims=keepdims)
+            out_num = num.sum(arr_num, axis=axis, keepdims=keepdims)
+            # in cunumeric/deferred/unary_reduction:
+            # if lhs_array.size == 1:
+            #     > assert axes is None or len(axes) == rhs_array.ndim - (
+            #         0 if keepdims else lhs_array.ndim
+            #     )
+            # E    AssertionError
+            assert allclose(out_np, out_num)
+
+    @pytest.mark.parametrize("size", NO_EMPTY_SIZE)
+    def test_initial(self, size):
+        arr_np = np.random.random(size) * 10
+        arr_num = num.array(arr_np)
+        initial_value = random.uniform(-20.0, 20.0)
+        out_num = num.sum(arr_num, initial=initial_value)
+        out_np = np.sum(arr_np, initial=initial_value)
+
+        assert allclose(out_np, out_num)
 
 
 def test_indexed():
diff --git a/tests/integration/test_reduction_axis.py b/tests/integration/test_reduction_axis.py
deleted file mode 100644
index 6ae89f07c..000000000
--- a/tests/integration/test_reduction_axis.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from itertools import permutations
-
-import numpy as np
-import pytest
-
-import cunumeric as num
-
-
-def _sum(shape, axis, lib, dtype=None):
-    return lib.ones(shape).sum(axis=axis, dtype=dtype)
-
-
-# Try various non-square shapes, to nudge the core towards trying many
-# different partitionings.
-@pytest.mark.parametrize("axis", range(3), ids=str)
-@pytest.mark.parametrize("shape", permutations((3, 4, 5)), ids=str)
-def test_3d(shape, axis):
-    assert np.array_equal(_sum(shape, axis, np), _sum(shape, axis, num))
-    assert np.array_equal(
-        _sum(shape, axis, np, dtype="D"), _sum(shape, axis, num, dtype="D")
-    )
-
-
-if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main(sys.argv))
diff --git a/tests/integration/test_reduction_complex.py b/tests/integration/test_reduction_complex.py
deleted file mode 100644
index 0800246d0..000000000
--- a/tests/integration/test_reduction_complex.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-import pytest
-
-import cunumeric as num
-
-x = np.array([1 + 4j, 2 + 5j, 3 + 6j], np.complex64)
-
-
-def test_sum():
-    cx = num.array(x)
-    assert num.all(num.abs(num.sum(cx) - np.sum(x)) < 1e-5)
-
-
-def test_prod():
-    cx = num.array(x)
-    assert num.all(num.abs(num.prod(cx) - np.prod(x)) < 1e-5)
-
-
-if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main(sys.argv))

From a927aa9e55a68b5a01e0112fc52ead45950c4595 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 12 Dec 2022 22:04:23 -0800
Subject: [PATCH 64/89] [pre-commit.ci] pre-commit autoupdate (#738)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/PyCQA/isort: 5.10.1 → 5.11.1](https://github.com/PyCQA/isort/compare/5.10.1...5.11.1)
- [github.com/psf/black: 22.10.0 → 22.12.0](https://github.com/psf/black/compare/22.10.0...22.12.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e929c0833..71a89944f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,11 +7,11 @@ repos:
           pass_filenames: false
           args: ['cunumeric']
     - repo: https://github.com/PyCQA/isort
-      rev: 5.10.1
+      rev: 5.11.1
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
-      rev: 22.10.0
+      rev: 22.12.0
       hooks:
             - id: black
     - repo: https://github.com/PyCQA/flake8

From 69dfb0fb315b0277f13bab677c2bb0bd98c69d65 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Tue, 13 Dec 2022 00:17:45 -0800
Subject: [PATCH 65/89] Pin conda packages to older versions WAR (#737)

* Pin curand

* Pin curand

* Pin cusolver

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/meta.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 7a274a1d3..b1f2de956 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -104,7 +104,10 @@ requirements:
     # the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
     - cuda-nvcc ={{ cuda_version }}
     # libcurand is used both in CPU and GPU builds
-    - libcurand-dev
+    # temporarily pin curand until problems are resolved
+    - libcurand-dev =10.3.0.86
+    # the following line is only necessary for pinning curand
+    - libcurand =10.3.0.86
     # cudart needed for CPU and GPU builds because of curand
     - cuda-cudart-dev ={{ cuda_version }}
     - python
@@ -140,7 +143,7 @@ requirements:
     # - libcutensor >=1.3
     - cutensor >=1.3
     - libcublas
-    - libcusolver
+    - libcusolver =11.4.1.48-0
     - libcufft
 {% endif %}
     - opt_einsum >=3.3

From 53778f348b89dae571e98e44345a61cf4e26b21f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20F=C3=B6rster?=
 <97973773+mfoerste4@users.noreply.github.com>
Date: Wed, 14 Dec 2022 19:38:04 +0100
Subject: [PATCH 66/89] guard all2all from empty transfer (#727)

---
 src/cunumeric/sort/sort.cu | 44 +++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/cunumeric/sort/sort.cu b/src/cunumeric/sort/sort.cu
index af931c807..c303eb1ab 100644
--- a/src/cunumeric/sort/sort.cu
+++ b/src/cunumeric/sort/sort.cu
@@ -1557,32 +1557,36 @@ void sample_sort_nccl_nd(SortPiece<legate_type_of<CODE>> local_sorted,
   // communicate all2all (in sort dimension)
   CHECK_NCCL(ncclGroupStart());
   for (size_t r = 0; r < num_sort_ranks; r++) {
-    CHECK_NCCL(ncclSend(val_send_buffers[r].ptr(0),
-                        size_send_total[r] * sizeof(VAL),
-                        ncclInt8,
-                        sort_ranks[r],
-                        *comm,
-                        stream));
-    CHECK_NCCL(ncclRecv(merge_buffers[r].values.ptr(0),
-                        merge_buffers[r].size * sizeof(VAL),
-                        ncclInt8,
-                        sort_ranks[r],
-                        *comm,
-                        stream));
+    if (size_send_total[r] > 0)
+      CHECK_NCCL(ncclSend(val_send_buffers[r].ptr(0),
+                          size_send_total[r] * sizeof(VAL),
+                          ncclInt8,
+                          sort_ranks[r],
+                          *comm,
+                          stream));
+    if (merge_buffers[r].size > 0)
+      CHECK_NCCL(ncclRecv(merge_buffers[r].values.ptr(0),
+                          merge_buffers[r].size * sizeof(VAL),
+                          ncclInt8,
+                          sort_ranks[r],
+                          *comm,
+                          stream));
   }
   CHECK_NCCL(ncclGroupEnd());
 
   if (argsort) {
     CHECK_NCCL(ncclGroupStart());
     for (size_t r = 0; r < num_sort_ranks; r++) {
-      CHECK_NCCL(ncclSend(
-        idc_send_buffers[r].ptr(0), size_send_total[r], ncclInt64, sort_ranks[r], *comm, stream));
-      CHECK_NCCL(ncclRecv(merge_buffers[r].indices.ptr(0),
-                          merge_buffers[r].size,
-                          ncclInt64,
-                          sort_ranks[r],
-                          *comm,
-                          stream));
+      if (size_send_total[r] > 0)
+        CHECK_NCCL(ncclSend(
+          idc_send_buffers[r].ptr(0), size_send_total[r], ncclInt64, sort_ranks[r], *comm, stream));
+      if (merge_buffers[r].size > 0)
+        CHECK_NCCL(ncclRecv(merge_buffers[r].indices.ptr(0),
+                            merge_buffers[r].size,
+                            ncclInt64,
+                            sort_ranks[r],
+                            *comm,
+                            stream));
     }
     CHECK_NCCL(ncclGroupEnd());
   }

From 3177e5aecacba15f032b5500cf0bc369cdb0100c Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Sat, 17 Dec 2022 02:26:04 +0200
Subject: [PATCH 67/89] Clean up the basic build instructions (#741)

---
 BUILD.md  | 24 +++++++++---------------
 README.md |  4 ----
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/BUILD.md b/BUILD.md
index 589d7c016..e7183437c 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -15,36 +15,30 @@ limitations under the License.
 
 -->
 
-# Dependencies
+# Basic build
 
 Users must have a working installation of the
 [Legate Core](https://github.com/nv-legate/legate.core)
-library prior to installing cuNumeric.
+library prior to installing cuNumeric. **Installing cuNumeric by itself will not
+automatically install Legate Core.**
 
 As for other dependencies, the Dependencies section on the
 [Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
-also covers cuNumeric.
+also covers cuNumeric, so no additional packages are required.
 
-# Building for Users
+Once Legate Core is installed, you can simply invoke `./install.py` from the
+cuNumeric top-level directory. The build will automatically pick up the
+configuration used when building Legate Core (e.g. the CUDA Toolkit directory).
 
-cuNumeric provides the same source-based installation scripts as Legate Core (a
-custom `install.py` script, that is backed by `pip install`). See the
-[Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
-for help on using these.
-
-Note: Installing cuNumeric by itself will *not* automatically install Legate Core.
+# Advanced topics
 
-# Building for Developers
-
-## Overview
+## Building through pip & cmake
 
 cuNumeric uses the same cmake/scikit-build-based build workflow as Legate Core.
 See the
 [Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
 for an overview.
 
-## Example
-
 There are several examples in the `scripts` folder. We walk through the steps in
 `build-with-legate-separately-no-install.sh` here.
 
diff --git a/README.md b/README.md
index 62eecb153..d3973a60c 100644
--- a/README.md
+++ b/README.md
@@ -55,10 +55,6 @@ conda install -c nvidia -c conda-forge -c legate cunumeric
 The conda package is compatible with CUDA >= 11.4 (CUDA driver version >= r470),
 and Volta or later GPU architectures.
 
-Docker image build scripts, as well as specialized install scripts for
-supported clusters are available on the
-[quickstart](https://github.com/nv-legate/quickstart) repo.
-
 See [BUILD.md](BUILD.md) for instructions on building cuNumeric from source.
 
 ## Usage and Execution

From a79079b43ca18e9a1f32b46f597e3ef0948852ac Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Mon, 19 Dec 2022 15:14:15 +0200
Subject: [PATCH 68/89] Refactor benchmarks (#567)

* Centralize time() function in benchmark.py

* Remove python2-isms

* Centralize some argument parsing in benchmark.py

* Use legate.timing in all benchmarks

* Add warmup iterations to some more benchmarks

* Missing argument

* Put back samples argument in run_benchmark

* Fix #iterations in einsum

* Port added except from branch-22.12

* Cover new solve example

* Avoid mixup of --centers with -cunumeric:test

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 examples/benchmark.py         |  54 +++++++++++
 examples/black_scholes.py     |  32 ++----
 examples/cg.py                | 178 +++++++++++++++-------------------
 examples/einsum.py            |  72 ++++----------
 examples/gemm.py              |  64 ++++++------
 examples/indexing_routines.py |  21 +---
 examples/jacobi.py            | 101 ++++++-------------
 examples/kmeans.py            |  26 ++---
 examples/kmeans_slow.py       |  26 ++---
 examples/kmeans_sort.py       |  32 ++----
 examples/linreg.py            |  74 ++++++--------
 examples/logreg.py            | 107 ++++++--------------
 examples/lstm_backward.py     |  31 ++----
 examples/lstm_forward.py      |  31 ++----
 examples/lstm_full.py         |  25 ++---
 examples/richardson_lucy.py   |  18 +---
 examples/scan.py              |  49 +---------
 examples/solve.py             |  16 ++-
 examples/sort.py              |  53 +---------
 examples/stencil.py           |  89 +++++------------
 examples/wgrad.py             |  21 ++--
 21 files changed, 386 insertions(+), 734 deletions(-)

diff --git a/examples/benchmark.py b/examples/benchmark.py
index ec107f24f..1d0a6f73e 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -18,6 +18,60 @@
 import math
 from functools import reduce
 
+try:
+    from legate.timing import time
+except (ImportError, RuntimeError):
+    from time import perf_counter_ns
+
+    def time():
+        return perf_counter_ns() / 1000.0
+
+
+# Add common arguments and parse
+def parse_args(parser):
+    parser.add_argument(
+        "-b",
+        "--benchmark",
+        type=int,
+        default=1,
+        dest="benchmark",
+        help="number of times to benchmark this application (default 1 - "
+        "normal execution)",
+    )
+    parser.add_argument(
+        "--package",
+        dest="package",
+        choices=["legate", "numpy", "cupy"],
+        type=str,
+        default="legate",
+        help="NumPy package to use",
+    )
+    parser.add_argument(
+        "--cupy-allocator",
+        dest="cupy_allocator",
+        choices=["default", "off", "managed"],
+        type=str,
+        default="default",
+        help="cupy allocator to use",
+    )
+    args, _ = parser.parse_known_args()
+    if args.package == "legate":
+        import cunumeric as np
+    elif args.package == "cupy":
+        import cupy as np
+
+        if args.cupy_allocator == "off":
+            np.cuda.set_allocator(None)
+            print("Turning off memory pool")
+        elif args.cupy_allocator == "managed":
+            np.cuda.set_allocator(
+                np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
+            )
+            print("Using managed memory pool")
+    elif args.package == "numpy":
+        import numpy as np
+    return args, np
+
 
 # A helper method for benchmarking applications
 def run_benchmark(f, samples, name, args):
diff --git a/examples/black_scholes.py b/examples/black_scholes.py
index aadcef456..d64e032d5 100644
--- a/examples/black_scholes.py
+++ b/examples/black_scholes.py
@@ -16,12 +16,8 @@
 #
 
 import argparse
-import datetime
-import math
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def generate_random(N, min, max, D):
@@ -75,16 +71,11 @@ def black_scholes(S, X, T, R, V):
 def run_black_scholes(N, D):
     print("Running black scholes on %dK options..." % N)
     N *= 1000
-    start = datetime.datetime.now()
+    start = time()
     S, X, T, R, V = initialize(N, D)
-    call, put = black_scholes(S, X, T, R, V)
-    # Check the result for NaNs to synchronize before stopping timing
-    call_sum = np.sum(call)
-    put_sum = np.sum(put)
-    assert not math.isnan(call_sum) and not math.isnan(put_sum)
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    _, _ = black_scholes(S, X, T, R, V)
+    stop = time()
+    total = (stop - start) / 1000.0
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -107,16 +98,9 @@ def run_black_scholes(N, D):
         dest="P",
         help="precision of the computation in bits",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     if args.P == 16:
         run_benchmark(
             run_black_scholes,
diff --git a/examples/cg.py b/examples/cg.py
index 50d7d1964..79721f7b0 100644
--- a/examples/cg.py
+++ b/examples/cg.py
@@ -17,15 +17,7 @@
 
 import argparse
 
-from benchmark import run_benchmark
-
-try:
-    from legate.timing import time
-except (ImportError, RuntimeError):
-    from time import perf_counter_ns
-
-    def time():
-        return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark, time
 
 
 # This is technically dead code right now, but we'll keep it around in
@@ -75,7 +67,28 @@ def generate_2D(N, corners):
     return A, b
 
 
-def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
+def check(A, x, b):
+    print("Checking result...")
+    if np.allclose(A.dot(x), b):
+        print("PASS!")
+    else:
+        print("FAIL!")
+
+
+def run_cg(
+    N,
+    corners,
+    conv_iters,
+    max_iters,
+    warmup,
+    conv_threshold,
+    perform_check,
+    timing,
+    verbose,
+):
+    # A, b = generate_random(N)
+    A, b = generate_2D(N, corners)
+
     print("Solving system...")
     x = np.zeros(A.shape[1])
     r = b - A.dot(x)
@@ -86,7 +99,11 @@ def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
     max_iters = (
         min(max_iters, b.shape[0]) if max_iters is not None else b.shape[0]
     )
-    for i in range(max_iters):
+
+    start = time()
+    for i in range(-warmup, max_iters):
+        if i == 0:
+            start = time()
         Ap = A.dot(p)
         alpha = rsold / (p.dot(Ap))
         x = x + alpha * p
@@ -94,9 +111,11 @@ def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
         rsnew = r.dot(r)
         # We only do the convergence test every conv_iters or on the last
         # iteration
-        if (i % conv_iters == 0 or i == (max_iters - 1)) and np.sqrt(
-            rsnew
-        ) < conv_threshold:
+        if (
+            i >= 0
+            and (i % conv_iters == 0 or i == (max_iters - 1))
+            and np.sqrt(rsnew) < conv_threshold
+        ):
             converged = i
             break
         if verbose:
@@ -104,11 +123,19 @@ def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
         beta = rsnew / rsold
         p = r + beta * p
         rsold = rsnew
+    stop = time()
+
     if converged < 0:
         print("Convergence FAILURE!")
     else:
         print("Converged in %d iterations" % (converged))
-    return x
+    if perform_check:
+        check(A, x, b)
+
+    total = (stop - start) / 1000.0
+    if timing:
+        print(f"Elapsed Time: {total} ms")
+    return total
 
 
 def precondition(A, N, corners):
@@ -120,10 +147,22 @@ def precondition(A, N, corners):
     return M
 
 
-def preconditioned_solve(
-    A, M, b, conv_iters, max_iters, conv_threshold, verbose
+def run_preconditioned_cg(
+    N,
+    corners,
+    conv_iters,
+    max_iters,
+    warmup,
+    conv_threshold,
+    perform_check,
+    timing,
+    verbose,
 ):
     print("Solving system with preconditioner...")
+    # A, b = generate_random(N)
+    A, b = generate_2D(N, corners)
+    M = precondition(A, N, corners)
+
     x = np.zeros(A.shape[1])
     r = b - A.dot(x)
     z = M.dot(r)
@@ -134,7 +173,11 @@ def preconditioned_solve(
     max_iters = (
         min(max_iters, b.shape[0]) if max_iters is not None else b.shape[0]
     )
-    for i in range(max_iters):
+
+    start = time()
+    for i in range(-warmup, max_iters):
+        if i == 0:
+            start = time()
         Ap = A.dot(p)
         alpha = rzold / (p.dot(Ap))
         x = x + alpha * p
@@ -142,9 +185,11 @@ def preconditioned_solve(
         rznew = r.dot(r)
         # We only do the convergence test every conv_iters or on the
         # last iteration
-        if (i % conv_iters == 0 or i == (max_iters - 1)) and np.sqrt(
-            rznew
-        ) < conv_threshold:
+        if (
+            i >= 0
+            and (i % conv_iters == 0 or i == (max_iters - 1))
+            and np.sqrt(rznew) < conv_threshold
+        ):
             converged = i
             break
         if verbose:
@@ -154,45 +199,15 @@ def preconditioned_solve(
         beta = rznew / rzold
         p = z + beta * p
         rzold = rznew
+    stop = time()
+
     if converged < 0:
         print("Convergence FAILURE!")
     else:
         print("Converged in %d iterations" % (converged))
-    return x
-
-
-def check(A, x, b):
-    print("Checking result...")
-    if np.allclose(A.dot(x), b):
-        print("PASS!")
-    else:
-        print("FAIL!")
-
-
-def run_cg(
-    N,
-    corners,
-    preconditioner,
-    conv_iters,
-    max_iters,
-    conv_threshold,
-    perform_check,
-    timing,
-    verbose,
-):
-    # A, b = generate_random(N)
-    A, b = generate_2D(N, corners)
-    start = time()
-    if preconditioner:
-        M = precondition(A, N, corners)
-        x = preconditioned_solve(
-            A, M, b, conv_iters, max_iters, conv_threshold, verbose
-        )
-    else:
-        x = solve(A, b, conv_iters, max_iters, conv_threshold, verbose)
     if perform_check:
         check(A, x, b)
-    stop = time()
+
     total = (stop - start) / 1000.0
     if timing:
         print(f"Elapsed Time: {total} ms")
@@ -237,6 +252,14 @@ def run_cg(
         dest="max_iters",
         help="bound the maximum number of iterations",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup",
+        type=int,
+        default=5,
+        dest="warmup",
+        help="warm-up iterations",
+    )
     parser.add_argument(
         "-n",
         "--num",
@@ -259,15 +282,6 @@ def run_cg(
         action="store_true",
         help="print verbose output",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
     parser.add_argument(
         "--threshold",
         type=float,
@@ -275,51 +289,19 @@ def run_cg(
         dest="conv_threshold",
         help="convergence check threshold",
     )
-    parser.add_argument(
-        "--package",
-        dest="package",
-        choices=["legate", "numpy", "cupy"],
-        type=str,
-        default="legate",
-        help="NumPy package to use (legate, numpy, or cupy)",
-    )
-    parser.add_argument(
-        "--cupy-allocator",
-        dest="cupy_allocator",
-        choices=["default", "off", "managed"],
-        type=str,
-        default="default",
-        help="cupy allocator to use (default, off, or managed)",
-    )
-
-    args, _ = parser.parse_known_args()
-
-    if args.package == "legate":
-        import cunumeric as np
-    elif args.package == "cupy":
-        import cupy as np
 
-        if args.cupy_allocator == "off":
-            np.cuda.set_allocator(None)
-            print("Turning off memory pool")
-        elif args.cupy_allocator == "managed":
-            np.cuda.set_allocator(
-                np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
-            )
-            print("Using managed memory pool")
-    elif args.package == "numpy":
-        import numpy as np
+    args, np = parse_args(parser)
 
     run_benchmark(
-        run_cg,
+        run_preconditioned_cg if args.precondition else run_cg,
         args.benchmark,
         "PreCG" if args.precondition else "CG",
         (
             args.N,
             args.corners,
-            args.precondition,
             args.conv_iters,
             args.max_iters,
+            args.warmup,
             args.conv_threshold,
             args.check,
             args.timing,
diff --git a/examples/einsum.py b/examples/einsum.py
index 9990c46d7..aac1ec995 100644
--- a/examples/einsum.py
+++ b/examples/einsum.py
@@ -18,18 +18,10 @@
 import argparse
 import re
 
-from benchmark import run_benchmark
+from benchmark import parse_args, run_benchmark, time
 
-try:
-    from legate.timing import time
-except (ImportError, RuntimeError):
-    from time import perf_counter_ns
 
-    def time():
-        return perf_counter_ns() / 1000.0
-
-
-def run_einsum(expr, N, iters, dtype, cupy_compatibility):
+def run_einsum(expr, N, iters, warmup, dtype, cupy_compatibility):
     # Parse contraction expression
     m = re.match(r"([a-zA-Z]*),([a-zA-Z]*)->([a-zA-Z]*)", expr)
     assert m is not None
@@ -91,7 +83,9 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
 
     # Run contraction
     start = time()
-    for _ in range(iters):
+    for idx in range(iters + warmup):
+        if idx == warmup:
+            start = time()
         if cupy_compatibility:
             C = np.einsum(expr, A, B)
         else:
@@ -144,6 +138,14 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
         dest="iters",
         help="number of iterations to run",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup",
+        type=int,
+        default=5,
+        dest="warmup",
+        help="warm-up iterations",
+    )
     parser.add_argument(
         "-t",
         "--dtype",
@@ -152,31 +154,6 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
         dest="dtype",
         help="dtype for array elements",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    parser.add_argument(
-        "--package",
-        dest="package",
-        choices=["legate", "numpy", "cupy"],
-        type=str,
-        default="legate",
-        help="NumPy package to use (legate, numpy, or cupy)",
-    )
-    parser.add_argument(
-        "--cupy-allocator",
-        dest="cupy_allocator",
-        choices=["default", "off", "managed"],
-        type=str,
-        default="default",
-        help="cupy allocator to use (default, off, or managed)",
-    )
     parser.add_argument(
         "--cupy-compatibility",
         action="store_true",
@@ -185,25 +162,9 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
              else, use einsum(expr, A, B, out=C)""",
     )
 
-    args, _ = parser.parse_known_args()
-
-    cupy_compatibility = args.cupy_compatibility
-    if args.package == "legate":
-        import cunumeric as np
-    elif args.package == "cupy":
-        import cupy as np
-
-        if args.cupy_allocator == "off":
-            np.cuda.set_allocator(None)
-            print("Turning off memory pool")
-        elif args.cupy_allocator == "managed":
-            np.cuda.set_allocator(
-                np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
-            )
-            print("Using managed memory pool")
-        cupy_compatibility = True
-    elif args.package == "numpy":
-        import numpy as np
+    args, np = parse_args(parser)
+
+    cupy_compatibility = args.cupy_compatibility or args.package == "cupy"
     if cupy_compatibility:
         print("Use C = np.einsum(expr, A, B) for cupy compatibility")
 
@@ -222,6 +183,7 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
             args.expr,
             args.N,
             args.iters,
+            args.warmup,
             dtypes[args.dtype],
             cupy_compatibility,
         ),
diff --git a/examples/gemm.py b/examples/gemm.py
index 409d43ece..2fe8aafc3 100644
--- a/examples/gemm.py
+++ b/examples/gemm.py
@@ -16,12 +16,8 @@
 #
 
 import argparse
-import datetime
-import math
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(M, N, K, ft):
@@ -39,7 +35,7 @@ def total_space(M, N, K, ft):
     return (M * N + M * K + K * N) * np.dtype(ft).itemsize
 
 
-def run_gemm(N, I, ft):  # noqa: E741
+def run_gemm(N, I, warmup, ft):  # noqa: E741
     print("Problem Size:     M=" + str(N) + " N=" + str(N) + " K=" + str(N))
     print("Total Iterations: " + str(I))
     flops = total_flops(N, N, N)
@@ -47,25 +43,21 @@ def run_gemm(N, I, ft):  # noqa: E741
     space = total_space(N, N, N, ft)
     print("Total Size:       " + str(space / 1e6) + " MB")
     A, B, C = initialize(N, N, N, ft)
-    # Compute some sums and check for NaNs to force synchronization
-    # before we start the timing
-    assert not math.isnan(np.sum(A))
-    assert not math.isnan(np.sum(B))
-    assert not math.isnan(np.sum(C))
-    start = datetime.datetime.now()
+
+    start = time()
     # Run for as many iterations as was requested
-    for idx in range(I):
+    for idx in range(I + warmup):
+        if idx == warmup:
+            start = time()
         np.dot(A, B, out=C)
         # We need to rotate the matrices to keep Legate honest
         # about moving data so it can't just duplicate A and B
         # on the first iteration and reuse them, this means
         # that A, B, C all need to be square
         A, B, C = B, C, A
-    # Do another sum to synchronize for timings, B is last output
-    assert not math.isnan(np.sum(B))
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    stop = time()
+
+    total = (stop - start) / 1000.0
     print("Elapsed Time:     " + str(total) + " ms")
     average = total / I
     print("Average GEMM:     " + str(average) + " ms")
@@ -83,6 +75,14 @@ def run_gemm(N, I, ft):  # noqa: E741
         dest="I",
         help="number of iterations to run",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup",
+        type=int,
+        default=5,
+        dest="warmup",
+        help="warm-up iterations",
+    )
     parser.add_argument(
         "-n",
         "--num",
@@ -100,27 +100,29 @@ def run_gemm(N, I, ft):  # noqa: E741
         help="number of bits of precision to use for the gemm computation "
         "(16,32,64)",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     if args.P == 16:
         run_benchmark(
-            run_gemm, args.benchmark, "HGEMM", (args.N, args.I, np.float16)
+            run_gemm,
+            args.benchmark,
+            "HGEMM",
+            (args.N, args.I, args.warmup, np.float16),
         )
     elif args.P == 32:
         run_benchmark(
-            run_gemm, args.benchmark, "SGEMM", (args.N, args.I, np.float32)
+            run_gemm,
+            args.benchmark,
+            "SGEMM",
+            (args.N, args.I, args.warmup, np.float32),
         )
     elif args.P == 64:
         run_benchmark(
-            run_gemm, args.benchmark, "DGEMM", (args.N, args.I, np.float64)
+            run_gemm,
+            args.benchmark,
+            "DGEMM",
+            (args.N, args.I, args.warmup, np.float64),
         )
     else:
         raise TypeError("Precision must be one of 16, 32, or 64")
diff --git a/examples/indexing_routines.py b/examples/indexing_routines.py
index 3d275e49f..2e7f40301 100644
--- a/examples/indexing_routines.py
+++ b/examples/indexing_routines.py
@@ -15,16 +15,11 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 import argparse
 import gc
 import math
 
-from benchmark import run_benchmark
-from legate.timing import time
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def compute_diagonal(steps, N, timing, warmup):
@@ -264,15 +259,6 @@ def run_indexing_routines(
         action="store_true",
         help="print verbose output",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
     parser.add_argument(
         "-r",
         "--routine",
@@ -281,8 +267,9 @@ def run_indexing_routines(
         choices=["diagonal", "choose", "repeat", "ai1", "ai2", "ai3", "all"],
         help="name of the index routine to test",
     )
-    args, unknown = parser.parse_known_args()
-    print("Warning, unrecognized arguments: ", unknown)
+
+    args, np = parse_args(parser)
+
     run_benchmark(
         run_indexing_routines,
         args.benchmark,
diff --git a/examples/jacobi.py b/examples/jacobi.py
index 56cf8aa90..f4e42081d 100644
--- a/examples/jacobi.py
+++ b/examples/jacobi.py
@@ -16,17 +16,8 @@
 #
 
 import argparse
-import math
 
-from benchmark import run_benchmark
-
-try:
-    from legate.timing import time
-except (ImportError, RuntimeError):
-    from time import perf_counter_ns
-
-    def time():
-        return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark, time
 
 
 def generate_random(N):
@@ -40,16 +31,6 @@ def generate_random(N):
     return A, b
 
 
-def solve(A, b, iters, verbose):
-    print("Solving system...")
-    x = np.zeros(A.shape[1])
-    d = np.diag(A)
-    R = A - np.diag(d)
-    for i in range(iters):
-        x = (b - np.dot(R, x)) / d
-    return x
-
-
 def check(A, x, b):
     print("Checking result...")
     if np.allclose(A.dot(x), b):
@@ -58,16 +39,24 @@ def check(A, x, b):
         print("FAIL!")
 
 
-def run_jacobi(N, iters, perform_check, timing, verbose):
+def run_jacobi(N, iters, warmup, perform_check, timing, verbose):
     A, b = generate_random(N)
+
+    print("Solving system...")
+    x = np.zeros(A.shape[1])
+    d = np.diag(A)
+    R = A - np.diag(d)
+
     start = time()
-    x = solve(A, b, iters, verbose)
+    for i in range(iters + warmup):
+        if i == warmup:
+            start = time()
+        x = (b - np.dot(R, x)) / d
+    stop = time()
+
     if perform_check:
         check(A, x, b)
-    else:
-        # Need a synchronization here for timing
-        assert not math.isnan(np.sum(x))
-    stop = time()
+
     total = (stop - start) / 1000.0
     if timing:
         print(f"Elapsed Time: {total} ms")
@@ -90,6 +79,14 @@ def run_jacobi(N, iters, perform_check, timing, verbose):
         dest="iters",
         help="number of iterations to run",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup",
+        type=int,
+        default=5,
+        dest="warmup",
+        help="warm-up iterations",
+    )
     parser.add_argument(
         "-n",
         "--num",
@@ -112,53 +109,19 @@ def run_jacobi(N, iters, perform_check, timing, verbose):
         action="store_true",
         help="print verbose output",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    parser.add_argument(
-        "--package",
-        dest="package",
-        choices=["legate", "numpy", "cupy"],
-        type=str,
-        default="legate",
-        help="NumPy package to use (legate, numpy, or cupy)",
-    )
-    parser.add_argument(
-        "--cupy-allocator",
-        dest="cupy_allocator",
-        choices=["default", "off", "managed"],
-        type=str,
-        default="default",
-        help="cupy allocator to use (default, off, or managed)",
-    )
-
-    args, _ = parser.parse_known_args()
-
-    if args.package == "legate":
-        import cunumeric as np
-    elif args.package == "cupy":
-        import cupy as np
 
-        if args.cupy_allocator == "off":
-            np.cuda.set_allocator(None)
-            print("Turning off memory pool")
-        elif args.cupy_allocator == "managed":
-            np.cuda.set_allocator(
-                np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
-            )
-            print("Using managed memory pool")
-    elif args.package == "numpy":
-        import numpy as np
+    args, np = parse_args(parser)
 
     run_benchmark(
         run_jacobi,
         args.benchmark,
         "Jacobi",
-        (args.N, args.iters, args.check, args.timing, args.verbose),
+        (
+            args.N,
+            args.iters,
+            args.warmup,
+            args.check,
+            args.timing,
+            args.verbose,
+        ),
     )
diff --git a/examples/kmeans.py b/examples/kmeans.py
index 736b7af58..a64495e7e 100644
--- a/examples/kmeans.py
+++ b/examples/kmeans.py
@@ -18,11 +18,8 @@
 # Derived from https://github.com/bryancatanzaro/kmeans
 
 import argparse
-import datetime
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(N, D, C, T):
@@ -80,7 +77,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
     print("Number of dimensions: " + str(D))
     print("Number of centroids: " + str(C))
     print("Max iterations: " + str(I))
-    start = datetime.datetime.now()
+    start = time()
     data, centroids = initialize(N, D, C, T)
 
     data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -128,9 +125,8 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         + ": "
         + str(prior_distance_sum)
     )
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    stop = time()
+    total = (stop - start) / 1000.0
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -138,7 +134,6 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-c",
         "--centers",
         type=int,
         default=10,
@@ -185,16 +180,9 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         dest="S",
         help="number of iterations between sampling the log likelihood",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application"
-        " (default 1 - normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     if args.P == 16:
         run_benchmark(
             run_kmeans,
diff --git a/examples/kmeans_slow.py b/examples/kmeans_slow.py
index 8727fa7d2..83f226af3 100644
--- a/examples/kmeans_slow.py
+++ b/examples/kmeans_slow.py
@@ -18,11 +18,8 @@
 # Derived from https://github.com/bryancatanzaro/kmeans
 
 import argparse
-import datetime
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(N, D, C, T):
@@ -81,7 +78,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
     print("Number of dimensions: " + str(D))
     print("Number of centroids: " + str(C))
     print("Max iterations: " + str(I))
-    start = datetime.datetime.now()
+    start = time()
     data, centroids = initialize(N, D, C, T)
 
     data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -129,9 +126,8 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         + ": "
         + str(prior_distance_sum)
     )
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    stop = time()
+    total = (stop - start) / 1000.0
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -139,7 +135,6 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-c",
         "--centers",
         type=int,
         default=10,
@@ -186,16 +181,9 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         dest="S",
         help="number of iterations between sampling the log likelihood",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     if args.P == 16:
         run_benchmark(
             run_kmeans,
diff --git a/examples/kmeans_sort.py b/examples/kmeans_sort.py
index b848b54e0..406b02833 100644
--- a/examples/kmeans_sort.py
+++ b/examples/kmeans_sort.py
@@ -18,16 +18,8 @@
 # Derived from https://github.com/bryancatanzaro/kmeans
 
 import argparse
-import datetime
 
-from benchmark import run_benchmark
-
-import cunumeric as np
-
-try:
-    xrange
-except NameError:
-    xrange = range
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(N, D, C, T):
@@ -68,7 +60,7 @@ def find_centroids(data, labels, C, D):
     # sum across them to create the centroids
     centroids = np.empty((C, D), dtype=data.dtype)
     ragged_arrays = np.split(sorted_points, indexes)
-    for idx in xrange(C):
+    for idx in range(C):
         centroids[idx, :] = np.sum(ragged_arrays[idx], axis=0)
     # To avoid introducing divide by zero errors
     # If a centroid has no weight, we'll do no normalization
@@ -83,7 +75,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
     print("Number of dimensions: " + str(D))
     print("Number of centroids: " + str(C))
     print("Max iterations: " + str(I))
-    start = datetime.datetime.now()
+    start = time()
     data, centroids = initialize(N, D, C, T)
 
     data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -130,9 +122,8 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         + ": "
         + str(prior_distance_sum)
     )
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    stop = time()
+    total = (stop - start) / 1000.0
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -187,16 +178,9 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         dest="S",
         help="number of iterations between sampling the log likelihood",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     if args.P == 16:
         run_benchmark(
             run_kmeans,
diff --git a/examples/linreg.py b/examples/linreg.py
index bce2fff58..a8e684e32 100644
--- a/examples/linreg.py
+++ b/examples/linreg.py
@@ -16,12 +16,8 @@
 #
 
 import argparse
-import datetime
-import math
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(N, F, T):
@@ -32,45 +28,37 @@ def initialize(N, F, T):
     return x, y
 
 
-def linear_regression(
-    T, features, target, steps, learning_rate, sample, add_intercept=False
-):
-    if add_intercept:
+def run_linear_regression(N, F, T, I, warmup, S, B):  # noqa: E741
+    print("Running linear regression...")
+    print("Number of data points: " + str(N) + "K")
+    print("Number of features: " + str(F))
+    print("Number of iterations: " + str(I))
+
+    learning_rate = 1e-5
+    features, target = initialize(N * 1000, F, T)
+    if B:
         intercept = np.ones((features.shape[0], 1), dtype=T)
         features = np.hstack((intercept, features))
-
     weights = np.zeros(features.shape[1], dtype=T)
 
-    for step in range(steps):
+    start = time()
+    for step in range(-warmup, I):
+        if step == 0:
+            start = time()
         scores = np.dot(features, weights)
         error = scores - target
         gradient = -(1.0 / len(features)) * error.dot(features)
         weights += learning_rate * gradient
-
-        if step % sample == 0:
+        if step >= 0 and step % S == 0:
             print(
                 "Error of step "
                 + str(step)
                 + ": "
                 + str(np.sum(np.power(error, 2)))
             )
+    stop = time()
 
-    return weights
-
-
-def run_linear_regression(N, F, T, I, S, B):  # noqa: E741
-    print("Running linear regression...")
-    print("Number of data points: " + str(N) + "K")
-    print("Number of features: " + str(F))
-    print("Number of iterations: " + str(I))
-    start = datetime.datetime.now()
-    features, target = initialize(N * 1000, F, T)
-    weights = linear_regression(T, features, target, I, 1e-5, S, B)
-    # Check the weights for NaNs to synchronize before stopping timing
-    assert not math.isnan(np.sum(weights))
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    total = (stop - start) / 1000.0
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -78,7 +66,7 @@ def run_linear_regression(N, F, T, I, S, B):  # noqa: E741
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-b",
+        "-B",
         "--intercept",
         dest="B",
         action="store_true",
@@ -100,6 +88,14 @@ def run_linear_regression(N, F, T, I, S, B):  # noqa: E741
         dest="I",
         help="number of iterations to run the algorithm for",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup",
+        type=int,
+        default=5,
+        dest="warmup",
+        help="warm-up iterations",
+    )
     parser.add_argument(
         "-n",
         "--num",
@@ -124,35 +120,29 @@ def run_linear_regression(N, F, T, I, S, B):  # noqa: E741
         dest="S",
         help="number of iterations between sampling the log likelihood",
     )
-    parser.add_argument(
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     if args.P == 16:
         run_benchmark(
             run_linear_regression,
             args.benchmark,
             "LINREG(H)",
-            (args.N, args.F, np.float16, args.I, args.S, args.B),
+            (args.N, args.F, np.float16, args.I, args.warmup, args.S, args.B),
         )
     elif args.P == 32:
         run_benchmark(
             run_linear_regression,
             args.benchmark,
             "LINREG(S)",
-            (args.N, args.F, np.float32, args.I, args.S, args.B),
+            (args.N, args.F, np.float32, args.I, args.warmup, args.S, args.B),
         )
     elif args.P == 64:
         run_benchmark(
             run_linear_regression,
             args.benchmark,
             "LINREG(D)",
-            (args.N, args.F, np.float64, args.I, args.S, args.B),
+            (args.N, args.F, np.float64, args.I, args.warmup, args.S, args.B),
         )
     else:
         raise TypeError("Precision must be one of 16, 32, or 64")
diff --git a/examples/logreg.py b/examples/logreg.py
index 4e1abb209..43b0e62b0 100644
--- a/examples/logreg.py
+++ b/examples/logreg.py
@@ -16,17 +16,8 @@
 #
 
 import argparse
-import math
 
-from benchmark import run_benchmark
-
-try:
-    from legate.timing import time
-except (ImportError, RuntimeError):
-    from time import perf_counter_ns
-
-    def time():
-        return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(N, F, T):
@@ -47,45 +38,37 @@ def log_likelihood(features, target, weights):
     return np.sum(target * scores - np.log(1.0 + np.exp(scores)))
 
 
-def logistic_regression(
-    T, features, target, steps, learning_rate, sample, add_intercept=False
-):
-    if add_intercept:
+def run_logistic_regression(N, F, T, I, warmup, S, B):  # noqa: E741
+    print("Running logistic regression...")
+    print("Number of data points: " + str(N) + "K")
+    print("Number of features: " + str(F))
+    print("Number of iterations: " + str(I))
+
+    learning_rate = 1e-5
+    features, target = initialize(N * 1000, F, T)
+    if B:
         intercept = np.ones((features.shape[0], 1), dtype=T)
         features = np.hstack((intercept, features))
-
     weights = np.zeros(features.shape[1], dtype=T)
 
-    for step in range(steps):
+    start = time()
+    for step in range(-warmup, I):
+        if step == 0:
+            start = time()
         scores = np.dot(features, weights)
         predictions = sigmoid(scores)
-
         error = target - predictions
         gradient = np.dot(error, features)
         weights += learning_rate * gradient
-
-        if step % sample == 0:
+        if step >= 0 and step % S == 0:
             print(
                 "Log Likelihood of step "
                 + str(step)
                 + ": "
                 + str(log_likelihood(features, target, weights))
             )
-
-    return weights
-
-
-def run_logistic_regression(N, F, T, I, S, B):  # noqa: E741
-    print("Running logistic regression...")
-    print("Number of data points: " + str(N) + "K")
-    print("Number of features: " + str(F))
-    print("Number of iterations: " + str(I))
-    features, target = initialize(N * 1000, F, T)
-    start = time()
-    weights = logistic_regression(T, features, target, I, 1e-5, S, B)
     stop = time()
-    # Check the weights for NaNs
-    assert not math.isnan(np.sum(weights))
+
     total = (stop - start) / 1000.0
     print(f"Elapsed Time: {total} ms")
     return total
@@ -94,7 +77,7 @@ def run_logistic_regression(N, F, T, I, S, B):  # noqa: E741
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-b",
+        "-B",
         "--intercept",
         dest="B",
         action="store_true",
@@ -116,6 +99,14 @@ def run_logistic_regression(N, F, T, I, S, B):  # noqa: E741
         dest="I",
         help="number of iterations to run the algorithm for",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup",
+        type=int,
+        default=5,
+        dest="warmup",
+        help="warm-up iterations",
+    )
     parser.add_argument(
         "-n",
         "--num",
@@ -140,69 +131,29 @@ def run_logistic_regression(N, F, T, I, S, B):  # noqa: E741
         dest="S",
         help="number of iterations between sampling the log likelihood",
     )
-    parser.add_argument(
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    parser.add_argument(
-        "--package",
-        dest="package",
-        choices=["legate", "numpy", "cupy"],
-        type=str,
-        default="legate",
-        help="NumPy package to use (legate, numpy, or cupy)",
-    )
-    parser.add_argument(
-        "--cupy-allocator",
-        dest="cupy_allocator",
-        choices=["default", "off", "managed"],
-        type=str,
-        default="default",
-        help="cupy allocator to use (default, off, or managed)",
-    )
-
-    args, _ = parser.parse_known_args()
 
-    if args.package == "legate":
-        import cunumeric as np
-    elif args.package == "cupy":
-        import cupy as np
-
-        if args.cupy_allocator == "off":
-            np.cuda.set_allocator(None)
-            print("Turning off memory pool")
-        elif args.cupy_allocator == "managed":
-            np.cuda.set_allocator(
-                np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
-            )
-            print("Using managed memory pool")
-    elif args.package == "numpy":
-        import numpy as np
+    args, np = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
             run_logistic_regression,
             args.benchmark,
             "LOGREG(H)",
-            (args.N, args.F, np.float16, args.I, args.S, args.B),
+            (args.N, args.F, np.float16, args.I, args.warmup, args.S, args.B),
         )
     elif args.P == 32:
         run_benchmark(
             run_logistic_regression,
             args.benchmark,
             "LOGREG(S)",
-            (args.N, args.F, np.float32, args.I, args.S, args.B),
+            (args.N, args.F, np.float32, args.I, args.warmup, args.S, args.B),
         )
     elif args.P == 64:
         run_benchmark(
             run_logistic_regression,
             args.benchmark,
             "LOGREG(D)",
-            (args.N, args.F, np.float64, args.I, args.S, args.B),
+            (args.N, args.F, np.float64, args.I, args.warmup, args.S, args.B),
         )
     else:
         raise TypeError("Precision must be one of 16, 32, or 64")
diff --git a/examples/lstm_backward.py b/examples/lstm_backward.py
index 554dd49e8..99e47f8be 100644
--- a/examples/lstm_backward.py
+++ b/examples/lstm_backward.py
@@ -16,16 +16,12 @@
 #
 
 import argparse
-import datetime
-import math
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
-    start = datetime.datetime.now()
+    start = time()
 
     WLSTM = np.random.randn(
         word_size + hidden_size, 4 * hidden_size
@@ -77,13 +73,8 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         else:
             dh0[0] += np.sum(dHin[t, :, word_size:], 0)
 
-    # Do a little sum to synchronize and check for NaNs
-    total = np.sum(dh0)
-    assert not math.isnan(total)
-
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    stop = time()
+    total = (stop - start) / 1000.0
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -92,7 +83,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-b", "--batch", type=int, default=32, dest="batch", help="batch size"
+        "-B", "--batch", type=int, default=32, dest="batch", help="batch size"
     )
     parser.add_argument(
         "--hidden", type=int, default=10, dest="hidden", help="hidden size"
@@ -115,15 +106,9 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         action="store_true",
         help="perform timing",
     )
-    parser.add_argument(
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     run_benchmark(
         run_lstm,
         args.benchmark,
diff --git a/examples/lstm_forward.py b/examples/lstm_forward.py
index dde2e7c76..4f1ab7abf 100644
--- a/examples/lstm_forward.py
+++ b/examples/lstm_forward.py
@@ -16,16 +16,12 @@
 #
 
 import argparse
-import datetime
-import math
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
-    start = datetime.datetime.now()
+    start = time()
 
     X = np.random.randn(sentence_length, batch_size, hidden_size)
     h0 = np.random.randn(1, hidden_size)
@@ -67,13 +63,8 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         Ct[t] = np.tanh(C[t])
         Hout[t] = IFOGf[t, :, 2 * d : 3 * d] * Ct[t]
 
-    # Do a little sum of the outputs to synchronize and check for NaNs
-    total = np.sum(Hout)
-    assert not math.isnan(total)
-
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    stop = time()
+    total = (stop - start) / 1000.0
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -82,7 +73,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "-b", "--batch", type=int, default=32, dest="batch", help="batch size"
+        "-B", "--batch", type=int, default=32, dest="batch", help="batch size"
     )
     parser.add_argument(
         "--hidden", type=int, default=10, dest="hidden", help="hidden size"
@@ -105,15 +96,9 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         action="store_true",
         help="perform timing",
     )
-    parser.add_argument(
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     run_benchmark(
         run_lstm,
         args.benchmark,
diff --git a/examples/lstm_full.py b/examples/lstm_full.py
index 0a56400a1..864773739 100644
--- a/examples/lstm_full.py
+++ b/examples/lstm_full.py
@@ -16,11 +16,8 @@
 #
 
 import argparse
-import datetime
 
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 class Param:
@@ -293,7 +290,7 @@ def run_lstm(
 
     pointer = 0
 
-    start = datetime.datetime.now()
+    start = time()
 
     for iteration in range(max_iters):
         # Reset
@@ -328,9 +325,8 @@ def run_lstm(
         pointer += T_steps
     update_status(max_iters, smooth_loss)
 
-    stop = datetime.datetime.now()
-    delta = stop - start
-    total = delta.total_seconds() * 1000.0
+    stop = time()
+    total = (stop - start) / 1000.0
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -400,16 +396,9 @@ def run_lstm(
         dest="weight",
         help="standard deviation of weights for initialization",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     run_benchmark(
         run_lstm,
         args.benchmark,
diff --git a/examples/richardson_lucy.py b/examples/richardson_lucy.py
index db8a06a75..7e5514280 100644
--- a/examples/richardson_lucy.py
+++ b/examples/richardson_lucy.py
@@ -15,10 +15,7 @@
 
 import argparse
 
-from benchmark import run_benchmark
-from legate.timing import time
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 float_type = "float32"
 
@@ -113,16 +110,9 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
         action="store_true",
         help="perform timing",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 "
-        "- normal execution)",
-    )
-    args = parser.parse_args()
+
+    args, np = parse_args(parser)
+
     run_benchmark(
         run_richardson_lucy,
         args.benchmark,
diff --git a/examples/scan.py b/examples/scan.py
index 07b3621fd..03d315325 100644
--- a/examples/scan.py
+++ b/examples/scan.py
@@ -18,8 +18,7 @@
 import argparse
 
 import numpy as np
-from benchmark import run_benchmark
-from legate.timing import time
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(shape, dt, axis):
@@ -81,8 +80,7 @@ def run_scan(OP, shape, dt, ax, check):
     getattr(num, OP)(A, out=B, axis=ax)
 
     stop = time()
-    delta = stop - start
-    total = delta / 1000.0
+    total = (stop - start) / 1000.0
     print(f"Elapsed Time:  {total}ms")
     # error checking
     if check:
@@ -131,49 +129,8 @@ def run_scan(OP, shape, dt, ax, check):
         action="store_true",
         help="check the result of the solve",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    parser.add_argument(
-        "--package",
-        dest="package",
-        choices=["legate", "numpy", "cupy"],
-        type=str,
-        default="legate",
-        help="NumPy package to use (legate, numpy, or cupy)",
-    )
-    parser.add_argument(
-        "--cupy-allocator",
-        dest="cupy_allocator",
-        choices=["default", "off", "managed"],
-        type=str,
-        default="default",
-        help="cupy allocator to use (default, off, or managed)",
-    )
 
-    args, _ = parser.parse_known_args()
-
-    if args.package == "legate":
-        import cunumeric as num
-    elif args.package == "cupy":
-        import cupy as num
-
-        if args.cupy_allocator == "off":
-            num.cuda.set_allocator(None)
-            print("Turning off memory pool")
-        elif args.cupy_allocator == "managed":
-            num.cuda.set_allocator(
-                num.cuda.MemoryPool(num.cuda.malloc_managed).malloc
-            )
-            print("Using managed memory pool")
-    elif args.package == "numpy":
-        import numpy as num
+    args, num = parse_args(parser)
 
     run_benchmark(
         run_scan,
diff --git a/examples/solve.py b/examples/solve.py
index 5d5082dd4..d07642dba 100644
--- a/examples/solve.py
+++ b/examples/solve.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # Copyright 2022 NVIDIA Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,9 +17,7 @@
 
 import argparse
 
-from legate.timing import time
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark, time
 
 
 def solve(m, n, nrhs, dtype):
@@ -66,5 +66,11 @@ def solve(m, n, nrhs, dtype):
         dest="dtype",
         help="data type",
     )
-    args = parser.parse_args()
-    solve(args.m, args.n, args.nrhs, args.dtype)
+    args, np = parse_args(parser)
+
+    run_benchmark(
+        solve,
+        args.benchmark,
+        "Solve",
+        (args.m, args.n, args.nrhs, args.dtype),
+    )
diff --git a/examples/sort.py b/examples/sort.py
index fb92d3dfb..cfcf7590a 100644
--- a/examples/sort.py
+++ b/examples/sort.py
@@ -18,15 +18,7 @@
 import argparse
 
 import numpy as np
-from benchmark import run_benchmark
-
-try:
-    from legate.timing import time
-except (ImportError, RuntimeError):
-    from time import perf_counter_ns
-
-    def time():
-        return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark, time
 
 
 def check_sorted(a, a_sorted, package, axis=-1):
@@ -162,49 +154,8 @@ def run_sort(
         action="store_true",
         help="use argsort",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 - "
-        "normal execution)",
-    )
-    parser.add_argument(
-        "--package",
-        dest="package",
-        choices=["legate", "numpy", "cupy"],
-        type=str,
-        default="legate",
-        help="NumPy package to use (legate, numpy, or cupy)",
-    )
-    parser.add_argument(
-        "--cupy-allocator",
-        dest="cupy_allocator",
-        choices=["default", "off", "managed"],
-        type=str,
-        default="default",
-        help="cupy allocator to use (default, off, or managed)",
-    )
-
-    args, _ = parser.parse_known_args()
-
-    if args.package == "legate":
-        import cunumeric as num
-    elif args.package == "cupy":
-        import cupy as num
 
-        if args.cupy_allocator == "off":
-            num.cuda.set_allocator(None)
-            print("Turning off memory pool")
-        elif args.cupy_allocator == "managed":
-            num.cuda.set_allocator(
-                num.cuda.MemoryPool(num.cuda.malloc_managed).malloc
-            )
-            print("Using managed memory pool")
-    elif args.package == "numpy":
-        import numpy as num
+    args, num = parse_args(parser)
 
     run_benchmark(
         run_sort,
diff --git a/examples/stencil.py b/examples/stencil.py
index 460cb7bde..3eae3c0b1 100644
--- a/examples/stencil.py
+++ b/examples/stencil.py
@@ -16,17 +16,8 @@
 #
 
 import argparse
-import math
 
-from benchmark import run_benchmark
-
-try:
-    from legate.timing import time
-except (ImportError, RuntimeError):
-    from time import perf_counter_ns
-
-    def time():
-        return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark, time
 
 
 def initialize(N):
@@ -39,30 +30,26 @@ def initialize(N):
     return grid
 
 
-def run(grid, I, N):  # noqa: E741
+def run_stencil(N, I, warmup, timing):  # noqa: E741
+    grid = initialize(N)
+
     print("Running Jacobi stencil...")
     center = grid[1:-1, 1:-1]
     north = grid[0:-2, 1:-1]
     east = grid[1:-1, 2:]
     west = grid[1:-1, 0:-2]
     south = grid[2:, 1:-1]
-    for i in range(I):
+
+    start = time()
+    for i in range(I + warmup):
+        if i == warmup:
+            start = time()
         average = center + north + east + west + south
         work = 0.2 * average
-        # delta = np.sum(np.absolute(work - center))
         center[:] = work
-    total = np.sum(center)
-    return total / (N**2)
-
-
-def run_stencil(N, I, timing):  # noqa: E741
-    grid = initialize(N)
-    start = time()
-    average = run(grid, I, N)
     stop = time()
-    print("Average energy is %.8g" % average)
+
     total = (stop - start) / 1000.0
-    assert not math.isnan(average)
     if timing:
         print(f"Elapsed Time: {total} ms")
     return total
@@ -78,6 +65,14 @@ def run_stencil(N, I, timing):  # noqa: E741
         dest="I",
         help="number of iterations to run",
     )
+    parser.add_argument(
+        "-w",
+        "--warmup",
+        type=int,
+        default=5,
+        dest="warmup",
+        help="warm-up iterations",
+    )
     parser.add_argument(
         "-n",
         "--num",
@@ -93,50 +88,12 @@ def run_stencil(N, I, timing):  # noqa: E741
         action="store_true",
         help="perform timing",
     )
-    parser.add_argument(
-        "-b",
-        "--benchmark",
-        type=int,
-        default=1,
-        dest="benchmark",
-        help="number of times to benchmark this application (default 1 "
-        "- normal execution)",
-    )
-    parser.add_argument(
-        "--package",
-        dest="package",
-        choices=["legate", "numpy", "cupy"],
-        type=str,
-        default="legate",
-        help="NumPy package to use (legate, numpy, or cupy)",
-    )
-    parser.add_argument(
-        "--cupy-allocator",
-        dest="cupy_allocator",
-        choices=["default", "off", "managed"],
-        type=str,
-        default="default",
-        help="cupy allocator to use (default, off, or managed)",
-    )
-
-    args, _ = parser.parse_known_args()
-
-    if args.package == "legate":
-        import cunumeric as np
-    elif args.package == "cupy":
-        import cupy as np
 
-        if args.cupy_allocator == "off":
-            np.cuda.set_allocator(None)
-            print("Turning off memory pool")
-        elif args.cupy_allocator == "managed":
-            np.cuda.set_allocator(
-                np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
-            )
-            print("Using managed memory pool")
-    elif args.package == "numpy":
-        import numpy as np
+    args, np = parse_args(parser)
 
     run_benchmark(
-        run_stencil, args.benchmark, "Stencil", (args.N, args.I, args.timing)
+        run_stencil,
+        args.benchmark,
+        "Stencil",
+        (args.N, args.I, args.warmup, args.timing),
     )
diff --git a/examples/wgrad.py b/examples/wgrad.py
index d95c00297..f4767f2b0 100644
--- a/examples/wgrad.py
+++ b/examples/wgrad.py
@@ -16,8 +16,8 @@
 #
 
 import argparse
-import datetime
-import math
+
+from legate.timing import time
 
 import cunumeric as np
 
@@ -45,17 +45,14 @@ def cross_correlate(x, y, C, K, R, S, B, H, W):
 
 
 def run_wgrad(H=256, W=256, B=32, C=256, K=32, R=5, S=5, timing=False):
-    if timing:
-        start = datetime.datetime.now()
+    start = time()
     x, y = initialize(C, K, B, H, W)
-    dw = cross_correlate(x, y, C, K, R, S, B, H, W)
-    # Do a little sum over dw to sync the results
-    total = np.sum(dw)
-    assert not math.isnan(total)
+    _ = cross_correlate(x, y, C, K, R, S, B, H, W)
+    stop = time()
+    total = (stop - start) / 1000.0
     if timing:
-        stop = datetime.datetime.now()
-        delta = stop - start
-        print("Elapsed Time: " + str(delta.total_seconds() * 1000.0) + " ms")
+        print("Elapsed Time: " + str(total) + " ms")
+    return total
 
 
 if __name__ == "__main__":
@@ -104,7 +101,7 @@ def run_wgrad(H=256, W=256, B=32, C=256, K=32, R=5, S=5, timing=False):
         dest="W",
         help="width of images in pixels",
     )
-    args = parser.parse_args()
+    args = parser.parse_args(parser)
     run_wgrad(
         args.H, args.W, args.B, args.C, args.K, args.R, args.R, args.timing
     )

From b7f0881d8d40cf432ffd86efb561f4f7bc909905 Mon Sep 17 00:00:00 2001
From: Rohan Yadav <rohany@alumni.cmu.edu>
Date: Tue, 20 Dec 2022 02:46:04 -0700
Subject: [PATCH 69/89] src/cunumeric/item: add openmp variants for write/read
 tasks (#740)

This commit adds OMP variants for the write and read tasks so that they
can be used in resource-scoped settings where openmp processors are
desired.

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>

Signed-off-by: Rohan Yadav <rohany@alumni.cmu.edu>
---
 src/cunumeric/item/read.h  | 3 +++
 src/cunumeric/item/write.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/src/cunumeric/item/read.h b/src/cunumeric/item/read.h
index d3bb90774..0606d82e4 100644
--- a/src/cunumeric/item/read.h
+++ b/src/cunumeric/item/read.h
@@ -26,6 +26,9 @@ class ReadTask : public CuNumericTask<ReadTask> {
 
  public:
   static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context) { ReadTask::cpu_variant(context); }
+#endif
 #ifdef LEGATE_USE_CUDA
   static void gpu_variant(legate::TaskContext& context);
 #endif
diff --git a/src/cunumeric/item/write.h b/src/cunumeric/item/write.h
index c3455b0e0..725918139 100644
--- a/src/cunumeric/item/write.h
+++ b/src/cunumeric/item/write.h
@@ -26,6 +26,9 @@ class WriteTask : public CuNumericTask<WriteTask> {
 
  public:
   static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context) { WriteTask::cpu_variant(context); }
+#endif
 #ifdef LEGATE_USE_CUDA
   static void gpu_variant(legate::TaskContext& context);
 #endif

From 5617e2c490ebf67bd52f2c6490d99dfaa455fc61 Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 21 Dec 2022 01:05:39 +0200
Subject: [PATCH 70/89] Add back NaN checks to some benchmarks (#743)

* Add back NaN checks to some benchmarks

* Add some more debugging info in case of NaNs
---
 examples/jacobi.py | 12 +++++++-----
 examples/logreg.py |  5 +++++
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/examples/jacobi.py b/examples/jacobi.py
index f4e42081d..ef82e76f9 100644
--- a/examples/jacobi.py
+++ b/examples/jacobi.py
@@ -16,6 +16,7 @@
 #
 
 import argparse
+import math
 
 from benchmark import parse_args, run_benchmark, time
 
@@ -33,10 +34,7 @@ def generate_random(N):
 
 def check(A, x, b):
     print("Checking result...")
-    if np.allclose(A.dot(x), b):
-        print("PASS!")
-    else:
-        print("FAIL!")
+    return np.allclose(A.dot(x), b)
 
 
 def run_jacobi(N, iters, warmup, perform_check, timing, verbose):
@@ -55,7 +53,11 @@ def run_jacobi(N, iters, warmup, perform_check, timing, verbose):
     stop = time()
 
     if perform_check:
-        check(A, x, b)
+        assert check(A, x, b)
+    else:
+        assert not math.isnan(
+            np.sum(x)
+        ), f"{np.count_nonzero(~np.isnan(x))} NaNs in x"
 
     total = (stop - start) / 1000.0
     if timing:
diff --git a/examples/logreg.py b/examples/logreg.py
index 43b0e62b0..88fe7cde9 100644
--- a/examples/logreg.py
+++ b/examples/logreg.py
@@ -16,6 +16,7 @@
 #
 
 import argparse
+import math
 
 from benchmark import parse_args, run_benchmark, time
 
@@ -69,6 +70,10 @@ def run_logistic_regression(N, F, T, I, warmup, S, B):  # noqa: E741
             )
     stop = time()
 
+    assert not math.isnan(
+        np.sum(weights)
+    ), f"{np.count_nonzero(~np.isnan(weights))} NaNs in weights"
+
     total = (stop - start) / 1000.0
     print(f"Elapsed Time: {total} ms")
     return total

From 98d1e822cb037cbda142d52a89df740e5907a2fd Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Sun, 25 Dec 2022 19:51:36 +0200
Subject: [PATCH 71/89] Fix CI failures due to numpy 1.24 upgrade (#745)

Co-authored-by: Manolis Papadakis <mpapadakis@nvidia.com>
---
 cunumeric/eager.py             | 2 +-
 tests/integration/test_ndim.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index 0c792fbae..61e8f5d37 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -504,7 +504,7 @@ def convert(
                 elif nan_op is ConvertCode.PROD and np.isnan(rhs.array.item()):
                     self.array.fill(1)
                 else:
-                    self.array.fill(rhs.array.item())
+                    self.array.fill(rhs.array.astype(self.array.dtype).item())
             else:
                 if nan_op is ConvertCode.SUM:
                     self.array[:] = np.where(np.isnan(rhs.array), 0, rhs.array)
diff --git a/tests/integration/test_ndim.py b/tests/integration/test_ndim.py
index d6888cb50..c9bba7f07 100644
--- a/tests/integration/test_ndim.py
+++ b/tests/integration/test_ndim.py
@@ -44,9 +44,7 @@ def test_ndarray_empty(input):
     assert np.ndim(input) == num.ndim(input)
 
 
-@pytest.mark.parametrize(
-    "input", (([0], [1, 2], [3, 4, 5]), ([1, 2], [3.3, 4.4]))
-)
+@pytest.mark.parametrize("input", [([1, 2], [3.3, 4.4])])
 def test_python_values_diff_dim(input):
     assert np.ndim(input) == num.ndim(input)
 

From fc202143fb1635ce06a54f5611d28440178dfac2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 27 Dec 2022 14:22:57 +0000
Subject: [PATCH 72/89] [pre-commit.ci] pre-commit autoupdate (#744)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/PyCQA/isort: 5.11.1 → 5.11.4](https://github.com/PyCQA/isort/compare/5.11.1...5.11.4)
- [github.com/pre-commit/mirrors-clang-format: v15.0.4 → v15.0.6](https://github.com/pre-commit/mirrors-clang-format/compare/v15.0.4...v15.0.6)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 71a89944f..eefd667d3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ repos:
           pass_filenames: false
           args: ['cunumeric']
     - repo: https://github.com/PyCQA/isort
-      rev: 5.11.1
+      rev: 5.11.4
       hooks:
             - id: isort
     - repo: https://github.com/psf/black
@@ -19,7 +19,7 @@ repos:
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v15.0.4'  # Use the sha / tag you want to point at
+      rev: 'v15.0.6'  # Use the sha / tag you want to point at
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$

From a2bc300c940ca26a6fb47e080d01dabe4c6419a0 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 27 Dec 2022 10:14:03 -0800
Subject: [PATCH 73/89] Improving performance for some special cases of
 advanced indexing (#731)

* moving boolean case for advanced indexing to a separate function

* improving performance for special case of advanced indexing with bool array
---
 cunumeric/deferred.py                       | 336 +++++++++++++-------
 src/cunumeric/index/putmask_template.inl    |   3 -
 tests/integration/test_advanced_indexing.py |  14 +
 3 files changed, 231 insertions(+), 122 deletions(-)

diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index a7ba5d6c0..baa2cf50d 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -534,142 +534,238 @@ def _slice_store(k: slice, store: Store, dim: int) -> tuple[slice, Store]:
 
         return k, store
 
-    def _create_indexing_array(
+    def _has_single_boolean_array(
+        self, key: Any, is_set: bool
+    ) -> tuple[bool, DeferredArray, Any]:
+        if isinstance(key, NumPyThunk) and key.dtype == bool:
+            return True, self, key
+        else:
+            # key is a single array of indices
+            if isinstance(key, NumPyThunk):
+                return False, self, key
+
+            assert isinstance(key, tuple)
+
+            key = self._unpack_ellipsis(key, self.ndim)
+
+            # loop through all the keys to check if there
+            # is a single NumPyThunk entry
+            num_arrays = 0
+            transpose_index = 0
+            for dim, k in enumerate(key):
+                if isinstance(k, NumPyThunk):
+                    num_arrays += 1
+                    transpose_index = dim
+
+            # this is the case when there is a single boolean array passed
+            # in this case we transpose original array so that the indx
+            # to which boolean array is passed to goes first
+            # doing this we can avoid going through Realm Copy which should
+            # improve performance
+            if (
+                num_arrays == 1
+                and key[transpose_index].dtype == bool
+                and is_set
+            ):
+                lhs = self
+                key_dim = key[transpose_index].ndim
+                transpose_indices = tuple(
+                    (transpose_index + i) for i in range(0, key_dim)
+                )
+                transpose_indices += tuple(
+                    i for i in range(0, transpose_index)
+                )
+                transpose_indices += tuple(
+                    i for i in range(transpose_index + key_dim, lhs.ndim)
+                )
+
+                new_key = tuple(key[i] for i in range(0, transpose_index))
+                new_key += tuple(
+                    key[i] for i in range(transpose_index + 1, len(key))
+                )
+                lhs = lhs.transpose(transpose_indices)
+
+                # transform original array for all other keys in the tuple
+                if len(new_key) > 0:
+                    shift = 0
+                    store = lhs.base
+                    for dim, k in enumerate(new_key):
+                        if np.isscalar(k):
+                            if k < 0:  # type: ignore [operator]
+                                k += store.shape[dim + key_dim + shift]
+                            store = store.project(dim + key_dim + shift, k)
+                            shift -= 1
+                        elif k is np.newaxis:
+                            store = store.promote(dim + key_dim + shift, 1)
+                        elif isinstance(k, slice):
+                            k, store = self._slice_store(
+                                k, store, dim + key_dim + shift
+                            )
+                        else:
+                            raise TypeError(
+                                "Unsupported entry type passed to advanced ",
+                                "indexing operation",
+                            )
+                    lhs = DeferredArray(self.runtime, store, self.dtype)
+
+                return True, lhs, key[transpose_index]
+
+            # this is a general advanced indexing case
+            else:
+                return False, self, key
+
+    def _advanced_indexing_with_boolean_array(
         self,
         key: Any,
         is_set: bool = False,
         set_value: Optional[Any] = None,
     ) -> tuple[bool, Any, Any, Any]:
-        store = self.base
         rhs = self
-        # the index where the first index_array is passed to the [] operator
-        start_index = -1
-        if isinstance(key, NumPyThunk) and key.dtype == bool:
-            if not isinstance(key, DeferredArray):
-                key = self.runtime.to_deferred_array(key)
-
-            # in case when boolean array is passed as an index, shape for all
-            # its dimensions should be the same as the shape of
-            # corresponding dimensions of the input array
-            for i in range(key.ndim):
-                if key.shape[i] != rhs.shape[i]:
-                    raise ValueError(
-                        "shape of the index array for "
-                        f"dimension {i} doesn't match to the shape of the"
-                        f"index array which is {rhs.shape[i]}"
-                    )
-
-            # if key or rhs are empty, return an empty array with correct shape
-            if key.size == 0 or rhs.size == 0:
-                if rhs.size == 0 and key.size != 0:
-                    # we need to calculate shape of the 0 dim of output region
-                    # even though the size of it is 0
-                    # this can potentially be replaced with COUNT_NONZERO
-                    s = key.nonzero()[0].size
-                else:
-                    s = 0
-
-                out_shape = (s,) + tuple(
-                    rhs.shape[i] for i in range(key.ndim, rhs.ndim)
+        if not isinstance(key, DeferredArray):
+            key = self.runtime.to_deferred_array(key)
+
+        # in case when boolean array is passed as an index, shape for all
+        # its dimensions should be the same as the shape of
+        # corresponding dimensions of the input array
+        for i in range(key.ndim):
+            if key.shape[i] != rhs.shape[i]:
+                raise ValueError(
+                    "shape of the index array for "
+                    f"dimension {i} doesn't match to the shape of the"
+                    f"index array which is {rhs.shape[i]}"
                 )
-                out = cast(
-                    DeferredArray,
-                    self.runtime.create_empty_thunk(
-                        out_shape,
-                        rhs.dtype,
-                        inputs=[rhs],
-                    ),
-                )
-                out.fill(np.zeros((), dtype=out.dtype))
-                return False, rhs, out, self
-
-            key_store = key.base
-            # bring key to the same shape as rhs
-            for i in range(key_store.ndim, rhs.ndim):
-                key_store = key_store.promote(i, rhs.shape[i])
-
-            # has_set_value && set_value.size==1 corresponds to the case
-            # when a[bool_indices]=scalar
-            # then we can call "putmask" to modify input array
-            # and avoid calling Copy
-            has_set_value = set_value is not None and set_value.size == 1
-            if has_set_value:
-                mask = DeferredArray(
-                    self.runtime,
-                    base=key_store,
-                    dtype=self.dtype,
-                )
-                rhs.putmask(mask, set_value)
-                return False, rhs, rhs, self
+
+        # if key or rhs are empty, return an empty array with correct shape
+        if key.size == 0 or rhs.size == 0:
+            if rhs.size == 0 and key.size != 0:
+                # we need to calculate shape of the 0 dim of output region
+                # even though the size of it is 0
+                # this can potentially be replaced with COUNT_NONZERO
+                s = key.nonzero()[0].size
             else:
-                out_dtype = rhs.dtype
-                # in the case this operation is called for the set_item, we
-                # return Point<N> type field that is later used for
-                # indirect copy operation
-                if is_set:
-                    N = rhs.ndim
-                    out_dtype = rhs.runtime.get_point_type(N)
-
-                # TODO : current implementation of the ND output regions
-                # requires out.ndim == rhs.ndim. This will be fixed in the
-                # future
-                out = rhs.runtime.create_unbound_thunk(
-                    out_dtype, ndim=rhs.ndim
-                )
-                key_dims = key.ndim  # dimension of the original key
+                s = 0
 
-                task = rhs.context.create_auto_task(
-                    CuNumericOpCode.ADVANCED_INDEXING
-                )
-                task.add_output(out.base)
-                task.add_input(rhs.base)
-                task.add_input(key_store)
-                task.add_scalar_arg(is_set, bool)
-                task.add_scalar_arg(key_dims, ty.int64)
-                task.add_alignment(rhs.base, key_store)
-                task.add_broadcast(
-                    rhs.base, axes=tuple(range(1, len(rhs.base.shape)))
-                )
-                task.execute()
+            out_shape = (s,) + tuple(
+                rhs.shape[i] for i in range(key.ndim, rhs.ndim)
+            )
 
-                # TODO : current implementation of the ND output regions
-                # requires out.ndim == rhs.ndim.
-                # The logic below will be removed in the future
-                out_dim = rhs.ndim - key_dims + 1
-
-                if out_dim != rhs.ndim:
-                    out_tmp = out.base
-
-                    if out.size == 0:
-                        out_shape = tuple(
-                            out.shape[i] for i in range(0, out_dim)
-                        )
-                        out = cast(
-                            DeferredArray,
-                            self.runtime.create_empty_thunk(
-                                out_shape,
-                                out_dtype,
-                                inputs=[out],
-                            ),
-                        )
-                        if not is_set:
-                            out.fill(np.array(0, dtype=out_dtype))
-                    else:
-                        for dim in range(rhs.ndim - out_dim):
-                            out_tmp = out_tmp.project(rhs.ndim - dim - 1, 0)
+            out = cast(
+                DeferredArray,
+                self.runtime.create_empty_thunk(
+                    out_shape,
+                    rhs.dtype,
+                    inputs=[rhs],
+                ),
+            )
+            out.fill(np.zeros((), dtype=out.dtype))
+            return False, rhs, out, self
+
+        key_store = key.base
+        # bring key to the same shape as rhs
+        for i in range(key_store.ndim, rhs.ndim):
+            key_store = key_store.promote(i, rhs.shape[i])
+
+        # has_set_value && set_value.size==1 corresponds to the case
+        # when a[bool_indices]=scalar
+        # then we can call "putmask" to modify input array
+        # and avoid calling Copy
+        has_set_value = set_value is not None and set_value.size == 1
+        if has_set_value:
+
+            mask = DeferredArray(
+                self.runtime,
+                base=key_store,
+                dtype=self.dtype,
+            )
+            rhs.putmask(mask, set_value)
+            return False, rhs, rhs, self
+        else:
+            out_dtype = rhs.dtype
+            # in the case this operation is called for the set_item, we
+            # return Point<N> type field that is later used for
+            # indirect copy operation
+            if is_set:
+                N = rhs.ndim
+                out_dtype = rhs.runtime.get_point_type(N)
 
-                        out = out._copy_store(out_tmp)
+            # TODO : current implementation of the ND output regions
+            # requires out.ndim == rhs.ndim. This will be fixed in the
+            # future
+            out = rhs.runtime.create_unbound_thunk(out_dtype, ndim=rhs.ndim)
+            key_dims = key.ndim  # dimension of the original key
 
-                return is_set, rhs, out, self
+            task = rhs.context.create_auto_task(
+                CuNumericOpCode.ADVANCED_INDEXING
+            )
+            task.add_output(out.base)
+            task.add_input(rhs.base)
+            task.add_input(key_store)
+            task.add_scalar_arg(is_set, bool)
+            task.add_scalar_arg(key_dims, ty.int64)
+            task.add_alignment(rhs.base, key_store)
+            task.add_broadcast(
+                rhs.base, axes=tuple(range(1, len(rhs.base.shape)))
+            )
+            task.execute()
+
+            # TODO : current implementation of the ND output regions
+            # requires out.ndim == rhs.ndim.
+            # The logic below will be removed in the future
+            out_dim = rhs.ndim - key_dims + 1
+
+            if out_dim != rhs.ndim:
+                out_tmp = out.base
+
+                if out.size == 0:
+                    out_shape = tuple(out.shape[i] for i in range(0, out_dim))
+                    out = cast(
+                        DeferredArray,
+                        self.runtime.create_empty_thunk(
+                            out_shape,
+                            out_dtype,
+                            inputs=[out],
+                        ),
+                    )
+                    if not is_set:
+                        out.fill(np.array(0, dtype=out_dtype))
+                else:
+                    for dim in range(rhs.ndim - out_dim):
+                        out_tmp = out_tmp.project(rhs.ndim - dim - 1, 0)
+
+                    out = out._copy_store(out_tmp)
+            return is_set, rhs, out, self
 
+    def _create_indexing_array(
+        self,
+        key: Any,
+        is_set: bool = False,
+        set_value: Optional[Any] = None,
+    ) -> tuple[bool, Any, Any, Any]:
+
+        is_bool_array, lhs, bool_key = self._has_single_boolean_array(
+            key, is_set
+        )
+
+        # the case when single boolean array is passed to the advanced
+        # indexing operation
+        if is_bool_array:
+            return lhs._advanced_indexing_with_boolean_array(
+                bool_key, is_set, set_value
+            )
+        # general advanced indexing case
+
+        store = self.base
+        rhs = self
         if isinstance(key, NumPyThunk):
             key = (key,)
-
         assert isinstance(key, tuple)
         key = self._unpack_ellipsis(key, self.ndim)
+
+        # the index where the first index_array is passed to the [] operator
+        start_index = -1
         shift = 0
         last_index = self.ndim
-        # in case when index arrays are passed in the scaterred way,
+        # in case when index arrays are passed in the scattered way,
         # we need to transpose original array so all index arrays
         # are close to each other
         transpose_needed = False
@@ -730,8 +826,8 @@ def _create_indexing_array(
                                 "shape of boolean index did not match "
                                 "indexed array "
                             )
-                    # in case of the mixed indises we all nonzero
-                    # for the bool array
+                    # in case of the mixed indices we all nonzero
+                    # for the boolean array
                     k = k.nonzero()
                     shift += len(k) - 1
                     tuple_of_arrays += k
@@ -1770,6 +1866,8 @@ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
 
     @auto_convert("mask", "values")
     def putmask(self, mask: Any, values: Any) -> None:
+        assert self.shape == mask.shape
+
         if values.shape != self.shape:
             values_new = values._broadcast(self.shape)
         else:
diff --git a/src/cunumeric/index/putmask_template.inl b/src/cunumeric/index/putmask_template.inl
index 6f55c34e0..f522198b3 100644
--- a/src/cunumeric/index/putmask_template.inl
+++ b/src/cunumeric/index/putmask_template.inl
@@ -52,9 +52,6 @@ struct Putmask {
   Putmask(PutmaskArgs& args) : dense(false)
   {
     rect = args.input.shape<DIM>();
-#ifdef DEBUG_CUNUMERIC
-    assert(rect == args.mask.shape<DIM>());
-#endif
 
     input  = args.input.read_write_accessor<T, DIM>(rect);
     mask   = args.mask.read_accessor<bool, DIM>(rect);
diff --git a/tests/integration/test_advanced_indexing.py b/tests/integration/test_advanced_indexing.py
index 2e3ee475a..4f1d649ac 100644
--- a/tests/integration/test_advanced_indexing.py
+++ b/tests/integration/test_advanced_indexing.py
@@ -756,6 +756,20 @@ def test():
     res_num = x_num[[1, 1], :, [False, True, False, True]]
     assert np.array_equal(res, res_num)
 
+    # set item with mixed indices
+    x[1, :, [False, True, False, True]] = 129
+    x_num[1, :, [False, True, False, True]] = 129
+    assert np.array_equal(x, x_num)
+
+    # set item with mixed indices
+    x[:, [False, True, False], 1] = 111
+    x_num[:, [False, True, False], 1] = 111
+    assert np.array_equal(x, x_num)
+
+    x[..., [False, True, False, True, False]] = 200
+    x_num[..., [False, True, False, True, False]] = 200
+    assert np.array_equal(x, x_num)
+
     # b: combining basic and advanced indexing schemes
     ind0 = np.array([1, 1])
     ind0_num = num.array(ind0)

From 2844e59e93404e60daf6de6c656619a44cda4bde Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Wed, 4 Jan 2023 15:13:13 -0800
Subject: [PATCH 74/89] Switch docs from recommonmark to myst-parser (#746)

---
 README.md                     | 9 ---------
 docs/cunumeric/source/conf.py | 3 +--
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/README.md b/README.md
index d3973a60c..dbe358373 100644
--- a/README.md
+++ b/README.md
@@ -35,15 +35,6 @@ canonical NumPy implementation.
 
 If you have questions, please contact us at legate(at)nvidia.com.
 
-1. [Installation](#installation)
-1. [Usage and Execution](#usage-and-execution)
-1. [Supported and Planned Features](#supported-and-planned-features)
-1. [Supported Types and Dimensions](#supported-types-and-dimensions)
-1. [Documentation](#documentation)
-1. [Future Directions](#future-directions)
-1. [Contributing](#contributing)
-1. [Known Bugs](#known-bugs)
-
 ## Installation
 
 cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric):
diff --git a/docs/cunumeric/source/conf.py b/docs/cunumeric/source/conf.py
index 17fd408c1..5d3ce4881 100644
--- a/docs/cunumeric/source/conf.py
+++ b/docs/cunumeric/source/conf.py
@@ -37,8 +37,7 @@
     "sphinx.ext.mathjax",
     "sphinx.ext.napoleon",
     "sphinx_copybutton",
-    "sphinx_markdown_tables",
-    "recommonmark",
+    "myst_parser",
     "cunumeric._sphinxext.comparison_table",
     "cunumeric._sphinxext.implemented_index",
     "cunumeric._sphinxext.missing_refs",

From b4a40fedf3a5f4a1ddffeadd281e14eafa46c5ca Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Fri, 6 Jan 2023 11:09:11 -0800
Subject: [PATCH 75/89] Fix timing for CuPy tests (#747)

CuPy launches work asynchronously on the GPU, so we need to block
until all work finishes before taking a measurement.
---
 examples/benchmark.py         | 76 +++++++++++++++++++++++++++++++----
 examples/black_scholes.py     |  9 ++---
 examples/cg.py                | 18 ++++-----
 examples/einsum.py            | 11 +++--
 examples/gemm.py              | 11 +++--
 examples/indexing_routines.py | 34 +++++++---------
 examples/jacobi.py            | 11 +++--
 examples/kmeans.py            |  9 ++---
 examples/kmeans_slow.py       |  9 ++---
 examples/kmeans_sort.py       |  9 ++---
 examples/linreg.py            | 11 +++--
 examples/logreg.py            | 11 +++--
 examples/lstm_backward.py     |  9 ++---
 examples/lstm_forward.py      |  9 ++---
 examples/lstm_full.py         |  9 ++---
 examples/richardson_lucy.py   | 11 +++--
 examples/scan.py              |  9 ++---
 examples/solve.py             |  9 ++---
 examples/sort.py              |  9 ++---
 examples/stencil.py           | 11 +++--
 20 files changed, 166 insertions(+), 129 deletions(-)

diff --git a/examples/benchmark.py b/examples/benchmark.py
index 1d0a6f73e..1d0944e3b 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -18,13 +18,70 @@
 import math
 from functools import reduce
 
-try:
-    from legate.timing import time
-except (ImportError, RuntimeError):
-    from time import perf_counter_ns
+from typing_extensions import Protocol
 
-    def time():
-        return perf_counter_ns() / 1000.0
+
+class Timer(Protocol):
+    def start(self):
+        ...
+
+    def stop(self):
+        """
+        Blocks execution until everything before it has completed. Returns the
+        duration since the last call to start(), in milliseconds.
+        """
+        ...
+
+
+class CuNumericTimer(Timer):
+    def __init__(self):
+        self._start_future = None
+
+    def start(self):
+        from legate.timing import time
+
+        self._start_future = time()
+
+    def stop(self):
+        from legate.timing import time
+
+        end_future = time()
+        return (end_future - self._start_future) / 1000.0
+
+
+class CuPyTimer(Timer):
+    def __init__(self):
+        self._start_event = None
+
+    def start(self):
+        from cupy import cuda
+
+        self._start_event = cuda.Event()
+        self._start_event.record()
+
+    def stop(self):
+        from cupy import cuda
+
+        end_event = cuda.Event()
+        end_event.record()
+        end_event.synchronize()
+        return cuda.get_elapsed_time(self._start_event, end_event)
+
+
+class NumPyTimer(Timer):
+    def __init__(self):
+        self._start_time = None
+
+    def start(self):
+        from time import perf_counter_ns
+
+        self._start_time = perf_counter_ns() / 1000.0
+
+    def stop(self):
+        from time import perf_counter_ns
+
+        end_time = perf_counter_ns() / 1000.0
+        return (end_time - self._start_time) / 1000.0
 
 
 # Add common arguments and parse
@@ -57,6 +114,8 @@ def parse_args(parser):
     args, _ = parser.parse_known_args()
     if args.package == "legate":
         import cunumeric as np
+
+        timer = CuNumericTimer()
     elif args.package == "cupy":
         import cupy as np
 
@@ -68,9 +127,12 @@ def parse_args(parser):
                 np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
             )
             print("Using managed memory pool")
+        timer = CuPyTimer()
     elif args.package == "numpy":
         import numpy as np
-    return args, np
+
+        timer = NumPyTimer()
+    return args, np, timer
 
 
 # A helper method for benchmarking applications
diff --git a/examples/black_scholes.py b/examples/black_scholes.py
index d64e032d5..55374ea09 100644
--- a/examples/black_scholes.py
+++ b/examples/black_scholes.py
@@ -17,7 +17,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def generate_random(N, min, max, D):
@@ -71,11 +71,10 @@ def black_scholes(S, X, T, R, V):
 def run_black_scholes(N, D):
     print("Running black scholes on %dK options..." % N)
     N *= 1000
-    start = time()
+    timer.start()
     S, X, T, R, V = initialize(N, D)
     _, _ = black_scholes(S, X, T, R, V)
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -99,7 +98,7 @@ def run_black_scholes(N, D):
         help="precision of the computation in bits",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
diff --git a/examples/cg.py b/examples/cg.py
index 79721f7b0..a0399778e 100644
--- a/examples/cg.py
+++ b/examples/cg.py
@@ -17,7 +17,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 # This is technically dead code right now, but we'll keep it around in
@@ -100,10 +100,10 @@ def run_cg(
         min(max_iters, b.shape[0]) if max_iters is not None else b.shape[0]
     )
 
-    start = time()
+    timer.start()
     for i in range(-warmup, max_iters):
         if i == 0:
-            start = time()
+            timer.start()
         Ap = A.dot(p)
         alpha = rsold / (p.dot(Ap))
         x = x + alpha * p
@@ -123,7 +123,7 @@ def run_cg(
         beta = rsnew / rsold
         p = r + beta * p
         rsold = rsnew
-    stop = time()
+    total = timer.stop()
 
     if converged < 0:
         print("Convergence FAILURE!")
@@ -132,7 +132,6 @@ def run_cg(
     if perform_check:
         check(A, x, b)
 
-    total = (stop - start) / 1000.0
     if timing:
         print(f"Elapsed Time: {total} ms")
     return total
@@ -174,10 +173,10 @@ def run_preconditioned_cg(
         min(max_iters, b.shape[0]) if max_iters is not None else b.shape[0]
     )
 
-    start = time()
+    timer.start()
     for i in range(-warmup, max_iters):
         if i == 0:
-            start = time()
+            timer.start()
         Ap = A.dot(p)
         alpha = rzold / (p.dot(Ap))
         x = x + alpha * p
@@ -199,7 +198,7 @@ def run_preconditioned_cg(
         beta = rznew / rzold
         p = z + beta * p
         rzold = rznew
-    stop = time()
+    total = timer.stop()
 
     if converged < 0:
         print("Convergence FAILURE!")
@@ -208,7 +207,6 @@ def run_preconditioned_cg(
     if perform_check:
         check(A, x, b)
 
-    total = (stop - start) / 1000.0
     if timing:
         print(f"Elapsed Time: {total} ms")
     return total
@@ -290,7 +288,7 @@ def run_preconditioned_cg(
         help="convergence check threshold",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_preconditioned_cg if args.precondition else run_cg,
diff --git a/examples/einsum.py b/examples/einsum.py
index aac1ec995..090e3385f 100644
--- a/examples/einsum.py
+++ b/examples/einsum.py
@@ -18,7 +18,7 @@
 import argparse
 import re
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def run_einsum(expr, N, iters, warmup, dtype, cupy_compatibility):
@@ -82,10 +82,10 @@ def run_einsum(expr, N, iters, warmup, dtype, cupy_compatibility):
     C = np.zeros((N,) * len(c_modes), dtype=dtype)
 
     # Run contraction
-    start = time()
+    timer.start()
     for idx in range(iters + warmup):
         if idx == warmup:
-            start = time()
+            timer.start()
         if cupy_compatibility:
             C = np.einsum(expr, A, B)
         else:
@@ -102,10 +102,9 @@ def run_einsum(expr, N, iters, warmup, dtype, cupy_compatibility):
             A, C = C, A
         else:
             B, C = C, B
-    stop = time()
+    total = timer.stop()
 
     # Print statistics
-    total = (stop - start) / 1000.0
     average = total / iters
     print(f"Elapsed Time: {total:.3f} ms")
     print(f"Average Iteration: {average:.3f} ms")
@@ -162,7 +161,7 @@ def run_einsum(expr, N, iters, warmup, dtype, cupy_compatibility):
              else, use einsum(expr, A, B, out=C)""",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     cupy_compatibility = args.cupy_compatibility or args.package == "cupy"
     if cupy_compatibility:
diff --git a/examples/gemm.py b/examples/gemm.py
index 2fe8aafc3..c70a666c1 100644
--- a/examples/gemm.py
+++ b/examples/gemm.py
@@ -17,7 +17,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(M, N, K, ft):
@@ -44,20 +44,19 @@ def run_gemm(N, I, warmup, ft):  # noqa: E741
     print("Total Size:       " + str(space / 1e6) + " MB")
     A, B, C = initialize(N, N, N, ft)
 
-    start = time()
+    timer.start()
     # Run for as many iterations as was requested
     for idx in range(I + warmup):
         if idx == warmup:
-            start = time()
+            timer.start()
         np.dot(A, B, out=C)
         # We need to rotate the matrices to keep Legate honest
         # about moving data so it can't just duplicate A and B
         # on the first iteration and reuse them, this means
         # that A, B, C all need to be square
         A, B, C = B, C, A
-    stop = time()
+    total = timer.stop()
 
-    total = (stop - start) / 1000.0
     print("Elapsed Time:     " + str(total) + " ms")
     average = total / I
     print("Average GEMM:     " + str(average) + " ms")
@@ -101,7 +100,7 @@ def run_gemm(N, I, warmup, ft):  # noqa: E741
         "(16,32,64)",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
diff --git a/examples/indexing_routines.py b/examples/indexing_routines.py
index 2e7f40301..a0f15e120 100644
--- a/examples/indexing_routines.py
+++ b/examples/indexing_routines.py
@@ -19,7 +19,7 @@
 import gc
 import math
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def compute_diagonal(steps, N, timing, warmup):
@@ -27,11 +27,10 @@ def compute_diagonal(steps, N, timing, warmup):
     print("measuring diagonal")
     for step in range(steps + warmup):
         if step == warmup:
-            start = time()
+            timer.start()
         A2 = np.diag(A1)
         A1 = np.diag(A2)
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         space = (N * N + N) * np.dtype(int).itemsize / 1073741824
         print("Total Size:       " + str(space) + " GB")
@@ -52,10 +51,9 @@ def compute_choose(steps, N, timing, warmup):
     C1 = np.arange(N, dtype=int) % 10
     for step in range(steps + warmup):
         if step == warmup:
-            start = time()
+            timer.start()
         C1 = np.choose(C1, A, mode="wrap")
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         space = N * np.dtype(int).itemsize / 1073741824
         print("Total Size:       " + str(space) + " GB")
@@ -82,10 +80,9 @@ def compute_repeat(steps, N, timing, warmup):
     print("measuring repeat")
     for step in range(steps + warmup):
         if step == warmup:
-            start = time()
+            timer.start()
         A2 = np.repeat(A2, R, axis=1)
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         space = (N * N) * np.dtype(int).itemsize / 1073741824
         print("Total Size:       " + str(space) + " GB")
@@ -108,11 +105,10 @@ def compute_advanced_indexing_1d(steps, N, timing, warmup):
     indx_bool = (B % 2).astype(bool)
     for step in range(steps + warmup):
         if step == warmup:
-            start = time()
+            timer.start()
         A1[indx] = 10  # 1 copy
         A1[indx_bool] = 12  # 1 AI and 1 copy
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         space = (3 * N) * np.dtype(int).itemsize / 1073741824
         print("Total Size:       " + str(space) + " GB")
@@ -136,12 +132,11 @@ def compute_advanced_indexing_2d(steps, N, timing, warmup):
     indx2d_bool = (A2 % 2).astype(bool)
     for step in range(steps + warmup):
         if step == warmup:
-            start = time()
+            timer.start()
         A2[indx_bool, indx_bool] = 11  # one ZIP and 1 copy = N+N*N
         A2[:, indx] = 12  # one ZIP and 3 copies = N+3*N*N
         A2[indx2d_bool] = 13  # 1 copy and one AI task = 2* N*N
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         space = (6 * N * N + 2 * N) * np.dtype(int).itemsize / 1073741824
         print("Total Size:       " + str(space) + " GB")
@@ -171,11 +166,10 @@ def compute_advanced_indexing_3d(steps, N, timing, warmup):
     indx3d_bool = (A3 % 2).astype(bool)
     for step in range(steps + warmup):
         if step == warmup:
-            start = time()
+            timer.start()
         A3[indx, :, indx] = 15  # 1 ZIP and 3 copy = N+3N*N
         A3[indx3d_bool] = 16  # 1 copy and 1 AI task = 2*N*N
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         space = (5 * N * N + N) * np.dtype(int).itemsize / 1073741824
         print("Total Size:       " + str(space) + " GB")
@@ -268,7 +262,7 @@ def run_indexing_routines(
         help="name of the index routine to test",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_indexing_routines,
diff --git a/examples/jacobi.py b/examples/jacobi.py
index ef82e76f9..6b9e46968 100644
--- a/examples/jacobi.py
+++ b/examples/jacobi.py
@@ -18,7 +18,7 @@
 import argparse
 import math
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def generate_random(N):
@@ -45,12 +45,12 @@ def run_jacobi(N, iters, warmup, perform_check, timing, verbose):
     d = np.diag(A)
     R = A - np.diag(d)
 
-    start = time()
+    timer.start()
     for i in range(iters + warmup):
         if i == warmup:
-            start = time()
+            timer.start()
         x = (b - np.dot(R, x)) / d
-    stop = time()
+    total = timer.stop()
 
     if perform_check:
         assert check(A, x, b)
@@ -59,7 +59,6 @@ def run_jacobi(N, iters, warmup, perform_check, timing, verbose):
             np.sum(x)
         ), f"{np.count_nonzero(~np.isnan(x))} NaNs in x"
 
-    total = (stop - start) / 1000.0
     if timing:
         print(f"Elapsed Time: {total} ms")
     return total
@@ -112,7 +111,7 @@ def run_jacobi(N, iters, warmup, perform_check, timing, verbose):
         help="print verbose output",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_jacobi,
diff --git a/examples/kmeans.py b/examples/kmeans.py
index a64495e7e..a12723d94 100644
--- a/examples/kmeans.py
+++ b/examples/kmeans.py
@@ -19,7 +19,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(N, D, C, T):
@@ -77,7 +77,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
     print("Number of dimensions: " + str(D))
     print("Number of centroids: " + str(C))
     print("Max iterations: " + str(I))
-    start = time()
+    timer.start()
     data, centroids = initialize(N, D, C, T)
 
     data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -125,8 +125,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         + ": "
         + str(prior_distance_sum)
     )
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -181,7 +180,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         help="number of iterations between sampling the log likelihood",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
diff --git a/examples/kmeans_slow.py b/examples/kmeans_slow.py
index 83f226af3..a4d4c7009 100644
--- a/examples/kmeans_slow.py
+++ b/examples/kmeans_slow.py
@@ -19,7 +19,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(N, D, C, T):
@@ -78,7 +78,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
     print("Number of dimensions: " + str(D))
     print("Number of centroids: " + str(C))
     print("Max iterations: " + str(I))
-    start = time()
+    timer.start()
     data, centroids = initialize(N, D, C, T)
 
     data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -126,8 +126,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         + ": "
         + str(prior_distance_sum)
     )
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -182,7 +181,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         help="number of iterations between sampling the log likelihood",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
diff --git a/examples/kmeans_sort.py b/examples/kmeans_sort.py
index 406b02833..ae84ca6da 100644
--- a/examples/kmeans_sort.py
+++ b/examples/kmeans_sort.py
@@ -19,7 +19,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(N, D, C, T):
@@ -75,7 +75,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
     print("Number of dimensions: " + str(D))
     print("Number of centroids: " + str(C))
     print("Max iterations: " + str(I))
-    start = time()
+    timer.start()
     data, centroids = initialize(N, D, C, T)
 
     data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -122,8 +122,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         + ": "
         + str(prior_distance_sum)
     )
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -179,7 +178,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking):  # noqa: E741
         help="number of iterations between sampling the log likelihood",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
diff --git a/examples/linreg.py b/examples/linreg.py
index a8e684e32..7ec3d11ba 100644
--- a/examples/linreg.py
+++ b/examples/linreg.py
@@ -17,7 +17,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(N, F, T):
@@ -41,10 +41,10 @@ def run_linear_regression(N, F, T, I, warmup, S, B):  # noqa: E741
         features = np.hstack((intercept, features))
     weights = np.zeros(features.shape[1], dtype=T)
 
-    start = time()
+    timer.start()
     for step in range(-warmup, I):
         if step == 0:
-            start = time()
+            timer.start()
         scores = np.dot(features, weights)
         error = scores - target
         gradient = -(1.0 / len(features)) * error.dot(features)
@@ -56,9 +56,8 @@ def run_linear_regression(N, F, T, I, warmup, S, B):  # noqa: E741
                 + ": "
                 + str(np.sum(np.power(error, 2)))
             )
-    stop = time()
+    total = timer.stop()
 
-    total = (stop - start) / 1000.0
     print("Elapsed Time: " + str(total) + " ms")
     return total
 
@@ -121,7 +120,7 @@ def run_linear_regression(N, F, T, I, warmup, S, B):  # noqa: E741
         help="number of iterations between sampling the log likelihood",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
diff --git a/examples/logreg.py b/examples/logreg.py
index 88fe7cde9..d502e35f3 100644
--- a/examples/logreg.py
+++ b/examples/logreg.py
@@ -18,7 +18,7 @@
 import argparse
 import math
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(N, F, T):
@@ -52,10 +52,10 @@ def run_logistic_regression(N, F, T, I, warmup, S, B):  # noqa: E741
         features = np.hstack((intercept, features))
     weights = np.zeros(features.shape[1], dtype=T)
 
-    start = time()
+    timer.start()
     for step in range(-warmup, I):
         if step == 0:
-            start = time()
+            timer.start()
         scores = np.dot(features, weights)
         predictions = sigmoid(scores)
         error = target - predictions
@@ -68,13 +68,12 @@ def run_logistic_regression(N, F, T, I, warmup, S, B):  # noqa: E741
                 + ": "
                 + str(log_likelihood(features, target, weights))
             )
-    stop = time()
+    total = timer.stop()
 
     assert not math.isnan(
         np.sum(weights)
     ), f"{np.count_nonzero(~np.isnan(weights))} NaNs in weights"
 
-    total = (stop - start) / 1000.0
     print(f"Elapsed Time: {total} ms")
     return total
 
@@ -137,7 +136,7 @@ def run_logistic_regression(N, F, T, I, warmup, S, B):  # noqa: E741
         help="number of iterations between sampling the log likelihood",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     if args.P == 16:
         run_benchmark(
diff --git a/examples/lstm_backward.py b/examples/lstm_backward.py
index 99e47f8be..2de702700 100644
--- a/examples/lstm_backward.py
+++ b/examples/lstm_backward.py
@@ -17,11 +17,11 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
-    start = time()
+    timer.start()
 
     WLSTM = np.random.randn(
         word_size + hidden_size, 4 * hidden_size
@@ -73,8 +73,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         else:
             dh0[0] += np.sum(dHin[t, :, word_size:], 0)
 
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -107,7 +106,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         help="perform timing",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_lstm,
diff --git a/examples/lstm_forward.py b/examples/lstm_forward.py
index 4f1ab7abf..097218eaf 100644
--- a/examples/lstm_forward.py
+++ b/examples/lstm_forward.py
@@ -17,11 +17,11 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
-    start = time()
+    timer.start()
 
     X = np.random.randn(sentence_length, batch_size, hidden_size)
     h0 = np.random.randn(1, hidden_size)
@@ -63,8 +63,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         Ct[t] = np.tanh(C[t])
         Hout[t] = IFOGf[t, :, 2 * d : 3 * d] * Ct[t]
 
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -97,7 +96,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
         help="perform timing",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_lstm,
diff --git a/examples/lstm_full.py b/examples/lstm_full.py
index 864773739..7bab6c9c7 100644
--- a/examples/lstm_full.py
+++ b/examples/lstm_full.py
@@ -17,7 +17,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 class Param:
@@ -290,7 +290,7 @@ def run_lstm(
 
     pointer = 0
 
-    start = time()
+    timer.start()
 
     for iteration in range(max_iters):
         # Reset
@@ -325,8 +325,7 @@ def run_lstm(
         pointer += T_steps
     update_status(max_iters, smooth_loss)
 
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -397,7 +396,7 @@ def run_lstm(
         help="standard deviation of weights for initialization",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_lstm,
diff --git a/examples/richardson_lucy.py b/examples/richardson_lucy.py
index 7e5514280..5ffcdcad8 100644
--- a/examples/richardson_lucy.py
+++ b/examples/richardson_lucy.py
@@ -15,7 +15,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 float_type = "float32"
 
@@ -28,17 +28,16 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
     im_deconv = np.full(image.shape, 0.5, dtype=float_type)
     psf_mirror = np.flip(psf)
 
-    start = time()
+    timer.start()
 
     for idx in range(num_iter + warmup):
         if idx == warmup:
-            start = time()
+            timer.start()
         conv = np.convolve(im_deconv, psf, mode="same")
         relative_blur = image / conv
         im_deconv *= np.convolve(relative_blur, psf_mirror, mode="same")
 
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
 
@@ -111,7 +110,7 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
         help="perform timing",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_richardson_lucy,
diff --git a/examples/scan.py b/examples/scan.py
index 03d315325..d4737e54b 100644
--- a/examples/scan.py
+++ b/examples/scan.py
@@ -18,7 +18,7 @@
 import argparse
 
 import numpy as np
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(shape, dt, axis):
@@ -74,13 +74,12 @@ def run_scan(OP, shape, dt, ax, check):
     print(f"Axis:            axis={ax}")
     print(f"Data type:       dtype={dt}32")
     A, B = initialize(shape=shape, dt=dt, axis=ax)
-    start = time()
+    timer.start()
 
     # op handling
     getattr(num, OP)(A, out=B, axis=ax)
 
-    stop = time()
-    total = (stop - start) / 1000.0
+    total = timer.stop()
     print(f"Elapsed Time:  {total}ms")
     # error checking
     if check:
@@ -130,7 +129,7 @@ def run_scan(OP, shape, dt, ax, check):
         help="check the result of the solve",
     )
 
-    args, num = parse_args(parser)
+    args, num, timer = parse_args(parser)
 
     run_benchmark(
         run_scan,
diff --git a/examples/solve.py b/examples/solve.py
index d07642dba..91f92c6dd 100644
--- a/examples/solve.py
+++ b/examples/solve.py
@@ -17,18 +17,17 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def solve(m, n, nrhs, dtype):
     a = np.random.rand(m, n).astype(dtype=dtype)
     b = np.random.rand(n, nrhs).astype(dtype=dtype)
 
-    start = time()
+    timer.start()
     np.linalg.solve(a, b)
-    stop = time()
+    total = timer.stop()
 
-    total = (stop - start) / 1000.0
     print(f"Elapsed Time: {total} ms")
 
 
@@ -66,7 +65,7 @@ def solve(m, n, nrhs, dtype):
         dest="dtype",
         help="data type",
     )
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         solve,
diff --git a/examples/sort.py b/examples/sort.py
index cfcf7590a..5982f91ea 100644
--- a/examples/sort.py
+++ b/examples/sort.py
@@ -18,7 +18,7 @@
 import argparse
 
 import numpy as np
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def check_sorted(a, a_sorted, package, axis=-1):
@@ -73,19 +73,18 @@ def run_sort(
         print("UNKNOWN type " + str(newtype))
         assert False
 
-    start = time()
+    timer.start()
     if argsort:
         a_sorted = num.argsort(a, axis)
     else:
         a_sorted = num.sort(a, axis)
-    stop = time()
+    total = timer.stop()
 
     if perform_check and not argsort:
         check_sorted(a, a_sorted, package, axis)
     else:
         # do we need to synchronize?
         assert True
-    total = (stop - start) * 1e-3
     if timing:
         print("Elapsed Time: " + str(total) + " ms")
     return total
@@ -155,7 +154,7 @@ def run_sort(
         help="use argsort",
     )
 
-    args, num = parse_args(parser)
+    args, num, timer = parse_args(parser)
 
     run_benchmark(
         run_sort,
diff --git a/examples/stencil.py b/examples/stencil.py
index 3eae3c0b1..c0d33c90b 100644
--- a/examples/stencil.py
+++ b/examples/stencil.py
@@ -17,7 +17,7 @@
 
 import argparse
 
-from benchmark import parse_args, run_benchmark, time
+from benchmark import parse_args, run_benchmark
 
 
 def initialize(N):
@@ -40,16 +40,15 @@ def run_stencil(N, I, warmup, timing):  # noqa: E741
     west = grid[1:-1, 0:-2]
     south = grid[2:, 1:-1]
 
-    start = time()
+    timer.start()
     for i in range(I + warmup):
         if i == warmup:
-            start = time()
+            timer.start()
         average = center + north + east + west + south
         work = 0.2 * average
         center[:] = work
-    stop = time()
+    total = timer.stop()
 
-    total = (stop - start) / 1000.0
     if timing:
         print(f"Elapsed Time: {total} ms")
     return total
@@ -89,7 +88,7 @@ def run_stencil(N, I, warmup, timing):  # noqa: E741
         help="perform timing",
     )
 
-    args, np = parse_args(parser)
+    args, np, timer = parse_args(parser)
 
     run_benchmark(
         run_stencil,

From 56d35204916f26e2672af38a6e01392af677c080 Mon Sep 17 00:00:00 2001
From: robinw0928 <104830875+robinw0928@users.noreply.github.com>
Date: Tue, 10 Jan 2023 09:06:33 +0800
Subject: [PATCH 76/89] Enhance test_inner.py and test_tensordot.py (#748)

---
 tests/integration/test_dot.py       |  4 +--
 tests/integration/test_inner.py     | 27 ++++++++++++++++++
 tests/integration/test_tensordot.py | 44 +++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_dot.py b/tests/integration/test_dot.py
index d0157cbf1..40769c354 100644
--- a/tests/integration/test_dot.py
+++ b/tests/integration/test_dot.py
@@ -67,9 +67,9 @@ def test_out_invalid_dtype(self, dtype):
         # In cuNumeric,
         # for np.float32, it pass
         # for np.int64, it raises TypeError: Unsupported type: int64
-        out = np.zeros((5, 2), dtype=dtype)
+        out = num.zeros((5, 2), dtype=dtype)
         with pytest.raises(ValueError):
-            np.dot(self.A, self.B, out=out)
+            num.dot(self.A, self.B, out=out)
 
 
 if __name__ == "__main__":
diff --git a/tests/integration/test_inner.py b/tests/integration/test_inner.py
index 03259ccae..3bc5f761d 100644
--- a/tests/integration/test_inner.py
+++ b/tests/integration/test_inner.py
@@ -15,7 +15,9 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 from utils.contractions import check_default
+from utils.generators import mk_0to1_array
 
+import cunumeric as num
 from cunumeric.utils import inner_modes
 
 
@@ -31,6 +33,31 @@ def operation(lib, *args, **kwargs):
     check_default(name, modes, operation)
 
 
+class TestInnerErrors:
+    def setup_method(self):
+        self.A = mk_0to1_array(num, (5, 3))
+        self.B = mk_0to1_array(num, (2, 3))
+
+    @pytest.mark.parametrize(
+        "shapeA",
+        ((3,), (4, 3), (5, 4, 3)),
+        ids=lambda shapeA: f"(shapeA={shapeA})",
+    )
+    def test_a_b_invalid_shape(self, shapeA):
+        A = mk_0to1_array(num, shapeA)
+        B = mk_0to1_array(num, (3, 2))
+        with pytest.raises(ValueError):
+            num.inner(A, B)
+
+    @pytest.mark.parametrize(
+        "shape", ((5,), (2,), (5, 3)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_out_invalid_shape(self, shape):
+        out = num.zeros(shape)
+        with pytest.raises(ValueError):
+            num.inner(self.A, self.B, out=out)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_tensordot.py b/tests/integration/test_tensordot.py
index 3bea9f522..9f8c902ab 100644
--- a/tests/integration/test_tensordot.py
+++ b/tests/integration/test_tensordot.py
@@ -15,7 +15,9 @@
 import pytest
 from legate.core import LEGATE_MAX_DIM
 from utils.contractions import check_default
+from utils.generators import mk_0to1_array
 
+import cunumeric as num
 from cunumeric.utils import tensordot_modes
 
 
@@ -39,6 +41,48 @@ def operation(lib, *args, **kwargs):
         check_default(name, modes, operation)
 
 
+class TestTensorDotErrors:
+    def setup_method(self):
+        self.A = mk_0to1_array(num, (2, 3, 4))
+        self.B = mk_0to1_array(num, (3, 2, 4))
+
+    @pytest.mark.parametrize(
+        "axis",
+        (
+            1,
+            2,
+            [],
+            [0],
+            [0, 0],
+            ([0, 1], [0, 1]),
+            ([0, 1], [1, 0], [0, 1]),
+            ([0, 0], [0, 0]),
+        ),
+        ids=lambda axis: f"(axis={axis})",
+    )
+    def test_axis_invalid_value(self, axis):
+        with pytest.raises(ValueError):
+            num.tensordot(self.A, self.B, axis)
+
+    @pytest.mark.xfail
+    @pytest.mark.parametrize(
+        "axis", (4, ([0, 3], [1, 3])), ids=lambda axis: f"(axis={axis})"
+    )
+    def test_axis_invalid_index(self, axis):
+        # In Numpy, for both cases, it raises IndexError
+        # In cuNumeric, for both cases, it raises ValueError
+        with pytest.raises(IndexError):
+            num.tensordot(self.A, self.B, axis)
+
+    @pytest.mark.parametrize(
+        "shape", ((4,), (4, 3)), ids=lambda shape: f"(shape={shape})"
+    )
+    def test_out_invalid_shape(self, shape):
+        out = num.zeros(shape)
+        with pytest.raises(ValueError):
+            num.tensordot(self.A, self.B, out=out)
+
+
 if __name__ == "__main__":
     import sys
 

From 330175735d0fedf831961a06cdfc53eb10ff46c7 Mon Sep 17 00:00:00 2001
From: xialu00 <110973296+xialu00@users.noreply.github.com>
Date: Tue, 10 Jan 2023 14:31:02 +0800
Subject: [PATCH 77/89] Testcase enhance test_bits.py and test_contains.py
 (#742)

* add negative test case for test_convolve.py

* add test case for test_astype.py

* add test case for test_astype.py

* fix bug

* enhance test_bincount.py

* enhance test_bincount.py

* enhance test_cholesky.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_reduction.py

* enhance test_prod.py

* fix bug tests/integration/test_prod.py

* enhance test_bits.py and test_contains.py

* fix bugs
---
 tests/integration/test_bits.py     | 176 ++++++++++++++++++++++++-----
 tests/integration/test_contains.py |  48 ++++++--
 2 files changed, 186 insertions(+), 38 deletions(-)

diff --git a/tests/integration/test_bits.py b/tests/integration/test_bits.py
index 825c2afd7..08437ea25 100644
--- a/tests/integration/test_bits.py
+++ b/tests/integration/test_bits.py
@@ -21,18 +21,54 @@
 import cunumeric as num
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
-@pytest.mark.parametrize("dtype", ("B", "i", "?"))
-@pytest.mark.parametrize("bitorder", ("little", "big"))
-def test_packbits(ndim, dtype, bitorder):
-    in_np = np.array([], dtype=dtype)
-    in_num = num.array([], dtype=dtype)
-    out_np = np.packbits(in_np, bitorder=bitorder)
-    out_num = num.packbits(in_num, bitorder=bitorder)
-    assert np.array_equal(out_np, out_num)
-
-    for extent in (3, 5, 8, 16):
-        shape = (extent,) * ndim
+class TestPackbits(object):
+    def test_none_arr(self):
+        # Numpy raises "TypeError:
+        # Expected an input array of integer or boolean data type"
+        # For cuNumeric raises:
+        #  > if a.dtype.kind not in ("u", "i", "b"):
+        #  E AttributeError: 'NoneType' object has no attribute 'dtype'
+        with pytest.raises(AttributeError):
+            num.packbits(None)
+
+    def test_dtype(self):
+        shape = (3, 3)
+        in_num = num.random.random(size=shape)
+        # TypeError: Expected an input array of integer or boolean data type
+        with pytest.raises(TypeError):
+            num.packbits(in_num)
+
+    def test_axis_outbound(self):
+        shape = (3, 3)
+        in_num = num.random.randint(low=0, high=2, size=shape)
+        with pytest.raises(ValueError):
+            num.packbits(in_num, axis=2)
+
+    @pytest.mark.parametrize("bitorder", (1, True, "True", "BIG", "LITTLE"))
+    def test_bitorder_negative(self, bitorder):
+        shape = (3, 3)
+        in_num = num.random.randint(low=0, high=2, size=shape, dtype="i")
+        # when bitorder is 1 or True, Numpy raises
+        # "TypeError: pack() argument 3 must be str".
+        # while cuNumeric raises valueError.
+        with pytest.raises(ValueError):
+            num.packbits(in_num, bitorder=bitorder)
+
+    @pytest.mark.parametrize("arr", ([], [[]]))
+    @pytest.mark.parametrize("dtype", ("B", "i", "?"))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    def test_arr(self, arr, dtype, bitorder):
+        in_np = np.array(arr, dtype=dtype)
+        in_num = num.array(arr, dtype=dtype)
+        out_np = np.packbits(in_np, bitorder=bitorder)
+        out_num = num.packbits(in_num, bitorder=bitorder)
+        assert np.array_equal(out_np, out_num)
+
+    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("dtype", ("B", "i", "?"))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    def test_common(self, ndim, dtype, bitorder):
+        shape = (3,) * ndim
         in_np = np.random.randint(low=0, high=2, size=shape, dtype=dtype)
         in_num = num.array(in_np)
 
@@ -40,48 +76,130 @@ def test_packbits(ndim, dtype, bitorder):
         out_num = num.packbits(in_num, bitorder=bitorder)
         assert np.array_equal(out_np, out_num)
 
-        for axis in range(ndim):
+    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("dtype", ("B", "i", "?"))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    def test_axis(self, ndim, dtype, bitorder):
+        shape = (5,) * ndim
+        in_np = np.random.randint(low=0, high=2, size=shape, dtype=dtype)
+        in_num = num.array(in_np)
+
+        for axis in range(-ndim + 1, ndim):
             out_np = np.packbits(in_np, axis=axis, bitorder=bitorder)
             out_num = num.packbits(in_num, axis=axis, bitorder=bitorder)
             assert np.array_equal(out_np, out_num)
 
 
-@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
-@pytest.mark.parametrize("bitorder", ("little", "big"))
-def test_unpackbits(ndim, bitorder):
-    in_np = np.array([], dtype="B")
-    in_num = num.array([], dtype="B")
-    out_np = np.unpackbits(in_np, bitorder=bitorder)
-    out_num = num.unpackbits(in_num, bitorder=bitorder)
-    assert np.array_equal(out_np, out_num)
-
-    for extent in (3, 5, 8, 16):
-        shape = (extent,) * ndim
+class TestUnpackbits(object):
+    def test_none_arr(self):
+        # Numpy raises "TypeError:
+        # TypeError: Expected an input array of unsigned byte data type
+        # For cuNumeric raises:
+        # > if a.dtype != "B":
+        # E AttributeError: 'NoneType' object has no attribute 'dtype'
+        with pytest.raises(AttributeError):
+            num.unpackbits(None)
+
+    def test_dtype(self):
+        shape = (3, 3)
+        in_num = num.random.random(size=shape)
+        # TypeError: Expected an input array of unsigned byte data type
+        with pytest.raises(TypeError):
+            num.unpackbits(in_num)
+
+    def test_axis_outbound(self):
+        shape = (3, 3)
         in_np = np.random.randint(low=0, high=255, size=shape, dtype="B")
         in_num = num.array(in_np)
+        with pytest.raises(ValueError):
+            num.unpackbits(in_num, axis=2)
 
+    @pytest.mark.parametrize("bitorder", (1, True, "True", "BIG", "LITTLE"))
+    def test_bitorder_negative(self, bitorder):
+        shape = (3, 3)
+        in_np = np.random.randint(low=0, high=255, size=shape, dtype="B")
+        in_num = num.array(in_np)
+        # when bitorder is 1 or True, Numpy raises
+        # "TypeError: unpack() argument 4 must be str".
+        # while cuNumeric raises valueError.
+        with pytest.raises(ValueError):
+            num.unpackbits(in_num, bitorder=bitorder)
+
+    @pytest.mark.parametrize("arr", ([], [[]]))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    def test_arr(self, arr, bitorder):
+        in_np = np.array(arr, dtype="B")
+        in_num = num.array(arr, dtype="B")
         out_np = np.unpackbits(in_np, bitorder=bitorder)
         out_num = num.unpackbits(in_num, bitorder=bitorder)
         assert np.array_equal(out_np, out_num)
 
-        out_np = np.unpackbits(in_np, count=extent // 2, bitorder=bitorder)
-        out_num = num.unpackbits(in_num, count=extent // 2, bitorder=bitorder)
+    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    def test_common(self, ndim, bitorder):
+        shape = (5,) * ndim
+        in_np = np.random.randint(low=0, high=255, size=shape, dtype="B")
+        in_num = num.array(in_np)
+
+        out_np = np.unpackbits(in_np, bitorder=bitorder)
+        out_num = num.unpackbits(in_num, bitorder=bitorder)
         assert np.array_equal(out_np, out_num)
 
-        for axis in range(ndim):
+    @pytest.mark.parametrize("count", (-9, 4, -1, 0, 4, 8, 9))
+    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    def test_count(self, ndim, count, bitorder):
+        shape = (5,) * ndim
+        in_np = np.random.randint(low=0, high=255, size=shape, dtype="B")
+        in_num = num.array(in_np)
+
+        out_np = np.unpackbits(in_np, count=count, bitorder=bitorder)
+        out_num = num.unpackbits(in_num, count=count, bitorder=bitorder)
+        assert np.array_equal(out_np, out_num)
+
+    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    def test_axis(self, ndim, bitorder):
+        shape = (5,) * ndim
+        in_np = np.random.randint(low=0, high=255, size=shape, dtype="B")
+        in_num = num.array(in_np)
+
+        for axis in range(-ndim + 1, ndim):
             out_np = np.unpackbits(in_np, axis=axis, bitorder=bitorder)
             out_num = num.unpackbits(in_num, axis=axis, bitorder=bitorder)
             assert np.array_equal(out_np, out_num)
 
+    @pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+    @pytest.mark.parametrize("bitorder", ("little", "big"))
+    @pytest.mark.parametrize("count", (-2, 0, 2, 5))
+    def test_axis_count(self, ndim, bitorder, count):
+        shape = (5,) * ndim
+        in_np = np.random.randint(low=0, high=255, size=shape, dtype="B")
+        in_num = num.array(in_np)
+
+        for axis in range(-ndim + 1, ndim):
             out_np = np.unpackbits(
-                in_np, count=extent // 2, axis=axis, bitorder=bitorder
+                in_np, count=count, axis=axis, bitorder=bitorder
             )
             out_num = num.unpackbits(
-                in_num, count=extent // 2, axis=axis, bitorder=bitorder
+                in_num, count=count, axis=axis, bitorder=bitorder
             )
             assert np.array_equal(out_np, out_num)
 
 
+@pytest.mark.parametrize("ndim", range(1, LEGATE_MAX_DIM + 1))
+@pytest.mark.parametrize("bitorder", ("little", "big"))
+@pytest.mark.parametrize("dtype", ("B", "i", "?"))
+def test_pack_unpack(ndim, bitorder, dtype):
+    shape = (8,) * ndim
+    in_np = np.random.randint(low=0, high=2, size=shape, dtype=dtype)
+    in_num = num.array(in_np)
+    for axis in range(ndim):
+        out_b = num.packbits(in_num, axis=axis)
+        out_p = num.unpackbits(out_b, count=in_num.shape[0], axis=axis)
+        assert np.array_equal(in_num, out_p)
+
+
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/integration/test_contains.py b/tests/integration/test_contains.py
index c97716c3a..23811ba74 100644
--- a/tests/integration/test_contains.py
+++ b/tests/integration/test_contains.py
@@ -12,19 +12,49 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-import pytest
-
-import cunumeric as num
 
+from functools import reduce
 
-def test_True():
-    x = num.array([1, 2, 3, 4, 5])
-    assert 4 in x
+import pytest
+from utils.generators import mk_seq_array
 
+import cunumeric as num
 
-def test_False():
-    x = num.array([1, 2, 3, 4, 5])
-    assert 6 not in x
+DIM = 128
+NO_EMPTY_SIZES = [
+    (DIM,),
+    (1, DIM),
+    (DIM, 1),
+    (DIM, DIM),
+    (DIM, 1, 1),
+    (1, DIM, 1),
+    (1, 1, DIM),
+    (DIM, DIM, DIM),
+]
+
+
+@pytest.mark.parametrize("size", NO_EMPTY_SIZES)
+def test_int(size):
+    arr = mk_seq_array(num, shape=size)
+    max_data = reduce(lambda x, y: x * y, size)
+    assert -1 not in arr
+    assert 0 not in arr
+    assert 1 in arr
+    assert max_data // 2 in arr
+    assert max_data in arr
+    assert max_data + 1 not in arr
+
+
+@pytest.mark.parametrize("size", NO_EMPTY_SIZES)
+def test_complex(size):
+    arr = mk_seq_array(num, shape=size) + mk_seq_array(num, shape=size) * 1.0j
+    max_data = reduce(lambda x, y: x * y, size)
+    assert -1 not in arr
+    assert 0 not in arr
+    assert 1 + 1.0j in arr
+    assert (max_data // 2) + (max_data // 2) * 1.0j in arr
+    assert max_data + max_data * 1.0j in arr
+    assert (max_data + 1) + (max_data + 1) * 1.0j not in arr
 
 
 if __name__ == "__main__":

From c9d44c9f5970a855f384934227877b31929bcff3 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Thu, 12 Jan 2023 14:31:47 -0800
Subject: [PATCH 78/89] fix assertion in config test (#749)

---
 tests/unit/cunumeric/test_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/cunumeric/test_config.py b/tests/unit/cunumeric/test_config.py
index a3cbd1529..98171b811 100644
--- a/tests/unit/cunumeric/test_config.py
+++ b/tests/unit/cunumeric/test_config.py
@@ -113,7 +113,7 @@ def test_destroy(self, mock_destroy) -> None:
         lib.initialize(_FakeSO)
         lib.set_runtime(runtime)
         lib.destroy()
-        assert mock_destroy.called_once_with()
+        mock_destroy.assert_called_once_with()
 
 
 def test_CUNUMERIC_LIB_NAME() -> None:

From be6767e2535dcc50fe35b31094b7be02503678d7 Mon Sep 17 00:00:00 2001
From: Irina Demeshko <idemeshko@nvidia.com>
Date: Tue, 17 Jan 2023 14:42:57 -0800
Subject: [PATCH 79/89] adding new version for documentations (#751)

---
 docs/cunumeric/source/versions.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/cunumeric/source/versions.rst b/docs/cunumeric/source/versions.rst
index c7c1e0ca6..ef6b7a83d 100644
--- a/docs/cunumeric/source/versions.rst
+++ b/docs/cunumeric/source/versions.rst
@@ -10,3 +10,5 @@ Versions
   22.05 <https://nv-legate.github.io/cunumeric/22.05>
   22.08 <https://nv-legate.github.io/cunumeric/22.08>
   22.10 <https://nv-legate.github.io/cunumeric/22.10>
+  23.01 <https://nv-legate.github.io/cunumeric/23.01>
+

From 21879b986bc3e0da2d36175823a983cb1b39c3d4 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Wed, 18 Jan 2023 10:21:40 -0800
Subject: [PATCH 80/89] update legate arg processing apis

---
 cunumeric/runtime.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py
index 26d8ab207..603a69a47 100644
--- a/cunumeric/runtime.py
+++ b/cunumeric/runtime.py
@@ -23,7 +23,7 @@
 import numpy as np
 from legate.core import LEGATE_MAX_DIM, Rect, get_legate_runtime, legion
 from legate.core.context import Context as LegateContext
-from legate.rc import ArgSpec, Argument, parse_command_args
+from legate.util.args import ArgSpec, Argument, parse_library_command_args
 from typing_extensions import TypeGuard
 
 from .config import (
@@ -149,7 +149,7 @@ def __init__(self, legate_context: LegateContext) -> None:
         self.has_curand = cunumeric_lib.shared_object.cunumeric_has_curand()
         self._register_dtypes()
 
-        self.args = parse_command_args("cunumeric", ARGS)
+        self.args = parse_library_command_args("cunumeric", ARGS)
         self.args.warning = self.args.warning or self.args.test_mode
 
         if self.num_gpus > 0 and self.args.preload_cudalibs:

From 7e3afb90f1310a0c5d595b8f0f2ba436c7259e0b Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Wed, 18 Jan 2023 15:48:58 -0800
Subject: [PATCH 81/89] Don't turn on cuNumeric debug checks on debug-rel
 builds (#753)

---
 cunumeric_cpp.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
index 9ab2741b3..7034bb600 100644
--- a/cunumeric_cpp.cmake
+++ b/cunumeric_cpp.cmake
@@ -331,7 +331,7 @@ list(APPEND cunumeric_SOURCES
   src/cunumeric/cunumeric.cc
 )
 
-if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   list(APPEND cunumeric_CXX_DEFS DEBUG_CUNUMERIC)
   list(APPEND cunumeric_CUDA_DEFS DEBUG_CUNUMERIC)
 endif()

From a45f6801f23c0f05b5644698cc84ac9dcca39c4b Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Fri, 20 Jan 2023 16:54:57 -0800
Subject: [PATCH 82/89] Pass `CMAKE_GENERATOR` to scikit-build (#750)

* pass cmake_generator to skbuild as envvar so it overrides skbuild's generator detection

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* change variable name

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 install.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/install.py b/install.py
index 8bed64992..c54c1968a 100755
--- a/install.py
+++ b/install.py
@@ -306,12 +306,6 @@ def validate_path(path):
     # Also use preexisting CMAKE_ARGS from conda if set
     cmake_flags = cmd_env.get("CMAKE_ARGS", "").split(" ")
 
-    if cmake_generator:
-        if " " not in cmake_generator:
-            cmake_flags += [f"-G{cmake_generator}"]
-        else:
-            cmake_flags += [f"-G'{cmake_generator}'"]
-
     if debug or verbose:
         cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
 
@@ -356,10 +350,18 @@ def validate_path(path):
     cmake_flags += ["-Dlegate_core_ROOT=%s" % legate_dir]
 
     cmake_flags += extra_flags
+    build_flags = [f"-j{str(thread_count)}"]
+    if verbose:
+        if cmake_generator == "Unix Makefiles":
+            build_flags += ["VERBOSE=1"]
+        else:
+            build_flags += ["--verbose"]
+
     cmd_env.update(
         {
-            "SKBUILD_BUILD_OPTIONS": f"-j{str(thread_count)}",
             "CMAKE_ARGS": " ".join(cmake_flags),
+            "CMAKE_GENERATOR": cmake_generator,
+            "SKBUILD_BUILD_OPTIONS": " ".join(build_flags),
         }
     )
 
@@ -488,7 +490,10 @@ def driver():
         "--cmake-generator",
         dest="cmake_generator",
         required=False,
-        default=(None if shutil.which("ninja") is None else "Ninja"),
+        default=os.environ.get(
+            "CMAKE_GENERATOR",
+            "Unix Makefiles" if shutil.which("ninja") is None else "Ninja",
+        ),
         choices=["Ninja", "Unix Makefiles", None],
         help="The CMake makefiles generator",
     )

From 35ebbb741838c34f0f0cb9d96ad0077cbb424d14 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Mon, 23 Jan 2023 14:26:08 -0800
Subject: [PATCH 83/89] Move `pip uninstall` step before CMake is run instead
 of after. (#760)

* fix issue of pip removing C++ libs from previous installations

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* skip uninstall if editable

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 install.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/install.py b/install.py
index c54c1968a..ce4544edd 100755
--- a/install.py
+++ b/install.py
@@ -76,10 +76,13 @@ def __call__(self, parser, namespace, values, option_string):
         setattr(namespace, self.dest, not option_string.startswith("--no"))
 
 
-def execute_command(args, verbose, **kwargs):
+def execute_command(args, verbose, ignore_errors=False, **kwargs):
     if verbose:
         print('Executing: "', " ".join(args), '" with ', kwargs)
-    subprocess.check_call(args, **kwargs)
+    if ignore_errors:
+        subprocess.call(args, **kwargs)
+    else:
+        subprocess.check_call(args, **kwargs)
 
 
 def scikit_build_cmake_build_dir(skbuild_dir):
@@ -254,6 +257,29 @@ def validate_path(path):
         print("Performing a clean build to accommodate build isolation.")
         clean_first = True
 
+    cmd_env = dict(os.environ.items())
+
+    # Explicitly uninstall cunumeric if doing a clean/isolated build.
+    #
+    # A prior installation may have built and installed cunumeric C++
+    # dependencies (like BLAS or tblis).
+    #
+    # CMake will find and use them for the current build, which would normally
+    # be correct, but pip uninstalls files from any existing installation as
+    # the last step of the install process, including the libraries found by
+    # CMake during the current build.
+    #
+    # Therefore this uninstall step must occur *before* CMake attempts to find
+    # these dependencies, triggering CMake to build and install them again.
+    if clean_first or (build_isolation and not editable):
+        execute_command(
+            [sys.executable, "-m", "pip", "uninstall", "-y", "cunumeric"],
+            verbose,
+            ignore_errors=True,
+            cwd=cunumeric_dir,
+            env=cmd_env,
+        )
+
     if clean_first:
         shutil.rmtree(skbuild_dir, ignore_errors=True)
         shutil.rmtree(join(cunumeric_dir, "dist"), ignore_errors=True)
@@ -265,7 +291,6 @@ def validate_path(path):
 
     # Configure and build cuNumeric via setup.py
     pip_install_cmd = [sys.executable, "-m", "pip", "install"]
-    cmd_env = dict(os.environ.items())
 
     install_dir = None
 

From 5f17dd6af24441484f33564c09a1c78892b94a89 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Mon, 23 Jan 2023 15:35:34 -0800
Subject: [PATCH 84/89] Change march to haswell when on x86 platforms (#762)

Use haswell by default on x86 platforms.
---
 install.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.py b/install.py
index ce4544edd..96dc13242 100755
--- a/install.py
+++ b/install.py
@@ -554,7 +554,7 @@ def driver():
         "--march",
         dest="march",
         required=False,
-        default="native",
+        default=("haswell" if platform.machine() == "x86_64" else "native"),
         help="Specify the target CPU architecture.",
     )
     parser.add_argument(

From 638adbc79d6ad6be9b78d9f2095102ee00926450 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 24 Jan 2023 09:44:31 -0800
Subject: [PATCH 85/89] [pre-commit.ci] pre-commit autoupdate (#763)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/pre-commit/mirrors-clang-format: v15.0.6 → v15.0.7](https://github.com/pre-commit/mirrors-clang-format/compare/v15.0.6...v15.0.7)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index eefd667d3..bc47df8a7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ repos:
       hooks:
             - id: flake8
     - repo: https://github.com/pre-commit/mirrors-clang-format
-      rev: 'v15.0.6'  # Use the sha / tag you want to point at
+      rev: 'v15.0.7'  # Use the sha / tag you want to point at
       hooks:
         - id: clang-format
           files: \.(cu|cuh|h|cc|inl)$

From 47d65d99da5ab24e2d99f5b1912c55dfebbf4091 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Tue, 24 Jan 2023 11:35:02 -0800
Subject: [PATCH 86/89] Force conda version of cutensor (#765)

* Force conda version of cutensor

* Change cutensor package spec
---
 conda/conda-build/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index b1f2de956..cc352ff62 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -125,7 +125,7 @@ requirements:
     - cuda-cudart-dev ={{ cuda_version }}
     - cuda-nvtx ={{ cuda_version }}
     # - libcutensor-dev >=1.3
-    - cutensor >=1.3
+    - cutensor >=1.3 =*_*
     - libcublas-dev
     - libcusolver-dev
     - libcufft-dev
@@ -141,7 +141,7 @@ requirements:
     - legate-core ={{ core_version }}
     - cuda-cudart >={{ cuda_version }}
     # - libcutensor >=1.3
-    - cutensor >=1.3
+    - cutensor >=1.3 =*_*
     - libcublas
     - libcusolver =11.4.1.48-0
     - libcufft

From f26a05d85ced680edfe66ea24f9f87d643078cb4 Mon Sep 17 00:00:00 2001
From: Bryan Van de Ven <bvandeven@nvidia.com>
Date: Tue, 24 Jan 2023 14:28:43 -0800
Subject: [PATCH 87/89] handle numpy 'builtins' properly for coverage (#766)

---
 cunumeric/coverage.py        | 22 ++++++++++++++++++----
 cunumeric/random/__init__.py |  2 +-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/cunumeric/coverage.py b/cunumeric/coverage.py
index f8f4446ae..3efad0342 100644
--- a/cunumeric/coverage.py
+++ b/cunumeric/coverage.py
@@ -17,7 +17,13 @@
 import warnings
 from dataclasses import dataclass
 from functools import wraps
-from types import FunctionType, MethodDescriptorType, MethodType, ModuleType
+from types import (
+    BuiltinFunctionType,
+    FunctionType,
+    MethodDescriptorType,
+    MethodType,
+    ModuleType,
+)
 from typing import Any, Container, Mapping, Optional, cast
 
 import numpy as np
@@ -194,7 +200,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
 
 
 def clone_module(
-    origin_module: ModuleType, new_globals: dict[str, Any]
+    origin_module: ModuleType,
+    new_globals: dict[str, Any],
+    include_builtin_function_type: bool = False,
 ) -> None:
     """Copy attributes from one module to another, excluding submodules
 
@@ -230,7 +238,10 @@ def clone_module(
         # Only need to wrap things that are in the origin module to begin with
         if attr not in origin_module.__dict__:
             continue
-        if isinstance(value, (FunctionType, lgufunc)):
+        if isinstance(value, (FunctionType, lgufunc)) or (
+            include_builtin_function_type
+            and isinstance(value, BuiltinFunctionType)
+        ):
             wrapped = implemented(
                 cast(AnyCallable, value), mod_name, attr, reporting=reporting
             )
@@ -239,7 +250,10 @@ def clone_module(
     from numpy import ufunc as npufunc
 
     for attr, value in missing.items():
-        if isinstance(value, (FunctionType, npufunc)):
+        if isinstance(value, (FunctionType, npufunc)) or (
+            include_builtin_function_type
+            and isinstance(value, BuiltinFunctionType)
+        ):
             wrapped = unimplemented(value, mod_name, attr, reporting=reporting)
             new_globals[attr] = wrapped
         else:
diff --git a/cunumeric/random/__init__.py b/cunumeric/random/__init__.py
index a9730d063..2f8a98460 100644
--- a/cunumeric/random/__init__.py
+++ b/cunumeric/random/__init__.py
@@ -25,7 +25,7 @@
 else:
     from cunumeric.random.legacy import *
 
-clone_module(_nprandom, globals())
+clone_module(_nprandom, globals(), include_builtin_function_type=True)
 
 del clone_module
 del _nprandom

From da3319799e5676d6375db8a6b4dd774fc016a5ac Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Fri, 27 Jan 2023 10:32:30 -0800
Subject: [PATCH 88/89] Update the architectures built in conda package (#770)
 (#771)

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index d0df68008..19b1f1f48 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -13,7 +13,7 @@ if [ -z "$CPU_ONLY" ]; then
   # cutensor, relying on the conda cutensor package
   CMAKE_ARGS+="
 -Dcutensor_DIR=$PREFIX
--DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;86
+-DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;90
 "
 else
   # When we build without cuda, we need to provide the location of curand

From 1817dc773dcf95f7214416bab33d8aee3fcf2926 Mon Sep 17 00:00:00 2001
From: Marcin Zalewski <marcin.zalewski@gmail.com>
Date: Mon, 30 Jan 2023 14:33:03 -0800
Subject: [PATCH 89/89] Revert "Update the architectures built in conda package
 (#770) (#771)" (#772)

This reverts commit da3319799e5676d6375db8a6b4dd774fc016a5ac.

Co-authored-by: Marcin Zalewski <mzalewski@nvidia.com>
---
 conda/conda-build/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index 19b1f1f48..d0df68008 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -13,7 +13,7 @@ if [ -z "$CPU_ONLY" ]; then
   # cutensor, relying on the conda cutensor package
   CMAKE_ARGS+="
 -Dcutensor_DIR=$PREFIX
--DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;90
+-DCMAKE_CUDA_ARCHITECTURES:LIST=60-real;70-real;75-real;80-real;86
 "
 else
   # When we build without cuda, we need to provide the location of curand