From 89ac0ed230a4f362e52df35e7282ede12865ec73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Mon, 2 Sep 2024 16:35:26 +0200 Subject: [PATCH] GH-25118: [Python] Make NumPy an optional runtime dependency (#41904) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Being able to run pyarrow without requiring numpy. ### What changes are included in this PR? If numpy is not present we are able to import pyarrow and run functionality. A new CI job has been created to run some basic tests without numpy. ### Are these changes tested? Yes via CI. ### Are there any user-facing changes? Yes, NumPy can be removed from the user installation and pyarrow functionality still works * GitHub Issue: #25118 Lead-authored-by: Raúl Cumplido Co-authored-by: Joris Van den Bossche Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .github/workflows/python.yml | 6 + docker-compose.yml | 32 ++++ python/CMakeLists.txt | 4 +- python/pyarrow/_compute.pyx | 16 +- python/pyarrow/array.pxi | 5 + python/pyarrow/builder.pxi | 14 +- python/pyarrow/conftest.py | 13 +- python/pyarrow/includes/libarrow_python.pxd | 2 +- python/pyarrow/lib.pyx | 12 +- python/pyarrow/pandas_compat.py | 79 +++++---- python/pyarrow/src/arrow/python/inference.cc | 4 +- python/pyarrow/src/arrow/python/iterators.h | 6 +- .../arrow/python/{init.cc => numpy_init.cc} | 13 +- .../src/arrow/python/{init.h => numpy_init.h} | 5 +- .../pyarrow/src/arrow/python/numpy_internal.h | 19 ++- .../pyarrow/src/arrow/python/python_test.cc | 2 +- .../src/arrow/python/python_to_arrow.cc | 11 +- python/pyarrow/table.pxi | 3 + python/pyarrow/tensor.pxi | 15 ++ python/pyarrow/tests/conftest.py | 1 + .../tests/interchange/test_conversion.py | 35 ++-- .../interchange/test_interchange_spec.py | 33 ++-- python/pyarrow/tests/parquet/common.py | 5 +- python/pyarrow/tests/parquet/test_basic.py | 5 +- .../pyarrow/tests/parquet/test_data_types.py | 13 +- python/pyarrow/tests/parquet/test_dataset.py | 5 +- python/pyarrow/tests/parquet/test_datetime.py | 5 +- python/pyarrow/tests/parquet/test_metadata.py | 7 +- python/pyarrow/tests/parquet/test_pandas.py | 5 +- python/pyarrow/tests/strategies.py | 10 +- .../pyarrow/tests/test_adhoc_memory_leak.py | 5 +- python/pyarrow/tests/test_array.py | 100 +++++++++-- python/pyarrow/tests/test_builder.py | 11 +- python/pyarrow/tests/test_compute.py | 85 ++++++---- python/pyarrow/tests/test_convert_builtin.py | 155 +++++++++++------- python/pyarrow/tests/test_cpp_internals.py | 8 + python/pyarrow/tests/test_csv.py | 44 ++++- python/pyarrow/tests/test_cuda.py | 5 +- .../pyarrow/tests/test_cuda_numba_interop.py | 5 +- python/pyarrow/tests/test_cython.py | 4 + python/pyarrow/tests/test_dataset.py | 55 ++++--- .../pyarrow/tests/test_dataset_encryption.py | 7 +- python/pyarrow/tests/test_dlpack.py | 46 +++--- python/pyarrow/tests/test_extension_type.py | 77 ++++++--- python/pyarrow/tests/test_feather.py | 10 +- python/pyarrow/tests/test_flight.py | 6 +- python/pyarrow/tests/test_io.py | 38 +++-- python/pyarrow/tests/test_ipc.py | 10 +- python/pyarrow/tests/test_json.py | 8 +- python/pyarrow/tests/test_pandas.py | 62 +++---- python/pyarrow/tests/test_scalars.py | 59 +++++-- python/pyarrow/tests/test_schema.py | 6 +- python/pyarrow/tests/test_sparse_tensor.py | 5 +- python/pyarrow/tests/test_strategies.py | 5 + python/pyarrow/tests/test_substrait.py | 2 + python/pyarrow/tests/test_table.py | 29 +++- python/pyarrow/tests/test_tensor.py | 5 +- python/pyarrow/tests/test_types.py | 16 +- python/pyarrow/tests/test_udf.py | 13 +- python/pyarrow/tests/test_without_numpy.py | 58 +++++++ python/pyarrow/tests/util.py | 19 +-- python/pyarrow/types.pxi | 85 +++++----- 62 files changed, 1008 insertions(+), 420 deletions(-) rename python/pyarrow/src/arrow/python/{init.cc => numpy_init.cc} (78%) rename python/pyarrow/src/arrow/python/{init.h => numpy_init.h} (93%) create mode 100644 python/pyarrow/tests/test_without_numpy.py diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 854d792f3100d..90d3a50af3705 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -59,6 +59,7 @@ jobs: - conda-python-3.9-nopandas - conda-python-3.8-pandas-1.0 - conda-python-3.10-pandas-latest + - conda-python-3.10-no-numpy include: - name: conda-python-docs cache: conda-python-3.9 @@ -83,6 +84,11 @@ jobs: title: AMD64 Conda Python 3.10 Pandas latest python: "3.10" pandas: latest + - name: conda-python-3.10-no-numpy + cache: conda-python-3.10 + image: conda-python-no-numpy + title: AMD64 Conda Python 3.10 without NumPy + python: "3.10" env: PYTHON: ${{ matrix.python || 3.8 }} UBUNTU: ${{ matrix.ubuntu || 20.04 }} diff --git a/docker-compose.yml b/docker-compose.yml index 3045cf015bc26..97d6e1158ea03 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -126,6 +126,7 @@ x-hierarchy: - conda-python-hdfs - conda-python-java-integration - conda-python-jpype + - conda-python-no-numpy - conda-python-spark - conda-python-substrait - conda-verify-rc @@ -1258,6 +1259,37 @@ services: volumes: *conda-volumes command: *python-conda-command + conda-python-no-numpy: + # Usage: + # docker-compose build conda + # docker-compose build conda-cpp + # docker-compose build conda-python + # docker-compose build conda-python-no-numpy + # docker-compose run --rm conda-python-no-numpy + image: ${REPO}:${ARCH}-conda-python-${PYTHON}-no-numpy + build: + context: . + dockerfile: ci/docker/conda-python.dockerfile + cache_from: + - ${REPO}:${ARCH}-conda-python-${PYTHON} + args: + repo: ${REPO} + arch: ${ARCH} + python: ${PYTHON} + shm_size: *shm-size + environment: + <<: [*common, *ccache, *sccache] + PARQUET_REQUIRE_ENCRYPTION: # inherit + HYPOTHESIS_PROFILE: # inherit + PYARROW_TEST_HYPOTHESIS: # inherit + volumes: *conda-volumes + command: + [" + /arrow/ci/scripts/cpp_build.sh /arrow /build && + /arrow/ci/scripts/python_build.sh /arrow /build && + mamba uninstall -y numpy && + /arrow/ci/scripts/python_test.sh /arrow"] + conda-python-docs: # Usage: # archery docker run conda-python-docs diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 1a18b2b173acb..eda4ff4ca5f07 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -339,17 +339,17 @@ set(PYARROW_CPP_SRCS ${PYARROW_CPP_SOURCE_DIR}/gdb.cc ${PYARROW_CPP_SOURCE_DIR}/helpers.cc ${PYARROW_CPP_SOURCE_DIR}/inference.cc - ${PYARROW_CPP_SOURCE_DIR}/init.cc ${PYARROW_CPP_SOURCE_DIR}/io.cc ${PYARROW_CPP_SOURCE_DIR}/ipc.cc ${PYARROW_CPP_SOURCE_DIR}/numpy_convert.cc + ${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc ${PYARROW_CPP_SOURCE_DIR}/numpy_to_arrow.cc ${PYARROW_CPP_SOURCE_DIR}/python_test.cc ${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc ${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc ${PYARROW_CPP_SOURCE_DIR}/serialize.cc ${PYARROW_CPP_SOURCE_DIR}/udf.cc) -set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/init.cc +set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 0e860eaf4c6b8..d39120934d5fd 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -33,7 +33,10 @@ from pyarrow.util import _DEPR_MSG from libcpp cimport bool as c_bool import inspect -import numpy as np +try: + import numpy as np +except ImportError: + np = None import warnings @@ -43,6 +46,11 @@ _substrait_msg = ( ) +SUPPORTED_INPUT_ARR_TYPES = (list, tuple) +if np is not None: + SUPPORTED_INPUT_ARR_TYPES += (np.ndarray, ) + + def _pas(): global __pas if __pas is None: @@ -473,7 +481,7 @@ cdef class MetaFunction(Function): cdef _pack_compute_args(object values, vector[CDatum]* out): for val in values: - if isinstance(val, (list, np.ndarray)): + if isinstance(val, SUPPORTED_INPUT_ARR_TYPES): val = lib.asarray(val) if isinstance(val, Array): @@ -2189,7 +2197,7 @@ class QuantileOptions(_QuantileOptions): def __init__(self, q=0.5, *, interpolation="linear", skip_nulls=True, min_count=0): - if not isinstance(q, (list, tuple, np.ndarray)): + if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES): q = [q] self._set_options(q, interpolation, skip_nulls, min_count) @@ -2222,7 +2230,7 @@ class TDigestOptions(_TDigestOptions): def __init__(self, q=0.5, *, delta=100, buffer_size=500, skip_nulls=True, min_count=0): - if not isinstance(q, (list, tuple, np.ndarray)): + if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES): q = [q] self._set_options(q, delta, buffer_size, skip_nulls, min_count) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 1587de0e6b744..93c44297590e8 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -50,6 +50,8 @@ cdef _sequence_to_array(object sequence, object mask, object size, cdef inline _is_array_like(obj): + if np is None: + return False if isinstance(obj, np.ndarray): return True return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj) @@ -1608,6 +1610,9 @@ cdef class Array(_PandasConvertible): """ self._assert_cpu() + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") cdef: PyObject* out PandasOptions c_options diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi index 2af39e2c589e6..fbab5bbdb5a01 100644 --- a/python/pyarrow/builder.pxi +++ b/python/pyarrow/builder.pxi @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import math + cdef class StringBuilder(_Weakrefable): """ @@ -42,10 +44,10 @@ cdef class StringBuilder(_Weakrefable): value : string/bytes or np.nan/None The value to append to the string array builder. """ - if value is None or value is np.nan: - self.builder.get().AppendNull() - elif isinstance(value, (bytes, str)): + if isinstance(value, (bytes, str)): self.builder.get().Append(tobytes(value)) + elif value is None or math.isnan(value): + self.builder.get().AppendNull() else: raise TypeError('StringBuilder only accepts string objects') @@ -108,10 +110,10 @@ cdef class StringViewBuilder(_Weakrefable): value : string/bytes or np.nan/None The value to append to the string array builder. """ - if value is None or value is np.nan: - self.builder.get().AppendNull() - elif isinstance(value, (bytes, str)): + if isinstance(value, (bytes, str)): self.builder.get().Append(tobytes(value)) + elif value is None or math.isnan(value): + self.builder.get().AppendNull() else: raise TypeError('StringViewBuilder only accepts string objects') diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 29c850c142da1..10a2e72f923cb 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -25,7 +25,6 @@ from pyarrow.tests.util import windows_has_tzdata import sys -import numpy as np groups = [ 'acero', @@ -46,6 +45,8 @@ 'lz4', 'memory_leak', 'nopandas', + 'nonumpy', + 'numpy', 'orc', 'pandas', 'parquet', @@ -81,6 +82,8 @@ 'lz4': Codec.is_available('lz4'), 'memory_leak': False, 'nopandas': False, + 'nonumpy': False, + 'numpy': False, 'orc': False, 'pandas': False, 'parquet': False, @@ -158,6 +161,12 @@ except ImportError: defaults['nopandas'] = True +try: + import numpy # noqa + defaults['numpy'] = True +except ImportError: + defaults['nonumpy'] = True + try: import pyarrow.parquet # noqa defaults['parquet'] = True @@ -327,6 +336,7 @@ def unary_agg_func_fixture(): Register a unary aggregate function (mean) """ from pyarrow import compute as pc + import numpy as np def func(ctx, x): return pa.scalar(np.nanmean(x)) @@ -352,6 +362,7 @@ def varargs_agg_func_fixture(): Register a unary aggregate function """ from pyarrow import compute as pc + import numpy as np def func(ctx, *args): sum = 0.0 diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 9fcc97aaf0a9c..96725c9c3862b 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -248,7 +248,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil: CResult[PyObject*] StringToTzinfo(c_string) -cdef extern from "arrow/python/init.h": +cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py": int arrow_init_numpy() except -1 diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index c72841c299566..6b82eb6566896 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -21,7 +21,10 @@ import datetime import decimal as _pydecimal -import numpy as np +try: + import numpy as np +except ImportError: + np = None import os import sys @@ -32,8 +35,11 @@ from pyarrow.includes.common cimport PyObject_to_object cimport pyarrow.includes.libarrow_python as libarrow_python cimport cpython as cp -# Initialize NumPy C API -arrow_init_numpy() + +# Initialize NumPy C API only if numpy was able to be imported +if np is not None: + arrow_init_numpy() + # Initialize PyArrow C++ API # (used from some of our C++ code, see e.g. ARROW-5260) import_pyarrow() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index fcccf564fc619..7fbde36bc23e9 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -30,13 +30,17 @@ import re import warnings -import numpy as np - +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa _logical_type_map = {} +_numpy_logical_type_map = {} +_pandas_logical_type_map = {} def get_logical_type_map(): @@ -85,27 +89,32 @@ def get_logical_type(arrow_type): return 'object' -_numpy_logical_type_map = { - np.bool_: 'bool', - np.int8: 'int8', - np.int16: 'int16', - np.int32: 'int32', - np.int64: 'int64', - np.uint8: 'uint8', - np.uint16: 'uint16', - np.uint32: 'uint32', - np.uint64: 'uint64', - np.float32: 'float32', - np.float64: 'float64', - 'datetime64[D]': 'date', - np.str_: 'string', - np.bytes_: 'bytes', -} +def get_numpy_logical_type_map(): + global _numpy_logical_type_map + if not _numpy_logical_type_map: + _numpy_logical_type_map.update({ + np.bool_: 'bool', + np.int8: 'int8', + np.int16: 'int16', + np.int32: 'int32', + np.int64: 'int64', + np.uint8: 'uint8', + np.uint16: 'uint16', + np.uint32: 'uint32', + np.uint64: 'uint64', + np.float32: 'float32', + np.float64: 'float64', + 'datetime64[D]': 'date', + np.str_: 'string', + np.bytes_: 'bytes', + }) + return _numpy_logical_type_map def get_logical_type_from_numpy(pandas_collection): + numpy_logical_type_map = get_numpy_logical_type_map() try: - return _numpy_logical_type_map[pandas_collection.dtype.type] + return numpy_logical_type_map[pandas_collection.dtype.type] except KeyError: if hasattr(pandas_collection.dtype, 'tz'): return 'datetimetz' @@ -1023,18 +1032,23 @@ def _is_generated_index_name(name): return re.match(pattern, name) is not None -_pandas_logical_type_map = { - 'date': 'datetime64[D]', - 'datetime': 'datetime64[ns]', - 'datetimetz': 'datetime64[ns]', - 'unicode': np.str_, - 'bytes': np.bytes_, - 'string': np.str_, - 'integer': np.int64, - 'floating': np.float64, - 'decimal': np.object_, - 'empty': np.object_, -} +def get_pandas_logical_type_map(): + global _pandas_logical_type_map + + if not _pandas_logical_type_map: + _pandas_logical_type_map.update({ + 'date': 'datetime64[D]', + 'datetime': 'datetime64[ns]', + 'datetimetz': 'datetime64[ns]', + 'unicode': np.str_, + 'bytes': np.bytes_, + 'string': np.str_, + 'integer': np.int64, + 'floating': np.float64, + 'decimal': np.object_, + 'empty': np.object_, + }) + return _pandas_logical_type_map def _pandas_type_to_numpy_type(pandas_type): @@ -1050,8 +1064,9 @@ def _pandas_type_to_numpy_type(pandas_type): dtype : np.dtype The dtype that corresponds to `pandas_type`. """ + pandas_logical_type_map = get_pandas_logical_type_map() try: - return _pandas_logical_type_map[pandas_type] + return pandas_logical_type_map[pandas_type] except KeyError: if 'mixed' in pandas_type: # catching 'mixed', 'mixed-integer' and 'mixed-integer-float' diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 10116f9afad69..1aa7915ba1e19 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -395,11 +395,11 @@ class TypeInferrer { *keep_going = make_unions_; } else if (arrow::py::is_scalar(obj)) { RETURN_NOT_OK(VisitArrowScalar(obj, keep_going)); - } else if (PyArray_CheckAnyScalarExact(obj)) { + } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) { RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going)); } else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) { RETURN_NOT_OK(VisitSet(obj, keep_going)); - } else if (PyArray_Check(obj)) { + } else if (has_numpy() && PyArray_Check(obj)) { RETURN_NOT_OK(VisitNdarray(obj, keep_going)); } else if (PyDict_Check(obj)) { RETURN_NOT_OK(VisitDict(obj)); diff --git a/python/pyarrow/src/arrow/python/iterators.h b/python/pyarrow/src/arrow/python/iterators.h index 7b31962dac5b8..8512276848272 100644 --- a/python/pyarrow/src/arrow/python/iterators.h +++ b/python/pyarrow/src/arrow/python/iterators.h @@ -22,6 +22,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/python/common.h" +#include "arrow/python/numpy_init.h" #include "arrow/python/numpy_internal.h" namespace arrow { @@ -44,7 +45,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& // VisitorFunc may set to false to terminate iteration bool keep_going = true; - if (PyArray_Check(obj)) { + if (has_numpy() && PyArray_Check(obj)) { PyArrayObject* arr_obj = reinterpret_cast(obj); if (PyArray_NDIM(arr_obj) != 1) { return Status::Invalid("Only 1D arrays accepted"); @@ -64,6 +65,7 @@ inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& // This code path is inefficient: callers should implement dedicated // logic for non-object arrays. } + if (PySequence_Check(obj)) { if (PyList_Check(obj) || PyTuple_Check(obj)) { // Use fast item access @@ -101,7 +103,7 @@ inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) { template inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset, VisitorFunc&& func) { - if (PyArray_Check(mo)) { + if (has_numpy() && PyArray_Check(mo)) { PyArrayObject* mask = reinterpret_cast(mo); if (PyArray_NDIM(mask) != 1) { return Status::Invalid("Mask must be 1D array"); diff --git a/python/pyarrow/src/arrow/python/init.cc b/python/pyarrow/src/arrow/python/numpy_init.cc similarity index 78% rename from python/pyarrow/src/arrow/python/init.cc rename to python/pyarrow/src/arrow/python/numpy_init.cc index dba293bbe2366..96e2c7b7ccb5c 100644 --- a/python/pyarrow/src/arrow/python/init.cc +++ b/python/pyarrow/src/arrow/python/numpy_init.cc @@ -18,7 +18,16 @@ // Trigger the array import (inversion of NO_IMPORT_ARRAY) #define NUMPY_IMPORT_ARRAY -#include "arrow/python/init.h" +#include "arrow/python/numpy_init.h" #include "arrow/python/numpy_interop.h" -int arrow_init_numpy() { return arrow::py::import_numpy(); } +namespace arrow::py { +bool numpy_imported = false; + +int arrow_init_numpy() { + numpy_imported = true; + return arrow::py::import_numpy(); +} + +bool has_numpy() { return numpy_imported; } +} // namespace arrow::py diff --git a/python/pyarrow/src/arrow/python/init.h b/python/pyarrow/src/arrow/python/numpy_init.h similarity index 93% rename from python/pyarrow/src/arrow/python/init.h rename to python/pyarrow/src/arrow/python/numpy_init.h index 2e6c954862bd9..36c544c1b51fd 100644 --- a/python/pyarrow/src/arrow/python/init.h +++ b/python/pyarrow/src/arrow/python/numpy_init.h @@ -20,7 +20,8 @@ #include "arrow/python/platform.h" #include "arrow/python/visibility.h" -extern "C" { +namespace arrow::py { ARROW_PYTHON_EXPORT int arrow_init_numpy(); -} +bool has_numpy(); +} // namespace arrow::py diff --git a/python/pyarrow/src/arrow/python/numpy_internal.h b/python/pyarrow/src/arrow/python/numpy_internal.h index b9b632f9f9a12..0b4d0be00e42b 100644 --- a/python/pyarrow/src/arrow/python/numpy_internal.h +++ b/python/pyarrow/src/arrow/python/numpy_internal.h @@ -19,6 +19,7 @@ #pragma once +#include "arrow/python/numpy_init.h" #include "arrow/python/numpy_interop.h" #include "arrow/status.h" @@ -155,15 +156,27 @@ inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { namespace internal { inline bool PyFloatScalar_Check(PyObject* obj) { - return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating); + if (has_numpy()) { + return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating); + } else { + return PyFloat_Check(obj); + } } inline bool PyIntScalar_Check(PyObject* obj) { - return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer); + if (has_numpy()) { + return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer); + } else { + return PyLong_Check(obj); + } } inline bool PyBoolScalar_Check(PyObject* obj) { - return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool); + if (has_numpy()) { + return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool); + } else { + return PyBool_Check(obj); + } } static inline PyArray_Descr* GetSafeNumPyDtype(int type) { diff --git a/python/pyarrow/src/arrow/python/python_test.cc b/python/pyarrow/src/arrow/python/python_test.cc index 746bf410911f9..eea6bf9459d1f 100644 --- a/python/pyarrow/src/arrow/python/python_test.cc +++ b/python/pyarrow/src/arrow/python/python_test.cc @@ -870,7 +870,7 @@ std::vector GetCppTestCases() { TestInferAllLeadingZerosExponentialNotationPositive}, {"test_infer_all_leading_zeros_exponential_notation_negative", TestInferAllLeadingZerosExponentialNotationNegative}, - {"test_object_block_write_fails", TestObjectBlockWriteFails}, + {"test_object_block_write_fails_pandas_convert", TestObjectBlockWriteFails}, {"test_mixed_type_fails", TestMixedTypeFails}, {"test_from_python_decimal_rescale_not_truncateable", TestFromPythonDecimalRescaleNotTruncateable}, diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index ce9e15c894ce3..e7195e99072b0 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -202,7 +202,7 @@ class PyValue { return true; } else if (obj == Py_False) { return false; - } else if (PyArray_IsScalar(obj, Bool)) { + } else if (has_numpy() && PyArray_IsScalar(obj, Bool)) { return reinterpret_cast(obj)->obval == NPY_TRUE; } else { return internal::InvalidValue(obj, "tried to convert to boolean"); @@ -385,7 +385,7 @@ class PyValue { default: return Status::UnknownError("Invalid time unit"); } - } else if (PyArray_CheckAnyScalarExact(obj)) { + } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) { // validate that the numpy scalar has np.datetime64 dtype ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj)); if (!numpy_type->Equals(*type)) { @@ -464,7 +464,7 @@ class PyValue { default: return Status::UnknownError("Invalid time unit"); } - } else if (PyArray_CheckAnyScalarExact(obj)) { + } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) { // validate that the numpy scalar has np.datetime64 dtype ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj)); if (!numpy_type->Equals(*type)) { @@ -664,7 +664,7 @@ class PyPrimitiveConverter< ARROW_ASSIGN_OR_RAISE( auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); // Numpy NaT sentinels can be checked after the conversion - if (PyArray_CheckAnyScalarExact(value) && + if (has_numpy() && PyArray_CheckAnyScalarExact(value) && PyValue::IsNaT(this->primitive_type_, converted)) { this->primitive_builder_->UnsafeAppendNull(); } else { @@ -804,8 +804,7 @@ class PyListConverter : public ListConverter { if (PyValue::IsNull(this->options_, value)) { return this->list_builder_->AppendNull(); } - - if (PyArray_Check(value)) { + if (has_numpy() && PyArray_Check(value)) { RETURN_NOT_OK(AppendNdarray(value)); } else if (PySequence_Check(value)) { RETURN_NOT_OK(AppendSequence(value)); diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 6d34c71c9df40..fff47373cb991 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -495,6 +495,9 @@ cdef class ChunkedArray(_PandasConvertible): >>> n_legs.to_numpy() array([ 2, 2, 4, 4, 5, 100]) """ + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") if zero_copy_only: raise ValueError( "zero_copy_only must be False for pyarrow.ChunkedArray.to_numpy" diff --git a/python/pyarrow/tensor.pxi b/python/pyarrow/tensor.pxi index 6fb4fc99d7cbc..3e0c63c18fc98 100644 --- a/python/pyarrow/tensor.pxi +++ b/python/pyarrow/tensor.pxi @@ -107,6 +107,9 @@ strides: {0.strides}""".format(self) array([[ 2, 2, 4], [ 4, 5, 100]], dtype=int32) """ + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") cdef PyObject* out check_status(TensorToNdarray(self.sp_tensor, self, &out)) @@ -478,6 +481,9 @@ shape: {0.shape}""".format(self) """ Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. """ + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") cdef PyObject* out_data cdef PyObject* out_coords @@ -743,6 +749,9 @@ shape: {0.shape}""".format(self) """ Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. """ + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices @@ -981,6 +990,9 @@ shape: {0.shape}""".format(self) """ Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy """ + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices @@ -1216,6 +1228,9 @@ shape: {0.shape}""".format(self) """ Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy """ + if np is None: + raise ImportError( + "Cannot return a numpy.ndarray if NumPy is not present") cdef PyObject* out_data cdef PyObject* out_indptr cdef PyObject* out_indices diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 7a222cec8a7c4..0b82696d0a73f 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -25,6 +25,7 @@ import pytest import hypothesis as h + from ..conftest import groups, defaults from pyarrow import set_timezone_db_path diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py index 6d91bad57cef4..50da6693afff1 100644 --- a/python/pyarrow/tests/interchange/test_conversion.py +++ b/python/pyarrow/tests/interchange/test_conversion.py @@ -16,11 +16,15 @@ # under the License. from datetime import datetime as dt -import numpy as np import pyarrow as pa from pyarrow.vendored.version import Version import pytest +try: + import numpy as np +except ImportError: + np = None + import pyarrow.interchange as pi from pyarrow.interchange.column import ( _PyArrowColumn, @@ -107,13 +111,13 @@ def test_offset_of_sliced_array(): "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] ) @pytest.mark.parametrize( - "float, np_float", [ + "float, np_float_str", [ # (pa.float16(), np.float16), #not supported by pandas - (pa.float32(), np.float32), - (pa.float64(), np.float64) + (pa.float32(), "float32"), + (pa.float64(), "float64") ] ) -def test_pandas_roundtrip(uint, int, float, np_float): +def test_pandas_roundtrip(uint, int, float, np_float_str): if Version(pd.__version__) < Version("1.5.0"): pytest.skip("__dataframe__ added to pandas in 1.5.0") @@ -122,7 +126,7 @@ def test_pandas_roundtrip(uint, int, float, np_float): { "a": pa.array(arr, type=uint), "b": pa.array(arr, type=int), - "c": pa.array(np.array(arr, dtype=np_float), type=float), + "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float), "d": [True, False, True], } ) @@ -326,13 +330,13 @@ def test_pandas_roundtrip_datetime(unit): @pytest.mark.pandas @pytest.mark.parametrize( - "np_float", [np.float32, np.float64] + "np_float_str", ["float32", "float64"] ) -def test_pandas_to_pyarrow_with_missing(np_float): +def test_pandas_to_pyarrow_with_missing(np_float_str): if Version(pd.__version__) < Version("1.5.0"): pytest.skip("__dataframe__ added to pandas in 1.5.0") - np_array = np.array([0, np.nan, 2], dtype=np_float) + np_array = np.array([0, np.nan, 2], dtype=np.dtype(np_float_str)) datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)] df = pd.DataFrame({ # float, ColumnNullType.USE_NAN @@ -364,6 +368,7 @@ def test_pandas_to_pyarrow_float16_with_missing(): pi.from_dataframe(df) +@pytest.mark.numpy @pytest.mark.parametrize( "uint", [pa.uint8(), pa.uint16(), pa.uint32()] ) @@ -371,16 +376,16 @@ def test_pandas_to_pyarrow_float16_with_missing(): "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] ) @pytest.mark.parametrize( - "float, np_float", [ - (pa.float16(), np.float16), - (pa.float32(), np.float32), - (pa.float64(), np.float64) + "float, np_float_str", [ + (pa.float16(), "float16"), + (pa.float32(), "float32"), + (pa.float64(), "float64") ] ) @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) @pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30']) @pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)]) -def test_pyarrow_roundtrip(uint, int, float, np_float, +def test_pyarrow_roundtrip(uint, int, float, np_float_str, unit, tz, offset, length): from datetime import datetime as dt @@ -391,7 +396,7 @@ def test_pyarrow_roundtrip(uint, int, float, np_float, { "a": pa.array(arr, type=uint), "b": pa.array(arr, type=int), - "c": pa.array(np.array(arr, dtype=np_float), + "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float, from_pandas=True), "d": [True, False, True], "e": [True, False, None], diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index 826089652bca6..d060f7842c2fe 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -19,10 +19,13 @@ import hypothesis as h import hypothesis.strategies as st -import numpy as np +import pytest +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa import pyarrow.tests.strategies as past -import pytest all_types = st.deferred( @@ -39,6 +42,7 @@ # datetime is tested in test_extra.py # dictionary is tested in test_categorical() +@pytest.mark.numpy @h.given(past.arrays(all_types, size=3)) def test_dtypes(arr): table = pa.table([arr], names=["a"]) @@ -51,6 +55,7 @@ def test_dtypes(arr): assert df.get_column(0).offset == 0 +@pytest.mark.numpy @pytest.mark.parametrize( "uint, uint_bw", [ @@ -68,17 +73,17 @@ def test_dtypes(arr): ] ) @pytest.mark.parametrize( - "float, float_bw, np_float", [ - (pa.float16(), 16, np.float16), - (pa.float32(), 32, np.float32), - (pa.float64(), 64, np.float64) + "float, float_bw, np_float_str", [ + (pa.float16(), 16, "float16"), + (pa.float32(), 32, "float32"), + (pa.float64(), 64, "float64") ] ) @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) @pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30']) @pytest.mark.parametrize("use_batch", [False, True]) def test_mixed_dtypes(uint, uint_bw, int, int_bw, - float, float_bw, np_float, unit, tz, + float, float_bw, np_float_str, unit, tz, use_batch): from datetime import datetime as dt arr = [1, 2, 3] @@ -87,7 +92,7 @@ def test_mixed_dtypes(uint, uint_bw, int, int_bw, { "a": pa.array(arr, type=uint), "b": pa.array(arr, type=int), - "c": pa.array(np.array(arr, dtype=np_float), type=float), + "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float), "d": [True, False, True], "e": ["a", "", "c"], "f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz)) @@ -200,16 +205,16 @@ def test_column_get_chunks(use_batch, size, n_chunks): "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()] ) @pytest.mark.parametrize( - "float, np_float", [ - (pa.float16(), np.float16), - (pa.float32(), np.float32), - (pa.float64(), np.float64) + "float, np_float_str", [ + (pa.float16(), "float16"), + (pa.float32(), "float32"), + (pa.float64(), "float64") ] ) @pytest.mark.parametrize("use_batch", [False, True]) -def test_get_columns(uint, int, float, np_float, use_batch): +def test_get_columns(uint, int, float, np_float_str, use_batch): arr = [[1, 2, 3], [4, 5]] - arr_float = np.array([1, 2, 3, 4, 5], dtype=np_float) + arr_float = np.array([1, 2, 3, 4, 5], dtype=np.dtype(np_float_str)) table = pa.table( { "a": pa.chunked_array(arr, type=uint), diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index b4a57ba0b1556..fd6ad94fbd6d3 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -17,7 +17,10 @@ import io -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa from pyarrow.tests import util diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 194af7415e863..6496aa99092b8 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -22,7 +22,6 @@ from shutil import copytree from decimal import Decimal -import numpy as np import pytest import pyarrow as pa @@ -47,6 +46,10 @@ except ImportError: pd = tm = None +try: + import numpy as np +except ImportError: + np = None # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index e6b66b00428fb..79dd96948261c 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -17,8 +17,12 @@ import decimal import io +import random -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa @@ -173,6 +177,7 @@ def test_direct_read_dictionary_subfield(): assert result[0].num_chunks == 1 +@pytest.mark.numpy def test_dictionary_array_automatically_read(): # ARROW-3246 @@ -334,10 +339,10 @@ def test_column_of_lists(tempdir): def test_large_list_records(): # This was fixed in PARQUET-1100 - list_lengths = np.random.randint(0, 500, size=50) - list_lengths[::10] = 0 + list_lengths = [random.randint(0, 500) for _ in range(50)] + list_lengths[::10] = [0, 0, 0, 0, 0] - list_values = [list(map(int, np.random.randint(0, 100, size=x))) + list_values = [list(map(int, [random.randint(0, 100) for _ in range(x)])) if i % 8 else None for i, x in enumerate(list_lengths)] diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index 47e608a1404ff..f68f1aa9cdb46 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -20,7 +20,10 @@ import os import pathlib -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import unittest.mock as mock diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index 08fb1098322be..b89fd97cb91e6 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -19,7 +19,10 @@ import io import warnings -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index c29213ebc3d42..14ce9bbfcdd58 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -20,7 +20,10 @@ from collections import OrderedDict import io -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa @@ -584,7 +587,7 @@ def test_table_large_metadata(): my_schema = pa.schema([pa.field('f0', 'double')], metadata={'large': 'x' * 10000000}) - table = pa.table([np.arange(10)], schema=my_schema) + table = pa.table([range(10)], schema=my_schema) _check_roundtrip(table) diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index b5913bf5c6b6e..2ea2f46873aef 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -18,7 +18,10 @@ import io import json -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index db0aa1397123d..7a1b31a4d9d77 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -21,7 +21,10 @@ import pytest import hypothesis as h import hypothesis.strategies as st -import hypothesis.extra.numpy as npst +try: + import hypothesis.extra.numpy as npst +except ImportError: + npst = None try: import hypothesis.extra.pytz as tzst except ImportError: @@ -35,7 +38,10 @@ import tzdata # noqa:F401 except ImportError: zoneinfo = None -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py index cd381cf427dc3..76a766984dab6 100644 --- a/python/pyarrow/tests/test_adhoc_memory_leak.py +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -17,7 +17,10 @@ import pytest -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index c44ec3f8e1afe..4160d64829483 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -27,7 +27,10 @@ import sys import weakref -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa import pyarrow.tests.strategies as past @@ -157,6 +160,7 @@ def test_binary_total_values_length(): assert large_arr.slice(1, 3).total_values_length == 11 +@pytest.mark.numpy def test_to_numpy_zero_copy(): arr = pa.array(range(10)) @@ -176,6 +180,7 @@ def test_to_numpy_zero_copy(): np.testing.assert_array_equal(np_arr, expected) +@pytest.mark.numpy def test_chunked_array_to_numpy_zero_copy(): elements = [[2, 2, 4], [4, 5, 100]] @@ -191,6 +196,7 @@ def test_chunked_array_to_numpy_zero_copy(): np.testing.assert_array_equal(np_arr, expected) +@pytest.mark.numpy def test_to_numpy_unsupported_types(): # ARROW-2871: Some primitive types are not yet supported in to_numpy bool_arr = pa.array([True, False, True]) @@ -217,6 +223,7 @@ def test_to_numpy_unsupported_types(): arr.to_numpy() +@pytest.mark.numpy def test_to_numpy_writable(): arr = pa.array(range(10)) np_arr = arr.to_numpy() @@ -234,6 +241,7 @@ def test_to_numpy_writable(): arr.to_numpy(zero_copy_only=True, writable=True) +@pytest.mark.numpy @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) @pytest.mark.parametrize('tz', [None, "UTC"]) def test_to_numpy_datetime64(unit, tz): @@ -243,6 +251,7 @@ def test_to_numpy_datetime64(unit, tz): np.testing.assert_array_equal(np_arr, expected) +@pytest.mark.numpy @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) def test_to_numpy_timedelta64(unit): arr = pa.array([1, 2, 3], pa.duration(unit)) @@ -251,6 +260,7 @@ def test_to_numpy_timedelta64(unit): np.testing.assert_array_equal(np_arr, expected) +@pytest.mark.numpy def test_to_numpy_dictionary(): # ARROW-7591 arr = pa.array(["a", "b", "a"]).dictionary_encode() @@ -427,6 +437,11 @@ def test_array_getitem(): with pytest.raises(IndexError): arr[idx] + +@pytest.mark.numpy +def test_array_getitem_numpy_scalars(): + arr = pa.array(range(10, 15)) + lst = arr.to_pylist() # check that numpy scalars are supported for idx in range(-len(arr), len(arr)): assert arr[np.int32(idx)].as_py() == lst[idx] @@ -469,9 +484,11 @@ def test_array_slice(): res.validate() expected = arr.to_pylist()[start:stop] assert res.to_pylist() == expected - assert res.to_numpy().tolist() == expected + if np is not None: + assert res.to_numpy().tolist() == expected +@pytest.mark.numpy def test_array_slice_negative_step(): # ARROW-2714 np_arr = np.arange(20) @@ -542,6 +559,7 @@ def test_struct_array_slice(): {'a': 5, 'b': 6.5}] +@pytest.mark.numpy def test_array_factory_invalid_type(): class MyObject: @@ -552,6 +570,7 @@ class MyObject: pa.array(arr) +@pytest.mark.numpy def test_array_ref_to_ndarray_base(): arr = np.array([1, 2, 3]) @@ -576,6 +595,7 @@ def test_array_eq(): assert (arr1 == None) is False # noqa: E711 +@pytest.mark.numpy def test_array_from_buffers(): values_buf = pa.py_buffer(np.int16([4, 5, 6, 7])) nulls_buf = pa.py_buffer(np.uint8([0b00001101])) @@ -773,6 +793,7 @@ def test_dictionary_from_buffers(offset): assert a[offset:] == b +@pytest.mark.numpy def test_dictionary_from_numpy(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) @@ -795,6 +816,7 @@ def test_dictionary_from_numpy(): assert d2[i].as_py() == dictionary[indices[i]] +@pytest.mark.numpy def test_dictionary_to_numpy(): expected = pa.array( ["foo", "bar", None, "foo"] @@ -865,6 +887,7 @@ def test_dictionary_to_numpy(): ) +@pytest.mark.numpy def test_dictionary_from_boxed_arrays(): indices = np.repeat([0, 1, 2], 2) dictionary = np.array(['foo', 'bar', 'baz'], dtype=object) @@ -910,6 +933,7 @@ def test_dictionary_indices(): arr.indices.validate(full=True) +@pytest.mark.numpy @pytest.mark.parametrize(('list_array_type', 'list_type_factory'), [(pa.ListArray, pa.list_), (pa.LargeListArray, pa.large_list)]) @@ -1052,6 +1076,7 @@ def test_map_from_dict(): assert tup_arr.equals(dict_arr) +@pytest.mark.numpy def test_map_from_arrays(): offsets_arr = np.array([0, 2, 5, 8], dtype='i4') offsets = pa.array(offsets_arr, type='int32') @@ -1472,6 +1497,7 @@ def _check_cast_case(case, *, safe=True, check_array_construction=True): assert in_arr.equals(expected) +@pytest.mark.numpy def test_cast_integers_safe(): safe_cases = [ (np.array([0, 1, 2, 3], dtype='i1'), 'int8', @@ -1558,6 +1584,7 @@ def test_chunked_array_data_warns(): assert isinstance(res, pa.ChunkedArray) +@pytest.mark.numpy def test_cast_integers_unsafe(): # We let NumPy do the unsafe casting. # Note that NEP50 in the NumPy spec no longer allows @@ -1578,6 +1605,7 @@ def test_cast_integers_unsafe(): _check_cast_case(case, safe=False) +@pytest.mark.numpy def test_floating_point_truncate_safe(): safe_cases = [ (np.array([1.0, 2.0, 3.0], dtype='float32'), 'float32', @@ -1591,6 +1619,7 @@ def test_floating_point_truncate_safe(): _check_cast_case(case, safe=True) +@pytest.mark.numpy def test_floating_point_truncate_unsafe(): unsafe_cases = [ (np.array([1.1, 2.2, 3.3], dtype='float32'), 'float32', @@ -1635,6 +1664,7 @@ def test_decimal_to_int_safe(): _check_cast_case(case, safe=True) +@pytest.mark.numpy def test_decimal_to_int_value_out_of_bounds(): out_of_bounds_cases = [ ( @@ -1735,6 +1765,7 @@ def test_decimal_to_decimal(): result = arr.cast(pa.decimal128(5, 2)) +@pytest.mark.numpy def test_safe_cast_nan_to_int_raises(): arr = pa.array([np.nan, 1.]) @@ -1742,6 +1773,7 @@ def test_safe_cast_nan_to_int_raises(): arr.cast(pa.int64(), safe=True) +@pytest.mark.numpy def test_cast_signed_to_unsigned(): safe_cases = [ (np.array([0, 1, 2, 3], dtype='i1'), pa.uint8(), @@ -1992,6 +2024,7 @@ def test_dictionary_decode(): assert result.equals(expected) +@pytest.mark.numpy def test_cast_time32_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int32'), type=pa.time32('s')) @@ -2001,6 +2034,7 @@ def test_cast_time32_to_int(): assert result.equals(expected) +@pytest.mark.numpy def test_cast_time64_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.time64('us')) @@ -2010,6 +2044,7 @@ def test_cast_time64_to_int(): assert result.equals(expected) +@pytest.mark.numpy def test_cast_timestamp_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.timestamp('us')) @@ -2035,6 +2070,7 @@ def test_cast_date32_to_int(): assert result2.equals(arr) +@pytest.mark.numpy def test_cast_duration_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.duration('us')) @@ -2044,6 +2080,7 @@ def test_cast_duration_to_int(): assert result.equals(expected) +@pytest.mark.numpy def test_cast_binary_to_utf8(): binary_arr = pa.array([b'foo', b'bar', b'baz'], type=pa.binary()) utf8_arr = binary_arr.cast(pa.utf8()) @@ -2064,6 +2101,7 @@ def test_cast_binary_to_utf8(): assert casted.null_count == 1 +@pytest.mark.numpy def test_cast_date64_to_int(): arr = pa.array(np.array([0, 1, 2], dtype='int64'), type=pa.date64()) @@ -2146,6 +2184,7 @@ def test_array_pickle_dictionary(pickle_module): assert array.equals(result) +@pytest.mark.numpy @h.settings(suppress_health_check=(h.HealthCheck.too_slow,)) @h.given( past.arrays( @@ -2177,9 +2216,9 @@ def test_array_pickle_protocol5(data, typ, pickle_module): assert result_addresses == addresses -@pytest.mark.parametrize( - 'narr', - [ +@pytest.mark.numpy +def test_to_numpy_roundtrip(): + for narr in [ np.arange(10, dtype=np.int64), np.arange(10, dtype=np.int32), np.arange(10, dtype=np.int16), @@ -2191,23 +2230,23 @@ def test_array_pickle_protocol5(data, typ, pickle_module): np.arange(10, dtype=np.float64), np.arange(10, dtype=np.float32), np.arange(10, dtype=np.float16), - ] -) -def test_to_numpy_roundtrip(narr): - arr = pa.array(narr) - assert narr.dtype == arr.to_numpy().dtype - np.testing.assert_array_equal(narr, arr.to_numpy()) - np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) - np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) - np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) + ]: + arr = pa.array(narr) + assert narr.dtype == arr.to_numpy().dtype + np.testing.assert_array_equal(narr, arr.to_numpy()) + np.testing.assert_array_equal(narr[:6], arr[:6].to_numpy()) + np.testing.assert_array_equal(narr[2:], arr[2:].to_numpy()) + np.testing.assert_array_equal(narr[2:6], arr[2:6].to_numpy()) +@pytest.mark.numpy def test_array_uint64_from_py_over_range(): arr = pa.array([2 ** 63], type=pa.uint64()) expected = pa.array(np.array([2 ** 63], dtype='u8')) assert arr.equals(expected) +@pytest.mark.numpy def test_array_conversions_no_sentinel_values(): arr = np.array([1, 2, 3, 4], dtype='int8') refcount = sys.getrefcount(arr) @@ -2249,6 +2288,7 @@ def test_time32_time64_from_integer(): assert result.equals(expected) +@pytest.mark.numpy def test_binary_string_pandas_null_sentinels(): # ARROW-6227 def _check_case(ty): @@ -2259,6 +2299,7 @@ def _check_case(ty): _check_case('utf8') +@pytest.mark.numpy def test_pandas_null_sentinels_raise_error(): # ARROW-6227 cases = [ @@ -2299,6 +2340,7 @@ def test_pandas_null_sentinels_index(): assert result.equals(expected) +@pytest.mark.numpy def test_array_roundtrip_from_numpy_datetimeD(): arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') @@ -2319,6 +2361,7 @@ def test_array_from_naive_datetimes(): assert arr.type == pa.timestamp('us', tz=None) +@pytest.mark.numpy @pytest.mark.parametrize(('dtype', 'type'), [ ('datetime64[s]', pa.timestamp('s')), ('datetime64[ms]', pa.timestamp('ms')), @@ -2342,6 +2385,7 @@ def test_array_from_numpy_datetime(dtype, type): assert arr.equals(expected) +@pytest.mark.numpy def test_array_from_different_numpy_datetime_units_raises(): data = [ None, @@ -2356,6 +2400,7 @@ def test_array_from_different_numpy_datetime_units_raises(): pa.array(data) +@pytest.mark.numpy @pytest.mark.parametrize('unit', ['ns', 'us', 'ms', 's']) def test_array_from_list_of_timestamps(unit): n = np.datetime64('NaT', unit) @@ -2370,6 +2415,7 @@ def test_array_from_list_of_timestamps(unit): assert a1[0] == a2[0] +@pytest.mark.numpy def test_array_from_timestamp_with_generic_unit(): n = np.datetime64('NaT') x = np.datetime64('2017-01-01 01:01:01.111111111') @@ -2380,6 +2426,7 @@ def test_array_from_timestamp_with_generic_unit(): pa.array([n, x, y]) +@pytest.mark.numpy @pytest.mark.parametrize(('dtype', 'type'), [ ('timedelta64[s]', pa.duration('s')), ('timedelta64[ms]', pa.duration('ms')), @@ -2408,6 +2455,7 @@ def test_array_from_numpy_timedelta(dtype, type): assert arr.to_pylist() == data +@pytest.mark.numpy def test_array_from_numpy_timedelta_incorrect_unit(): # generic (no unit) td = np.timedelta64(1) @@ -2423,6 +2471,7 @@ def test_array_from_numpy_timedelta_incorrect_unit(): pa.array(data) +@pytest.mark.numpy def test_array_from_numpy_ascii(): arr = np.array(['abcde', 'abc', ''], dtype='|S5') @@ -2567,6 +2616,7 @@ def test_interval_array_from_dateoffset(): assert list(actual_list[0]) == expected_from_pandas +@pytest.mark.numpy def test_array_from_numpy_unicode(): dtypes = ['U5'] @@ -2599,12 +2649,14 @@ def test_array_from_numpy_unicode(): assert arrow_arr.equals(expected) +@pytest.mark.numpy def test_array_string_from_non_string(): # ARROW-5682 - when converting to string raise on non string-like dtype with pytest.raises(TypeError): pa.array(np.array([1, 2, 3]), type=pa.string()) +@pytest.mark.numpy def test_array_string_from_all_null(): # ARROW-5682 vals = np.array([None, None], dtype=object) @@ -2619,6 +2671,7 @@ def test_array_string_from_all_null(): assert arr.null_count == 2 +@pytest.mark.numpy def test_array_from_masked(): ma = np.ma.array([1, 2, 3, 4], dtype='int64', mask=[False, False, True, False]) @@ -2630,6 +2683,7 @@ def test_array_from_masked(): pa.array(ma, mask=np.array([True, False, False, False])) +@pytest.mark.numpy def test_array_from_shrunken_masked(): ma = np.ma.array([0], dtype='int64') result = pa.array(ma) @@ -2637,6 +2691,7 @@ def test_array_from_shrunken_masked(): assert expected.equals(result) +@pytest.mark.numpy def test_array_from_invalid_dim_raises(): msg = "only handle 1-dimensional arrays" arr2d = np.array([[1, 2, 3], [4, 5, 6]]) @@ -2648,6 +2703,7 @@ def test_array_from_invalid_dim_raises(): pa.array(arr0d) +@pytest.mark.numpy def test_array_from_strided_bool(): # ARROW-6325 arr = np.ones((3, 2), dtype=bool) @@ -2659,6 +2715,7 @@ def test_array_from_strided_bool(): assert result.equals(expected) +@pytest.mark.numpy def test_array_from_strided(): pydata = [ ([b"ab", b"cd", b"ef"], (pa.binary(), pa.binary(2))), @@ -2683,6 +2740,7 @@ def test_boolean_true_count_false_count(): assert arr.false_count == 1000 +@pytest.mark.numpy def test_buffers_primitive(): a = pa.array([1, 2, None, 4], type=pa.int16()) buffers = a.buffers() @@ -2755,6 +2813,7 @@ def test_buffers_nested(): assert struct.unpack('4xh', values) == (43,) +@pytest.mark.numpy def test_total_buffer_size(): a = pa.array(np.array([4, 5, 6], dtype='int64')) assert a.nbytes == 8 * 3 @@ -3153,6 +3212,7 @@ def test_nested_dictionary_array(): assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a'] +@pytest.mark.numpy def test_array_from_numpy_str_utf8(): # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python # 2 they are NPY_STRING (binary), so we must do UTF-8 validation @@ -3179,6 +3239,7 @@ def test_array_from_numpy_str_utf8(): pa.array(vec, pa.string(), mask=np.array([False])) +@pytest.mark.numpy @pytest.mark.slow @pytest.mark.large_memory def test_numpy_binary_overflow_to_chunked(): @@ -3237,6 +3298,7 @@ def test_list_child_overflow_to_chunked(): assert len(arr.chunk(1)) == 1 +@pytest.mark.numpy def test_infer_type_masked(): # ARROW-5208 ty = pa.infer_type(['foo', 'bar', None, 2], @@ -3252,6 +3314,7 @@ def test_infer_type_masked(): assert pa.infer_type([], mask=[]) == pa.null() +@pytest.mark.numpy def test_array_masked(): # ARROW-5208 arr = pa.array([4, None, 4, 3.], @@ -3264,6 +3327,7 @@ def test_array_masked(): assert arr.type == pa.int64() +@pytest.mark.numpy def test_array_supported_masks(): # ARROW-13883 arr = pa.array([4, None, 4, 3.], @@ -3322,6 +3386,7 @@ def test_array_supported_pandas_masks(): assert arr.to_pylist() == [None, 1] +@pytest.mark.numpy def test_binary_array_masked(): # ARROW-12431 masked_basic = pa.array([b'\x05'], type=pa.binary(1), @@ -3354,6 +3419,7 @@ def test_binary_array_masked(): assert ([b'aaa', b'bbb', b'ccc']*10) == arrow_array.to_pylist() +@pytest.mark.numpy def test_binary_array_strided(): # Masked nparray = np.array([b"ab", b"cd", b"ef"]) @@ -3367,6 +3433,7 @@ def test_binary_array_strided(): assert [b"ab", b"ef"] == arrow_array.to_pylist() +@pytest.mark.numpy def test_array_invalid_mask_raises(): # ARROW-10742 cases = [ @@ -3400,6 +3467,7 @@ def test_array_from_large_pyints(): pa.array([int(2 ** 63)]) +@pytest.mark.numpy def test_numpy_array_protocol(): # test the __array__ method on pyarrow.Array arr = pa.array([1, 2, 3]) @@ -3446,6 +3514,7 @@ def test_numpy_array_protocol(): assert result.dtype == "float64" +@pytest.mark.numpy def test_array_protocol(): class MyArray: @@ -3769,6 +3838,7 @@ def test_run_end_encoded_from_buffers(): 1, offset, children) +@pytest.mark.numpy def test_run_end_encoded_from_array_with_type(): run_ends = [1, 3, 6] values = [1, 2, 3] @@ -3808,6 +3878,7 @@ def test_run_end_encoded_from_array_with_type(): assert result.equals(expected) +@pytest.mark.numpy def test_run_end_encoded_to_numpy(): arr = [1, 2, 2, 3, 3, 3] ree_array = pa.array(arr, pa.run_end_encoded(pa.int32(), pa.int64())) @@ -4023,6 +4094,7 @@ def test_list_view_slice(list_view_type): assert sliced_array[0].as_py() == sliced_array.values[i:j].to_pylist() == [4] +@pytest.mark.numpy @pytest.mark.parametrize('numpy_native_dtype', ['u2', 'i4', 'f8']) def test_swapped_byte_order_fails(numpy_native_dtype): # ARROW-39129 diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py index abc8a0013df37..9187a19b5fc24 100644 --- a/python/pyarrow/tests/test_builder.py +++ b/python/pyarrow/tests/test_builder.py @@ -15,10 +15,9 @@ # specific language governing permissions and limitations # under the License. +import math import weakref -import numpy as np - import pyarrow as pa from pyarrow.lib import StringBuilder, StringViewBuilder @@ -35,7 +34,7 @@ def test_string_builder_append(): sbuilder = StringBuilder() sbuilder.append(b"a byte string") sbuilder.append("a string") - sbuilder.append(np.nan) + sbuilder.append(math.nan) sbuilder.append(None) assert len(sbuilder) == 4 assert sbuilder.null_count == 2 @@ -50,7 +49,7 @@ def test_string_builder_append(): def test_string_builder_append_values(): sbuilder = StringBuilder() - sbuilder.append_values([np.nan, None, "text", None, "other text"]) + sbuilder.append_values([math.nan, None, "text", None, "other text"]) assert sbuilder.null_count == 3 arr = sbuilder.finish() assert arr.null_count == 3 @@ -60,7 +59,7 @@ def test_string_builder_append_values(): def test_string_builder_append_after_finish(): sbuilder = StringBuilder() - sbuilder.append_values([np.nan, None, "text", None, "other text"]) + sbuilder.append_values([math.nan, None, "text", None, "other text"]) arr = sbuilder.finish() sbuilder.append("No effect") expected = [None, None, "text", None, "other text"] @@ -72,7 +71,7 @@ def test_string_view_builder(): builder.append(b"a byte string") builder.append("a string") builder.append("a longer not-inlined string") - builder.append(np.nan) + builder.append(math.nan) builder.append_values([None, "text"]) assert len(builder) == 6 assert builder.null_count == 2 diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 64fe7f1deb510..d4307cd24f8fc 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -28,7 +28,10 @@ import sys import textwrap -import numpy as np +try: + import numpy as np +except ImportError: + np = None try: import pandas as pd @@ -44,27 +47,6 @@ except ImportError: pas = None -all_array_types = [ - ('bool', [True, False, False, True, True]), - ('uint8', np.arange(5)), - ('int8', np.arange(5)), - ('uint16', np.arange(5)), - ('int16', np.arange(5)), - ('uint32', np.arange(5)), - ('int32', np.arange(5)), - ('uint64', np.arange(5, 10)), - ('int64', np.arange(5, 10)), - ('float', np.arange(0, 0.5, 0.1)), - ('double', np.arange(0, 0.5, 0.1)), - ('string', ['a', 'b', None, 'ddd', 'ee']), - ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), - (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), - (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), - (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), - (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [ - {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]), -] - exported_functions = [ func for (name, func) in sorted(pc.__dict__.items()) if hasattr(func, '__arrow_compute_function__')] @@ -87,6 +69,28 @@ ] +all_array_types = [ + ('bool', [True, False, False, True, True]), + ('uint8', range(5)), + ('int8', range(5)), + ('uint16', range(5)), + ('int16', range(5)), + ('uint32', range(5)), + ('int32', range(5)), + ('uint64', range(5, 10)), + ('int64', range(5, 10)), + ('float', [0, 0.1, 0.2, 0.3, 0.4]), + ('double', [0, 0.1, 0.2, 0.3, 0.4]), + ('string', ['a', 'b', None, 'ddd', 'ee']), + ('binary', [b'a', b'b', b'c', b'ddd', b'ee']), + (pa.binary(3), [b'abc', b'bcd', b'cde', b'def', b'efg']), + (pa.list_(pa.int8()), [[1, 2], [3, 4], [5, 6], None, [9, 16]]), + (pa.large_list(pa.int16()), [[1], [2, 3, 4], [5, 6], None, [9, 16]]), + (pa.struct([('a', pa.int8()), ('b', pa.int8())]), [ + {'a': 1, 'b': 2}, None, {'a': 3, 'b': 4}, None, {'a': 5, 'b': 6}]), +] + + def test_exported_functions(): # Check that all exported concrete functions can be called with # the right number of arguments. @@ -263,6 +267,7 @@ def test_get_function_hash_aggregate(): pc.HashAggregateKernel, 1) +@pytest.mark.numpy def test_call_function_with_memory_pool(): arr = pa.array(["foo", "bar", "baz"]) indices = np.array([2, 2, 1]) @@ -1172,7 +1177,7 @@ def test_take_on_chunked_array(): ] ]) - indices = np.array([0, 5, 1, 6, 9, 2]) + indices = pa.array([0, 5, 1, 6, 9, 2]) result = arr.take(indices) expected = pa.chunked_array([["a", "f", "b", "g", "j", "c"]]) assert result.equals(expected) @@ -1304,12 +1309,6 @@ def test_filter(ty, values): result.validate() assert result.equals(pa.array([values[0], values[3], None], type=ty)) - # same test with different array type - mask = np.array([True, False, False, True, None]) - result = arr.filter(mask, null_selection_behavior='drop') - result.validate() - assert result.equals(pa.array([values[0], values[3]], type=ty)) - # non-boolean dtype mask = pa.array([0, 1, 0, 1, 0]) with pytest.raises(NotImplementedError): @@ -1321,6 +1320,17 @@ def test_filter(ty, values): arr.filter(mask) +@pytest.mark.numpy +@pytest.mark.parametrize(('ty', 'values'), all_array_types) +def test_filter_numpy_array_mask(ty, values): + arr = pa.array(values, type=ty) + # same test as test_filter with different array type + mask = np.array([True, False, False, True, None]) + result = arr.filter(mask, null_selection_behavior='drop') + result.validate() + assert result.equals(pa.array([values[0], values[3]], type=ty)) + + def test_filter_chunked_array(): arr = pa.chunked_array([["a", None], ["c", "d", "e"]]) expected_drop = pa.chunked_array([["a"], ["e"]]) @@ -1586,9 +1596,11 @@ def test_round_to_integer(ty): for round_mode, expected in rmode_and_expected.items(): options = RoundOptions(round_mode=round_mode) result = round(values, options=options) - np.testing.assert_array_equal(result, pa.array(expected)) + expected_array = pa.array(expected, type=pa.float64()) + assert expected_array.equals(result) +@pytest.mark.numpy def test_round(): values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] ndigits_and_expected = { @@ -1607,6 +1619,7 @@ def test_round(): assert pc.round(values, ndigits, "half_towards_infinity") == result +@pytest.mark.numpy def test_round_to_multiple(): values = [320, 3.5, 3.075, 4.5, -3.212, -35.1234, -3.045, None] multiple_and_expected = { @@ -1670,7 +1683,7 @@ def test_is_null(): expected = pa.chunked_array([[True, True], [True, False]]) assert result.equals(expected) - arr = pa.array([1, 2, 3, None, np.nan]) + arr = pa.array([1, 2, 3, None, float("nan")]) result = arr.is_null() expected = pa.array([False, False, False, True, False]) assert result.equals(expected) @@ -1681,7 +1694,7 @@ def test_is_null(): def test_is_nan(): - arr = pa.array([1, 2, 3, None, np.nan]) + arr = pa.array([1, 2, 3, None, float("nan")]) result = arr.is_nan() expected = pa.array([False, False, False, None, True]) assert result.equals(expected) @@ -1986,6 +1999,7 @@ def check_cast_float_to_decimal(float_ty, float_val, decimal_ty, decimal_ctx, # Cannot test float32 as case generators above assume float64 +@pytest.mark.numpy @pytest.mark.parametrize('float_ty', [pa.float64()], ids=str) @pytest.mark.parametrize('decimal_ty', decimal_type_traits, ids=lambda v: v.name) @@ -2003,6 +2017,7 @@ def test_cast_float_to_decimal(float_ty, decimal_ty, case_generator): ctx, decimal_ty.max_precision) +@pytest.mark.numpy @pytest.mark.parametrize('float_ty', [pa.float32(), pa.float64()], ids=str) @pytest.mark.parametrize('decimal_traits', decimal_type_traits, ids=lambda v: v.name) @@ -2908,6 +2923,7 @@ def test_min_max_element_wise(): assert result == pa.array([1, 2, None]) +@pytest.mark.numpy @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_sum(start, skip_nulls): @@ -2962,6 +2978,7 @@ def test_cumulative_sum(start, skip_nulls): pc.cumulative_sum([1, 2, 3], start=strt) +@pytest.mark.numpy @pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_prod(start, skip_nulls): @@ -3016,6 +3033,7 @@ def test_cumulative_prod(start, skip_nulls): pc.cumulative_prod([1, 2, 3], start=strt) +@pytest.mark.numpy @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_max(start, skip_nulls): @@ -3073,6 +3091,7 @@ def test_cumulative_max(start, skip_nulls): pc.cumulative_max([1, 2, 3], start=strt) +@pytest.mark.numpy @pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) @pytest.mark.parametrize('skip_nulls', (True, False)) def test_cumulative_min(start, skip_nulls): @@ -3407,6 +3426,7 @@ def create_sample_expressions(): # Tests the Arrow-specific serialization mechanism +@pytest.mark.numpy def test_expression_serialization_arrow(pickle_module): for expr in create_sample_expressions()["all"]: assert isinstance(expr, pc.Expression) @@ -3414,6 +3434,7 @@ def test_expression_serialization_arrow(pickle_module): assert expr.equals(restored) +@pytest.mark.numpy @pytest.mark.substrait def test_expression_serialization_substrait(): diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 6140163a8ee8c..c3589877e6423 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -23,8 +23,11 @@ import re import hypothesis as h -import numpy as np import pytest +try: + import numpy as np +except ImportError: + np = None from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa @@ -32,17 +35,17 @@ int_type_pairs = [ - (np.int8, pa.int8()), - (np.int16, pa.int16()), - (np.int32, pa.int32()), - (np.int64, pa.int64()), - (np.uint8, pa.uint8()), - (np.uint16, pa.uint16()), - (np.uint32, pa.uint32()), - (np.uint64, pa.uint64())] + ("int8", pa.int8()), + ("int16", pa.int16()), + ("int32", pa.int32()), + ("int64", pa.int64()), + ("uint8", pa.uint8()), + ("uint16", pa.uint16()), + ("uint32", pa.uint32()), + ("uint64", pa.uint64())] -np_int_types, pa_int_types = zip(*int_type_pairs) +np_str_int_types, pa_int_types = zip(*int_type_pairs) class StrangeIterable: @@ -174,7 +177,9 @@ def _as_set(xs): return set(xs) -SEQUENCE_TYPES = [_as_list, _as_tuple, _as_numpy_array] +SEQUENCE_TYPES = [_as_list, _as_tuple] +if np is not None: + SEQUENCE_TYPES.append(_as_numpy_array) ITERABLE_TYPES = [_as_set, _as_dict_values] + SEQUENCE_TYPES COLLECTIONS_TYPES = [_as_deque] + ITERABLE_TYPES @@ -217,6 +222,7 @@ def test_sequence_boolean(seq): assert arr.to_pylist() == expected +@pytest.mark.numpy @parametrize_with_sequence_types def test_sequence_numpy_boolean(seq): expected = [np.bool_(True), None, np.bool_(False), None] @@ -225,6 +231,7 @@ def test_sequence_numpy_boolean(seq): assert arr.to_pylist() == [True, None, False, None] +@pytest.mark.numpy @parametrize_with_sequence_types def test_sequence_mixed_numpy_python_bools(seq): values = np.array([True, False]) @@ -278,11 +285,14 @@ def test_list_with_non_list(seq): @parametrize_with_sequence_types +@pytest.mark.parametrize( + "inner_seq", SEQUENCE_TYPES +) @pytest.mark.parametrize("factory", [ pa.list_, pa.large_list, pa.list_view, pa.large_list_view]) -def test_nested_arrays(seq, factory): - arr = pa.array(seq([np.array([], dtype=np.int64), - np.array([1, 2], dtype=np.int64), None]), +def test_nested_arrays(seq, inner_seq, factory): + arr = pa.array(seq([inner_seq([]), + inner_seq([1, 2]), None]), type=factory(pa.int64())) assert len(arr) == 3 assert arr.null_count == 1 @@ -290,6 +300,7 @@ def test_nested_arrays(seq, factory): assert arr.to_pylist() == [[], [1, 2], None] +@pytest.mark.numpy @parametrize_with_sequence_types def test_nested_fixed_size_list(seq): # sequence of lists @@ -334,10 +345,12 @@ def test_sequence_all_none(seq): assert arr.to_pylist() == [None, None] +@pytest.mark.numpy @parametrize_with_sequence_types @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) def test_sequence_integer(seq, np_scalar_pa_type): - np_scalar, pa_type = np_scalar_pa_type + np_str_scalar, pa_type = np_scalar_pa_type + np_scalar = getattr(np, np_str_scalar) expected = [1, None, 3, None, np.iinfo(np_scalar).min, np.iinfo(np_scalar).max] arr = pa.array(seq(expected), type=pa_type) @@ -347,12 +360,12 @@ def test_sequence_integer(seq, np_scalar_pa_type): assert arr.to_pylist() == expected +@pytest.mark.numpy @parametrize_with_collections_types -@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) -def test_sequence_integer_np_nan(seq, np_scalar_pa_type): +@pytest.mark.parametrize("pa_type", pa_int_types) +def test_sequence_integer_np_nan(seq, pa_type): # ARROW-2806: numpy.nan is a double value and thus should produce # a double array. - _, pa_type = np_scalar_pa_type with pytest.raises(ValueError): pa.array(seq([np.nan]), type=pa_type, from_pandas=False) @@ -364,12 +377,12 @@ def test_sequence_integer_np_nan(seq, np_scalar_pa_type): assert arr.to_pylist() == expected +@pytest.mark.numpy @parametrize_with_sequence_types -@pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) -def test_sequence_integer_nested_np_nan(seq, np_scalar_pa_type): +@pytest.mark.parametrize("pa_type", pa_int_types) +def test_sequence_integer_nested_np_nan(seq, pa_type): # ARROW-2806: numpy.nan is a double value and thus should produce # a double array. - _, pa_type = np_scalar_pa_type with pytest.raises(ValueError): pa.array(seq([[np.nan]]), type=pa.list_(pa_type), from_pandas=False) @@ -391,10 +404,12 @@ def test_sequence_integer_inferred(seq): assert arr.to_pylist() == expected +@pytest.mark.numpy @parametrize_with_sequence_types @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) def test_sequence_numpy_integer(seq, np_scalar_pa_type): - np_scalar, pa_type = np_scalar_pa_type + np_str_scalar, pa_type = np_scalar_pa_type + np_scalar = getattr(np, np_str_scalar) expected = [np_scalar(1), None, np_scalar(3), None, np_scalar(np.iinfo(np_scalar).min), np_scalar(np.iinfo(np_scalar).max)] @@ -405,10 +420,12 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type): assert arr.to_pylist() == expected +@pytest.mark.numpy @parametrize_with_sequence_types @pytest.mark.parametrize("np_scalar_pa_type", int_type_pairs) def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type): - np_scalar, pa_type = np_scalar_pa_type + np_str_scalar, pa_type = np_scalar_pa_type + np_scalar = getattr(np, np_str_scalar) expected = [np_scalar(1), None, np_scalar(3), None] expected += [np_scalar(np.iinfo(np_scalar).min), np_scalar(np.iinfo(np_scalar).max)] @@ -434,6 +451,7 @@ def test_broken_integers(seq): pa.array(seq(data), type=pa.int64()) +@pytest.mark.numpy def test_numpy_scalars_mixed_type(): # ARROW-4324 data = [np.int32(10), np.float32(0.5)] @@ -448,6 +466,7 @@ def test_numpy_scalars_mixed_type(): assert arr.equals(expected) +@pytest.mark.numpy @pytest.mark.xfail(reason="Type inference for uint64 not implemented", raises=OverflowError) def test_uint64_max_convert(): @@ -491,7 +510,7 @@ def test_integer_from_string_error(seq, typ): def test_convert_with_mask(): data = [1, 2, 3, 4, 5] - mask = np.array([False, True, False, False, True]) + mask = [False, True, False, False, True] result = pa.array(data, mask=mask) expected = pa.array([1, None, 3, 4, None]) @@ -559,6 +578,7 @@ def test_double_integer_coerce_representable_range(): pa.array(invalid_values2) +@pytest.mark.numpy def test_float32_integer_coerce_representable_range(): f32 = np.float32 valid_values = [f32(1.5), 1 << 24, -(1 << 24)] @@ -587,14 +607,16 @@ def test_mixed_sequence_errors(): pa.array([1.5, 'foo']) +@pytest.mark.numpy @parametrize_with_sequence_types -@pytest.mark.parametrize("np_scalar,pa_type", [ - (np.float16, pa.float16()), - (np.float32, pa.float32()), - (np.float64, pa.float64()) +@pytest.mark.parametrize("np_str_scalar,pa_type", [ + ("float16", pa.float16()), + ("float32", pa.float32()), + ("float64", pa.float64()) ]) @pytest.mark.parametrize("from_pandas", [True, False]) -def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas): +def test_sequence_numpy_double(seq, np_str_scalar, pa_type, from_pandas): + np_scalar = getattr(np, np_str_scalar) data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan] arr = pa.array(seq(data), from_pandas=from_pandas) assert len(arr) == 6 @@ -616,27 +638,29 @@ def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas): assert np.isnan(arr.to_pylist()[5]) +@pytest.mark.numpy @pytest.mark.parametrize("from_pandas", [True, False]) -@pytest.mark.parametrize("inner_seq", [np.array, list]) -def test_ndarray_nested_numpy_double(from_pandas, inner_seq): +def test_ndarray_nested_numpy_double(from_pandas): # ARROW-2806 - data = np.array([ - inner_seq([1., 2.]), - inner_seq([1., 2., 3.]), - inner_seq([np.nan]), - None - ], dtype=object) - arr = pa.array(data, from_pandas=from_pandas) - assert len(arr) == 4 - assert arr.null_count == 1 - assert arr.type == pa.list_(pa.float64()) - if from_pandas: - assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None] - else: - np.testing.assert_equal(arr.to_pylist(), - [[1., 2.], [1., 2., 3.], [np.nan], None]) + for inner_seq in (np.array, list): + data = np.array([ + inner_seq([1., 2.]), + inner_seq([1., 2., 3.]), + inner_seq([np.nan]), + None + ], dtype=object) + arr = pa.array(data, from_pandas=from_pandas) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == pa.list_(pa.float64()) + if from_pandas: + assert arr.to_pylist() == [[1.0, 2.0], [1.0, 2.0, 3.0], [None], None] + else: + np.testing.assert_equal(arr.to_pylist(), + [[1., 2.], [1., 2., 3.], [np.nan], None]) +@pytest.mark.numpy def test_nested_ndarray_in_object_array(): # ARROW-4350 arr = np.empty(2, dtype=object) @@ -664,6 +688,7 @@ def test_nested_ndarray_in_object_array(): assert result.to_pylist() == [[[1], [2]], [[1], [2]]] +@pytest.mark.numpy @pytest.mark.xfail(reason=("Type inference for multidimensional ndarray " "not yet implemented"), raises=AssertionError) @@ -682,6 +707,7 @@ def test_multidimensional_ndarray_as_nested_list(): assert result.equals(expected) +@pytest.mark.numpy @pytest.mark.parametrize(('data', 'value_type'), [ ([True, False], pa.bool_()), ([None, None], pa.null()), @@ -711,6 +737,7 @@ def test_list_array_from_object_ndarray(data, value_type): assert arr.to_pylist() == [data] +@pytest.mark.numpy @pytest.mark.parametrize(('data', 'value_type'), [ ([[1, 2], [3]], pa.list_(pa.int64())), ([[1, 2], [3, 4]], pa.list_(pa.int64(), 2)), @@ -730,13 +757,14 @@ def test_array_ignore_nan_from_pandas(): # See ARROW-4324, this reverts logic that was introduced in # ARROW-2240 with pytest.raises(ValueError): - pa.array([np.nan, 'str']) + pa.array([float("nan"), 'str']) - arr = pa.array([np.nan, 'str'], from_pandas=True) + arr = pa.array([float("nan"), 'str'], from_pandas=True) expected = pa.array([None, 'str']) assert arr.equals(expected) +@pytest.mark.numpy def test_nested_ndarray_different_dtypes(): data = [ np.array([1, 2, 3], dtype='int64'), @@ -1238,6 +1266,7 @@ def test_sequence_timestamp_out_of_bounds_nanosecond(): assert arr.to_pylist()[0] == datetime.datetime(2262, 4, 12) +@pytest.mark.numpy def test_sequence_numpy_timestamp(): data = [ np.datetime64(datetime.datetime(2007, 7, 13, 1, 23, 34, 123456)), @@ -1407,14 +1436,25 @@ class CustomClass(): pa.array([1, CustomClass()], type=ty) -@pytest.mark.parametrize('np_scalar', [True, False]) -def test_sequence_duration(np_scalar): +def test_sequence_duration(): td1 = datetime.timedelta(2, 3601, 1) td2 = datetime.timedelta(1, 100, 1000) - if np_scalar: - data = [np.timedelta64(td1), None, np.timedelta64(td2)] - else: - data = [td1, None, td2] + data = [td1, None, td2] + + arr = pa.array(data) + assert len(arr) == 3 + assert arr.type == pa.duration('us') + assert arr.null_count == 1 + assert arr[0].as_py() == td1 + assert arr[1].as_py() is None + assert arr[2].as_py() == td2 + + +@pytest.mark.numpy +def test_sequence_duration_np_scalar(): + td1 = datetime.timedelta(2, 3601, 1) + td2 = datetime.timedelta(1, 100, 1000) + data = [np.timedelta64(td1), None, np.timedelta64(td2)] arr = pa.array(data) assert len(arr) == 3 @@ -1480,6 +1520,7 @@ def test_sequence_duration_nested_lists_with_explicit_type(factory): assert arr.to_pylist() == data +@pytest.mark.numpy def test_sequence_duration_nested_lists_numpy(): td1 = datetime.timedelta(1, 1, 1000) td2 = datetime.timedelta(1, 100) @@ -1769,6 +1810,7 @@ def test_struct_from_dicts_bytes_keys(): ] +@pytest.mark.numpy def test_struct_from_tuples(): ty = pa.struct([pa.field('a', pa.int32()), pa.field('b', pa.string()), @@ -1915,6 +1957,7 @@ def test_struct_from_mixed_sequence(): pa.array(data, type=ty) +@pytest.mark.numpy def test_struct_from_dicts_inference(): expected_type = pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string()), @@ -1992,7 +2035,7 @@ def test_structarray_from_arrays_coerce(): def test_decimal_array_with_none_and_nan(): - values = [decimal.Decimal('1.234'), None, np.nan, decimal.Decimal('nan')] + values = [decimal.Decimal('1.234'), None, float("nan"), decimal.Decimal('nan')] with pytest.raises(TypeError): # ARROW-6227: Without from_pandas=True, NaN is considered a float @@ -2215,6 +2258,7 @@ def test_roundtrip_nanosecond_resolution_pandas_temporal_objects(): ] +@pytest.mark.numpy @h.given(past.all_arrays) def test_array_to_pylist_roundtrip(arr): seq = arr.to_pylist() @@ -2498,6 +2542,7 @@ def test_array_accepts_pyarrow_scalar(seq, data, scalar_data, value_type): assert expect.equals(result) +@pytest.mark.numpy @parametrize_with_collections_types def test_array_accepts_pyarrow_scalar_errors(seq): sequence = seq([pa.scalar(1), pa.scalar("a"), pa.scalar(3.0)]) diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py index 83800b77f894b..7508d8f0b9816 100644 --- a/python/pyarrow/tests/test_cpp_internals.py +++ b/python/pyarrow/tests/test_cpp_internals.py @@ -18,6 +18,8 @@ import os.path from os.path import join as pjoin +import pytest + from pyarrow._pyarrow_cpp_tests import get_cpp_tests @@ -26,10 +28,16 @@ def inject_cpp_tests(ns): Inject C++ tests as Python functions into namespace `ns` (a dict). """ for case in get_cpp_tests(): + def wrapper(case=case): case() wrapper.__name__ = wrapper.__qualname__ = case.name wrapper.__module__ = ns['__name__'] + # Add numpy or pandas marks if the test requires it + if 'numpy' in case.name: + wrapper = pytest.mark.numpy(wrapper) + elif 'pandas' in case.name: + wrapper = pytest.mark.pandas(wrapper) ns[case.name] = wrapper diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 112129d9602ed..dcf96f68c4da7 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -24,6 +24,7 @@ import io import itertools import os +import random import select import shutil import signal @@ -36,8 +37,6 @@ import pytest -import numpy as np - import pyarrow as pa from pyarrow.csv import ( open_csv, read_csv, ReadOptions, ParseOptions, ConvertOptions, ISO8601, @@ -54,18 +53,32 @@ def generate_col_names(): yield first + second +def split_rows(arr, num_cols, num_rows): + # Split a num_cols x num_rows array into rows + for i in range(0, num_rows * num_cols, num_cols): + yield arr[i:i + num_cols] + + +def split_columns(arr, num_cols, num_rows): + # Split a num_cols x num_rows array into columns + for i in range(0, num_cols): + yield arr[i::num_cols] + + def make_random_csv(num_cols=2, num_rows=10, linesep='\r\n', write_names=True): - arr = np.random.RandomState(42).randint(0, 1000, size=(num_cols, num_rows)) + rnd = random.Random(42) + arr = [rnd.randint(0, 1000) for _ in range(num_cols * num_rows)] csv = io.StringIO() col_names = list(itertools.islice(generate_col_names(), num_cols)) if write_names: csv.write(",".join(col_names)) csv.write(linesep) - for row in arr.T: + for row in split_rows(arr, num_cols, num_rows): csv.write(",".join(map(str, row))) csv.write(linesep) csv = csv.getvalue().encode() - columns = [pa.array(a, type=pa.int64()) for a in arr] + columns = [pa.array(row, type=pa.int64()) + for row in split_columns(arr, num_cols, num_rows)] expected = pa.Table.from_arrays(columns, col_names) return csv, expected @@ -127,6 +140,25 @@ def __ne__(self, other): other.result != self.result) +def test_split_rows_and_columns_utility(): + num_cols = 5 + num_rows = 2 + arr = [x for x in range(1, 11)] + rows = list(split_rows(arr, num_cols, num_rows)) + assert rows == [ + [1, 2, 3, 4, 5], + [6, 7, 8, 9, 10] + ] + columns = list(split_columns(arr, num_cols, num_rows)) + assert columns == [ + [1, 6], + [2, 7], + [3, 8], + [4, 9], + [5, 10] + ] + + def test_read_options(pickle_module): cls = ReadOptions opts = cls() @@ -520,6 +552,7 @@ def test_skip_rows_after_names(self): assert (values[opts.skip_rows + opts.skip_rows_after_names:] == table_dict[name]) + @pytest.mark.numpy def test_row_number_offset_in_errors(self): # Row numbers are only correctly counted in serial reads def format_msg(msg_format, row, *args): @@ -1802,6 +1835,7 @@ def test_header_skip_rows(self): with pytest.raises(StopIteration): assert reader.read_next_batch() + @pytest.mark.numpy def test_skip_rows_after_names(self): super().test_skip_rows_after_names() diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index d55be651b1571..a71fa036503d7 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -26,7 +26,10 @@ import pytest import pyarrow as pa -import numpy as np +try: + import numpy as np +except ImportError: + pytestmark = pytest.mark.numpy cuda = pytest.importorskip("pyarrow.cuda") diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py index ff1722d278d5e..876f3c7f761cf 100644 --- a/python/pyarrow/tests/test_cuda_numba_interop.py +++ b/python/pyarrow/tests/test_cuda_numba_interop.py @@ -17,7 +17,10 @@ import pytest import pyarrow as pa -import numpy as np +try: + import numpy as np +except ImportError: + pytestmark = pytest.mark.numpy dtypes = ['uint8', 'int16', 'float32'] cuda = pytest.importorskip("pyarrow.cuda") diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py index 0eeae5d65f7d5..937d927f831b0 100644 --- a/python/pyarrow/tests/test_cython.py +++ b/python/pyarrow/tests/test_cython.py @@ -80,6 +80,9 @@ def check_cython_example_module(mod): mod.cast_scalar(scal, pa.list_(pa.int64())) +# NumPy is still a required build dependency. It is present in our +# headers and is required to build for the cython tests. +@pytest.mark.numpy @pytest.mark.cython def test_cython_api(tmpdir): """ @@ -162,6 +165,7 @@ def test_cython_api(tmpdir): env=subprocess_env) +@pytest.mark.numpy @pytest.mark.cython def test_visit_strings(tmpdir): with tmpdir.as_cwd(): diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3b0284bcb74a6..276cd2e78db37 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -20,6 +20,7 @@ import os import pathlib import posixpath +import random import sys import tempfile import textwrap @@ -28,7 +29,10 @@ from shutil import copytree from urllib.parse import quote -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa @@ -684,8 +688,8 @@ def test_partitioning(): # test partitioning roundtrip table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10))], + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a'] * 10 + ['b'] * 10)], names=["f1", "f2", "part"] ) partitioning_schema = pa.schema([("part", pa.string())]) @@ -2494,7 +2498,7 @@ def _create_partitioned_dataset(basedir): pq.write_table(table.slice(3*i, 3), part / "test.parquet") full_table = table.append_column( - "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int32())) + "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int32())) return full_table, path @@ -2532,7 +2536,7 @@ def test_open_dataset_partitioned_directory(tempdir, dataset_reader, pickle_modu result = dataset.to_table() expected = table.append_column( - "part", pa.array(np.repeat([0, 1, 2], 3), type=pa.int8())) + "part", pa.array([0] * 3 + [1] * 3 + [2] * 3, type=pa.int8())) assert result.equals(expected) @@ -3567,7 +3571,7 @@ def _create_parquet_dataset_simple(root_path): metadata_collector = [] for i in range(4): - table = pa.table({'f1': [i] * 10, 'f2': np.random.randn(10)}) + table = pa.table({'f1': [i] * 10, 'f2': [random.random() for _ in range(10)]}) pq.write_to_dataset( table, str(root_path), metadata_collector=metadata_collector ) @@ -4255,7 +4259,7 @@ def compare_tables_ignoring_order(t1, t2): def _generate_random_int_array(size=4, min=1, max=10): - return np.random.randint(min, max, size) + return [random.randint(min, max) for _ in range(size)] def _generate_data_and_columns(num_of_columns, num_of_records): @@ -4513,8 +4517,8 @@ def file_visitor(written_file): def test_write_table(tempdir): table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10)) + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a'] * 10 + ['b'] * 10) ], names=["f1", "f2", "part"]) base_dir = tempdir / 'single' @@ -4560,8 +4564,8 @@ def file_visitor(written_file): def test_write_table_multiple_fragments(tempdir): table = pa.table([ - pa.array(range(10)), pa.array(np.random.randn(10)), - pa.array(np.repeat(['a', 'b'], 5)) + pa.array(range(10)), pa.array(random.random() for _ in range(10)), + pa.array(['a'] * 5 + ['b'] * 5) ], names=["f1", "f2", "part"]) table = pa.concat_tables([table]*2) @@ -4596,8 +4600,8 @@ def test_write_table_multiple_fragments(tempdir): def test_write_iterable(tempdir): table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10)) + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a'] * 10 + ['b'] * 10) ], names=["f1", "f2", "part"]) base_dir = tempdir / 'inmemory_iterable' @@ -4618,8 +4622,8 @@ def test_write_iterable(tempdir): def test_write_scanner(tempdir, dataset_reader): table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10)) + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a'] * 10 + ['b'] * 10) ], names=["f1", "f2", "part"]) dataset = ds.dataset(table) @@ -4647,7 +4651,7 @@ def test_write_table_partitioned_dict(tempdir): # specifying the dictionary values explicitly table = pa.table([ pa.array(range(20)), - pa.array(np.repeat(['a', 'b'], 10)).dictionary_encode(), + pa.array(['a'] * 10 + ['b'] * 10).dictionary_encode(), ], names=['col', 'part']) partitioning = ds.partitioning(table.select(["part"]).schema) @@ -4666,6 +4670,7 @@ def test_write_table_partitioned_dict(tempdir): assert result.equals(table) +@pytest.mark.numpy @pytest.mark.parquet def test_write_dataset_parquet(tempdir): table = pa.table([ @@ -4712,8 +4717,8 @@ def test_write_dataset_parquet(tempdir): def test_write_dataset_csv(tempdir): table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10)) + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a'] * 10 + ['b'] * 10) ], names=["f1", "f2", "chr1"]) base_dir = tempdir / 'csv_dataset' @@ -4739,8 +4744,8 @@ def test_write_dataset_csv(tempdir): @pytest.mark.parquet def test_write_dataset_parquet_file_visitor(tempdir): table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10)) + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a'] * 10 + ['b'] * 10) ], names=["f1", "f2", "part"]) visitor_called = False @@ -4763,7 +4768,7 @@ def test_partition_dataset_parquet_file_visitor(tempdir): f1_vals = [item for chunk in range(4) for item in [chunk] * 10] f2_vals = [item*10 for chunk in range(4) for item in [chunk] * 10] table = pa.table({'f1': f1_vals, 'f2': f2_vals, - 'part': np.repeat(['a', 'b'], 20)}) + 'part': ['a'] * 20 + ['b'] * 20}) root_path = tempdir / 'partitioned' partitioning = ds.partitioning( @@ -4841,8 +4846,8 @@ def test_write_dataset_s3(s3_example_simple): ) table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10))], + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a'] * 10 + ['b'] * 10)], names=["f1", "f2", "part"] ) part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") @@ -4918,8 +4923,8 @@ def test_write_dataset_s3_put_only(s3_server): _configure_s3_limited_user(s3_server, _minio_put_only_policy) table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), - pa.array(np.repeat(['a', 'b'], 10))], + pa.array(range(20)), pa.array(random.random() for _ in range(20)), + pa.array(['a']*10 + ['b'] * 10)], names=["f1", "f2", "part"] ) part = ds.partitioning(pa.schema([("part", pa.string())]), flavor="hive") diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index 0d8b4a152ab9f..eb79121b1cdbe 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -17,7 +17,7 @@ import base64 from datetime import timedelta -import numpy as np +import random import pyarrow.fs as fs import pyarrow as pa @@ -187,7 +187,10 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: row_count = 2**15 + 1 table = pa.Table.from_arrays( - [pa.array(np.random.rand(row_count), type=pa.float32())], names=["foo"] + [pa.array( + [random.random() for _ in range(row_count)], + type=pa.float32() + )], names=["foo"] ) kms_config = pe.KmsConnectionConfig() diff --git a/python/pyarrow/tests/test_dlpack.py b/python/pyarrow/tests/test_dlpack.py index 7cf3f4acdbd40..a18accb1e21df 100644 --- a/python/pyarrow/tests/test_dlpack.py +++ b/python/pyarrow/tests/test_dlpack.py @@ -19,12 +19,20 @@ from functools import wraps import pytest -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa from pyarrow.vendored.version import Version +# Marks all of the tests in this module +# Ignore these with pytest ... -m 'not numpy' +pytestmark = pytest.mark.numpy + + def PyCapsule_IsValid(capsule, name): return ctypes.pythonapi.PyCapsule_IsValid(ctypes.py_object(capsule), name) == 1 @@ -52,45 +60,45 @@ def wrapper(*args, **kwargs): @check_bytes_allocated @pytest.mark.parametrize( - ('value_type', 'np_type'), + ('value_type', 'np_type_str'), [ - (pa.uint8(), np.uint8), - (pa.uint16(), np.uint16), - (pa.uint32(), np.uint32), - (pa.uint64(), np.uint64), - (pa.int8(), np.int8), - (pa.int16(), np.int16), - (pa.int32(), np.int32), - (pa.int64(), np.int64), - (pa.float16(), np.float16), - (pa.float32(), np.float32), - (pa.float64(), np.float64), + (pa.uint8(), "uint8"), + (pa.uint16(), "uint16"), + (pa.uint32(), "uint32"), + (pa.uint64(), "uint64"), + (pa.int8(), "int8"), + (pa.int16(), "int16"), + (pa.int32(), "int32"), + (pa.int64(), "int64"), + (pa.float16(), "float16"), + (pa.float32(), "float32"), + (pa.float64(), "float64"), ] ) -def test_dlpack(value_type, np_type): +def test_dlpack(value_type, np_type_str): if Version(np.__version__) < Version("1.24.0"): pytest.skip("No dlpack support in numpy versions older than 1.22.0, " "strict keyword in assert_array_equal added in numpy version " "1.24.0") - expected = np.array([1, 2, 3], dtype=np_type) + expected = np.array([1, 2, 3], dtype=np.dtype(np_type_str)) arr = pa.array(expected, type=value_type) check_dlpack_export(arr, expected) arr_sliced = arr.slice(1, 1) - expected = np.array([2], dtype=np_type) + expected = np.array([2], dtype=np.dtype(np_type_str)) check_dlpack_export(arr_sliced, expected) arr_sliced = arr.slice(0, 1) - expected = np.array([1], dtype=np_type) + expected = np.array([1], dtype=np.dtype(np_type_str)) check_dlpack_export(arr_sliced, expected) arr_sliced = arr.slice(1) - expected = np.array([2, 3], dtype=np_type) + expected = np.array([2, 3], dtype=np.dtype(np_type_str)) check_dlpack_export(arr_sliced, expected) arr_zero = pa.array([], type=value_type) - expected = np.array([], dtype=np_type) + expected = np.array([], dtype=np.dtype(np_type_str)) check_dlpack_export(arr_zero, expected) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index aacbd2cb6e756..b74eca75bdca9 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -23,12 +23,15 @@ from uuid import uuid4, UUID import sys -import numpy as np +import pytest +try: + import numpy as np +except ImportError: + np = None + import pyarrow as pa from pyarrow.vendored.version import Version -import pytest - @contextlib.contextmanager def registered_extension_type(ext_type): @@ -562,6 +565,7 @@ def test_ext_array_pickling(pickle_module): assert arr.storage.to_pylist() == [b"foo", b"bar"] +@pytest.mark.numpy def test_ext_array_conversion_to_numpy(): storage1 = pa.array([1, 2, 3], type=pa.int64()) storage2 = pa.array([b"123", b"456", b"789"], type=pa.binary(3)) @@ -619,6 +623,7 @@ def struct_w_ext_data(): return [sarr1, sarr2] +@pytest.mark.numpy def test_struct_w_ext_array_to_numpy(struct_w_ext_data): # ARROW-15291 # Check that we don't segfault when trying to build @@ -1233,6 +1238,7 @@ def test_parquet_extension_nested_in_extension(tmpdir): assert table == orig_table +@pytest.mark.numpy def test_to_numpy(): period_type = PeriodType('D') storage = pa.array([1, 2, 3, 4], pa.int64()) @@ -1285,7 +1291,11 @@ def test_empty_take(): (["cat", "dog", "horse"], LabelType) )) @pytest.mark.parametrize( - "into", ["to_numpy", pytest.param("to_pandas", marks=pytest.mark.pandas)]) + "into", [ + pytest.param("to_numpy", marks=pytest.mark.numpy), + pytest.param("to_pandas", marks=pytest.mark.pandas) + ] +) def test_extension_array_to_numpy_pandas(data, ty, into): storage = pa.array(data) ext_arr = pa.ExtensionArray.from_storage(ty(), storage) @@ -1301,6 +1311,7 @@ def test_extension_array_to_numpy_pandas(data, ty, into): assert np.array_equal(result, expected) +@pytest.mark.numpy def test_array_constructor(): ext_type = IntegerType() storage = pa.array([1, 2, 3], type=pa.int64()) @@ -1333,6 +1344,7 @@ def test_array_constructor_from_pandas(): assert result.equals(expected) +@pytest.mark.numpy @pytest.mark.cython def test_cpp_extension_in_python(tmpdir): from .test_cython import ( @@ -1430,38 +1442,45 @@ def test_tensor_type(): assert tensor_type.permutation is None -@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) -def test_tensor_class_methods(value_type): +@pytest.mark.numpy +@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32")) +def test_tensor_class_methods(np_type_str): from numpy.lib.stride_tricks import as_strided - arrow_type = pa.from_numpy_dtype(value_type) + arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str)) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 3]) storage = pa.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], pa.list_(arrow_type, 6)) arr = pa.ExtensionArray.from_storage(tensor_type, storage) expected = np.array( - [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=value_type) + [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], + dtype=np.dtype(np_type_str) + ) np.testing.assert_array_equal(arr.to_tensor(), expected) np.testing.assert_array_equal(arr.to_numpy_ndarray(), expected) - expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=value_type) + expected = np.array([[[7, 8, 9], [10, 11, 12]]], dtype=np.dtype(np_type_str)) result = arr[1:].to_numpy_ndarray() np.testing.assert_array_equal(result, expected) values = [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]] - flat_arr = np.array(values[0], dtype=value_type) - bw = value_type.itemsize + flat_arr = np.array(values[0], dtype=np.dtype(np_type_str)) + bw = np.dtype(np_type_str).itemsize storage = pa.array(values, pa.list_(arrow_type, 12)) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 1, 2]) result = pa.ExtensionArray.from_storage(tensor_type, storage) expected = np.array( - [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], dtype=value_type) + [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]], + dtype=np.dtype(np_type_str) + ) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) result = flat_arr.reshape(1, 2, 3, 2) expected = np.array( - [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], + dtype=np.dtype(np_type_str) + ) np.testing.assert_array_equal(result, expected) tensor_type = pa.fixed_shape_tensor(arrow_type, [2, 2, 3], permutation=[0, 2, 1]) @@ -1482,25 +1501,27 @@ def test_tensor_class_methods(value_type): assert result.to_tensor().strides == (12 * bw, 1 * bw, 6 * bw, 2 * bw) -@pytest.mark.parametrize("value_type", (np.int8(), np.int64(), np.float32())) -def test_tensor_array_from_numpy(value_type): +@pytest.mark.numpy +@pytest.mark.parametrize("np_type_str", ("int8", "int64", "float32")) +def test_tensor_array_from_numpy(np_type_str): from numpy.lib.stride_tricks import as_strided - arrow_type = pa.from_numpy_dtype(value_type) + arrow_type = pa.from_numpy_dtype(np.dtype(np_type_str)) arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], - dtype=value_type, order="C") + dtype=np.dtype(np_type_str), order="C") tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) assert isinstance(tensor_array_from_numpy.type, pa.FixedShapeTensorType) assert tensor_array_from_numpy.type.value_type == arrow_type assert tensor_array_from_numpy.type.shape == [2, 3] arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]], - dtype=value_type, order="F") + dtype=np.dtype(np_type_str), order="F") with pytest.raises(ValueError, match="First stride needs to be largest"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) - bw = value_type.itemsize + flat_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + dtype=np.dtype(np_type_str)) + bw = np.dtype(np_type_str).itemsize arr = flat_arr.reshape(1, 3, 4) tensor_array_from_numpy = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) @@ -1518,23 +1539,26 @@ def test_tensor_array_from_numpy(value_type): arr = flat_arr.reshape(1, 2, 3, 2) result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr) expected = np.array( - [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], dtype=value_type) + [[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]]], + dtype=np.dtype(np_type_str) + ) np.testing.assert_array_equal(result.to_numpy_ndarray(), expected) - arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], dtype=value_type) + arr = np.array([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + dtype=np.dtype(np_type_str)) expected = arr[1:] result = pa.FixedShapeTensorArray.from_numpy_ndarray(arr)[1:].to_numpy_ndarray() np.testing.assert_array_equal(result, expected) - arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=value_type) + arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.dtype(np_type_str)) with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - arr = np.array(1, dtype=value_type) + arr = np.array(1, dtype=np.dtype(np_type_str)) with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr) - arr = np.array([], dtype=value_type) + arr = np.array([], dtype=np.dtype(np_type_str)) with pytest.raises(ValueError, match="Cannot convert 1D array or scalar to fixed"): pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((0))) @@ -1546,6 +1570,7 @@ def test_tensor_array_from_numpy(value_type): pa.FixedShapeTensorArray.from_numpy_ndarray(arr.reshape((3, 0, 2))) +@pytest.mark.numpy @pytest.mark.parametrize("tensor_type", ( pa.fixed_shape_tensor(pa.int8(), [2, 2, 3]), pa.fixed_shape_tensor(pa.int8(), [2, 2, 3], permutation=[0, 2, 1]), @@ -1801,6 +1826,7 @@ def test_bool8_to_bool_conversion(): assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr +@pytest.mark.numpy def test_bool8_to_numpy_conversion(): arr = pa.ExtensionArray.from_storage( pa.bool8(), @@ -1841,6 +1867,7 @@ def test_bool8_to_numpy_conversion(): assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address +@pytest.mark.numpy def test_bool8_from_numpy_conversion(): np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_) canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage( diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 0064006489088..18c8cd5b654e6 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -23,7 +23,10 @@ import hypothesis as h import hypothesis.strategies as st -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa import pyarrow.tests.strategies as past @@ -135,6 +138,7 @@ def f(): pytest.raises(exc, f) +@pytest.mark.numpy def test_dataset(version): num_values = (100, 100) num_files = 5 @@ -354,6 +358,7 @@ def test_buffer_bounds_error(version): _check_arrow_roundtrip(table) +@pytest.mark.numpy def test_boolean_object_nulls(version): repeats = 100 table = pa.Table.from_arrays( @@ -540,6 +545,7 @@ def test_read_columns(version): columns=['boo', 'woo']) +@pytest.mark.numpy def test_overwritten_file(version): path = random_path() TEST_FILES.append(path) @@ -675,6 +681,7 @@ def test_v2_compression_options(): write_feather(df, buf, compression='snappy') +@pytest.mark.numpy def test_v2_lz4_default_compression(): # ARROW-8750: Make sure that the compression=None option selects lz4 if # it's available @@ -807,6 +814,7 @@ def test_nested_types(compression): _check_arrow_roundtrip(table, compression=compression) +@pytest.mark.numpy @h.given(past.all_tables, st.sampled_from(["uncompressed", "lz4", "zstd"])) def test_roundtrip(table, compression): _check_arrow_roundtrip(table, compression=compression) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 832c6a2dbdf9f..029a2695b9fd8 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -28,7 +28,10 @@ import traceback import json -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa @@ -1588,6 +1591,7 @@ def test_flight_do_put_metadata(): assert idx == server_idx +@pytest.mark.numpy def test_flight_do_put_limit(): """Try a simple do_put call with a size limit.""" large_batch = pa.RecordBatch.from_arrays([ diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index ef499a3a8d76c..e2df1b1c46835 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -25,11 +25,15 @@ import os import pathlib import pytest +import random import sys import tempfile import weakref -import numpy as np +try: + import numpy as np +except ImportError: + np = None from pyarrow.util import guid from pyarrow import Codec @@ -464,6 +468,7 @@ def test_buffer_hex(val, expected_hex_buffer): assert buf.hex() == expected_hex_buffer +@pytest.mark.numpy def test_buffer_to_numpy(): # Make sure creating a numpy array from an arrow buffer works byte_array = bytearray(20) @@ -476,6 +481,7 @@ def test_buffer_to_numpy(): assert array.base == buf +@pytest.mark.numpy def test_buffer_from_numpy(): # C-contiguous arr = np.arange(12, dtype=np.int8).reshape((3, 4)) @@ -493,6 +499,7 @@ def test_buffer_from_numpy(): buf = pa.py_buffer(arr.T[::2]) +@pytest.mark.numpy def test_buffer_address(): b1 = b'some data!' b2 = bytearray(b1) @@ -513,6 +520,7 @@ def test_buffer_address(): assert buf.address == arr.ctypes.data +@pytest.mark.numpy def test_buffer_equals(): # Buffer.equals() returns true iff the buffers have the same contents def eq(a, b): @@ -624,6 +632,7 @@ def test_buffer_hashing(): hash(pa.py_buffer(b'123')) +@pytest.mark.numpy def test_buffer_protocol_respects_immutability(): # ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like # object is mutable by first attempting to get a mutable buffer using @@ -635,6 +644,7 @@ def test_buffer_protocol_respects_immutability(): assert not numpy_ref.flags.writeable +@pytest.mark.numpy def test_foreign_buffer(): obj = np.array([1, 2], dtype=np.int32) addr = obj.__array_interface__["data"][0] @@ -669,6 +679,7 @@ def test_allocate_buffer_resizable(): assert buf.size == 200 +@pytest.mark.numpy def test_non_cpu_buffer(pickle_module): cuda = pytest.importorskip("pyarrow.cuda") ctx = cuda.Context(0) @@ -798,6 +809,7 @@ def test_cache_options_pickling(pickle_module): assert pickle_module.loads(pickle_module.dumps(option)) == option +@pytest.mark.numpy @pytest.mark.parametrize("compression", [ pytest.param( "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) @@ -838,6 +850,7 @@ def test_compress_decompress(compression): pa.decompress(compressed_bytes, codec=compression) +@pytest.mark.numpy @pytest.mark.parametrize("compression", [ pytest.param( "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) @@ -996,6 +1009,7 @@ def make_buffer(bytes_obj): assert refcount_before == sys.getrefcount(val) +@pytest.mark.numpy def test_nativefile_write_memoryview(): f = pa.BufferOutputStream() data = b'ok' @@ -1058,8 +1072,8 @@ def test_mock_output_stream(): @pytest.fixture def sample_disk_data(request, tmpdir): SIZE = 4096 - arr = np.random.randint(0, 256, size=SIZE).astype('u1') - data = arr.tobytes()[:SIZE] + arr = [random.randint(0, 255) for _ in range(SIZE)] + data = bytes(arr[:SIZE]) path = os.path.join(str(tmpdir), guid()) @@ -1146,8 +1160,8 @@ def test_memory_map_writer(tmpdir): if sys.platform == "emscripten": pytest.xfail("Multiple memory maps to same file don't work on emscripten") SIZE = 4096 - arr = np.random.randint(0, 256, size=SIZE).astype('u1') - data = arr.tobytes()[:SIZE] + arr = [random.randint(0, 255) for _ in range(SIZE)] + data = bytes(arr[:SIZE]) path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: @@ -1187,9 +1201,9 @@ def test_memory_map_writer(tmpdir): def test_memory_map_resize(tmpdir): SIZE = 4096 - arr = np.random.randint(0, 256, size=SIZE).astype(np.uint8) - data1 = arr.tobytes()[:(SIZE // 2)] - data2 = arr.tobytes()[(SIZE // 2):] + arr = [random.randint(0, 255) for _ in range(SIZE)] + data1 = bytes(arr[:(SIZE // 2)]) + data2 = bytes(arr[(SIZE // 2):]) path = os.path.join(str(tmpdir), guid()) @@ -1202,7 +1216,7 @@ def test_memory_map_resize(tmpdir): mmap.close() with open(path, 'rb') as f: - assert f.read() == arr.tobytes() + assert f.read() == bytes(arr[:SIZE]) def test_memory_zero_length(tmpdir): @@ -1241,8 +1255,8 @@ def test_memory_map_deref_remove(tmpdir): def test_os_file_writer(tmpdir): SIZE = 4096 - arr = np.random.randint(0, 256, size=SIZE).astype('u1') - data = arr.tobytes()[:SIZE] + arr = [random.randint(0, 255) for _ in range(SIZE)] + data = bytes(arr[:SIZE]) path = os.path.join(str(tmpdir), guid()) with open(path, 'wb') as f: @@ -1523,6 +1537,7 @@ def test_buffered_input_stream_detach_non_seekable(): raw.seek(2) +@pytest.mark.numpy def test_buffered_output_stream(): np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer buf = pa.py_buffer(np_buf) @@ -1540,6 +1555,7 @@ def test_buffered_output_stream(): assert np_buf[:10].tobytes() == b'123456789\0' +@pytest.mark.numpy def test_buffered_output_stream_detach(): np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer buf = pa.py_buffer(np_buf) diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 1e5242efe40f0..4be5792a92f6d 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -20,11 +20,15 @@ import io import pathlib import pytest +import random import socket import threading import weakref -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script @@ -59,7 +63,7 @@ def write_batches(self, num_batches=5, as_table=False): batches = [] for i in range(num_batches): batch = pa.record_batch( - [np.random.randn(nrows), + [[random.random() for _ in range(nrows)], ['foo', None, 'bar', 'bazbaz', 'qux']], schema=schema) batches.append(batch) @@ -422,7 +426,7 @@ def test_stream_simple_roundtrip(stream_fixture, use_legacy_ipc_format): @pytest.mark.zstd def test_compression_roundtrip(): sink = io.BytesIO() - values = np.random.randint(0, 3, 10000) + values = [random.randint(0, 3) for _ in range(10000)] table = pa.Table.from_arrays([values], names=["values"]) options = pa.ipc.IpcWriteOptions(compression='zstd') diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index a0a6174266310..3bb4440e89750 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -23,7 +23,10 @@ import string import unittest -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa @@ -297,6 +300,7 @@ def test_explicit_schema_with_unexpected_behaviour(self): match="JSON parse error: unexpected field"): self.read_bytes(rows, parse_options=opts) + @pytest.mark.numpy def test_small_random_json(self): data, expected = make_random_json(num_cols=2, num_rows=10) table = self.read_bytes(data) @@ -304,6 +308,7 @@ def test_small_random_json(self): assert table.equals(expected) assert table.to_pydict() == expected.to_pydict() + @pytest.mark.numpy def test_load_large_json(self): data, expected = make_random_json(num_cols=2, num_rows=100100) # set block size is 10MB @@ -312,6 +317,7 @@ def test_load_large_json(self): assert table.num_rows == 100100 assert expected.num_rows == 100100 + @pytest.mark.numpy def test_stress_block_sizes(self): # Test a number of small block sizes to stress block stitching data_base, expected = make_random_json(num_cols=2, num_rows=100) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 208812c3ac458..178a073ed59dc 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -27,9 +27,18 @@ import hypothesis as h import hypothesis.strategies as st -import numpy as np -import numpy.testing as npt import pytest +try: + import numpy as np + import numpy.testing as npt + try: + _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning + except AttributeError: + from numpy.exceptions import ( + VisibleDeprecationWarning as _np_VisibleDeprecationWarning + ) +except ImportError: + np = None from pyarrow.pandas_compat import get_logical_type, _pandas_api from pyarrow.tests.util import invoke_script, random_ascii, rands @@ -51,14 +60,6 @@ pass -try: - _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning -except AttributeError: - from numpy.exceptions import ( - VisibleDeprecationWarning as _np_VisibleDeprecationWarning - ) - - # Marks all of the tests in this module pytestmark = pytest.mark.pandas @@ -1202,9 +1203,11 @@ def test_datetime64_to_date32(self): @pytest.mark.parametrize('mask', [ None, - np.array([True, False, False, True, False, False]), + [True, False, False, True, False, False], ]) def test_pandas_datetime_to_date64(self, mask): + if mask: + mask = np.array(mask) s = pd.to_datetime([ '2018-05-10T00:00:00', '2018-05-11T00:00:00', @@ -1608,7 +1611,8 @@ def test_array_from_pandas_date_with_mask(self): assert pa.Array.from_pandas(expected).equals(result) @pytest.mark.skipif( - Version('1.16.0') <= Version(np.__version__) < Version('1.16.1'), + np is not None and Version('1.16.0') <= Version( + np.__version__) < Version('1.16.1'), reason='Until numpy/numpy#12745 is resolved') def test_fixed_offset_timezone(self): df = pd.DataFrame({ @@ -2921,23 +2925,23 @@ class TestConvertMisc: """ type_pairs = [ - (np.int8, pa.int8()), - (np.int16, pa.int16()), - (np.int32, pa.int32()), - (np.int64, pa.int64()), - (np.uint8, pa.uint8()), - (np.uint16, pa.uint16()), - (np.uint32, pa.uint32()), - (np.uint64, pa.uint64()), - (np.float16, pa.float16()), - (np.float32, pa.float32()), - (np.float64, pa.float64()), + ("int8", pa.int8()), + ("int16", pa.int16()), + ("int32", pa.int32()), + ("int64", pa.int64()), + ("uint8", pa.uint8()), + ("uint16", pa.uint16()), + ("uint32", pa.uint32()), + ("uint64", pa.uint64()), + ("float16", pa.float16()), + ("float32", pa.float32()), + ("float64", pa.float64()), # XXX unsupported # (np.dtype([('a', 'i2')]), pa.struct([pa.field('a', pa.int16())])), - (np.object_, pa.string()), - (np.object_, pa.binary()), - (np.object_, pa.binary(10)), - (np.object_, pa.list_(pa.int64())), + ("object", pa.string()), + ("object", pa.binary()), + ("object", pa.binary(10)), + ("object", pa.list_(pa.int64())), ] def test_all_none_objects(self): @@ -2950,8 +2954,8 @@ def test_all_none_category(self): _check_pandas_roundtrip(df) def test_empty_arrays(self): - for dtype, pa_type in self.type_pairs: - arr = np.array([], dtype=dtype) + for dtype_str, pa_type in self.type_pairs: + arr = np.array([], dtype=np.dtype(dtype_str)) _check_array_roundtrip(arr, type=pa_type) def test_non_threaded_conversion(self): diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index bc50697e1be17..3f4a53c473e7e 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -20,7 +20,10 @@ import pytest import weakref -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa import pyarrow.compute as pc @@ -40,7 +43,6 @@ (1, pa.int64(), pa.Int64Scalar), (1, pa.uint64(), pa.UInt64Scalar), (1.0, None, pa.DoubleScalar), - (np.float16(1.0), pa.float16(), pa.HalfFloatScalar), (1.0, pa.float32(), pa.FloatScalar), (decimal.Decimal("1.123"), None, pa.Decimal128Scalar), (decimal.Decimal("1.1234567890123456789012345678901234567890"), @@ -98,6 +100,40 @@ def test_basics(value, ty, klass, pickle_module): assert wr() is None +# This test is a copy of test_basics but only for float16 (HalfFloatScalar) +# which currently requires a numpy scalar to create it. The test collection +# fails if numpy is used on the parametrization when not present. +@pytest.mark.numpy +def test_basics_np_required(pickle_module): + value, ty, klass = np.float16(1.0), pa.float16(), pa.HalfFloatScalar + s = pa.scalar(value, type=ty) + s.validate() + s.validate(full=True) + assert isinstance(s, klass) + assert s.as_py() == value + assert s == pa.scalar(value, type=ty) + assert s != value + assert s != "else" + assert hash(s) == hash(s) + assert s.is_valid is True + assert s != None # noqa: E711 + + s = pa.scalar(None, type=s.type) + assert s.is_valid is False + assert s.as_py() is None + assert s != pa.scalar(value, type=ty) + + # test pickle roundtrip + restored = pickle_module.loads(pickle_module.dumps(s)) + assert s.equals(restored) + + # test that scalars are weak-referenceable + wr = weakref.ref(s) + assert wr() is not None + del s + assert wr() is None + + def test_invalid_scalar(): s = pc.cast(pa.scalar(b"\xff"), pa.string(), safe=False) s.validate() @@ -202,14 +238,15 @@ def test_numerics(): assert str(s) == "1.5" assert s.as_py() == 1.5 - # float16 - s = pa.scalar(np.float16(0.5), type='float16') - assert isinstance(s, pa.HalfFloatScalar) - # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)" - # on numpy1 repr(np.float16(0.5)) == "0.5" - assert repr(s) == f"" - assert str(s) == "0.5" - assert s.as_py() == 0.5 + if np is not None: + # float16 + s = pa.scalar(np.float16(0.5), type='float16') + assert isinstance(s, pa.HalfFloatScalar) + # on numpy2 repr(np.float16(0.5)) == "np.float16(0.5)" + # on numpy1 repr(np.float16(0.5)) == "0.5" + assert repr(s) == f"" + assert str(s) == "0.5" + assert s.as_py() == 0.5 def test_decimal128(): @@ -434,6 +471,7 @@ def test_timestamp_fixed_offset_print(): assert str(arr[0]) == "1970-01-01 02:00:00+02:00" +@pytest.mark.numpy def test_duration(): arr = np.array([0, 3600000000000], dtype='timedelta64[ns]') @@ -559,6 +597,7 @@ def test_list(ty, klass): s[2] +@pytest.mark.numpy @pytest.mark.parametrize('ty', [ pa.list_(pa.int64()), pa.large_list(pa.int64()), diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 1b05c58384cf0..bdcb6c2b42d78 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -20,7 +20,10 @@ import weakref import pytest -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa import pyarrow.tests.util as test_util @@ -185,6 +188,7 @@ def test_time_types(): pa.time64('s') +@pytest.mark.numpy def test_from_numpy_dtype(): cases = [ (np.dtype('bool'), pa.bool_()), diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index aa7da0a742086..7ba9e2b3e13db 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -19,7 +19,10 @@ import sys import weakref -import numpy as np +try: + import numpy as np +except ImportError: + pytestmark = pytest.mark.numpy import pyarrow as pa try: diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py index 14fc949928c33..da50bcda52f2b 100644 --- a/python/pyarrow/tests/test_strategies.py +++ b/python/pyarrow/tests/test_strategies.py @@ -17,6 +17,8 @@ import hypothesis as h +import pytest + import pyarrow as pa import pyarrow.tests.strategies as past @@ -36,11 +38,13 @@ def test_schemas(schema): assert isinstance(schema, pa.lib.Schema) +@pytest.mark.numpy @h.given(past.all_arrays) def test_arrays(array): assert isinstance(array, pa.lib.Array) +@pytest.mark.numpy @h.given(past.arrays(past.primitive_types, nullable=False)) def test_array_nullability(array): assert array.null_count == 0 @@ -56,6 +60,7 @@ def test_record_batches(record_bath): assert isinstance(record_bath, pa.lib.RecordBatch) +@pytest.mark.numpy @h.given(past.all_tables) def test_tables(table): assert isinstance(table, pa.lib.Table) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 40700e4741321..01d468cd9e9cc 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -608,6 +608,7 @@ def table_provider(names, schema): assert res_tb == expected +@pytest.mark.numpy def test_scalar_aggregate_udf_basic(varargs_agg_func_fixture): test_table = pa.Table.from_pydict( @@ -756,6 +757,7 @@ def table_provider(names, _): assert res_tb == expected_tb +@pytest.mark.numpy def test_hash_aggregate_udf_basic(varargs_agg_func_fixture): test_table = pa.Table.from_pydict( diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index cd38909edf357..3b60cff2d8cf2 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -20,7 +20,10 @@ import sys import weakref -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pytest import pyarrow as pa import pyarrow.compute as pc @@ -125,6 +128,7 @@ def test_chunked_array_can_combine_chunks_with_no_chunks(): ).combine_chunks() == pa.array([], type=pa.bool_()) +@pytest.mark.numpy def test_chunked_array_to_numpy(): data = pa.chunked_array([ [1, 2, 3], @@ -173,6 +177,7 @@ def test_chunked_array_str(): ]""" +@pytest.mark.numpy def test_chunked_array_getitem(): data = [ pa.array([1, 2, 3]), @@ -972,12 +977,14 @@ def check_tensors(tensor, expected_tensor, type, size): assert tensor.strides == expected_tensor.strides -@pytest.mark.parametrize('typ', [ - np.uint8, np.uint16, np.uint32, np.uint64, - np.int8, np.int16, np.int32, np.int64, - np.float32, np.float64, +@pytest.mark.numpy +@pytest.mark.parametrize('typ_str', [ + "uint8", "uint16", "uint32", "uint64", + "int8", "int16", "int32", "int64", + "float32", "float64", ]) -def test_recordbatch_to_tensor_uniform_type(typ): +def test_recordbatch_to_tensor_uniform_type(typ_str): + typ = np.dtype(typ_str) arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] arr3 = [100, 100, 100, 100, 100, 100, 100, 100, 100] @@ -1031,6 +1038,7 @@ def test_recordbatch_to_tensor_uniform_type(typ): check_tensors(result, expected, pa.from_numpy_dtype(typ), 15) +@pytest.mark.numpy def test_recordbatch_to_tensor_uniform_float_16(): arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] @@ -1054,6 +1062,7 @@ def test_recordbatch_to_tensor_uniform_float_16(): check_tensors(result, expected, pa.float16(), 27) +@pytest.mark.numpy def test_recordbatch_to_tensor_mixed_type(): # uint16 + int16 = int32 arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] @@ -1105,6 +1114,7 @@ def test_recordbatch_to_tensor_mixed_type(): assert result.strides == expected.strides +@pytest.mark.numpy def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16(): arr1 = [1, 2, 3, 4, 5, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, 80, 90] @@ -1124,6 +1134,7 @@ def test_recordbatch_to_tensor_unsupported_mixed_type_with_float16(): batch.to_tensor() +@pytest.mark.numpy def test_recordbatch_to_tensor_nan(): arr1 = [1, 2, 3, 4, np.nan, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, np.nan, 90] @@ -1144,6 +1155,7 @@ def test_recordbatch_to_tensor_nan(): assert result.strides == expected.strides +@pytest.mark.numpy def test_recordbatch_to_tensor_null(): arr1 = [1, 2, 3, 4, None, 6, 7, 8, 9] arr2 = [10, 20, 30, 40, 50, 60, 70, None, 90] @@ -1204,6 +1216,7 @@ def test_recordbatch_to_tensor_null(): assert result.strides == expected.strides +@pytest.mark.numpy def test_recordbatch_to_tensor_empty(): batch = pa.RecordBatch.from_arrays( [ @@ -1295,6 +1308,7 @@ def test_slice_zero_length_table(): table.to_pandas() +@pytest.mark.numpy def test_recordbatchlist_schema_equals(): a1 = np.array([1], dtype='uint32') a2 = np.array([4.0, 5.0], dtype='float64') @@ -2130,6 +2144,7 @@ def test_table_unsafe_casting(cls): assert casted_table.equals(expected_table) +@pytest.mark.numpy def test_invalid_table_construct(): array = np.array([0, 1], dtype=np.uint8) u8 = pa.uint8() @@ -3287,6 +3302,7 @@ def test_table_sort_by(cls): assert sorted_tab_dict["b"] == ["foo", "car", "bar", "foobar"] +@pytest.mark.numpy @pytest.mark.parametrize("constructor", [pa.table, pa.record_batch]) def test_numpy_asarray(constructor): table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"]) @@ -3319,6 +3335,7 @@ def test_numpy_asarray(constructor): assert result.dtype == "int32" +@pytest.mark.numpy @pytest.mark.parametrize("constructor", [pa.table, pa.record_batch]) def test_numpy_array_protocol(constructor): table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"]) diff --git a/python/pyarrow/tests/test_tensor.py b/python/pyarrow/tests/test_tensor.py index 29c6de65b1607..debb1066280c1 100644 --- a/python/pyarrow/tests/test_tensor.py +++ b/python/pyarrow/tests/test_tensor.py @@ -21,7 +21,10 @@ import warnings import weakref -import numpy as np +try: + import numpy as np +except ImportError: + pytestmark = pytest.mark.numpy import pyarrow as pa diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index d673f956527aa..cc680939ac46a 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -30,7 +30,10 @@ tzst = None import weakref -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa import pyarrow.types as types import pyarrow.tests.strategies as past @@ -1265,14 +1268,16 @@ def test_field_modified_copies(): def test_is_integer_value(): assert pa.types.is_integer_value(1) - assert pa.types.is_integer_value(np.int64(1)) + if np is not None: + assert pa.types.is_integer_value(np.int64(1)) assert not pa.types.is_integer_value('1') def test_is_float_value(): assert not pa.types.is_float_value(1) assert pa.types.is_float_value(1.) - assert pa.types.is_float_value(np.float64(1)) + if np is not None: + assert pa.types.is_float_value(np.float64(1)) assert not pa.types.is_float_value('1.0') @@ -1280,8 +1285,9 @@ def test_is_boolean_value(): assert not pa.types.is_boolean_value(1) assert pa.types.is_boolean_value(True) assert pa.types.is_boolean_value(False) - assert pa.types.is_boolean_value(np.bool_(True)) - assert pa.types.is_boolean_value(np.bool_(False)) + if np is not None: + assert pa.types.is_boolean_value(np.bool_(True)) + assert pa.types.is_boolean_value(np.bool_(False)) @h.settings(suppress_health_check=(h.HealthCheck.too_slow,)) diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 22fefbbb58ba9..93004a30618a7 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -18,7 +18,10 @@ import pytest -import numpy as np +try: + import numpy as np +except ImportError: + np = None import pyarrow as pa from pyarrow import compute as pc @@ -749,6 +752,7 @@ def test_udt_datasource1_exception(): _test_datasource1_udt(datasource1_exception) +@pytest.mark.numpy def test_scalar_agg_basic(unary_agg_func_fixture): arr = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64()) result = pc.call_function("mean_udf", [arr]) @@ -756,6 +760,7 @@ def test_scalar_agg_basic(unary_agg_func_fixture): assert result == expected +@pytest.mark.numpy def test_scalar_agg_empty(unary_agg_func_fixture): empty = pa.array([], pa.float64()) @@ -775,6 +780,7 @@ def test_scalar_agg_wrong_output_type(wrong_output_type_agg_func_fixture): pc.call_function("y=wrong_output_type(x)", [arr]) +@pytest.mark.numpy def test_scalar_agg_varargs(varargs_agg_func_fixture): arr1 = pa.array([10, 20, 30, 40, 50], pa.int64()) arr2 = pa.array([1.0, 2.0, 3.0, 4.0, 5.0], pa.float64()) @@ -786,6 +792,7 @@ def test_scalar_agg_varargs(varargs_agg_func_fixture): assert result == expected +@pytest.mark.numpy def test_scalar_agg_exception(exception_agg_func_fixture): arr = pa.array([10, 20, 30, 40, 50, 60], pa.int64()) @@ -793,6 +800,7 @@ def test_scalar_agg_exception(exception_agg_func_fixture): pc.call_function("y=exception_len(x)", [arr]) +@pytest.mark.numpy def test_hash_agg_basic(unary_agg_func_fixture): arr1 = pa.array([10.0, 20.0, 30.0, 40.0, 50.0], pa.float64()) arr2 = pa.array([4, 2, 1, 2, 1], pa.int32()) @@ -811,6 +819,7 @@ def test_hash_agg_basic(unary_agg_func_fixture): assert result.sort_by('id') == expected.sort_by('id') +@pytest.mark.numpy def test_hash_agg_empty(unary_agg_func_fixture): arr1 = pa.array([], pa.float64()) arr2 = pa.array([], pa.int32()) @@ -841,6 +850,7 @@ def test_hash_agg_wrong_output_type(wrong_output_type_agg_func_fixture): table.group_by("id").aggregate([("value", "y=wrong_output_type(x)")]) +@pytest.mark.numpy def test_hash_agg_exception(exception_agg_func_fixture): arr1 = pa.array([10, 20, 30, 40, 50], pa.int64()) arr2 = pa.array([4, 2, 1, 2, 1], pa.int32()) @@ -850,6 +860,7 @@ def test_hash_agg_exception(exception_agg_func_fixture): table.group_by("id").aggregate([("value", "y=exception_len(x)")]) +@pytest.mark.numpy def test_hash_agg_random(sum_agg_func_fixture): """Test hash aggregate udf with randomly sampled data""" diff --git a/python/pyarrow/tests/test_without_numpy.py b/python/pyarrow/tests/test_without_numpy.py new file mode 100644 index 0000000000000..55c12602ce89a --- /dev/null +++ b/python/pyarrow/tests/test_without_numpy.py @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +import pyarrow as pa + +# Marks all of the tests in this module +# Ignore these with pytest ... -m 'not nonumpy' +pytestmark = pytest.mark.nonumpy + + +def test_array_to_np(): + arr = pa.array(range(10)) + + msg = "Cannot return a numpy.ndarray if NumPy is not present" + + with pytest.raises(ImportError, match=msg): + arr.to_numpy() + + +def test_chunked_array_to_np(): + data = pa.chunked_array([ + [1, 2, 3], + [4, 5, 6], + [] + ]) + msg = "Cannot return a numpy.ndarray if NumPy is not present" + + with pytest.raises(ImportError, match=msg): + data.to_numpy() + + +def test_tensor_to_np(): + tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + storage = pa.array(arr, pa.list_(pa.int32(), 4)) + tensor_array = pa.ExtensionArray.from_storage(tensor_type, storage) + + tensor = tensor_array.to_tensor() + msg = "Cannot return a numpy.ndarray if NumPy is not present" + + with pytest.raises(ImportError, match=msg): + tensor.to_numpy() diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 638eee9807335..aa6dd21f800c5 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -22,7 +22,6 @@ import contextlib import decimal import gc -import numpy as np import os import random import re @@ -110,27 +109,15 @@ def randdecimal(precision, scale): def random_ascii(length): - return bytes(np.random.randint(65, 123, size=length, dtype='i1')) + return bytes([random.randint(65, 122) for i in range(length)]) def rands(nchars): """ Generate one random string. """ - RANDS_CHARS = np.array( - list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) - return "".join(np.random.choice(RANDS_CHARS, nchars)) - - -def make_dataframe(): - import pandas as pd - - N = 30 - df = pd.DataFrame( - {col: np.random.randn(N) for col in string.ascii_uppercase[:4]}, - index=pd.Index([rands(10) for _ in range(N)]) - ) - return df + RANDS_CHARS = list(string.ascii_letters + string.digits) + return "".join(random.choice(RANDS_CHARS) for i in range(nchars)) def memory_leak_check(f, metric='rss', threshold=1 << 17, iterations=10, diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index f83ecc3aa4326..a46caff1f21a4 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -33,42 +33,50 @@ from cython import sizeof # These are imprecise because the type (in pandas 0.x) depends on the presence # of nulls -cdef dict _pandas_type_map = { - _Type_NA: np.object_, # NaNs - _Type_BOOL: np.bool_, - _Type_INT8: np.int8, - _Type_INT16: np.int16, - _Type_INT32: np.int32, - _Type_INT64: np.int64, - _Type_UINT8: np.uint8, - _Type_UINT16: np.uint16, - _Type_UINT32: np.uint32, - _Type_UINT64: np.uint64, - _Type_HALF_FLOAT: np.float16, - _Type_FLOAT: np.float32, - _Type_DOUBLE: np.float64, - # Pandas does not support [D]ay, so default to [ms] for date32 - _Type_DATE32: np.dtype('datetime64[ms]'), - _Type_DATE64: np.dtype('datetime64[ms]'), - _Type_TIMESTAMP: { - 's': np.dtype('datetime64[s]'), - 'ms': np.dtype('datetime64[ms]'), - 'us': np.dtype('datetime64[us]'), - 'ns': np.dtype('datetime64[ns]'), - }, - _Type_DURATION: { - 's': np.dtype('timedelta64[s]'), - 'ms': np.dtype('timedelta64[ms]'), - 'us': np.dtype('timedelta64[us]'), - 'ns': np.dtype('timedelta64[ns]'), - }, - _Type_BINARY: np.object_, - _Type_FIXED_SIZE_BINARY: np.object_, - _Type_STRING: np.object_, - _Type_LIST: np.object_, - _Type_MAP: np.object_, - _Type_DECIMAL128: np.object_, -} +cdef dict _pandas_type_map = {} + + +def _get_pandas_type_map(): + global _pandas_type_map + if not _pandas_type_map: + _pandas_type_map.update({ + _Type_NA: np.object_, # NaNs + _Type_BOOL: np.bool_, + _Type_INT8: np.int8, + _Type_INT16: np.int16, + _Type_INT32: np.int32, + _Type_INT64: np.int64, + _Type_UINT8: np.uint8, + _Type_UINT16: np.uint16, + _Type_UINT32: np.uint32, + _Type_UINT64: np.uint64, + _Type_HALF_FLOAT: np.float16, + _Type_FLOAT: np.float32, + _Type_DOUBLE: np.float64, + # Pandas does not support [D]ay, so default to [ms] for date32 + _Type_DATE32: np.dtype('datetime64[ms]'), + _Type_DATE64: np.dtype('datetime64[ms]'), + _Type_TIMESTAMP: { + 's': np.dtype('datetime64[s]'), + 'ms': np.dtype('datetime64[ms]'), + 'us': np.dtype('datetime64[us]'), + 'ns': np.dtype('datetime64[ns]'), + }, + _Type_DURATION: { + 's': np.dtype('timedelta64[s]'), + 'ms': np.dtype('timedelta64[ms]'), + 'us': np.dtype('timedelta64[us]'), + 'ns': np.dtype('timedelta64[ns]'), + }, + _Type_BINARY: np.object_, + _Type_FIXED_SIZE_BINARY: np.object_, + _Type_STRING: np.object_, + _Type_LIST: np.object_, + _Type_MAP: np.object_, + _Type_DECIMAL128: np.object_, + }) + return _pandas_type_map + cdef dict _pep3118_type_map = { _Type_INT8: b'b', @@ -149,14 +157,15 @@ def _is_primitive(Type type): def _get_pandas_type(arrow_type, coerce_to_ns=False): cdef Type type_id = arrow_type.id - if type_id not in _pandas_type_map: + cdef dict pandas_type_map = _get_pandas_type_map() + if type_id not in pandas_type_map: return None if coerce_to_ns: # ARROW-3789: Coerce date/timestamp types to datetime64[ns] if type_id == _Type_DURATION: return np.dtype('timedelta64[ns]') return np.dtype('datetime64[ns]') - pandas_type = _pandas_type_map[type_id] + pandas_type = pandas_type_map[type_id] if isinstance(pandas_type, dict): unit = getattr(arrow_type, 'unit', None) pandas_type = pandas_type.get(unit, None)