Skip to content

Commit

Permalink
apacheGH-25118: [Python] Make NumPy an optional runtime dependency (a…
Browse files Browse the repository at this point in the history
…pache#41904)

### Rationale for this change

Being able to run pyarrow without requiring numpy.

### What changes are included in this PR?

If numpy is not present we are able to import pyarrow and run functionality.
A new CI job has been created to run some basic tests without numpy.

### Are these changes tested?

Yes via CI.

### Are there any user-facing changes?

Yes, NumPy can be removed from the user installation and pyarrow functionality still works

* GitHub Issue: apache#25118

Lead-authored-by: Raúl Cumplido <[email protected]>
Co-authored-by: Joris Van den Bossche <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
3 people authored and zanmato1984 committed Sep 6, 2024
1 parent 9bacb0e commit 89ac0ed
Show file tree
Hide file tree
Showing 62 changed files with 1,008 additions and 420 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ jobs:
- conda-python-3.9-nopandas
- conda-python-3.8-pandas-1.0
- conda-python-3.10-pandas-latest
- conda-python-3.10-no-numpy
include:
- name: conda-python-docs
cache: conda-python-3.9
Expand All @@ -83,6 +84,11 @@ jobs:
title: AMD64 Conda Python 3.10 Pandas latest
python: "3.10"
pandas: latest
- name: conda-python-3.10-no-numpy
cache: conda-python-3.10
image: conda-python-no-numpy
title: AMD64 Conda Python 3.10 without NumPy
python: "3.10"
env:
PYTHON: ${{ matrix.python || 3.8 }}
UBUNTU: ${{ matrix.ubuntu || 20.04 }}
Expand Down
32 changes: 32 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ x-hierarchy:
- conda-python-hdfs
- conda-python-java-integration
- conda-python-jpype
- conda-python-no-numpy
- conda-python-spark
- conda-python-substrait
- conda-verify-rc
Expand Down Expand Up @@ -1258,6 +1259,37 @@ services:
volumes: *conda-volumes
command: *python-conda-command

conda-python-no-numpy:
# Usage:
# docker-compose build conda
# docker-compose build conda-cpp
# docker-compose build conda-python
# docker-compose build conda-python-no-numpy
# docker-compose run --rm conda-python-no-numpy
image: ${REPO}:${ARCH}-conda-python-${PYTHON}-no-numpy
build:
context: .
dockerfile: ci/docker/conda-python.dockerfile
cache_from:
- ${REPO}:${ARCH}-conda-python-${PYTHON}
args:
repo: ${REPO}
arch: ${ARCH}
python: ${PYTHON}
shm_size: *shm-size
environment:
<<: [*common, *ccache, *sccache]
PARQUET_REQUIRE_ENCRYPTION: # inherit
HYPOTHESIS_PROFILE: # inherit
PYARROW_TEST_HYPOTHESIS: # inherit
volumes: *conda-volumes
command:
["
/arrow/ci/scripts/cpp_build.sh /arrow /build &&
/arrow/ci/scripts/python_build.sh /arrow /build &&
mamba uninstall -y numpy &&
/arrow/ci/scripts/python_test.sh /arrow"]

conda-python-docs:
# Usage:
# archery docker run conda-python-docs
Expand Down
4 changes: 2 additions & 2 deletions python/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -339,17 +339,17 @@ set(PYARROW_CPP_SRCS
${PYARROW_CPP_SOURCE_DIR}/gdb.cc
${PYARROW_CPP_SOURCE_DIR}/helpers.cc
${PYARROW_CPP_SOURCE_DIR}/inference.cc
${PYARROW_CPP_SOURCE_DIR}/init.cc
${PYARROW_CPP_SOURCE_DIR}/io.cc
${PYARROW_CPP_SOURCE_DIR}/ipc.cc
${PYARROW_CPP_SOURCE_DIR}/numpy_convert.cc
${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
${PYARROW_CPP_SOURCE_DIR}/numpy_to_arrow.cc
${PYARROW_CPP_SOURCE_DIR}/python_test.cc
${PYARROW_CPP_SOURCE_DIR}/python_to_arrow.cc
${PYARROW_CPP_SOURCE_DIR}/pyarrow.cc
${PYARROW_CPP_SOURCE_DIR}/serialize.cc
${PYARROW_CPP_SOURCE_DIR}/udf.cc)
set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/init.cc
set_source_files_properties(${PYARROW_CPP_SOURCE_DIR}/numpy_init.cc
PROPERTIES SKIP_PRECOMPILE_HEADERS ON
SKIP_UNITY_BUILD_INCLUSION ON)

Expand Down
16 changes: 12 additions & 4 deletions python/pyarrow/_compute.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ from pyarrow.util import _DEPR_MSG
from libcpp cimport bool as c_bool

import inspect
import numpy as np
try:
import numpy as np
except ImportError:
np = None
import warnings


Expand All @@ -43,6 +46,11 @@ _substrait_msg = (
)


SUPPORTED_INPUT_ARR_TYPES = (list, tuple)
if np is not None:
SUPPORTED_INPUT_ARR_TYPES += (np.ndarray, )


def _pas():
global __pas
if __pas is None:
Expand Down Expand Up @@ -473,7 +481,7 @@ cdef class MetaFunction(Function):

cdef _pack_compute_args(object values, vector[CDatum]* out):
for val in values:
if isinstance(val, (list, np.ndarray)):
if isinstance(val, SUPPORTED_INPUT_ARR_TYPES):
val = lib.asarray(val)

if isinstance(val, Array):
Expand Down Expand Up @@ -2189,7 +2197,7 @@ class QuantileOptions(_QuantileOptions):

def __init__(self, q=0.5, *, interpolation="linear", skip_nulls=True,
min_count=0):
if not isinstance(q, (list, tuple, np.ndarray)):
if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
q = [q]
self._set_options(q, interpolation, skip_nulls, min_count)

Expand Down Expand Up @@ -2222,7 +2230,7 @@ class TDigestOptions(_TDigestOptions):

def __init__(self, q=0.5, *, delta=100, buffer_size=500, skip_nulls=True,
min_count=0):
if not isinstance(q, (list, tuple, np.ndarray)):
if not isinstance(q, SUPPORTED_INPUT_ARR_TYPES):
q = [q]
self._set_options(q, delta, buffer_size, skip_nulls, min_count)

Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ cdef _sequence_to_array(object sequence, object mask, object size,


cdef inline _is_array_like(obj):
if np is None:
return False
if isinstance(obj, np.ndarray):
return True
return pandas_api._have_pandas_internal() and pandas_api.is_array_like(obj)
Expand Down Expand Up @@ -1608,6 +1610,9 @@ cdef class Array(_PandasConvertible):
"""
self._assert_cpu()

if np is None:
raise ImportError(
"Cannot return a numpy.ndarray if NumPy is not present")
cdef:
PyObject* out
PandasOptions c_options
Expand Down
14 changes: 8 additions & 6 deletions python/pyarrow/builder.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.

import math


cdef class StringBuilder(_Weakrefable):
"""
Expand Down Expand Up @@ -42,10 +44,10 @@ cdef class StringBuilder(_Weakrefable):
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
if value is None or value is np.nan:
self.builder.get().AppendNull()
elif isinstance(value, (bytes, str)):
if isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
elif value is None or math.isnan(value):
self.builder.get().AppendNull()
else:
raise TypeError('StringBuilder only accepts string objects')

Expand Down Expand Up @@ -108,10 +110,10 @@ cdef class StringViewBuilder(_Weakrefable):
value : string/bytes or np.nan/None
The value to append to the string array builder.
"""
if value is None or value is np.nan:
self.builder.get().AppendNull()
elif isinstance(value, (bytes, str)):
if isinstance(value, (bytes, str)):
self.builder.get().Append(tobytes(value))
elif value is None or math.isnan(value):
self.builder.get().AppendNull()
else:
raise TypeError('StringViewBuilder only accepts string objects')

Expand Down
13 changes: 12 additions & 1 deletion python/pyarrow/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pyarrow.tests.util import windows_has_tzdata
import sys

import numpy as np

groups = [
'acero',
Expand All @@ -46,6 +45,8 @@
'lz4',
'memory_leak',
'nopandas',
'nonumpy',
'numpy',
'orc',
'pandas',
'parquet',
Expand Down Expand Up @@ -81,6 +82,8 @@
'lz4': Codec.is_available('lz4'),
'memory_leak': False,
'nopandas': False,
'nonumpy': False,
'numpy': False,
'orc': False,
'pandas': False,
'parquet': False,
Expand Down Expand Up @@ -158,6 +161,12 @@
except ImportError:
defaults['nopandas'] = True

try:
import numpy # noqa
defaults['numpy'] = True
except ImportError:
defaults['nonumpy'] = True

try:
import pyarrow.parquet # noqa
defaults['parquet'] = True
Expand Down Expand Up @@ -327,6 +336,7 @@ def unary_agg_func_fixture():
Register a unary aggregate function (mean)
"""
from pyarrow import compute as pc
import numpy as np

def func(ctx, x):
return pa.scalar(np.nanmean(x))
Expand All @@ -352,6 +362,7 @@ def varargs_agg_func_fixture():
Register a unary aggregate function
"""
from pyarrow import compute as pc
import numpy as np

def func(ctx, *args):
sum = 0.0
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py::internal" nogil:
CResult[PyObject*] StringToTzinfo(c_string)


cdef extern from "arrow/python/init.h":
cdef extern from "arrow/python/numpy_init.h" namespace "arrow::py":
int arrow_init_numpy() except -1


Expand Down
12 changes: 9 additions & 3 deletions python/pyarrow/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@

import datetime
import decimal as _pydecimal
import numpy as np
try:
import numpy as np
except ImportError:
np = None
import os
import sys

Expand All @@ -32,8 +35,11 @@ from pyarrow.includes.common cimport PyObject_to_object
cimport pyarrow.includes.libarrow_python as libarrow_python
cimport cpython as cp

# Initialize NumPy C API
arrow_init_numpy()

# Initialize NumPy C API only if numpy was able to be imported
if np is not None:
arrow_init_numpy()

# Initialize PyArrow C++ API
# (used from some of our C++ code, see e.g. ARROW-5260)
import_pyarrow()
Expand Down
79 changes: 47 additions & 32 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@
import re
import warnings

import numpy as np

try:
import numpy as np
except ImportError:
np = None
import pyarrow as pa
from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa


_logical_type_map = {}
_numpy_logical_type_map = {}
_pandas_logical_type_map = {}


def get_logical_type_map():
Expand Down Expand Up @@ -85,27 +89,32 @@ def get_logical_type(arrow_type):
return 'object'


_numpy_logical_type_map = {
np.bool_: 'bool',
np.int8: 'int8',
np.int16: 'int16',
np.int32: 'int32',
np.int64: 'int64',
np.uint8: 'uint8',
np.uint16: 'uint16',
np.uint32: 'uint32',
np.uint64: 'uint64',
np.float32: 'float32',
np.float64: 'float64',
'datetime64[D]': 'date',
np.str_: 'string',
np.bytes_: 'bytes',
}
def get_numpy_logical_type_map():
global _numpy_logical_type_map
if not _numpy_logical_type_map:
_numpy_logical_type_map.update({
np.bool_: 'bool',
np.int8: 'int8',
np.int16: 'int16',
np.int32: 'int32',
np.int64: 'int64',
np.uint8: 'uint8',
np.uint16: 'uint16',
np.uint32: 'uint32',
np.uint64: 'uint64',
np.float32: 'float32',
np.float64: 'float64',
'datetime64[D]': 'date',
np.str_: 'string',
np.bytes_: 'bytes',
})
return _numpy_logical_type_map


def get_logical_type_from_numpy(pandas_collection):
numpy_logical_type_map = get_numpy_logical_type_map()
try:
return _numpy_logical_type_map[pandas_collection.dtype.type]
return numpy_logical_type_map[pandas_collection.dtype.type]
except KeyError:
if hasattr(pandas_collection.dtype, 'tz'):
return 'datetimetz'
Expand Down Expand Up @@ -1023,18 +1032,23 @@ def _is_generated_index_name(name):
return re.match(pattern, name) is not None


_pandas_logical_type_map = {
'date': 'datetime64[D]',
'datetime': 'datetime64[ns]',
'datetimetz': 'datetime64[ns]',
'unicode': np.str_,
'bytes': np.bytes_,
'string': np.str_,
'integer': np.int64,
'floating': np.float64,
'decimal': np.object_,
'empty': np.object_,
}
def get_pandas_logical_type_map():
global _pandas_logical_type_map

if not _pandas_logical_type_map:
_pandas_logical_type_map.update({
'date': 'datetime64[D]',
'datetime': 'datetime64[ns]',
'datetimetz': 'datetime64[ns]',
'unicode': np.str_,
'bytes': np.bytes_,
'string': np.str_,
'integer': np.int64,
'floating': np.float64,
'decimal': np.object_,
'empty': np.object_,
})
return _pandas_logical_type_map


def _pandas_type_to_numpy_type(pandas_type):
Expand All @@ -1050,8 +1064,9 @@ def _pandas_type_to_numpy_type(pandas_type):
dtype : np.dtype
The dtype that corresponds to `pandas_type`.
"""
pandas_logical_type_map = get_pandas_logical_type_map()
try:
return _pandas_logical_type_map[pandas_type]
return pandas_logical_type_map[pandas_type]
except KeyError:
if 'mixed' in pandas_type:
# catching 'mixed', 'mixed-integer' and 'mixed-integer-float'
Expand Down
Loading

0 comments on commit 89ac0ed

Please sign in to comment.