From 82aa01c0987d3dc9e5f93cdaf1052ec58d39100e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 10 Jan 2025 11:17:34 -0500 Subject: [PATCH] [Backport release-1.15] [python] Validate byteorder on `fastercsx` argument arrays (#3521) * validate byteorder on argument arrays (#3508) * Use byte-order detection without std::endian for C++17 * work around fiddly include-path issues with trying to use `fmt::format` --------- Co-authored-by: Bruce Martin Co-authored-by: John Kerl --- apis/python/src/tiledbsoma/fastercsx.cc | 53 ++++++++++++++++- apis/python/tests/test_libfastercsx.py | 79 +++++++++++++++++++++++++ libtiledbsoma/src/utils/fastercsx.h | 37 ++++++++---- 3 files changed, 156 insertions(+), 13 deletions(-) diff --git a/apis/python/src/tiledbsoma/fastercsx.cc b/apis/python/src/tiledbsoma/fastercsx.cc index e1de4f901b..111e0d4bd0 100644 --- a/apis/python/src/tiledbsoma/fastercsx.cc +++ b/apis/python/src/tiledbsoma/fastercsx.cc @@ -29,7 +29,7 @@ * * Python bindings for CSX conversion primitives. */ - +#include #include // Define to include extra debugging bindings (e.g., count_rows) @@ -107,6 +107,45 @@ tcb::span make_mutable_casted_span_(py::array arr) { return tcb::span(reinterpret_cast(p), arr.size()); } +/** + * @brief Return true if the NP byteorder is native (or equivalent). + */ +bool is_native_byteorder(const char byteorder) { + if (byteorder == '=') // native + return true; + if (byteorder == '|') // not-applicable + return true; + + // On the main branch we have C++ 20 and we use std::endian. + // On release-1.15, as of this writing, we have C++ 17 and + // we roll our own. + union { + uint8_t u8[4]; + uint32_t u32; + } test; + test.u32 = 0xaabbccdd; + if (test.u8[0] == 0xaa) { + // Big-endian: u8[0]..u8[3] are aa, bb, cc, dd + return byteorder == '>'; // big + } else if (test.u8[3] == 0xaa) { + // Little-endian: u8[0]..u8[3] are dd, cc, bb, aa + return byteorder == '<'; // little + } else { + throw std::runtime_error("Internal coding error detecting endianness"); + } +} + +/** + * @brief Check enddianness/byteorder is native, and raise exception if not. + * Necessary becuase we dispatch on dtype().num(), which doesn't confirm + * byteorder is native. + */ +void check_byteorder(const py::dtype& dtype) { + if (!is_native_byteorder(dtype.byteorder())) + throw invalid_argument( + "All arrays must have native byteorder (endianness)."); +} + /* * Value/data arrays are cast to an unsigned of the same width as the actual * value type. This is solely to reduce the combinatorics of template @@ -208,6 +247,7 @@ T lookup_dtype_( const std::unordered_map& index, const py::dtype& dtype, const std::string& array_name) { + check_byteorder(dtype); try { return index.at(dtype.num()); } catch (const std::out_of_range& oor) { @@ -240,6 +280,7 @@ void compress_coo_validate_args_( 5. ensure B* are writeable 6. Ensure each element in A* tuples are same type 7. Ensure each element in the A* tuples are the same length + 8. byteorder etc... Not checked: @@ -260,6 +301,7 @@ void compress_coo_validate_args_( if (arr.dtype().num() != vec[0].dtype().num()) throw pybind11::type_error( "All chunks of COO arrays must be of same type."); + check_byteorder(arr.dtype()); } } for (uint64_t chunk_idx = 0; chunk_idx < n_chunks; chunk_idx++) { @@ -268,9 +310,14 @@ void compress_coo_validate_args_( throw std::length_error( "All COO array tuple elements must be of the same size."); } + if (Bp.ndim() != 1 || Bj.ndim() != 1 || Bd.ndim() != 1) throw std::length_error("All arrays must be of dimension rank 1."); + check_byteorder(Bp.dtype()); + check_byteorder(Bj.dtype()); + check_byteorder(Bd.dtype()); + for (auto& arr : Ad) if (arr.dtype().num() != Bd.dtype().num()) throw pybind11::type_error("All data arrays must be of same type."); @@ -407,6 +454,10 @@ bool sort_csx_indices( if (!Bp.writeable() || !Bj.writeable() || !Bd.writeable()) throw std::invalid_argument("Output arrays must be writeable."); + check_byteorder(Bp.dtype()); + check_byteorder(Bj.dtype()); + check_byteorder(Bd.dtype()); + // Get dispatch types CsxIndexType csx_major_index_type = lookup_dtype_( csx_index_type_dispatch, Bp.dtype(), "CSx indptr array"); diff --git a/apis/python/tests/test_libfastercsx.py b/apis/python/tests/test_libfastercsx.py index 6e949a9916..461e6bec7b 100644 --- a/apis/python/tests/test_libfastercsx.py +++ b/apis/python/tests/test_libfastercsx.py @@ -2,6 +2,7 @@ from __future__ import annotations +import sys from typing import Any import numpy as np @@ -11,6 +12,8 @@ import tiledbsoma.pytiledbsoma as clib import tiledbsoma.pytiledbsoma.fastercsx as fastercsx +NON_NATIVE_BYTEORDER = ">" if sys.byteorder == "little" else "<" + @pytest.fixture def concurrency() -> int | None: @@ -243,6 +246,14 @@ def test_sort_csx_indices_bad_args( pbad[1] = -1 fastercsx.sort_csx_indices(context, pbad, j, d) + # non-native byteorder should throw + with pytest.raises(ValueError): + fastercsx.sort_csx_indices(context, p.astype(f"{NON_NATIVE_BYTEORDER}i4"), j, d) + with pytest.raises(ValueError): + fastercsx.sort_csx_indices(context, p, j.astype(f"{NON_NATIVE_BYTEORDER}i4"), d) + with pytest.raises(ValueError): + fastercsx.sort_csx_indices(context, p, j, d.astype(f"{NON_NATIVE_BYTEORDER}i4")) + def test_compress_coo_bad_args( rng: np.random.Generator, context: clib.SOMAContext @@ -312,6 +323,74 @@ def test_compress_coo_bad_args( context, sp.shape, (i,), (j,), (d[1:],), indptr, indices, data ) + # non-native byteorder should throw + with pytest.raises(ValueError): + fastercsx.compress_coo( + context, + sp.shape, + (i.astype(f"{NON_NATIVE_BYTEORDER}i4"),), + (j,), + (d,), + indptr, + indices, + data, + ) + with pytest.raises(ValueError): + fastercsx.compress_coo( + context, + sp.shape, + (i,), + (j.astype(f"{NON_NATIVE_BYTEORDER}i4"),), + (d,), + indptr, + indices, + data, + ) + with pytest.raises(ValueError): + fastercsx.compress_coo( + context, + sp.shape, + (i,), + (j,), + (d.astype(f"{NON_NATIVE_BYTEORDER}i4"),), + indptr, + indices, + data, + ) + with pytest.raises(ValueError): + fastercsx.compress_coo( + context, + sp.shape, + (i,), + (j,), + (d,), + indptr.astype(f"{NON_NATIVE_BYTEORDER}i4"), + indices, + data, + ) + with pytest.raises(ValueError): + fastercsx.compress_coo( + context, + sp.shape, + (i,), + (j,), + (d,), + indptr, + indices.astype(f"{NON_NATIVE_BYTEORDER}i4"), + data, + ) + with pytest.raises(ValueError): + fastercsx.compress_coo( + context, + sp.shape, + (i,), + (j,), + (d,), + indptr, + indices, + data.astype(f"{NON_NATIVE_BYTEORDER}i4"), + ) + def test_ragged_chunk_error( rng: np.random.Generator, context: clib.SOMAContext diff --git a/libtiledbsoma/src/utils/fastercsx.h b/libtiledbsoma/src/utils/fastercsx.h index f58c46123e..7cb460aeb2 100644 --- a/libtiledbsoma/src/utils/fastercsx.h +++ b/libtiledbsoma/src/utils/fastercsx.h @@ -37,7 +37,6 @@ #include #include "span/span.hpp" -#include #include "parallel_functions.h" namespace tiledbsoma::fastercsx { @@ -201,11 +200,19 @@ void count_rows_( if ((row < 0) || (static_cast>(row) >= n_row)) [[unlikely]] { - throw std::out_of_range(fmt::format( - "First coordinate {} out of range {}.", - row, - 0, - n_row)); + // std::format isn't until C++ 20, and including + // utils/logger.h or spdlog/fmt/fmt.h is fiddly + // inside of a header file (fastercsx.h). For + // the moment (release-1.15, with C++ 17, as of + // 2025-01-10), we use stringstream. + // This is all temporary, this-branch-only, and + // will go away entirely once we have + // https://github.com/single-cell-data/TileDB-SOMA/issues/3154 + // resolved. + std::stringstream ss; + ss << "First coordinate " << row << " out of range " + << n_row << "."; + throw std::out_of_range(ss.str()); } counts[row]++; } @@ -229,8 +236,10 @@ void count_rows_( if ((row < 0) || (static_cast>(row) >= n_row)) [[unlikely]] { - throw std::out_of_range(fmt::format( - "First coordinate {} out of range {}.", row, 0, n_row)); + std::stringstream ss; + ss << "First coordinate " << row << " out of range " + << n_row << "."; + throw std::out_of_range(ss.str()); } Bp[row]++; } @@ -278,8 +287,10 @@ void compress_coo_inner_left_( if ((Aj_[n] < 0) || (static_cast>(Aj_[n]) >= n_col)) [[unlikely]] { - throw std::out_of_range(fmt::format( - "Second coordinate {} out of range {}.", Aj_[n], 0, n_col)); + std::stringstream ss; + ss << "Second coordinate " << Aj_[n] << " out of range " << n_col + << "."; + throw std::out_of_range(ss.str()); } Bj[dest] = Aj_[n]; Bd[dest] = Ad_[n]; @@ -313,8 +324,10 @@ void compress_coo_inner_right_( if ((Aj_[n] < 0) || (static_cast>(Aj_[n]) >= n_col)) [[unlikely]] { - throw std::out_of_range(fmt::format( - "Second coordinate {} out of range {}.", Aj_[n], 0, n_col)); + std::stringstream ss; + ss << "Second coordinate " << Aj_[n] << " out of range " << n_col + << "."; + throw std::out_of_range(ss.str()); } Bj[dest] = Aj_[n];