Skip to content

Commit

Permalink
[Backport release-1.15] [python] Validate byteorder on fastercsx ar…
Browse files Browse the repository at this point in the history
…gument arrays (#3521)

* validate byteorder on argument arrays (#3508)

* Use byte-order detection without std::endian for C++17

* work around fiddly include-path issues with trying to use `fmt::format`

---------

Co-authored-by: Bruce Martin <[email protected]>
Co-authored-by: John Kerl <[email protected]>
  • Loading branch information
3 people authored Jan 10, 2025
1 parent 451c5fc commit 82aa01c
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 13 deletions.
53 changes: 52 additions & 1 deletion apis/python/src/tiledbsoma/fastercsx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
*
* Python bindings for CSX conversion primitives.
*/

#include <bit>
#include <variant>

// Define to include extra debugging bindings (e.g., count_rows)
Expand Down Expand Up @@ -107,6 +107,45 @@ tcb::span<R> make_mutable_casted_span_(py::array arr) {
return tcb::span<R>(reinterpret_cast<R*>(p), arr.size());
}

/**
* @brief Return true if the NP byteorder is native (or equivalent).
*/
bool is_native_byteorder(const char byteorder) {
if (byteorder == '=') // native
return true;
if (byteorder == '|') // not-applicable
return true;

// On the main branch we have C++ 20 and we use std::endian.
// On release-1.15, as of this writing, we have C++ 17 and
// we roll our own.
union {
uint8_t u8[4];
uint32_t u32;
} test;
test.u32 = 0xaabbccdd;
if (test.u8[0] == 0xaa) {
// Big-endian: u8[0]..u8[3] are aa, bb, cc, dd
return byteorder == '>'; // big
} else if (test.u8[3] == 0xaa) {
// Little-endian: u8[0]..u8[3] are dd, cc, bb, aa
return byteorder == '<'; // little
} else {
throw std::runtime_error("Internal coding error detecting endianness");
}
}

/**
* @brief Check enddianness/byteorder is native, and raise exception if not.
* Necessary becuase we dispatch on dtype().num(), which doesn't confirm
* byteorder is native.
*/
void check_byteorder(const py::dtype& dtype) {
if (!is_native_byteorder(dtype.byteorder()))
throw invalid_argument(
"All arrays must have native byteorder (endianness).");
}

/*
* Value/data arrays are cast to an unsigned of the same width as the actual
* value type. This is solely to reduce the combinatorics of template
Expand Down Expand Up @@ -208,6 +247,7 @@ T lookup_dtype_(
const std::unordered_map<int, T>& index,
const py::dtype& dtype,
const std::string& array_name) {
check_byteorder(dtype);
try {
return index.at(dtype.num());
} catch (const std::out_of_range& oor) {
Expand Down Expand Up @@ -240,6 +280,7 @@ void compress_coo_validate_args_(
5. ensure B* are writeable
6. Ensure each element in A* tuples are same type
7. Ensure each element in the A* tuples are the same length
8. byteorder
etc...
Not checked:
Expand All @@ -260,6 +301,7 @@ void compress_coo_validate_args_(
if (arr.dtype().num() != vec[0].dtype().num())
throw pybind11::type_error(
"All chunks of COO arrays must be of same type.");
check_byteorder(arr.dtype());
}
}
for (uint64_t chunk_idx = 0; chunk_idx < n_chunks; chunk_idx++) {
Expand All @@ -268,9 +310,14 @@ void compress_coo_validate_args_(
throw std::length_error(
"All COO array tuple elements must be of the same size.");
}

if (Bp.ndim() != 1 || Bj.ndim() != 1 || Bd.ndim() != 1)
throw std::length_error("All arrays must be of dimension rank 1.");

check_byteorder(Bp.dtype());
check_byteorder(Bj.dtype());
check_byteorder(Bd.dtype());

for (auto& arr : Ad)
if (arr.dtype().num() != Bd.dtype().num())
throw pybind11::type_error("All data arrays must be of same type.");
Expand Down Expand Up @@ -407,6 +454,10 @@ bool sort_csx_indices(
if (!Bp.writeable() || !Bj.writeable() || !Bd.writeable())
throw std::invalid_argument("Output arrays must be writeable.");

check_byteorder(Bp.dtype());
check_byteorder(Bj.dtype());
check_byteorder(Bd.dtype());

// Get dispatch types
CsxIndexType csx_major_index_type = lookup_dtype_(
csx_index_type_dispatch, Bp.dtype(), "CSx indptr array");
Expand Down
79 changes: 79 additions & 0 deletions apis/python/tests/test_libfastercsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import sys
from typing import Any

import numpy as np
Expand All @@ -11,6 +12,8 @@
import tiledbsoma.pytiledbsoma as clib
import tiledbsoma.pytiledbsoma.fastercsx as fastercsx

NON_NATIVE_BYTEORDER = ">" if sys.byteorder == "little" else "<"


@pytest.fixture
def concurrency() -> int | None:
Expand Down Expand Up @@ -243,6 +246,14 @@ def test_sort_csx_indices_bad_args(
pbad[1] = -1
fastercsx.sort_csx_indices(context, pbad, j, d)

# non-native byteorder should throw
with pytest.raises(ValueError):
fastercsx.sort_csx_indices(context, p.astype(f"{NON_NATIVE_BYTEORDER}i4"), j, d)
with pytest.raises(ValueError):
fastercsx.sort_csx_indices(context, p, j.astype(f"{NON_NATIVE_BYTEORDER}i4"), d)
with pytest.raises(ValueError):
fastercsx.sort_csx_indices(context, p, j, d.astype(f"{NON_NATIVE_BYTEORDER}i4"))


def test_compress_coo_bad_args(
rng: np.random.Generator, context: clib.SOMAContext
Expand Down Expand Up @@ -312,6 +323,74 @@ def test_compress_coo_bad_args(
context, sp.shape, (i,), (j,), (d[1:],), indptr, indices, data
)

# non-native byteorder should throw
with pytest.raises(ValueError):
fastercsx.compress_coo(
context,
sp.shape,
(i.astype(f"{NON_NATIVE_BYTEORDER}i4"),),
(j,),
(d,),
indptr,
indices,
data,
)
with pytest.raises(ValueError):
fastercsx.compress_coo(
context,
sp.shape,
(i,),
(j.astype(f"{NON_NATIVE_BYTEORDER}i4"),),
(d,),
indptr,
indices,
data,
)
with pytest.raises(ValueError):
fastercsx.compress_coo(
context,
sp.shape,
(i,),
(j,),
(d.astype(f"{NON_NATIVE_BYTEORDER}i4"),),
indptr,
indices,
data,
)
with pytest.raises(ValueError):
fastercsx.compress_coo(
context,
sp.shape,
(i,),
(j,),
(d,),
indptr.astype(f"{NON_NATIVE_BYTEORDER}i4"),
indices,
data,
)
with pytest.raises(ValueError):
fastercsx.compress_coo(
context,
sp.shape,
(i,),
(j,),
(d,),
indptr,
indices.astype(f"{NON_NATIVE_BYTEORDER}i4"),
data,
)
with pytest.raises(ValueError):
fastercsx.compress_coo(
context,
sp.shape,
(i,),
(j,),
(d,),
indptr,
indices,
data.astype(f"{NON_NATIVE_BYTEORDER}i4"),
)


def test_ragged_chunk_error(
rng: np.random.Generator, context: clib.SOMAContext
Expand Down
37 changes: 25 additions & 12 deletions libtiledbsoma/src/utils/fastercsx.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
#include <numeric>
#include "span/span.hpp"

#include <spdlog/fmt/fmt.h>
#include "parallel_functions.h"

namespace tiledbsoma::fastercsx {
Expand Down Expand Up @@ -201,11 +200,19 @@ void count_rows_(
if ((row < 0) ||
(static_cast<std::make_unsigned_t<COO_IDX>>(row) >=
n_row)) [[unlikely]] {
throw std::out_of_range(fmt::format(
"First coordinate {} out of range {}.",
row,
0,
n_row));
// std::format isn't until C++ 20, and including
// utils/logger.h or spdlog/fmt/fmt.h is fiddly
// inside of a header file (fastercsx.h). For
// the moment (release-1.15, with C++ 17, as of
// 2025-01-10), we use stringstream.
// This is all temporary, this-branch-only, and
// will go away entirely once we have
// https://github.com/single-cell-data/TileDB-SOMA/issues/3154
// resolved.
std::stringstream ss;
ss << "First coordinate " << row << " out of range "
<< n_row << ".";
throw std::out_of_range(ss.str());
}
counts[row]++;
}
Expand All @@ -229,8 +236,10 @@ void count_rows_(
if ((row < 0) ||
(static_cast<std::make_unsigned_t<COO_IDX>>(row) >= n_row))
[[unlikely]] {
throw std::out_of_range(fmt::format(
"First coordinate {} out of range {}.", row, 0, n_row));
std::stringstream ss;
ss << "First coordinate " << row << " out of range "
<< n_row << ".";
throw std::out_of_range(ss.str());
}
Bp[row]++;
}
Expand Down Expand Up @@ -278,8 +287,10 @@ void compress_coo_inner_left_(
if ((Aj_[n] < 0) ||
(static_cast<std::make_unsigned_t<COO_IDX>>(Aj_[n]) >= n_col))
[[unlikely]] {
throw std::out_of_range(fmt::format(
"Second coordinate {} out of range {}.", Aj_[n], 0, n_col));
std::stringstream ss;
ss << "Second coordinate " << Aj_[n] << " out of range " << n_col
<< ".";
throw std::out_of_range(ss.str());
}
Bj[dest] = Aj_[n];
Bd[dest] = Ad_[n];
Expand Down Expand Up @@ -313,8 +324,10 @@ void compress_coo_inner_right_(
if ((Aj_[n] < 0) ||
(static_cast<std::make_unsigned_t<COO_IDX>>(Aj_[n]) >= n_col))
[[unlikely]] {
throw std::out_of_range(fmt::format(
"Second coordinate {} out of range {}.", Aj_[n], 0, n_col));
std::stringstream ss;
ss << "Second coordinate " << Aj_[n] << " out of range " << n_col
<< ".";
throw std::out_of_range(ss.str());
}

Bj[dest] = Aj_[n];
Expand Down

0 comments on commit 82aa01c

Please sign in to comment.