From 7465d12869bbfb7d9049e9fa38734cb7418c1f10 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Sat, 11 Nov 2023 15:45:58 -0700 Subject: [PATCH] Add P1673R13 --- D1673/P1673R13.html | 11040 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 11040 insertions(+) create mode 100644 D1673/P1673R13.html diff --git a/D1673/P1673R13.html b/D1673/P1673R13.html new file mode 100644 index 00000000..1d914594 --- /dev/null +++ b/D1673/P1673R13.html @@ -0,0 +1,11040 @@ + + + + + + + + A free function linear algebra interface based on the BLAS + + + + + + + + +
+
+

A free function linear +algebra interface based on the BLAS

+ + + + + + + + + + + + + + + + + + +
Document #: P1673
Date: 2023-11-10
Project: Programming Language C++
+ LEWG
+
Reply-to: + Mark Hoemmen
<>
+ Daisy Hollman
<>
+ Christian Trott
<>
+ Daniel Sunderland
<>
+ Nevin Liber
<>
+ Alicia Klinvex
<>
+ Li-Ta Lo
<>
+ Damien Lebrun-Grandie
<>
+ Graham Lopez
<>
+ Peter Caday
<>
+ Sarah Knepper
<>
+ Piotr Luszczek
<>
+ Timothy Costa
<>
+
+ +
+
+
+

Contents

+ +
+

1 Authors and contributors

+

1.1 Authors

+
    +
  • Mark Hoemmen (mhoemmen@nvidia.com) (NVIDIA)

  • +
  • Daisy Hollman (cpp@dsh.fyi) (Google)

  • +
  • Christian Trott (crtrott@sandia.gov) (Sandia National +Laboratories)

  • +
  • Daniel Sunderland (dansunderland@gmail.com)

  • +
  • Nevin Liber (nliber@anl.gov) (Argonne National +Laboratory)

  • +
  • Alicia Klinvex (alicia.klinvex@unnpp.gov) (Naval Nuclear +Laboratory)

  • +
  • Li-Ta Lo (ollie@lanl.gov) (Los Alamos National +Laboratory)

  • +
  • Damien Lebrun-Grandie (lebrungrandt@ornl.gov) (Oak Ridge National +Laboratories)

  • +
  • Graham Lopez (glopez@nvidia.com) (NVIDIA)

  • +
  • Peter Caday (peter.caday@intel.com) (Intel)

  • +
  • Sarah Knepper (sarah.knepper@intel.com) (Intel)

  • +
  • Piotr Luszczek (luszczek@icl.utk.edu) (University of +Tennessee)

  • +
  • Timothy Costa (tcosta@nvidia.com) (NVIDIA)

  • +
+

1.2 Contributors

+
    +
  • Zach Laine (particular thanks for R12 review and +suggestions)

  • +
  • Chip Freitag (chip.freitag@amd.com) (AMD)

  • +
  • Bryce Adelstein Lelbach (brycelelbach@gmail.com) +(NVIDIA)

  • +
  • Srinath Vadlamani (Srinath.Vadlamani@arm.com) (ARM)

  • +
  • Rene Vanoostrum (Rene.Vanoostrum@amd.com) (AMD)

  • +
+

2 Revision history

+
    +
  • Revision 0 (pre-Cologne) submitted 2019-06-17

    +
      +
    • Received feedback in Cologne from SG6, LEWGI, and (???).
    • +
  • +
  • Revision 1 (pre-Belfast) to be submitted 2019-10-07

    +
      +
    • Account for Cologne 2019 feedback

      +
        +
      • Make interface more consistent with existing Standard +algorithms

      • +
      • Change dot, dotc, +vector_norm2, and vector_abs_sum to imitate +reduce, so that they return their result, instead of taking +an output parameter. Users may set the result type via optional +init parameter.

      • +
    • +
    • Minor changes to “expression template” classes, based on +implementation experience

    • +
    • Briefly address LEWGI request of exploring concepts for input +arguments.

    • +
    • Lazy ranges style API was NOT explored.

    • +
  • +
  • Revision 2 (pre-Cologne) to be submitted 2020-01-13

    +
      +
    • Add “Future work” section.

    • +
    • Remove “Options and votes” section (which were addressed in SG6, +SG14, and LEWGI).

    • +
    • Remove basic_mdarray overloads.

    • +
    • Remove batched linear algebra operations.

    • +
    • Remove over- and underflow requirement for +vector_norm2.

    • +
    • Mandate any extent compatibility checks that can be done +at compile time.

    • +
    • Add missing functions +{symmetric,hermitian}_matrix_rank_k_update and +triangular_matrix_{left,right}_product.

    • +
    • Remove packed_view function.

    • +
    • Fix wording for +{conjugate,transpose,conjugate_transpose}_view, so that +implementations may optimize the return type. Make sure that +transpose_view of a layout_blas_packed matrix +returns a layout_blas_packed matrix with opposite +Triangle and StorageOrder.

    • +
    • Remove second template parameter T from +accessor_conjugate.

    • +
    • Make scaled_scalar and +conjugated_scalar exposition only.

    • +
    • Add in-place overloads of +triangular_matrix_matrix_{left,right}_solve, +triangular_matrix_{left,right}_product, and +triangular_matrix_vector_solve.

    • +
    • Add alpha overloads to +{symmetric,hermitian}_matrix_rank_{1,k}_update.

    • +
    • Add Cholesky factorization and solve examples.

    • +
  • +
  • Revision 3 (electronic) to be submitted 2021-04-15

    +
      +
    • Per LEWG request, add a section on our investigation of +constraining template parameters with concepts, in the manner of P1813R0 +with the numeric algorithms. We concluded that we disagree with the +approach of P1813R0, and that the Standard’s current +GENERALIZED_SUM approach better expresses numeric algorithms’ +behavior.

    • +
    • Update references to the current revision of P0009 +(mdspan).

    • +
    • Per LEWG request, introduce std::linalg namespace +and put everything in there.

    • +
    • Per LEWG request, replace the linalg_ prefix with +the aforementioned namespace. We renamed linalg_add to +add, linalg_copy to copy, and +linalg_swap to swap_elements.

    • +
    • Per LEWG request, do not use _view as a suffix, to +avoid confusion with “views” in the sense of Ranges. We renamed +conjugate_view to conjugated, +conjugate_transpose_view to +conjugate_transposed, scaled_view to +scaled, and transpose_view to +transposed.

    • +
    • Change wording from “then implementations will use +T’s precision or greater for intermediate terms in the +sum,” to “then intermediate terms in the sum use T’s +precision or greater.” Thanks to Jens Maurer for this suggestion (and +many others!).

    • +
    • Before, a Note on vector_norm2 said, “We recommend +that implementers document their guarantees regarding overflow and +underflow of vector_norm2 for floating-point return types.” +Implementations always document “implementation-defined behavior” per +[defs.impl.defined]. (Thanks to Jens Maurer for +pointing out that “We recommend…” does not belong in the Standard.) +Thus, we changed this from a Note to normative wording in Remarks: “If +either in_vector_t::element_type or T are +floating-point types or complex versions thereof, then any guarantees +regarding overflow and underflow of vector_norm2 are +implementation-defined.”

    • +
    • Define return types of the dot, dotc, +vector_norm2, and vector_abs_sum overloads +with auto return type.

    • +
    • Remove the explicitly stated constraint on add and +copy that the rank of the array arguments be no more than +2. This is redundant, because we already impose this via the existing +constraints on template parameters named in_object*_t, +inout_object*_t, or out_object*_t. If we later +wish to relax this restriction, then we only have to do so in one +place.

    • +
    • Add vector_sum_of_squares. First, this gives +implementers a path to implementing vector_norm2 in a way +that achieves the over/underflow guarantees intended by the BLAS +Standard. Second, this is a useful algorithm in itself for parallelizing +vector 2-norm computation.

    • +
    • Add matrix_frob_norm, matrix_one_norm, +and matrix_inf_norm (thanks to coauthor Piotr +Luszczek).

    • +
    • Address LEWG request for us to investigate support for GPU +memory. See section “Explicit support for asynchronous return of scalar +values.”

    • +
    • Add ExecutionPolicy overloads of the in-place +versions of triangular_matrix_vector_solve, +triangular_matrix_left_product, +triangular_matrix_right_product, +triangular_matrix_matrix_left_solve, and +triangular_matrix_matrix_right_solve.

    • +
  • +
  • Revision 4 (electronic), to be submitted 2021-08-15

    +
      +
    • Update authors’ contact info.

    • +
    • Rebase atop P2299R3, which in turn sits atop P0009R12. Make any +needed fixes due to these changes. (P1673R3 was based on P0009R10, +without P2299.) Update P0009 references to point to the latest version +(R12).

    • +
    • Fix requirements for +{symmetric,hermitian}_matrix_{left,right}_product.

    • +
    • Change SemiRegular<Scalar> to +semiregular<Scalar>.

    • +
    • Make Real requirements refer to +[complex.numbers.general], rather than explicitly +listing allowed types. Remove redundant constraints on +Real.

    • +
    • In [linalg.algs.reqs], clarify that “unique +layout” for output matrix, vector, or object types means +is_always_unique() equals true.

    • +
    • Change file format from Markdown to Bikeshed.

    • +
    • Impose requirements on the types on which algorithms compute, and +on the algorithms themselves (e.g., what rearrangements are permitted). +Add a section explaining how we came up with the requirements. Lift the +requirements into a new higher-level section that applies to the entire +contents of [linalg], not just to +[linalg.algs].

    • +
    • Add “Overview of contents” section.

    • +
    • In the last review, LEWG had asked us to consider using +exposition-only concepts and requires clauses to express +requirements more clearly. We decided not to do so, because we did not +think it would add clarity.

    • +
    • Add more examples.

    • +
  • +
  • Revision 5 (electronic), to be submitted 2021-10-15

    +
      +
    • P0009R13 (to be submitted 2021-10-15) changes mdspan to +use operator[] instead of operator() as the +array access operator. Revision 5 of P1673 adopts this change, and is +“rebased” atop P1673R5.
    • +
  • +
  • Revision 6 (electronic), to be submitted 2021-12-15

    +
      +
    • Update references to P0009 (P0009R14) and P2128 +(P2128R6).

    • +
    • Fix typos in *rank_2k descriptions.

    • +
    • Remove references to any mdspan rank greater than 2. +(These were left over from earlier versions of the proposal that +included “batched” operations.)

    • +
    • Fix vector_sum_of_squares name in BLAS comparison +table.

    • +
    • Replace “Requires” with “Preconditions,” per new wording +guidelines.

    • +
    • Remove all overloads of +symmetric_matrix_rank_k_update and +hermitian_matrix_rank_k_update that do not take an +alpha parameter. This prevents ambiguity between overloads +that take ExecutionPolicy&& but not +alpha, and overloads that take alpha but not +ExecutionPolicy&&.

    • +
    • Harmonize with the implementation, by adding +operator+, operator*, and comparison operators +to conjugated_scalar.

    • +
  • +
  • Revision 7 (electronic), to be submitted 2022-04-15

    +
      +
    • Update author affiliations and e-mail addresses

    • +
    • Update proposal references

    • +
    • Fix typo observed +here

    • +
    • Add missing ExecutionPolicy overload of in-place +triangular_matrix_vector_product; issue was observed +here

    • +
    • Fix mixed-up order of sum_of_squares_result +aggregate initialization arguments in vector_norm2 +note

    • +
    • Fill in missing parts of matrix_frob_norm and +vector_norm2 specification, addressing +this +issue

    • +
  • +
  • Revision 8 (electronic), to be submitted 2022-05-15

    +
      +
    • Fix Triangle and R[0,0] in Cholesky +TSQR example

    • +
    • Explain why we apply Triangle to the possibly +transformed input matrix, while the BLAS applies UPLO to +the original input matrix

    • +
    • Optimize transposed for all known layouts, so as to +avoid use of layout_transpose when not needed; fix +computation of strides for transposed layouts

    • +
    • Fix matrix extents in constraints and mandates for +symmetric_matrix_rank_k_update and +hermitian_matrix_rank_k_update (thanks to Mikołaj Zuzek +(NexGen Analytics, mikolaj.zuzek@ng-analytics.com) for +reporting the issue)

    • +
    • Resolve vagueness in constness of return type of +transposed

    • +
    • Resolve vagueness in constness of return type of +scaled, and make its element type the type of the product, +rather than forcing it back to the input mdspan’s element +type

    • +
    • Remove decay member function from +accessor_conjugate and accessor_scaled, as it +is no longer part of mdspan’s accessor policy +requirements

    • +
    • Make sure accessor_conjugate and +conjugated work correctly for user-defined complex types, +introduce conj-if-needed to simplify wording, and +resolve vagueness in constness of return type of +conjugated. Make sure that +conj-if-needed works for custom types where +conj is not type-preserving. (Thanks to Yu You (NVIDIA, +yuyou@nvidia.com) and Phil Miller (Intense Computing, +phil.miller@intensecomputing.com) for helpful +discussions.)

    • +
    • Fix typo in givens_rotation_setup for complex +numbers, and other typos (thanks to Phil Miller for reporting the +issue)

    • +
  • +
  • Revision 9 (electronic), to be submitted 2022-06-15

    +
      +
    • Apply to-be-submitted P0009R17 changes (see P2553 in particular) +to all layouts, accessors, and examples in this proposal.

    • +
    • Improve +triangular_matrix_matrix_{left,right}_solve() “mathematical +expression of the algorithm” wording.

    • +
    • layout_blas_packed: Fix +required_span_size() and operator() +wording

    • +
    • Make sure all definitions of lower and upper triangle are +consistent.

    • +
    • Changes to layout_transpose:

      +
        +
      • Make +layout_transpose::mapping(const nested_mapping_type&) +constructor explicit, to avoid inadvertent +transposition.

      • +
      • Remove the following Constraint on +layout_transpose::mapping: “for all specializations +E of extents with E::rank() equal +to 2, +typename Layout::template mapping<E>::is_always_unique() +is true.” (This Constraint was not correct, because the +underlying mapping is allowed to be nonunique.)

      • +
      • Make layout_transpose::mapping::stride wording +independent of rank() equals 2 constraint, to improve +consistency with rest of layout_transpose wording.

      • +
    • +
    • Changes to scaled and conjugated:

      +
        +
      • Include and specify all the needed overloaded arithmetic +operators for scaled_scalar and +conjugated_scalar, and fix accessor_scaled and +accessor_conjugate accordingly.

      • +
      • Simplify scaled to ensure preservation of order of +operations.

      • +
      • Add missing nested_accessor() to +accessor_scaled.

      • +
      • Add hidden friends abs, real, +imag, and conj to common subclass of +scaled_scalar and conjugated_scalar. Add +wording to algorithms that use abs, real, +and/or imag, to indicate that these functions are to be +found by unqualified lookup. (Algorithms that use conjugation already +use conj-if-needed in their wording.)

      • +
    • +
    • Changes suggested by SG6 small group review on 2022/06/09

      +
        +
      • Make existing exposition-only function +conj-if-needed use conj if it can +find it via unqualified (ADL-only) lookup, otherwise be the identity. +Make it a function object instead of a function, to prevent ADL +issues.

      • +
      • Algorithms that mathematically need to do division can now +distinguish left division and right division (for the case of +noncommutative multiplication), by taking an optional +BinaryDivideOp binary function object parameter. If none is +given, binary operator/ is used.

      • +
    • +
    • Changes suggested by LEWG review of P1673R8 on 2022/05/24

      +
        +
      • LEWG asked us to add a section to the paper explaining why we +don’t define an interface for customization of the “back-end” optimized +BLAS operations. This section already existed, but we rewrote it to +improve clarity. Please see the section titled “We do not require using +the BLAS library or any particular ‘back-end’.”

      • +
      • LEWG asked us to add a section to the paper showing how BLAS 1 +and ranges algorithms would coexist. We added this section, titled +“Criteria for including BLAS 1 algorithms; coexistence with +ranges.”

      • +
      • Address LEWG feedback to defer support for custom complex number +types (but see above SG6 small group response).

      • +
    • +
    • Fix P1674 links to point to R2.

    • +
  • +
  • Revision 10 (electronic), submitted 2022-10-15

    +
      +
    • Revise scaled and conjugated +wording.

    • +
    • Make all matrix view functions constexpr.

    • +
    • Rebase atop P2642R1. Remove wording saying that we rebase atop +P0009R17. (We don’t need that any more, because P0009 was merged into +the current C++ draft.)

    • +
    • Remove layout_blas_general, as it has been replaced +with the layouts proposed by P2642 (layout_left_padded and +layout_right_padded).

    • +
    • Update layout_blas_packed to match mdspan’s other +layout mappings in the current C++ Standard draft.

    • +
    • Update accessors to match mdspan’s other accessors in the current +C++ Standard draft.

    • +
    • Update non-wording to reflect current status of +mdspan (voted into C++ Standard draft) and +submdspan (P2630).

    • +
  • +
  • Revision 11, to be submitted 2023-01-15

    +
      +
    • Remove requirement that in_{vector,matrix,object}*_t +have unique layout.

    • +
    • Change from name-based requirements +(in_{vector,matrix,object}*_t) to exposition-only concepts. +(This is our interpretation of LEWG’s Kona 2022/11/10 request to +“explore expressing constraints with concepts instead of named type +requirements” (see +https://github.com/cplusplus/papers/issues/557#issuecomment-1311054803).) +Add new section for exposition-only concepts and traits.

    • +
    • Add new exposition-only concept +possibly-packed-inout-matrix to constrain +symmetric and Hermitian update algorithms. These may write either to a +unique-layout mdspan or to a layout_blas_packed mdspan +(whose layout is nonunique).

    • +
    • Remove Constraints made redundant by the new exposition-only +concepts.

    • +
    • Remove unnecessary constraint on all algorithms that input +mdspan parameter(s) have unique layout.

    • +
    • Remove the requirement that vector / matrix / object template +parameters may deduce a const lvalue reference or a +(non-const) rvalue reference to an mdspan. The +new exposition-only concepts make this unnecessary.

    • +
    • Fix dot Remarks to include both vector +types.

    • +
    • Fix wording of several functions and examples to use +mdspan’s value_type alias instead of its +(potentially cv-qualified) element_type alias. This +includes dot, vector_sum_of_squares, +vector_norm2, vector_abs_sum, +matrix_frob_norm, matrix_one_norm, +matrix_inf_norm, and the QR factorization example.

    • +
    • Make matrix_vector_product template parameter order +consistent with parameter order.

    • +
    • Follow LEWG guidance to simplify Effects and Constraints (e.g., +removing wording referring to the “the mathematical expression for the +algorithm”) (as a kind of expression-implying constraints) by describing +them mathematically (in math font). This is our interpretation of the +“hand wavy do math” poll option that received a majority of votes at +Kona on 2022/11/10 (see +https://github.com/cplusplus/papers/issues/557#issuecomment-1311054803). +Revise [linalg.reqs.val] accordingly.

    • +
    • Change matrix_one_norm Precondition (that the result +of abs of a matrix element is convertible to +T) to a Constraint.

    • +
    • Change vector_abs_sum Precondition (that the result +of init + abs(v[i]) is convertible to T) to a +Constraint.

    • +
    • Reformat entire document to use Pandoc instead of Bikeshed. This +made it possible to add paragraph numbers and fix exposition-only +italics.

    • +
    • In conjugated-scalar, make +conjugatable<ReferenceValue> a +Constraint instead of a Mandate.

    • +
    • Delete “If an algorithm in [linalg.algs] accesses the +elements of an out-vector, +out-matrix, or out-object, +it will do so in write-only fashion.” This would prevent implementations +from, e.g., zero-initializing the elements and then updating them with ++=.

    • +
    • Rebase atop P2642R2 (updating from R1).

    • +
    • Add explicit defaulted default constructors to all +the tag types, in imitation of in_place_t.

    • +
    • If two objects refer to the same matrix or vector, we no longer +say that they must have the same layout. First, “same layout” doesn’t +have to mean the same type. For example, a +layout_stride::mapping instance may represent the same +layout as a layout_left::mapping instance. Second, the two +objects can’t represent “the same matrix” or “the same vector” if they +have different layout mappings (in the mathematical sense, not in the +type sense).

    • +
    • Make sure that all updating methods say that the input(s) can be +the same as the output (not all inputs, just the ones for which that +makes sense – e.g., for the matrix-vector product z = y + Ax, +y and z can refer to the same vector, but not +x and z).

    • +
    • Change givens_rotation_setup to return outputs in +the new givens_rotation_setup_result struct, instead of as +output parameters.

    • +
  • +
  • Revision 12 to be submitted 2023/03/15

    +
      +
    • Change “complex version(s) thereof” to “specialization(s) of +complex

    • +
    • Remove Note (“conjugation is self-annihilating”) repeating the +contents of a note a few lines before

    • +
    • Remove Notes with editorial or tutorial content

    • +
    • Remove outdated wildcard name-based requirements +language

    • +
    • Remove Notes that incorrectly stated that the input matrix was +symmetric or Hermitian (it’s not necessarily symmetric or Hermitian; +it’s just interpreted that way)

    • +
    • Remove implementation freedom Notes

    • +
    • Update non-wording text referring to P1467 (which was voted into +C++23)

    • +
    • Change “the following requirement(s)” to “the following +element(s),” as a “requirement” is a kind of element

    • +
    • Change “algorithm or method” to “function”

    • +
    • Change Preconditions elements that should be something else +(generally Mandates) to that something else

    • +
    • In some cases where it makes sense, use +extents::operator== in elements, instead of comparing +extent(r) for each r

    • +
    • For vector_sum_of_squares, remove last remnants of +R10 “mathematical expression of the algorithm” wording

    • +
    • Add section [linalg.general] (“General”) that explains +mathematical notation, the interpretation of Triangle t +parameters, and that calls to abs, conj, +imag, and real are unqualified. Move the +definitions of lower triangle, upper triangle, and diagonal from +[linalg.tags.triangle] into [linalg.general]. Move the +definitions of implicit unit diagonal and explicit diagonal from +[linalg.tags.diagonal] into [linalg.general]. Remove +Remarks on Triangle and DiagonalStorage and +definitions of AT and AH that +[linalg.general] makes redundant.

    • +
    • Replace “Effects: Equivalent to: return X;” +with “Returns: X”. Fix formatting of multiple +Returns: cases. Change “name(s) the type” to “is” or +“be.”

    • +
    • In [linalg.tags.order], add missing forward +reference.

    • +
    • Replace “if applicable” with hopefully canonical +wording.

    • +
    • Audit complexity elements and revise their wording.

    • +
    • Nonwording: Update reference to P2128 to reflect its adoption +into C++23, and remove outdated future work.

    • +
    • Nonwording: Add implementation experience section.

    • +
    • Nonwording: Update P1385 reference from R6 to R7, and move the +“interoperable with other linear algebra proposals” section to a more +fitting place.

    • +
    • Remove reference from the list of linear algebra +value types in [linalg.reqs.val].

    • +
    • Add feature test macro __cpp_lib_linalg.

    • +
    • Remove “Throws: Nothing” from givens_rotation_setup +because it doesn’t need to be explicitly stated, and make all overloads +noexcept.

    • +
    • transposed no longer returns a read-only +mdspan.

    • +
    • Remove const from by-value parameters, and pass +linear algebra value types by value, including +complex.

    • +
    • Rename vector_norm2 to +vector_two_norm.

    • +
    • symmetric_matrix_rank_k_update and +hermitian_matrix_rank_k_update (“the BLAS 3 +*_update functions”) now have overloads that do not take an +alpha scaling parameter.

    • +
    • Change +{symmetric,hermitian,triangular}_matrix_{left,right}_product +to {symmetric,hermitian,triangular}_matrix_product. +Distinguish the left and right cases by order of the A, t +and B (or C, in the case of in-place +triangular_matrix_product) parameters. NOTE: As a result, +triangular_matrix_product (for the in-place right product +case) is now an exception to the rule that output or in/out parameters +appear last.

    • +
    • In [in]out-{matrix,vector,object}, instead +of checking if the element type is const, check if the element type can +be assigned to the reference type.

    • +
    • Split [linalg.reqs.val] into two sections: requirements +on linear algebra value types (same label) and algorithm and class +requirements [linalg.reqs.alg].

    • +
    • In wording of conj-if-needed, add missing +definition of the type T.

    • +
    • Add exposition-only real-if-needed and +imag-if-needed, and use them to make +vector_abs_sum behave the same for custom complex types as +for std::complex.

    • +
    • Fix two design issues with idx_abs_max.

      +
        +
      1. For the complex case, make it behave like the BLAS’s ICAMAX or +IZAMAX, by using abs-if-needed( +real-if-needed (z[k])) + +abs-if-needed( +imag-if-needed (z[k])) as the element +absolute value definition for complex numbers, instead of +abs(z[k]).

      2. +
      3. Make idx_abs_max behave the same for custom complex +types as for std::complex.

      4. +
    • +
    • Simplify wording for proxy-reference’s +hidden friends real, imag, and +conj, so that they just defer to the corresponding +*-if-needed exposition-only functions, rather than +duplicating the wording of those functions.

    • +
    • Add exposition-only function abs-if-needed +to address std::abs not being defined for unsigned integer +types (which manifests as an ambiguous lookup compiler error). Simplify +wording for proxy-reference’s hidden friend +abs to defer to abs-if-needed. Use +abs-if-needed instead of abs +throughout P1673.

    • +
    • Change remarks about aliasing (e.g., “y and z may refer to the same vector”) to +say “object” instead of “vector” or “matrix.”

    • +
    • For in-place overwriting triangular matrix-matrix {left, right} +product, restore the “left” and “right” in their names, and always put +the input/output parameter at the end. (This restores +triangular_matrix_left_product and +triangular_matrix_right_product for only the in-place +overwriting case. See above changes for this revision.)

    • +
    • Add Demmel 2002 reference to the (C++ Standard) +Bibliography.

    • +
    • Rephrase [linalg.reqs.val] to use “The type T must” +language, and add real-if-needed and +imag-if-needed to the list of expressions +there.

    • +
    • Rephase [linalg.reqs.alg] to make it more like +sort, e.g., not making it explicitly a constraint that +certain expressions are well formed.

    • +
    • Add to [linalg.reqs.val] that a value-initialized object +of linear algebra value type acts as the additive identity.

    • +
    • Define what it means for two mdspan to “alias” each other. +Instead of saying that two things may refer to the same object, say that +they may alias.

    • +
    • Change the name of the T init parameters and the +template parameter of sum_of_squares_result to allow +simplification of [linalg.reqs.val].

    • +
    • Delete Note explaining BLAS 1, 2, and 3.

    • +
  • +
  • Revision 13 - running revision for LWG review

    +
      +
    • make scaled_accessor::reference +const element_type

    • +
    • add converting, default and copy ctor to +scaled_accessor

    • +
    • rename accessor_conjugate to +conjugated_accessor

    • +
    • rename accessor_scaled to +scaled_accessor

    • +
    • rename givens_rotation_setup to +setup_givens_rotation

    • +
    • rename givens_rotation_apply to +apply_givens_rotation

    • +
    • fix example for scaled

    • +
    • implement helper functions for algorithm mandates and +preconditions and apply it to gemv

    • +
    • fix hanging sections throughout

    • +
    • use stable tags instead of “in this clause”

    • +
    • make linalg-reqs-flpt a note

    • +
    • fix linalg.algs.reqs constraint -> precondition

    • +
    • fix unary plus and well formed

    • +
    • remove proxy-reference , +scaled_scalar and +conjugated_scalar

    • +
    • Redo conjugated to simply rely on deduction guide +for mdspan

    • +
    • fix numbering via dummy sections

    • +
    • Define what it means for two mdspan to “overlap” each other. +Replace “shall view a disjoint set of elements of” wording (was +[linalg.concepts] 3 in R12) with “shall not overlap.” “Alias” retains +its R12 meaning (view the same elements in the same order). This lets us +retain existing use of “alias” in describing algorithms.

    • +
    • rename idx_abs_max to +vector_idx_abs_max

    • +
    • make “may alias” a transitive verb phrase, and put all such +wording expressions in the form “Output may alias Input”

    • +
    • fix matrix_rank_1_update* wording (rename template +parameters to InOutMat, and fix Effects so that the +algorithms are updating; use new “Computes” wording)

    • +
    • make sure all Effects-equivalent-to that use the execution policy +use std::forward<ExecutionPolicy>(exec), instead of +passing exec directly

    • +
    • change [linalg.alg.*] stable names to [linalg.algs.*], for +consistency

    • +
    • fix stable names for BLAS 2 rank-1 symmetric and Hermitian +updates

    • +
    • Remove any wording (e.g., for transposed) that +depends on P2642 (padded mdspan layouts). We can restore and correct +that wording later.

    • +
    • Fix [linalg.algs.reqs] 1 by changing “type requirements” to +“Constraints.” Add the Constraint that ExecutionPolicy is an execution +policy. Remove the requirement that the algorithms that take +ExecutionPolicy are parallel algorithms, because that would +be circular with [algorithms.parallel] 2 (“A +parallel algorithm is a function template listed in this +document with a template parameter named +ExecutionPolicy”).

    • +
    • make layout_blas_packed::mapping::operator() take +exactly two parameters, rather than a pack

    • +
    • for default BinaryDivideOp, replace lambda with +divides<void>{}

    • +
    • add definitions of the “rows” and “columns” of a matrix to +[linalg.general], so that [linalg.tags.order] can refer to rows and +columns

    • +
    • layout_blas_packed::mapping::operator(): Pass input +parameters through index-cast before using them in +the formulas.

    • +
    • remove spurious return value from +layout_blas_packed::mapping::stride (the case where the +Precondition would have been violated anyway)

    • +
    • Move exposition-only helpers +transpose-extents and +transpose-extents-t to the new section +[linalg.transp.helpers]. Redefine +transpose-extents-t in terms of +transpose-extents, rather than the other way +around.

    • +
    • For triangular_*, symmetric_*, and +hermitian_* functions that take a Triangle +parameter and an mdspan with +layout_blas_packed layout, change the requirement that the +layout’s Triangle match the function’s +Triangle parameter, from a Constraint to a Mandate. This +should not result in ambiguous overloads, since Triangle is +already Constrained to be upper_triangle_t or +lower_triangle_t. Add a nonwording section explaining this +design choice.

    • +
    • Add triangle_type and +storage_order_type public type aliases to +layout_blas_packed.

    • +
    • Fix layout_blas_packed requirements so that wording +of operator() and other members doesn’t need to consider +overflow.

    • +
    • Add possibly-packed-inout-matrix to the +list of concepts in [linalg.helpers.concepts] 3 that forbid overlap +unless explicitly permitted.

    • +
    • Make sure that Complexity clauses use a math multiplication +symbol instead of code-font *. (The latter would cause +unwarranted overflow issues, especially for BLAS 3 functions.)

    • +
    • Many more wording fixes based on LWG review

    • +
  • +
+

3 Purpose of this paper

+

This paper proposes a C++ Standard Library dense linear algebra +interface based on the dense Basic Linear Algebra Subroutines (BLAS). +This corresponds to a subset of the BLAS +Standard. Our proposal implements the following classes of +algorithms on arrays that represent matrices and vectors:

+
    +
  • elementwise vector sums;

  • +
  • multiplying all elements of a vector or matrix by a +scalar;

  • +
  • 2-norms and 1-norms of vectors;

  • +
  • vector-vector, matrix-vector, and matrix-matrix products +(contractions);

  • +
  • low-rank updates of a matrix;

  • +
  • triangular solves with one or more “right-hand side” vectors; +and

  • +
  • generating and applying plane (Givens) rotations.

  • +
+

Our algorithms work with most of the matrix storage formats that the +BLAS Standard supports:

+
    +
  • “general” dense matrices, in column-major or row-major +format;

  • +
  • symmetric or Hermitian (for complex numbers only) dense matrices, +stored either as general dense matrices, or in a packed format; +and

  • +
  • dense triangular matrices, stored either as general dense +matrices or in a packed format.

  • +
+

Our proposal also has the following distinctive characteristics.

+
    +
  • It uses free functions, not arithmetic operator +overloading.

  • +
  • The interface is designed in the spirit of the C++ Standard +Library’s algorithms.

  • +
  • It uses mdspan (adopted into C++23), a +multidimensional array view, to represent matrices and vectors.

  • +
  • The interface permits optimizations for matrices and vectors with +small compile-time dimensions; the standard BLAS interface does +not.

  • +
  • Each of our proposed operations supports all element types for +which that operation makes sense, unlike the BLAS, which only supports +four element types.

  • +
  • Our operations permit “mixed-precision” computation with matrices +and vectors that have different element types. This subsumes most +functionality of the Mixed-Precision BLAS specification, comprising +Chapter 4 of the +BLAS +Standard.

  • +
  • Like the C++ Standard Library’s algorithms, our operations take +an optional execution policy argument. This is a hook to support +parallel execution and hierarchical parallelism.

  • +
  • Unlike the BLAS, our proposal can be expanded to support +“batched” operations (see P2901) with almost no interface differences. +This will support machine learning and other applications that need to +do many small matrix or vector operations at once.

  • +
+

Here are some examples of what this proposal offers. In these +examples, we ignore std:: namespace qualification for +anything in our proposal or for mdspan. We start with a +“hello world” that scales the elements of a 1-D mdspan by a +constant factor, first sequentially, then in parallel.

+
  constexpr size_t N = 40;
+  std::vector<double> x_vec(N);
+
+  mdspan x(x_vec.data(), N);
+  for(size_t i = 0; i < N; ++i) {
+    x[i] = double(i);
+  }
+
+  linalg::scale(2.0, x); // x = 2.0 * x
+  linalg::scale(std::execution::par_unseq, 3.0, x);
+  for(size_t i = 0; i < N; ++i) {
+    assert(x[i] == 6.0 * double(i));
+  }
+

Here is a matrix-vector product example. It illustrates the +scaled function that makes our interface more concise, +while still permitting the BLAS’ performance optimization of fusing +computations with multiplications by a scalar. It also shows the ability +to exploit dimensions known at compile time, and to mix compile-time and +run-time dimensions arbitrarily.

+
constexpr size_t N = 40;
+constexpr size_t M = 20;
+
+std::vector<double> A_vec(N*M);
+std::vector<double> x_vec(M);
+std::array<double, N> y_vec(N);
+
+mdspan A(A_vec.data(), N, M);
+mdspan x(x_vec.data(), M);
+mdspan y(y_vec.data(), N);
+
+for(int i = 0; i < A.extent(0); ++i) {
+  for(int j = 0; j < A.extent(1); ++j) {
+    A[i,j] = 100.0 * i + j;
+  }
+}
+for(int j = 0; j < x.extent(0); ++j) {
+  x[i] = 1.0 * j;
+}
+for(int i = 0; i < y.extent(0); ++i) {
+  y[i] = -1.0 * i;
+}
+
+linalg::matrix_vector_product(A, x, y); // y = A * x
+
+// y = 0.5 * y + 2 * A * x
+linalg::matrix_vector_product(std::execution::par,
+  linalg::scaled(2.0, A), x,
+  linalg::scaled(0.5, y), y);
+

This example illustrates the ability to perform mixed-precision +computations, and the ability to compute on subviews of a matrix or +vector by using submdspan (P2630, adopted into the C++26 +draft). (submdspan was separated from the rest of P0009 as +a way to avoid delaying the adoption of P0009 into C++23. The reference +implementation of mdspan includes +submdspan.)

+
constexpr size_t M = 40;
+
+std::vector<float> A_vec(M*8*4);
+std::vector<double> x_vec(M*4);
+std::vector<double> y_vec(M*8);
+
+mdspan<float, extents<size_t, dynamic_extent, 8, 4>> A(A_vec.data(), M);
+mdspan<double, extents<size_t, 4, dynamic_extent>> x(x_vec.data(), M);
+mdspan<double, extents<size_t, dynamic_extent, 8>> y(y_vec.data(), M);
+
+for(size_t m = 0; m < A.extent(0); ++m) {
+  for(size_t i = 0; i < A.extent(1); ++i) {
+    for(size_t j = 0; j < A.extent(2); ++j) {
+      A[m,i,j] = 1000.0 * m + 100.0 * i + j;
+    }
+  }
+}
+for(size_t i = 0; i < x.extent(0); ++i) {
+  for(size_t m = 0; m < x.extent(1); ++m) {
+    x[i,m] = 33. * i + 0.33 * m;
+  }
+}
+for(size_t m = 0; m < y.extent(0); ++m) {
+  for(size_t i = 0; i < y.extent(1); ++i) {
+    y[m,i] = 33. * m + 0.33 * i;
+  }
+}
+
+for(size_t m = 0; m < M; ++m) {
+  auto A_m = submdspan(A, m, full_extent, full_extent);
+  auto x_m = submdspan(x, full_extent, m);
+  auto y_m = submdspan(y, m, full_extent);
+  // y_m = A * x_m
+  linalg::matrix_vector_product(A_m, x_m, y_m);
+}
+

4 Overview of contents

+

Section +5 motivates considering any dense +linear algebra proposal for the C++ Standard Library.

+

Section +6 shows why we chose the BLAS as a starting +point for our proposed library. The BLAS is an existing standard with +decades of use, a rich set of functions, and many optimized +implementations.

+

Section 7 lists what we consider general criteria for +including algorithms in the C++ Standard Library. We rely on these +criteria to justify the algorithms in this proposal.

+

Section 8 describes BLAS notation and conventions in +C++ terms. Understanding this will give readers context for algorithms, +and show how our proposed algorithms expand on BLAS functionality.

+

Section 9 lists functionality that we intentionally +exclude from our proposal. We imitate the BLAS in aiming to be a set of +“performance primitives” on which external libraries or applications may +build a more complete linear algebra solution.

+

Section 10 elaborates on our design justification. This +section explains

+
    +
  • why we use mdspan to represent matrix and vector +parameters;

  • +
  • how we translate the BLAS’ Fortran-centric idioms into +C++;

  • +
  • how the BLAS’ different “matrix types” map to different +algorithms, rather than different mdspan layouts;

  • +
  • how we express quality-of-implementation recommendations about +avoiding undue overflow and underflow;

  • +
  • how we impose requirements on algorithms’ behavior and on the +various value types that algorithms encounter;

  • +
  • how we support for user-defined complex number types and address +type preservation and domain issues with std::abs, +std::conj, std::real, and +std::imag;

  • +
  • how we support division for triangular solves, for value types +with noncommutative multiplication; and

  • +
  • how we address consistency between +layout_blas_packed having a Triangle template +parameter, and functions also taking a Triangle +parameter.

  • +
+

Section 11 +lists future work, that is, ways future proposals could build on this +one.

+

Section +12 gives the data structures and utilities from +other proposals on which we depend. In particular, we rely heavily on +mdspan (adopted into C++23), and add custom layouts and +accessors.

+

Section 13 briefly summarizes the existing +implementations of this proposal.

+

Section +14 explains how this proposal is interoperable +with other linear algebra proposals currently under WG21 review. In +particular, we believe this proposal is complementary to P1385, and the +authors of P1385 have expressed the same view.

+

Section 15 +credits funding agencies and contributors.

+

Section 16 is +our bibliography.

+

Section 17 is where +readers will find the normative wording we propose.

+

Finally, Section 18 gives some more elaborate examples of linear +algebra algorithms that use our proposal. The examples show how +mdspan’s features let users easily describe “submatrices” +with submdspan, proposed in P2630 as a follow-on to mdspan. +(The reference implementation of mdspan includes +submdspan.) This integrates naturally with “block” +factorizations of matrices. The resulting notation is concise, yet still +computes in place, without unnecessary copies of any part of the +matrix.

+

Here is a table that maps between Reference BLAS function name, and +algorithm or function name in our proposal. The mapping is not always +one to one. “N/A” in the “BLAS name(s)” field means that the function is +not in the BLAS.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+BLAS name(s) + +Our name(s) +
+xLARTG + +setup_givens_rotation +
+xROT + +apply_givens_rotation +
+xSWAP + +swap_elements +
+xSCAL + +scale, scaled +
+xCOPY + +copy +
+xAXPY + +add, scaled +
+xDOT, xDOTU + +dot +
+xDOTC + +dotc +
+N/A + +vector_sum_of_squares +
+xNRM2 + +vector_two_norm +
+xASUM + +vector_abs_sum +
+xIAMAX + +vector_idx_abs_max +
+N/A + +matrix_frob_norm +
+N/A + +matrix_one_norm +
+N/A + +matrix_inf_norm +
+xGEMV + +matrix_vector_product +
+xSYMV + +symmetric_matrix_vector_product +
+xHEMV + +hermitian_matrix_vector_product +
+xTRMV + +triangular_matrix_vector_product +
+xTRSV + +triangular_matrix_vector_solve +
+xGER, xGERU + +matrix_rank_1_update +
+xGERC + +matrix_rank_1_update_c +
+xSYR + +symmetric_matrix_rank_1_update +
+xHER + +hermitian_matrix_rank_1_update +
+xSYR2 + +symmetric_matrix_rank_2_update +
+xHER2 + +hermitian_matrix_rank_2_update +
+xGEMM + +matrix_product +
+xSYMM + +symmetric_matrix_product +
+xHEMM + +hermitian_matrix_product +
+xTRMM + +triangular_matrix_product +
+xSYRK + +symmetric_matrix_rank_k_update +
+xHERK + +hermitian_matrix_rank_k_update +
+xSYR2K + +symmetric_matrix_rank_2k_update +
+xHER2K + +hermitian_matrix_rank_2k_update +
+xTRSM + +triangular_matrix_matrix_{left,right}_solve +
+

5 Why include dense linear algebra +in the C++ Standard Library?

+
    +
  1. “Direction for ISO C++” (P0939R4) explicitly calls out “Linear +Algebra” as a potential priority for C++23.

  2. +
  3. C++ applications in “important application areas” (see P0939R4) +have depended on linear algebra for a long time.

  4. +
  5. Linear algebra is like sort: obvious algorithms are +slow, and the fastest implementations call for hardware-specific +tuning.

  6. +
  7. Dense linear algebra is core functionality for most of linear +algebra, and can also serve as a building block for tensor +operations.

  8. +
  9. The C++ Standard Library includes plenty of “mathematical +functions.” Linear algebra operations like matrix-matrix multiply are at +least as broadly useful.

  10. +
  11. The set of linear algebra operations in this proposal are derived +from a well-established, standard set of algorithms that has changed +very little in decades. It is one of the strongest possible examples of +standardizing existing practice that anyone could bring to C++.

  12. +
  13. This proposal follows in the footsteps of many recent successful +incorporations of existing standards into C++, including the UTC and TAI +standard definitions from the International Telecommunications Union, +the time zone database standard from the International Assigned Numbers +Authority, and the ongoing effort to integrate the ISO unicode +standard.

  14. +
+

Linear algebra has had wide use in C++ applications for nearly three +decades (see P1417R0 for a historical survey). For much of that time, +many third-party C++ libraries for linear algebra have been available. +Many different subject areas depend on linear algebra, including machine +learning, data mining, web search, statistics, computer graphics, +medical imaging, geolocation and mapping, engineering, and physics-based +simulations.

+

“Directions for ISO C++” (P0939R4) not only lists “Linear Algebra” +explicitly as a potential C++23 priority, it also offers the following +in support of adding linear algebra to the C++ Standard Library.

+
    +
  • P0939R4 calls out “Support for demanding applications” in +“important application areas, such as medical, finance, automotive, and +games (e.g., key libraries…)” as an “area of general concern” that “we +should not ignore.” All of these areas depend on linear +algebra.

  • +
  • “Is my proposal essential for some important application domain?” +Many large and small private companies, science and engineering +laboratories, and academics in many different fields all depend on +linear algebra.

  • +
  • “We need better support for modern hardware”: Modern hardware +spends many of its cycles in linear algebra. For decades, hardware +vendors, some represented at WG21 meetings, have provided and continue +to provide features specifically to accelerate linear algebra +operations. Some of them even implement specific linear algebra +operations directly in hardware. Examples include NVIDIA’s +Tensor +Cores and Cerebras’ +Wafer Scale Engine. +Several large computer system vendors offer optimized linear algebra +libraries based on or closely resembling the BLAS. These include AMD’s +BLIS, ARM’s Performance Libraries, Cray’s LibSci, Intel’s Math Kernel +Library (MKL), IBM’s Engineering and Scientific Subroutine Library +(ESSL), and NVIDIA’s cuBLAS.

  • +
+

Obvious algorithms for some linear algebra operations like dense +matrix-matrix multiply are asymptotically slower than less-obvious +algorithms. (For details, please refer to a survey one of us coauthored, + “Communication +lower bounds and optimal algorithms for numerical linear algebra.”) +Furthermore, writing the fastest dense matrix-matrix multiply depends on +details of a specific computer architecture. This makes such operations +comparable to sort in the C++ Standard Library: worth +standardizing, so that Standard Library implementers can get them right +and hardware vendors can optimize them. In fact, almost all C++ linear +algebra libraries end up calling non-C++ implementations of these +algorithms, especially the implementations in optimized BLAS libraries +(see below). In this respect, linear algebra is also analogous to +standard library features like random_device: often +implemented directly in assembly or even with special hardware, and thus +an essential component of allowing no room for another language “below” +C++ (see P0939R4) and Stroustrup’s “The Design and Evolution of +C++”).

+

Dense linear algebra is the core component of most algorithms and +applications that use linear algebra, and the component that is most +widely shared over different application areas. For example, tensor +computations end up spending most of their time in optimized dense +linear algebra functions. Sparse matrix computations get best +performance when they spend as much time as possible in dense linear +algebra.

+

The C++ Standard Library includes many +“mathematical special +functions” ([sf.cmath]), like incomplete elliptic +integrals, Bessel functions, and other polynomials and functions named +after various mathematicians. Any of them comes with its own theory and +set of applications for which robust and accurate implementations are +indispensible. We think that linear algebra operations are at least as +broadly useful, and in many cases significantly more so.

+

6 Why base a C++ linear algebra +library on the BLAS?

+
    +
  1. The BLAS is a standard that codifies decades of existing +practice.

  2. +
  3. The BLAS separates “performance primitives” for hardware experts +to tune, from mathematical operations that rely on those primitives for +good performance.

  4. +
  5. Benchmarks reward hardware and system vendors for providing +optimized BLAS implementations.

  6. +
  7. Writing a fast BLAS implementation for common element types is +nontrivial, but well understood.

  8. +
  9. Optimized third-party BLAS implementations with liberal software +licenses exist.

  10. +
  11. Building a C++ interface on top of the BLAS is a straightforward +exercise, but has pitfalls for unaware developers.

  12. +
+

Linear algebra has had a cross-language standard, the Basic Linear +Algebra Subroutines (BLAS), since 2002. The Standard came out of a +standardization +process that started in 1995 and held meetings three times a year +until 1999. Participants in the process came from industry, academia, +and government research laboratories. The dense linear algebra subset of +the BLAS codifies forty years of evolving practice, and has existed in +recognizable form since 1990 (see P1417R0).

+

The BLAS interface was specifically designed as the distillation of +the “computer science” or performance-oriented parts of linear algebra +algorithms. It cleanly separates operations most critical for +performance, from operations whose implementation takes expertise in +mathematics and rounding-error analysis. This gives vendors +opportunities to add value, without asking for expertise outside the +typical required skill set of a Standard Library implementer.

+

Well-established benchmarks such as the +LINPACK benchmark, +reward computer hardware vendors for optimizing their BLAS +implementations. Thus, many vendors provide an optimized BLAS library +for their computer architectures. Writing fast BLAS-like operations is +not trivial, and depends on computer architecture. However, it is a +well-understood problem whose solutions could be parameterized for a +variety of computer architectures. See, for example, +Goto and van de Geijn +2008. There are optimized third-party BLAS implementations for +common architectures, like +ATLAS and +GotoBLAS. +A (slow but correct) +reference +implementation of the BLAS exists and it has a liberal software +license for easy reuse.

+

We have experience in the exercise of wrapping a C or Fortran BLAS +implementation for use in portable C++ libraries. We describe this +exercise in detail in our paper “Evolving a Standard C++ Linear Algebra +Library from the BLAS” (P1674). It +is straightforward for vendors, but has pitfalls for developers. For +example, Fortran’s application binary interface (ABI) differs across +platforms in ways that can cause run-time errors (even incorrect +results, not just crashing). Historical examples of vendors’ C BLAS +implementations have also had ABI issues that required work-arounds. +This dependence on ABI details makes availability in a standard C++ +library valuable.

+

7 Criteria for including +algorithms

+

7.1 Criteria for all the +algorithms

+

We include algorithms in our proposal based on the following +criteria, ordered by decreasing importance. Many of our algorithms +satisfy multiple criteria.

+
    +
  1. Getting the desired asymptotic run time is nontrivial

  2. +
  3. Opportunity for vendors to provide hardware-specific +optimizations

  4. +
  5. Opportunity for vendors to provide quality-of-implementation +improvements, especially relating to accuracy or reproducibility with +respect to floating-point rounding error

  6. +
  7. User convenience (familiar name, or tedious to +implement)

  8. +
+

Regarding (1), “nontrivial” means “at least for novices to the +field.” Dense matrix-matrix multiply is a good example. Getting close to +the asymptotic lower bound on the number of memory reads and writes +matters a lot for performance, and calls for a nonintuitive loop +reordering. An analogy to the current C++ Standard Library is +sort, where intuitive algorithms that many humans use are +not asymptotically optimal.

+

Regarding (2), a good example is copying multidimensional arrays. The +Kokkos library spends +about 2500 lines of code on multidimensional array copy, yet still +relies on system libraries for low-level optimizations. An analogy to +the current C++ Standard Library is copy or even +memcpy.

+

Regarding (3), accurate floating-point summation is nontrivial. +Well-meaning compiler optimizations might defeat even simple technqiues, +like compensated summation. The most obvious way to compute a vector’s +Euclidean norm (square root of sum of squares) can cause overflow or +underflow, even when the exact answer is much smaller than the overflow +threshold, or larger than the underflow threshold. Some users care +deeply about sums, even parallel sums, that always get the same answer, +despite rounding error. This can help debugging, for example. It is +possible to make floating-point sums completely independent of parallel +evaluation order. See e.g., the +ReproBLAS effort. +Naming these algorithms and providing ExecutionPolicy +customization hooks gives vendors a chance to provide these +improvements. An analogy to the current C++ Standard Library is +hypot, whose language in the C++ Standard alludes to the +tighter POSIX requirements.

+

Regarding (4), the C++ Standard Library is not entirely minimalist. +One example is std::string::contains. Existing Standard +Library algorithms already offered this functionality, but a member +contains function is easy for novices to find and use, and +avoids the tedium of comparing the result of find to +npos.

+

The BLAS exists mainly for the first two reasons. It includes +functions that were nontrivial for compilers to optimize in its time, +like scaled elementwise vector sums, as well as functions that generally +require human effort to optimize, like matrix-matrix multiply.

+

7.2 Criteria for including BLAS 1 +algorithms; coexistence with ranges

+

The BLAS developed in three “levels”: 1, 2, and 3. BLAS 1 includes +“vector-vector” operations like dot products, norms, and vector +addition. BLAS 2 includes “matrix-vector” operations like matrix-vector +products and outer products. BLAS 3 includes “matrix-matrix” operations +like matrix-matrix products and triangular solve with multiple +“right-hand side” vectors. The BLAS level coincides with the number of +nested loops in a naïve sequential implementation of the operation. +Increasing level also comes with increasing potential for data reuse. +For history of the BLAS “levels” and a bibliography, see +P1417.

+

We mention this here because some reviewers have asked how the +algorithms in our proposal would coexist with the existing ranges +algorithms in the C++ Standard Library. (Ranges was a feature added to +the C++ Standard Library in C++20.) This question actually encloses two +questions.

+
    +
  1. Will our proposed algorithms syntactically collide with existing +ranges algorithms?

  2. +
  3. How much overlap do our proposed algorithms have with the +existing ranges algorithms? (That is, do we really need these new +algorithms?)

  4. +
+

7.2.1 Low risk of syntactic +collision with ranges

+

We think there is low risk of our proposal colliding syntactically +with existing ranges algorithms, for the following reasons.

+
    +
  • We propose our algorithms in a new namespace +std::linalg.

  • +
  • None of the algorithms we propose share names with any existing +ranges algorithms.

  • +
  • We take care not to use _view as a suffix, in order +to avoid confusion or name collisions with “views” in the sense of +ranges.

  • +
  • We specifically do not use the names transpose or +transpose_view, since LEWG has advised us that ranges +algorithms may want to claim these names. (One could imagine +“transposing” a range of ranges.)

  • +
  • We constrain our algorithms only to take vector and matrix +parameters as mdspan. mdspan is not currently +a range, and there are currently no proposals in flight that would make +it a range. Changing mdspan of arbitrary rank to be a range +would require a design for multidimensional iterators. P0009’s coauthors +have not proposed a design, and it has proven challenging to get +compilers to optimize any existing design for multidimensional +iterators.

  • +
+

7.2.2 Minimal overlap with ranges +is justified by user convenience

+

The rest of this section answers the second question. The BLAS 2 and +3 algorithms require multiple nested loops, and high-performing +implementations generally need intermediate storage. This would make it +unnatural and difficult to express them in terms of ranges. Only the +BLAS 1 algorithms in our proposal might have a reasonable translation to +ranges algorithms. There, we limit ourselves to the BLAS 1 algorithms in +what follows.

+

Any rank-1 mdspan x can be translated into +the following range:

+
auto x_range = views::iota(size_t(0), x.extent(0)) |
+    views::transform([x] (auto k) { return x[k]; });
+

with specializations possible for mdspan whose layout +mapping’s range is a contiguous or fixed-stride set of offsets. However, +just because code could be written in a certain way doesn’t +mean that it should be. We have ranges even though the language +has for loops; we don’t need to step in the Turing tar-pit +on purpose (see Perlis 1982). Thus, we will analyze the BLAS 1 +algorithms in this proposal in the context of the previous section’s +four general criteria.

+

Our proposal would add 61 new unique names to the C++ Standard +Library. Of those, 16 are BLAS 1 algorithms, while 24 are BLAS 2 and 3 +algorithms. The 16 BLAS 1 algorithms fall into three categories.

+
    +
  1. Optimization hooks, like copy. As with +memcpy, the fastest implementation may depend closely on +the computer architecture, and may differ significantly from a +straightforward implementation. Some of these algorithms, like +copy, can operate on multidimensional arrays as well, +though it is traditional to list them as BLAS 1 algorithms.

  2. +
  3. Floating-point quality-of-implementation hooks, like +vector_sum_of_squares. These give vendors opportunities to +avoid preventable floating-point underflow and overflow (as with +hypot), improve accuracy, and reduce or even avoid parallel +nondeterminism and order dependence of floating-point sums.

  4. +
  5. Uncomplicated elementwise algorithms like add, +idx_abs_max, and scale.

  6. +
+

We included the first category mainly because of Criterion (2) +“Opportunity for vendors to provide hardware-specific optimizations,” +and the second mainly because of Criterion (3) (“Opportunity for vendors +to provide quality-of-implementation improvements”). Fast +implementations of algorithms in the first category are not likely to be +simple uses of ranges algorithms.

+

Algorithms in the second category could be presented as ranges +algorithms, as mdspan algorithms, or as both. The +“iterating over elements” part of those algorithms is not the most +challenging part of their implementation, nor is it what makes an +implementation “high quality.”

+

Algorithms in the third category could be replaced with a few lines +of C++ that use ranges algorithms. For example, here is a parallel +implementation of idx_abs_max, with simplifications for +exposition. (It omits template parameters’ constraints, uses +std::abs instead of abs-if-needed, +and does not address the complex number case. Here is a +Compiler Explorer link to +a working example.)

+
template<class Element, class Extents,
+  class Layout, class Accessor>
+typename Extents::size_type idx_abs_max(
+  std::mdspan<Element, Extents, Layout, Accessor> x)
+{
+  auto theRange = std::views::iota(size_t(0), x.extent(0)) |
+    std::views::transform([=] (auto k) { return std::abs(x[k]); });
+  auto iterOfMax =
+    std::max_element(std::execution::par_unseq, theRange.begin(), theRange.end());
+  auto indexOfMax = std::ranges::distance(theRange.begin(), iterOfMax);
+  // In GCC 12.1, the return type is __int128.
+  return static_cast<typename Extents::size_type>(indexOfMax);
+}
+

Even though the algorithms in the third category could be implemented +straightforwardly with ranges, we provide them because of Criterion 4 +(“User convenience”). Criterion (4) applies to all the algorithms in +this proposal, and particularly to the BLAS 1 algorithms. Matrix +algorithm developers need BLAS 1 and 2 as well as BLAS 3, because matrix +algorithms tend to decompose into vector algorithms. This is true even +of so-called “block” matrix algorithms that have been optimized to use +matrix-matrix operations wherever possible, in order to improve memory +locality. Demmel et al. 1987 (p. 4) explains.

+
+

Block algorithms generally require an unblocked version of the same +algorithm to be available to operate on a single block. Therefore, the +development of the software will fall naturally into two phases: first, +develop unblocked versions of the routines, calling the Level 2 BLAS +wherever possible; then develop blocked versions where possible, calling +the Level 3 BLAS.

+
+

Dongarra et al. 1990 (pp. 12-15) outlines this development process +for the specific example of Cholesky factorization. The Cholesky +factorization algorithm (on p. 14) spends most of its time (for a +sufficiently large input matrix) in matrix-matrix multiplies +(DGEMM), rank-k symmetric matrices updates +(DSYRK, a special case of matrix-matrix multiply), and +triangular solves with multiple “right-hand side” vectors +(DTRSM). However, it still needs an “unblocked” Cholesky +factorization as the blocked factorization’s “base case.” This is called +DLLT in Dongarra et al. 1990 (p. 15), and it uses +DDOT, DSCAL (both BLAS2), and +DGEMV (BLAS 2). In the case of Cholesky factorization, it’s +possible to express the “unblocked” case without using BLAS 1 or 2 +operations, by using recursion. This is the approach that LAPACK takes +with the blocked Cholesky factorization +DPOTRF +and its unblocked base case +DPOTRF2. +However, even a recursive formulation of most matrix factorizations +needs to use BLAS 1 or 2 operations. For example, the unblocked base +case +DGETRF2 +of LAPACK’s blocked LU factorization +DGETRF +needs to invoke vector-vector operations like DSCAL.

+

In summary, matrix algorithm developers need vector algorithms, +because matrix algorithms decompose into vector algorithms. If our +proposal lacked BLAS 1 algorithms, even simple ones like +add and scale, matrix algorithm developers +would end up writing them anyway.

+

8 Notation and conventions

+

8.1 The BLAS uses Fortran terms

+

The BLAS’ “native” language is Fortran. It has a C binding as well, +but the BLAS Standard and documentation use Fortran terms. Where +applicable, we will call out relevant Fortran terms and highlight +possibly confusing differences with corresponding C++ ideas. Our paper +P1674 (“Evolving a Standard C++ +Linear Algebra Library from the BLAS”) goes into more detail on these +issues.

+

8.2 We call “subroutines” +functions

+

Like Fortran, the BLAS distinguishes between functions that return a +value, and subroutines that do not return a value. In what follows, we +will refer to both as “BLAS functions” or “functions.”

+

8.3 Element types and BLAS function +name prefix

+

The BLAS implements functionality for four different matrix, vector, +or scalar element types:

+
    +
  • REAL (float in C++ terms)

  • +
  • DOUBLE PRECISION (double in C++ +terms)

  • +
  • COMPLEX (complex<float> in C++ +terms)

  • +
  • DOUBLE COMPLEX (complex<double> +in C++ terms)

  • +
+

The BLAS’ Fortran 77 binding uses a function name prefix to +distinguish functions based on element type:

+
    +
  • S for REAL (“single”)

  • +
  • D for DOUBLE PRECISION

  • +
  • C for COMPLEX

  • +
  • Z for DOUBLE COMPLEX

  • +
+

For example, the four BLAS functions SAXPY, +DAXPY, CAXPY, and ZAXPY all +perform the vector update Y = Y + ALPHA*X for vectors +X and Y and scalar ALPHA, but for +different vector and scalar element types.

+

The convention is to refer to all of these functions together as +xAXPY. In general, a lower-case x is a +placeholder for all data type prefixes that the BLAS provides. For most +functions, the x is a prefix, but for a few functions like +IxAMAX, the data type “prefix” is not the first letter of +the function name. (IxAMAX is a Fortran function that +returns INTEGER, and therefore follows the old Fortran +implicit naming rule that integers start with I, +J, etc.) Other examples include the vector 2-norm functions +SCNRM2 and DZNRM2, where the first letter +indicates the return type and the second letter indicates the vector +element type.

+

Not all BLAS functions exist for all four data types. These come in +three categories:

+
    +
  1. The BLAS provides only real-arithmetic (S and +D) versions of the function, since the function only makes +mathematical sense in real arithmetic.

  2. +
  3. The complex-arithmetic versions perform a different mathematical +operation than the real-arithmetic versions, so they have a different +base name.

  4. +
  5. The complex-arithmetic versions offer a choice between +nonconjugated or conjugated operations.

  6. +
+

As an example of the second category, the BLAS functions +SASUM and DASUM compute the sums of absolute +values of a vector’s elements. Their complex counterparts +CSASUM and DZASUM compute the sums of absolute +values of real and imaginary components of a vector v, that +is, the sum of |ℜ(vi)| + |ℑ(vi)| +for all i in the domain of v. This operation +is still useful as a vector norm, but it requires fewer arithmetic +operations.

+

Examples of the third category include the following:

+
    +
  • nonconjugated dot product xDOTU and conjugated dot +product xDOTC; and

  • +
  • rank-1 symmetric (xGERU) vs. Hermitian +(xGERC) matrix update.

  • +
+

The conjugate transpose and the (nonconjugated) transpose are the +same operation in real arithmetic (if one considers real arithmetic +embedded in complex arithmetic), but differ in complex arithmetic. +Different applications have different reasons to want either. The C++ +Standard includes complex numbers, so a Standard linear algebra library +needs to respect the mathematical structures that go along with complex +numbers.

+

9 What we exclude from the design

+

9.1 Most functions not in the +Reference BLAS

+

The BLAS Standard includes functionality that appears neither in the +Reference +BLAS library, nor in the classic BLAS “level” 1, 2, and 3 papers. +(For history of the BLAS “levels” and a bibliography, see P1417R0. For a +paper describing functions not in the Reference BLAS, see “An updated +set of basic linear algebra subprograms (BLAS),” listed in “Other +references” below.) For example, the BLAS Standard has

+
    +
  • several new dense functions, like a fused vector update and dot +product;

  • +
  • sparse linear algebra functions, like sparse matrix-vector +multiply and an interface for constructing sparse matrices; and

  • +
  • extended- and mixed-precision dense functions (though we subsume +some of their functionality; see below).

  • +
+

Our proposal only includes core Reference BLAS functionality, for the +following reasons:

+
    +
  1. Vendors who implement a new component of the C++ Standard Library +will want to see and test against an existing reference +implementation.

  2. +
  3. Many applications that use sparse linear algebra also use dense, +but not vice versa.

  4. +
  5. The Sparse BLAS interface is a stateful interface that is not +consistent with the dense BLAS, and would need more extensive redesign +to translate into a modern C++ idiom. See discussion in +P1417R0.

  6. +
  7. Our proposal subsumes some dense mixed-precision functionality +(see below).

  8. +
+

We have included vector sum-of-squares and matrix norms as +exceptions, for the same reason that we include vector 2-norm: to expose +hooks for quality-of-implementation improvements that avoid underflow or +overflow when computing with floating-point values.

+ +

The LAPACK Fortran +library implements solvers for the following classes of mathematical +problems:

+
    +
  • linear systems,

  • +
  • linear least-squares problems, and

  • +
  • eigenvalue and singular value problems.

  • +
+

It also provides matrix factorizations and related linear algebra +operations. LAPACK deliberately relies on the BLAS for good performance; +in fact, LAPACK and the BLAS were designed together. See history +presented in P1417R0.

+

Several C++ libraries provide slices of LAPACK functionality. Here is +a brief, noninclusive list, in alphabetical order, of some libraries +actively being maintained:

+ +

P1417R0 gives some history of +C++ linear algebra libraries. The authors of this proposal have designed, +written, and maintained +LAPACK wrappers in C++. Some authors have LAPACK founders as PhD +advisors. Nevertheless, we have excluded LAPACK-like functionality from +this proposal, for the following reasons:

+
    +
  1. LAPACK is a Fortran library, unlike the BLAS, which is a +multilanguage standard.

  2. +
  3. We intend to support more general element types, beyond the four +that LAPACK supports. It’s much more straightforward to make a C++ BLAS +work for general element types, than to make LAPACK algorithms work +generically.

  4. +
+

First, unlike the BLAS, LAPACK is a Fortran library, not a standard. +LAPACK was developed concurrently with the “level 3” BLAS functions, and +the two projects share contributors. Nevertheless, only the BLAS and not +LAPACK got standardized. Some vendors supply LAPACK implementations with +some optimized functions, but most implementations likely depend heavily +on “reference” LAPACK. There have been a few efforts by LAPACK +contributors to develop C++ LAPACK bindings, from Lapack++ in pre-templates C++ +circa 1993, to the recent “C++ +API for BLAS and LAPACK”. (The latter shares coauthors with this +proposal.) However, these are still just C++ bindings to a Fortran +library. This means that if vendors had to supply C++ functionality +equivalent to LAPACK, they would either need to start with a Fortran +compiler, or would need to invest a lot of effort in a C++ +reimplementation. Mechanical translation from Fortran to C++ introduces +risk, because many LAPACK functions depend critically on details of +floating-point arithmetic behavior.

+

Second, we intend to permit use of matrix or vector element types +other than just the four types that the BLAS and LAPACK support. This +includes “short” floating-point types, fixed-point types, integers, and +user-defined arithmetic types. Doing this is easier for BLAS-like +operations than for the much more complicated numerical algorithms in +LAPACK. LAPACK strives for a “generic” design (see Jack Dongarra +interview summary in P1417R0), but only supports two real floating-point +types and two complex floating-point types. Directly translating LAPACK +source code into a “generic” version could lead to pitfalls. Many LAPACK +algorithms only make sense for number systems that aim to approximate +real numbers (or their complex extentions). Some LAPACK functions output +error bounds that rely on properties of floating-point arithmetic.

+

For these reasons, we have left LAPACK-like functionality for future +work. It would be natural for a future LAPACK-like C++ library to build +on our proposal.

+

9.3 Extended-precision BLAS

+

Our interface subsumes some functionality of the Mixed-Precision BLAS +specification (Chapter 4 of the BLAS Standard). For example, users may +multiply two 16-bit floating-point matrices (assuming that a 16-bit +floating-point type exists) and accumulate into a 32-bit floating-point +matrix, just by providing a 32-bit floating-point matrix as output. +Users may specify the precision of a dot product result. If it is +greater than the input vectors’ element type precisions (e.g., +double vs. float), then this effectively +performs accumulation in higher precision. Our proposal imposes semantic +requirements on some functions, like vector_two_norm, to +behave in this way.

+

However, we do not include the “Extended-Precision BLAS” in this +proposal. The BLAS Standard lets callers decide at run time whether to +use extended precision floating-point arithmetic for internal +evaluations. We could support this feature at a later time. +Implementations of our interface also have the freedom to use more +accurate evaluation methods than typical BLAS implementations. For +example, it is possible to make floating-point sums completely independent of parallel +evaluation order.

+

9.4 Arithmetic operators and +associated expression templates

+

Our proposal omits arithmetic operators on matrices and vectors. We +do so for the following reasons:

+
    +
  1. We propose a low-level, minimal interface.

  2. +
  3. operator* could have multiple meanings for matrices +and vectors. Should it mean elementwise product (like +valarray) or matrix product? Should libraries reinterpret +“vector times vector” as a dot product (row vector times column vector)? +We prefer to let a higher-level library decide this, and make everything +explicit at our lower level.

  4. +
  5. Arithmetic operators require defining the element type of the +vector or matrix returned by an expression. Functions let users specify +this explicitly, and even let users use different output types for the +same input types in different expressions.

  6. +
  7. Arithmetic operators may require allocation of temporary matrix +or vector storage. This prevents use of nonowning data +structures.

  8. +
  9. Arithmetic operators strongly suggest expression templates. These +introduce problems such as dangling references and aliasing.

  10. +
+

Our goal is to propose a low-level interface. Other libraries, such +as that proposed by P1385, could +use our interface to implement overloaded arithmetic for matrices and +vectors. A constrained, function-based, BLAS-like interface builds +incrementally on the many years of BLAS experience.

+

Arithmetic operators on matrices and vectors would require the +library, not necessarily the user, to specify the element type of an +expression’s result. This gets tricky if the terms have mixed element +types. For example, what should the element type of the result of the +vector sum x + y be, if x has element type +complex<float> and y has element type +double? It’s tempting to use common_type_t, +but common_type_t<complex<float>, double> is +complex<float>. This loses precision. Some users may +want complex<double>; others may want +complex<long double> or something else, and others +may want to choose different types in the same program.

+

P1385 lets users customize the return type of such arithmetic +expressions. However, different algorithms may call for the same +expression with the same inputs to have different output types. For +example, iterative refinement of linear systems Ax=b can +work either with an extended-precision intermediate residual vector +r = b - A*x, or with a residual vector that has the same +precision as the input linear system. Each choice produces a different +algorithm with different convergence characteristics, per-iteration run +time, and memory requirements. Thus, our library lets users specify the +result element type of linear algebra operations explicitly, by calling +a named function that takes an output argument explicitly, rather than +an arithmetic operator.

+

Arithmetic operators on matrices or vectors may also need to allocate +temporary storage. Users may not want that. When LAPACK’s developers +switched from Fortran 77 to a subset of Fortran 90, their users rejected +the option of letting LAPACK functions allocate temporary storage on +their own. Users wanted to control memory allocation. Also, allocating +storage precludes use of nonowning input data structures like +mdspan, that do not know how to allocate.

+

Arithmetic expressions on matrices or vectors strongly suggest +expression templates, as a way to avoid allocation of temporaries and to +fuse computational kernels. They do not require expression +templates. For example, valarray offers overloaded +operators for vector arithmetic, but the Standard lets implementers +decide whether to use expression templates. However, all of the current +C++ linear algebra libraries that we mentioned above have some form of +expression templates for overloaded arithmetic operators, so users will +expect this and rely on it for good performance. This was, indeed, one +of the major complaints about initial implementations of +valarray: its lack of mandate for expression templates +meant that initial implementations were slow, and thus users did not +want to rely on it. (See Josuttis 1999, p. 547, and Vandevoorde and +Josuttis 2003, p. 342, for a summary of the history. Fortran has an +analogous issue, in which (under certain conditions) it is +implementation defined whether the run-time environment needs to copy +noncontiguous slices of an array into contiguous temporary storage.)

+

Expression templates work well, but have issues. Our papers +P1417R0 and “Evolving a Standard +C++ Linear Algebra Library from the BLAS” +(P1674) give more detail on these +concerns. A particularly troublesome one is that C++ auto +type deduction makes it easy for users to capture expressions before the +expression templates system has the chance to evaluate them and write +the result into the output. For matrices and vectors with container +semantics, this makes it easy to create dangling references. Users might +not realize that they need to assign expressions to named types before +actual work and storage happen. Eigen’s +documentation describes this common problem.

+

Our scaled, conjugated, +transposed, and conjugate_transposed functions +make use of one aspect of expression templates, namely modifying the +mdspan array access operator. However, we intend these +functions for use only as in-place modifications of arguments of a +function call. Also, when modifying mdspan, these functions +merely view the same data that their input mdspan views. +They introduce no more potential for dangling references than +mdspan itself. The use of views like mdspan is +self-documenting; it tells users that they need to take responsibility +for scope of the viewed data.

+

9.5 Banded matrix layouts

+

This proposal omits banded matrix types. It would be easy to add the +required layouts and specializations of algorithms later. The packed and +unpacked symmetric and triangular layouts in this proposal cover the +major concerns that would arise in the banded case, like nonstrided and +nonunique layouts, and matrix types that forbid access to some +multi-indices in the Cartesian product of extents.

+

9.6 Tensors

+

We exclude tensors from this proposal, for the following reasons. +First, tensor libraries naturally build on optimized dense linear +algebra libraries like the BLAS, so a linear algebra library is a good +first step. Second, mdspan has natural use as a low-level +representation of dense tensors, so we are already partway there. Third, +even simple tensor operations that naturally generalize the BLAS have +infintely many more cases than linear algebra. It’s not clear to us +which to optimize. Fourth, even though linear algebra is a special case +of tensor algebra, users of linear algebra have different interface +expectations than users of tensor algebra. Thus, it makes sense to have +two separate interfaces.

+

9.7 Explicit support for +asynchronous return of scalar values

+

After we presented revision 2 of this paper, LEWG asked us to +consider support for discrete graphics processing units (GPUs). GPUs +have two features of interest here. First, they might have memory that +is not accessible from ordinary C++ code, but could be accessed in a +standard algorithm (or one of our proposed algorithms) with the right +implementation-specific ExecutionPolicy. (For instance, a +policy could say “run this algorithm on the GPU.”) Second, they might +execute those algorithms asynchronously. That is, they might write to +output arguments at some later time after the algorithm invocation +returns. This would imply different interfaces in some cases. For +instance, a hypothetical asynchronous vector 2-norm might write its +scalar result via a pointer to GPU memory, instead of returning the +result “on the CPU.”

+

Nothing in principle prevents mdspan from viewing memory +that is inaccessible from ordinary C++ code. This is a major feature of +the Kokkos::View class from the Kokkos library, and +Kokkos::View directly inspired mdspan. The C++ +Standard does not currently define how such memory behaves, but +implementations could define its behavior and make it work with +mdspan. This would, in turn, let implementations define our +algorithms to operate on such memory efficiently, if given the right +implementation-specific ExecutionPolicy.

+

Our proposal excludes algorithms that might write to their output +arguments at some time after the algorithm returns. First, LEWG insisted +that our proposed algorithms that compute a scalar result, like +vector_two_norm, return that result in the manner of +reduce, rather than writing the result to an output +reference or pointer. (Previous revisions of our proposal used the +latter interface pattern.) Second, it’s not clear whether writing a +scalar result to a pointer is the right interface for asynchronous +algorithms. Follow-on proposals to Executors (P0443R14) include +asynchronous algorithms, but none of these suggest returning results +asynchronously by pointer. Our proposal deliberately imitates the +existing standard algorithms. Right now, we have no standard +asynchronous algorithms to imitate.

+

10 Design justification

+

We take a step-wise approach. We begin with core BLAS dense linear +algebra functionality. We then deviate from that only as much as +necessary to get algorithms that behave as much as reasonable like the +existing C++ Standard Library algorithms. Future work or collaboration +with other proposals could implement a higher-level interface.

+

Please refer to our papers “Evolving a Standard C++ Linear Algebra +Library from the BLAS” (P1674) and +“Historical lessons for C++ linear algebra library standardization” +(P1417) They will give details and +references for many of the points that we summarize here.

+

10.1 We do not require using the +BLAS library or any particular “back-end”

+

Our proposal is inspired by and extends the dense BLAS interface. A +natural implementation might look like this:

+
    +
  1. wrap an existing C or Fortran BLAS library,

  2. +
  3. hope that the BLAS library is optimized, and then

  4. +
  5. extend the wrapper to include straightforward Standard C++ +implementations of P1673’s algorithms for matrix and vector value types +and data layouts that the BLAS does not support.

  6. +
+

P1674 describes the process of writing such an implementation. +However, P1673 does not require implementations to wrap the BLAS. In +particular, P1673 does not specify a “back-end” C-style interface that +would let users or implementers “swap out” different BLAS libraries. +Here are some reasons why we made this choice.

+

First, it’s possible to write an optimized implementation entirely in +Standard C++, without calling external C or Fortran functions. For +example, one can write a cache-blocked matrix-matrix multiply +implementation entirely in Standard C++.

+

Second, different vendors may have their own libraries that support +matrix and vector value types and/or layouts beyond what the standard +dense BLAS supports. For example, they may have C functions for +mixed-precision matrix-matrix multiply, like BLIS’ bli_gemm +(example +here), or NVIDIA’s cublasGemmEx +(example +here).

+

Third, just because a C or Fortran BLAS library can be found, doesn’t +mean that it’s optimized at all or optimized well. For example, many +Linux distributions have a BLAS software package that is built by +compiling the Reference BLAS. This will give poor performance for BLAS 3 +operations. Even “optimized” vendor BLAS libraries may not optimize all +cases. Release notes even for recent versions show performance +improvements.

+

In summary: While a natural way to implement this proposal would be +to wrap an existing C or Fortran BLAS library, we do not want to require +this. Thus, we do not specify a “back-end” C-style interface.

+

10.2 Why use mdspan?

+

10.2.1 View of a multidimensional +array

+

The BLAS operates on what C++ programmers might call views of +multidimensional arrays. Users of the BLAS can store their data in +whatever data structures they like, and handle their allocation and +lifetime as they see fit, as long as the data have a BLAS-compatible +memory layout.

+

The corresponding C++ data structure is mdspan. This +class encapsulates the large number of pointer and integer arguments +that BLAS functions take, that represent views of matrices and vectors. +Using mdspan in the C++ interface reduce the number of +arguments and avoids common errors, like mixing up the order of +arguments. It supports all the array memory layouts that the BLAS +supports, including row major and column major. It also expresses the +same data ownership model that the BLAS expresses. Users may manage +allocation and deallocation however they wish. In addition, +mdspan lets our algorithms exploit any dimensions known at +compile time.

+

10.2.2 Ease of use

+

The mdspan class’ layout and accessor policies let us +simplify our interfaces, by encapsulating transpose, conjugate, and +scalar arguments. Features of mdspan make implementing +BLAS-like algorithms much less error prone and easier to read. These +include its encapsulation of matrix indexing and its built-in “slicing” +capabilities via submdspan.

+

10.2.3 BLAS and mdspan +are low level

+

The BLAS is low level; it imposes no mathematical meaning on +multidimensional arrays. This gives users the freedom to develop +mathematical libraries with the semantics they want. Similarly, +mdspan is just a view of a multidimensional array; it has +no mathematical meaning on its own.

+

We mention this because “matrix,” “vector,” and “tensor” are +mathematical ideas that mean more than just arrays of numbers. This is +more than just a theoretical concern. Some BLAS functions operate on +“triangular,” “symmetric,” or “Hermitian” matrices, but they do not +assert that a matrix has any of these mathematical properties. Rather, +they only read only one side of the matrix (the lower or upper +triangle), and compute as if the other side of the matrix satisfies the +mathematical property. A key feature of the BLAS and libraries that +build on it, like LAPACK, is that they can operate on the matrix’s data +in place. These operations change both the matrix’s mathematical +properties and its representation in memory. For example, one might have +an N x N array representing a matrix that is symmetric in theory, but +computed and stored in a way that might not result in exactly symmetric +data. In order to solve linear systems with this matrix, one might give +the array to LAPACK’s xSYTRF to compute an LDLT +factorization, asking xSYTRF only to access the array’s +lower triangle. If xSYTRF finishes successfully, it has +overwritten the lower triangle of its input with a representation of +both the lower triangular factor L and the block diagonal matrix D, as +computed assuming that the matrix is the sum of the lower triangle and +the transpose of the lower triangle. The resulting N x N array no longer +represents a symmetric matrix. Rather, it contains part of the +representation of a LDLT +factorization of the matrix. The upper triangle still contains the +original input matrix’s data. One may then solve linear systems by +giving xSYTRS the lower triangle, along with other output +of xSYTRF.

+

The point of this example is that a “symmetric matrix class” is the +wrong way to model this situation. There’s an N x N array, whose +mathematical interpretation changes with each in-place operation +performed on it. The low-level mdspan data structure +carries no mathematical properties in itself, so it models this +situation better.

+

10.2.4 Hook for future expansion

+

The mdspan class treats its layout as an extension +point. This lets our interface support layouts beyond what the BLAS +Standard permits. The accessor extension point offers us a hook for +future expansion to support heterogeneous memory spaces. (This is a key +feature of Kokkos::View, the data structure that inspired +mdspan.) In addition, using mdspan has made it +easier for us to propose an efficient “batched” interface in our +separate proposal P2901, with +almost no interface differences.

+

10.2.5 Generic enough to replace a +“multidimensional array concept”

+

Our functions differ from the C++ Standard algorithms, in that they +take a concrete type mdspan with template parameters, +rather than any of an open set of types that satisfy some concept. LEWGI +requested in the 2019 Cologne meeting that we explore using a concept +instead of mdspan to define the arguments for the linear +algebra functions. This would mean that instead of having our functions +take mdspan parameters, the functions would be generic on +one or more suitably constrained multidimensional array types. The +constraints would form a “multidimensional array concept.”

+

We investigated this option, and rejected it, for the following +reasons. First, our proposal uses enough features of mdspan +that any concept generally applicable to all functions we propose would +replicate almost the entire definition of mdspan. This +proposal refers to almost all of mdspan’s features, +including extents, layouts, and accessors. The +conjugated, scaled, and +transposed functions in this proposal depend specifically +on custom layouts and accessors. These features make the algorithms have +more functionality than their C or Fortran BLAS equivalents, while +reducing the number of parameters that the algorithms take. They also +make the interface more consistent, in that each mdspan +parameter of a function behaves as itself and is not otherwise +“modified” by other parameters. Second, conversely, we think that +mdspan’s potential for customization gives it the power to +represent any reasonable multidimensional array view. Thus, +mdspan “is the concept.” Third, this proposal could support +any reasonable multidimensional array type, if the type just made it +convertible to mdspan, for example via a general +customization point get_mdspan that returns an +mdspan that views the array’s elements. Fourth, a +multidimensional array concept would only have value if nearly all +multidimensional arrays “in the wild” had the same interface, and if +that were actually the interface we wanted. However, the adoption of +P2128R6 into C++23 makes +operator[] the preferred multidimensional array access +operator. As the discussion in P2128 points out, operator[] +not supporting multiple parameters before C++23 meant that different +multidimensional array classes exposed array access with different +syntax. While many of them used the function call operator +operator(), mdspan quite deliberately does +not. P2128 explains why it’s a bad idea for a multidimensional array +type to support both operator() and +operator[]. Thus, a hypothetical multidimensional array +concept could not represent both pre-C++23 and post-C++23 +multidimensional arrays. After further discussion at the 2019 Belfast +meeting, LEWGI accepted our position that it is reasonable for our +algorithms to take the concrete (yet highly customizable) type +mdspan, instead of template parameters constrained by a +multidimensional array concept.

+

10.3 Function argument aliasing and +zero scalar multipliers

+

Summary:

+
    +
  1. The BLAS Standard forbids aliasing any input (read-only) argument +with any output (write-only or read-and-write) argument.

  2. +
  3. The BLAS uses INTENT(INOUT) (read-and-write) +arguments to express “updates” to a vector or matrix. By contrast, C++ +Standard algorithms like transform take input and output +iterator ranges as different parameters, but may let input and output +ranges be the same.

  4. +
  5. The BLAS uses the values of scalar multiplier arguments (“alpha” +or “beta”) of vectors or matrices at run time, to decide whether to +treat the vectors or matrices as write only. This matters both for +performance and semantically, assuming IEEE floating-point +arithmetic.

  6. +
  7. We decide separately, based on the category of BLAS function, how +to translate INTENT(INOUT) arguments into a C++ idiom:

    +
      +
    1. For triangular solve and triangular multiply, in-place behavior +is essential for computing matrix factorizations in place, without +requiring extra storage proportional to the input matrix’s dimensions. +However, in-place functions may hinder implementations’ use of some +forms of parallelism. Thus, we have both not-in-place and in-place +overloads. Both take an optional ExecutionPolicy&&, +as some forms of parallelism (e.g., vectorization) may still be +effective with in-place operations.

    2. +
    3. Else, if the BLAS function unconditionally updates (like +xGER), we retain read-and-write behavior for that +argument.

    4. +
    5. Else, if the BLAS function uses a scalar beta +argument to decide whether to read the output argument as well as write +to it (like xGEMM), we provide two versions: a write-only +version (as if beta is zero), and a read-and-write version +(as if beta is nonzero).

    6. +
  8. +
+

For a detailed analysis, please see our paper “Evolving a Standard +C++ Linear Algebra Library from the BLAS” +(P1674).

+

10.4 Support for different matrix +layouts

+

Summary:

+
    +
  1. The dense BLAS supports several different dense matrix “types.” +Type is a mixture of “storage format” (e.g., packed, banded) and +“mathematical property” (e.g., symmetric, Hermitian, +triangular).

  2. +
  3. Some “types” can be expressed as custom mdspan +layouts. Other types actually represent algorithmic constraints: for +instance, what entries of the matrix the algorithm is allowed to +access.

  4. +
  5. Thus, a C++ BLAS wrapper cannot overload on matrix “type” simply +by overloading on mdspan specialization. The wrapper must +use different function names, tags, or some other way to decide what the +matrix type is.

  6. +
+

For more details, including a list and description of the matrix +“types” that the dense BLAS supports, please see our paper “Evolving a +Standard C++ Linear Algebra Library from the BLAS” +(P1674).

+

A C++ linear algebra library has a few possibilities for +distinguishing the matrix “type”:

+
    +
  1. It could imitate the BLAS, by introducing different function +names, if the layouts and accessors do not sufficiently describe the +arguments.

  2. +
  3. It could introduce a hierarchy of higher-level classes for +representing linear algebra objects, use mdspan (or +something like it) underneath, and write algorithms to those +higher-level classes.

  4. +
  5. It could use the layout and accessor types in mdspan +simply as tags to indicate the matrix “type.” Algorithms could +specialize on those tags.

  6. +
+

We have chosen Approach 1. Our view is that a BLAS-like interface +should be as low-level as possible. Approach 2 is more like a “Matlab in +C++”; a library that implements this could build on our proposal’s +lower-level library. Approach 3 sounds attractive. However, +most BLAS matrix “types” do not have a natural representation as +layouts. Trying to hack them in would pollute mdspan – a +simple class meant to be easy for the compiler to optimize – with extra +baggage for representing what amounts to sparse matrices. We think that +BLAS matrix “type” is better represented with a higher-level library +that builds on our proposal.

+

10.5 Interpretation of “lower / +upper triangular”

+

10.5.1 Triangle refers to what part +of the matrix is accessed

+

The triangular, symmetric, and Hermitian algorithms in this proposal +all take a Triangle tag that specifies whether the +algorithm should access the upper or lower triangle of the matrix. This +has the same function as the UPLO argument of the +corresponding BLAS routines. The upper or lower triangular argument only +refers to what part of the matrix the algorithm will access. The “other +triangle” of the matrix need not contain useful data. For example, with +the symmetric algorithms, A[j, i] need not equal +A[i, j] for any i and j in the +domain of A with i not equal to +j. The algorithm just accesses one triangle and interprets +the other triangle as the result of flipping the accessed triangle over +the diagonal.

+

This “interpretation” approach to representing triangular matrices is +critical for matrix factorizations. For example, LAPACK’s LU +factorization (xGETRF) overwrites a matrix A with both its +L (lower triangular, implicitly represented diagonal of all ones) and U +(upper triangular, explicitly stored diagonal) factors. Solving linear +systems Ax=b with this factorization, as LAPACK’s xGETRS +routine does, requires solving first a linear system with the upper +triangular matrix U, and then solving a linear system with the lower +triangular matrix L. If the BLAS required that the “other triangle” of a +triangular matrix had all zero elements, then LU factorization would +require at least twice the storage. For symmetric and Hermitian +matrices, only accessing the matrix’s elements nonredundantly ensures +that the matrix remains mathematically symmetric resp. Hermitian, even +in the presence of rounding error.

+

10.5.2 BLAS applies UPLO to +original matrix; we apply Triangle to transformed matrix

+

The BLAS routines that take an UPLO argument generally +also take a TRANS argument. The TRANS argument +says whether to apply the matrix, its transpose, or its conjugate +transpose. The BLAS applies the UPLO argument to the +“original” matrix, not to the transposed matrix. For example, if +TRANS='T' or TRANS='C', UPLO='U' +means the routine will access the upper triangle of the matrix, not the +upper triangle of the matrix’s transpose.

+

Our proposal takes the opposite approach. It applies +Triangle to the input matrix, which may be the result of a +transformation such as transposed or +conjugate_transposed. For example, if Triangle +is upper_triangle_t, the algorithm will always access the +matrix for i,j in its domain with i ≤ +j (or i strictly less than j, if +the algorithm takes a Diagonal tag and +Diagonal is implicit_unit_diagonal_t). If the +input matrix is transposed(A) for a +layout_left mdspan A, this means +that the algorithm will access the upper triangle of +transposed(A), which is actually the lower +triangle of A.

+

We took this approach because our interface permits arbitrary +layouts, with possibly arbitrary nesting of layout transformations. This +comes from mdspan’s design itself, not even necessarily +from our proposal. For example, users might define +antitranspose(A), that flips indices over the antidiagonal +(the “other diagonal” that goes from the lower left to the upper right +of the matrix, instead of from the upper left to the lower right). +Layout transformations need not even be one-to-one, because layouts +themselves need not be (hence is_unique). Since it’s not +possible to “undo” a general layout, there’s no way to get back to the +“original matrix.”

+

Our approach, while not consistent with the BLAS, is internally +consistent. Triangle always has a clear meaning, no matter +what transformations users apply to the input. Layout transformations +like transposed have the same interpretation for all the +matrix algorithms, whether for general, triangular, symmetric, or +Hermitian matrices. This interpretation is consistent with the standard +meaning of mdspan layouts.

+

C BLAS implementations already apply layout transformations like this +so that they can use an existing column-major Fortran BLAS to implement +operations on matrices with different layouts. For example, the +transpose of an M x N layout_left +matrix is just the same data, viewed as an N x +M layout_right matrix. Thus, +transposed is consistent with current practice. In fact, +transposed need not use a special +layout_transpose, if it knows how to reinterpret the input +layout.

+

10.5.3 Summary

+
    +
  1. BLAS applies UPLO to the original matrix, before any +transposition. Our proposal applies Triangle to the +transformed matrix, after any transposition.

  2. +
  3. Our approach is the only reasonable way to handle the full +generality of user-defined layouts and layout transformations.

  4. +
+

10.6 1-norms and infinity-norms for +vectors and matrices of complex numbers

+

10.6.1 Summary

+

We define complex 1-norms and infinity-norms for matrices using the +magnitude of each element, but for vectors using the sum of absolute +values of the real and imaginary components of each element.

+

We do so because the BLAS exists for the implementation of algorithms +to solve linear systems, linear least-squares problems, and eigenvalue +problems. The BLAS does not aim to provide a complete set of +mathematical operations. Every function in the BLAS exists because some +LINPACK or LAPACK algorithm needs it.

+

For vectors, we use the sum of absolute values of the components +because

+
    +
  • this more accurately expresses the condition number of the sum of +the vector’s elements,

  • +
  • it results in a tighter error bound, and

  • +
  • it avoids a square root per element, with potentially the +additional cost of preventing undue underflow or overflow (as +hypot implementations do).

  • +
+

The resulting functions are not actually norms in the mathematical +sense, so their names vector_abs_sum and +vector_idx_abs_max do not include the word “norm.”

+

For matrices, we use the magnitude because the only reason LAPACK +ever actually computes matrix 1-norms or infinity-norms is for +estimating the condition number of a matrix. For this case, LAPACK +actually needs to compute the “true” matrix 1-norm (and infinity-norm), +that uses the magnitude.

+

10.6.2 Vectors

+

The 1-norm of a vector of real numbers is the sum of the absolute +values of the vector’s elements. The infinity-norm of a vector of real +numbers is the maximum of the absolute values of the vector’s elements. +Both of these are useful for analyzing rounding errors when solving +common linear algebra problems. For example, the 1-norm of a vector +expresses the condition number of the sum of the vector’s elements (see +Higham 2002, Section 4.2), while the infinity-norm expresses the +normwise backward error of the computed solution vector when solving a +linear system using Gaussian elimination (see LAPACK Users’ Guide).

+

The straightforward extension of both of these definitions for +vectors of complex numbers would be to replace “absolute value” with +“magnitude.” C++ suggests this by defining std::abs for +complex arguments as the magnitude. However, the BLAS instead uses the +sum of the absolute values of the real and imaginary components of each +element. For example, the BLAS functions SASUM and +DASUM compute the actual 1-norm i|vi| +of their length-n input vector +of real elements v, while +their complex counterparts CSASUM and DZASUM +compute i|ℜ(zi)| + |ℑ(zi)| +for their length n input +vector of complex elements z. +Likewise, the real BLAS functions ISAMAX and +IDAMAX find maxi|vi|, +while their complex counterparts ICAMAX and +IZAMAX find maxi|ℜ(zi)| + |ℑ(zi)|.

+

This definition of CSASUM and DZASUM +accurately expresses the condition number of the sum of a complex +vector’s elements. This is because complex numbers are added +componentwise, so summing a complex vector componentwise is really like +summing two real vectors separately. Thus, it is the logical +generalization of the vector 1-norm.

+

Annex A.1 of the BLAS Standard (p. 173) explains that this is also a +performance optimization, to avoid the expense of one square root per +vector element. Computing the magnitude is equivalent to two-parameter +hypot. Thus, for the same reason, high-quality +implementations of magnitude for floating-point arguments may do extra +work besides the square root, to prevent undue underflow or +overflow.

+

This approach also results in tighter error bounds. We mentioned +above how adding complex numbers sums their real and imaginary parts +separately. Thus, the rounding error committed by the sum can be +considered as a two-component vector. For a vector x of length 2, x1 sqrt(2) x2 and x2 ≤ ∥x1 +(see LAPACK Users’ Guide), so using the “1-norm” |ℜ(z)| + |ℑ(z)| of a +complex number z, instead of +the “2-norm” |z|, gives a +tighter error bound.

+

This is why P1673’s vector_abs_sum (vector 1-norm) and +vector_idx_abs_max (vector infinity-norm) functions use +|ℜ(z)| + |ℑ(z)| +instead of |z| for the +“absolute value” of each vector element.

+

One disadvantage of these definitions is that the resulting +quantities are not actually norms. This is because they do not preserve +scaling factors. For magnitude, |αz| equals |α||z| for any real number +α and complex number z. However, |αℜ(z)| + |αℑ(z)| +does not equal α(|ℜ(z)|+|ℑ(z)|) +in general. As a result, the names of the functions +vector_abs_sum and vector_idx_abs_max do not +include the word “norm.”

+

10.6.3 Matrices

+

The 1-norm A1 +of a matrix A is maxx1Ax1 +for all vectors x with the +same number of elements as A has columns. The infinity-norm A of a matrix A is maxxAx +for all vectors x with the +same number of elements as A has columns. The 1-norm is the +infinity-norm of the transpose, and vice versa.

+

Given that these norms are defined using the corresponding vector +norms, it would seem reasonable to use the BLAS’s optimizations for +vectors of complex numbers. However, the BLAS exists to serve LINPACK +and its successor, LAPACK (see P1417 for citations and a summary of the +history). Thus, looking at what LAPACK actually computes would be the +best guide. LAPACK uses matrix 1-norms and infinity-norms in two +different ways.

+

First, equilibration reduces errors when solving linear +systems using Gaussian elimination. It does so by scaling rows and +columns to minimize the matrix’s condition number A1A−11 +(or AA−1; +minimizing one minimizes the other). This effectively tries to make the +maximum absolute value of each row and column of the matrix as close to +1 as possible. (We say “close to 1,” because LAPACK equilibrates using +scaling factors that are powers of two, to prevent rounding error in +binary floating-point arithmetic.) LAPACK performs equilibration with +the routines xyzEQU, where x represents the +matrix’s element type and the two letters yz represent the +kind of matrix (e.g., GE for a general dense nonsymmetric +matrix). The complex versions of these routines use |ℜ(Aij)| + |ℑ(Aij)| +for the “absolute value” of each matrix element Aij. +This aligns mathematically with how LAPACK measures errors when solving +a linear system Ax = b.

+

Second, condition number estimation estimates the condition +number of a matrix A. LAPACK +relies on the 1-norm condition number A1A−11 +to estimate errors for nearly all of its computations. LAPACK’s +xyzCON routines perform condition number estimation. It +turns out that the complex versions of these routines require the “true” +1-norm that uses the magnitude of each matrix element (as explained in +Higham 1988).

+

LAPACK only ever actually computes matrix 1-norms or +infinity-norms when it estimates the matrix condition number. Thus, +LAPACK actually needs to compute the “true” matrix 1-norm (and +infinity-norm). This is why P1673 defines matrix_one_norm +and matrix_inf_norm to return the “true” matrix 1-norm +resp. infinity-norm.

+

10.7 Over- and underflow wording +for vector 2-norm

+

SG6 recommended to us at Belfast 2019 to change the special overflow +/ underflow wording for vector_two_norm to imitate the BLAS +Standard more closely. The BLAS Standard does say something about +overflow and underflow for vector 2-norms. We reviewed this wording and +conclude that it is either a nonbinding quality of implementation (QoI) +recommendation, or too vaguely stated to translate directly into C++ +Standard wording. Thus, we removed our special overflow / underflow +wording. However, the BLAS Standard clearly expresses the intent that +implementations document their underflow and overflow guarantees for +certain functions, like vector 2-norms. The C++ Standard requires +documentation of “implementation-defined behavior.” Therefore, we added +language to our proposal that makes “any guarantees regarding overflow +and underflow” of those certain functions “implementation-defined.”

+

Previous versions of this paper asked implementations to compute +vector 2-norms “without undue overflow or underflow at intermediate +stages of the computation.” “Undue” imitates existing C++ Standard +wording for hypot. This wording hints at the stricter +requirements in F.9 (normative, but optional) of the C Standard for math +library functions like hypot, without mandating those +requirements. In particular, paragraph 9 of F.9 says:

+
+

Whether or when library functions raise an undeserved “underflow” +floating-point exception is unspecified. Otherwise, as implied by F.7.6, +the <math.h> functions do not raise spurious +floating-point exceptions (detectable by the user) [including the +“overflow” exception discussed in paragraph 6], other than the “inexact” +floating-point exception.

+
+

However, these requirements are for math library functions like +hypot, not for general algorithms that return +floating-point values. SG6 did not raise a concern that we should treat +vector_two_norm like a math library function; their concern +was that we imitate the BLAS Standard’s wording.

+

The BLAS Standard says of several operations, including vector +2-norm: “Here are the exceptional routines where we ask for particularly +careful implementations to avoid unnecessary over/underflows, that could +make the output unnecessarily inaccurate or unreliable” (p. 35).

+

The BLAS Standard does not define phrases like “unnecessary +over/underflows.” The likely intent is to avoid naïve implementations +that simply add up the squares of the vector elements. These would +overflow even if the norm in exact arithmetic is significantly less than +the overflow threshold. The POSIX Standard (IEEE Std 1003.1-2017) +analogously says that hypot must “take precautions against +overflow during intermediate steps of the computation.”

+

The phrase “precautions against overflow” is too vague for us to +translate into a requirement. The authors likely meant to exclude naïve +implementations, but not require implementations to know whether a +result computed in exact arithmetic would overflow or underflow. The +latter is a special case of computing floating-point sums exactly, which +is costly for vectors of arbitrary length. While it would be a useful +feature, it is difficult enough that we do not want to require it, +especially since the BLAS Standard itself does not. The implementation +of vector 2-norms in the Reference BLAS included with LAPACK 3.10.0 +partitions the running sum of squares into three different accumulators: +one for big values (that might cause the sum to overflow without +rescaling), one for small values (that might cause the sum to underflow +without rescaling), and one for the remaining “medium” values. (See +Anderson 2017.) Earlier implementations merely rescaled by the current +maximum absolute value of all the vector entries seen thus far. (See +Blue 1978.) Implementations could also just compute the sum of squares +in a straightforward loop, then check floating-point status flags for +underflow or overflow, and recompute if needed.

+

For all of the functions listed on p. 35 of the BLAS Standard as +needing “particularly careful implementations,” except vector +norm, the BLAS Standard has an “Advice to implementors” section with +extra accuracy requirements. The BLAS Standard does have an “Advice to +implementors” section for matrix norms (see Section 2.8.7, p. 69), which +have similar over- and underflow concerns as vector norms. However, the +Standard merely states that “[h]igh-quality implementations of these +routines should be accurate” and should document their accuracy, and +gives examples of “accurate implementations” in LAPACK.

+

The BLAS Standard never defines what “Advice to implementors” means. +However, the BLAS Standard shares coauthors and audience with the +Message Passing Interface (MPI) Standard, which defines “Advice to +implementors” as “primarily commentary to implementors” and permissible +to skip (see e.g., MPI 3.0, Section 2.1, p. 9). We thus interpret +“Advice to implementors” in the BLAS Standard as a nonbinding quality of +implementation (QoI) recommendation.

+

10.8 Constraining matrix and vector +element types and scalars

+

10.8.1 Introduction

+

The BLAS only accepts four different types of scalars and matrix and +vector elements. In C++ terms, these correspond to float, +double, complex<float>, and +complex<double>. The algorithms we propose generalize +the BLAS by accepting any matrix, vector, or scalar element types that +make sense for each algorithm. Those may be built-in types, like +floating-point numbers or integers, or they may be custom types. Those +custom types might not behave like conventional real or complex numbers. +For example, quaternions have noncommutative multiplication +(a * b might not equal b * a), polynomials in +one variable over a field lack division, and some types might not even +have subtraction defined. Nevertheless, many BLAS operations would make +sense for all of these types.

+

“Constraining matrix and vector element types and scalars” means +defining how these types must behave in order for our algorithms to make +sense. This includes both syntactic and semantic constraints. We have +three goals:

+
    +
  1. to help implementers implement our algorithms correctly;

  2. +
  3. to give implementers the freedom to make quality of +implementation (QoI) enhancements, for both performance and accuracy; +and

  4. +
  5. to help users understand what types they may use with our +algorithms.

  6. +
+

The whole point of the BLAS was to identify key operations for +vendors to optimize. Thus, performance is a major concern. “Accuracy” +here refers to either to rounding error or to approximation error (for +matrix or vector element types where either makes sense).

+

10.8.2 Value type constraints do +not suffice to describe algorithm behavior

+

LEWG’s 2020 review of P1673R2 asked us to investigate +conceptification of its algorithms. “Conceptification” here refers to an +effort like that of P1813R0 (“A Concept Design for the Numeric +Algorithms”), to come up with concepts that could be used to constrain +the template parameters of numeric algorithms like reduce +or transform. (We are not referring to LEWGI’s request for +us to consider generalizing our algorithm’s parameters from +mdspan to a hypothetical multidimensional array concept. We +discuss that above, in the “Why use mdspan?” section.) The +numeric algorithms are relevant to P1673 because many of the algorithms +proposed in P1673 look like generalizations of reduce or +transform. We intend for our algorithms to be generic on +their matrix and vector element types, so these questions matter a lot +to us.

+

We agree that it is useful to set constraints that make it possible +to reason about correctness of algorithms. However, we do not think +constraints on value types suffice for this purpose. First, requirements +like associativity are too strict to be useful for practical types. +Second, what we really want to do is describe the behavior of +algorithms, regardless of value types’ semantics. “The algorithm may +reorder sums” means something different than “addition on the terms in +the sum is associative.”

+

10.8.3 Associativity is too +strict

+

P1813R0 requires associative addition for many algorithms, such as +reduce. However, many practical arithmetic systems that +users might like to use with algorithms like reduce have +non-associative addition. These include

+
    +
  • systems with rounding;

  • +
  • systems with an “infinity”: e.g., if 10 is Inf, 3 + 8 - 7 could +be either Inf or 4; and

  • +
  • saturating arithmetic: e.g., if 10 saturates, 3 + 8 - 7 could be +either 3 or 4.

  • +
+

Note that the latter two arithmetic systems have nothing to do with +rounding error. With saturating integer arithmetic, parenthesizing a sum +in different ways might give results that differ by as much as the +saturation threshold. It’s true that many non-associative arithmetic +systems behave “associatively enough” that users don’t fear +parallelizing sums. However, a concept with an exact property (like +“commutative semigroup”) isn’t the right match for “close enough,” just +like operator== isn’t the right match for describing +“nearly the same.” For some number systems, a rounding error bound might +be more appropriate, or guarantees on when underflow or overflow may +occur (as in POSIX’s hypot).

+

The problem is a mismatch between the requirement we want to express +– that “the algorithm may reparenthesize addition” – and the constraint +that “addition is associative.” The former describes the algorithm’s +behavior, while the latter describes the types used with that algorithm. +Given the huge variety of possible arithmetic systems, an approach like +the Standard’s use of GENERALIZED_SUM to describe +reduce and its kin seems more helpful. If the Standard +describes an algorithm in terms of GENERALIZED_SUM, then that +tells the caller what the algorithm might do. The caller then takes +responsibility for interpreting the algorithm’s results.

+

We think this is important both for adding new algorithms (like those +in this proposal) and for defining behavior of an algorithm with respect +to different ExecutionPolicy arguments. (For instance, +execution::par_unseq could imply that the algorithm might +change the order of terms in a sum, while execution::par +need not. Compare to MPI_Op_create’s commute +parameter, that affects the behavior of algorithms like +MPI_Reduce when used with the resulting user-defined +reduction operator.)

+

10.8.4 Generalizing associativity +helps little

+

Suppose we accept that associativity and related properties are not +useful for describing our proposed algorithms. Could there be a +generalization of associativity that would be useful? P1813R0’s +most general concept is a magma. Mathematically, a +magma is a set M with a binary operation ×, such that if a and +b are in M, then a × b is in M. The operation need not be associative or +commutative. While this seems almost too general to be useful, there are +two reasons why even a magma is too specific for our proposal.

+
    +
  1. A magma only assumes one set, that is, one type. This does not +accurately describe what the algorithms do, and it excludes useful +features like mixed precision and types that use expression +templates.

  2. +
  3. A magma is too specific, because algorithms are useful even if +the binary operation is not closed.

  4. +
+

First, even for simple linear algebra operations that “only” use plus +and times, there is no one “set M” over which plus and times operate. +There are actually three operations: plus, times, and assignment. Each +operation may have completely heterogeneous input(s) and output. The +sets (types) that may occur vary from algorithm to algorithm, depending +on the input type(s), and the algebraic expression(s) that the algorithm +is allowed to use. We might need several different concepts to cover all +the expressions that algorithms use, and the concepts would end up being +less useful to users than the expressions themselves.

+

For instance, consider the Level 1 BLAS “AXPY” function. This +computes y[i] = alpha * x[i] + y[i] elementwise. What type +does the expression alpha * x[i] + y[i] have? It doesn’t +need to have the same type as y[i]; it just needs to be +assignable to y[i]. The types of alpha, +x[i], and y[i] could all differ. As a simple +example, alpha might be int, x[i] +might be float, and y[i] might be +double. The types of x[i] and +y[i] might be more complicated; e.g., x[i] +might be a polynomial with double coefficients, and +y[i] a polynomial with float coefficients. If +those polynomials use expression templates, then slightly different sum +expressions involving x[i] and/or y[i] (e.g., +alpha * x[i] + y[i], x[i] + y[i], or +y[i] + x[i]) might all have different types, all of which +differ from value type of x or y. All of these +types must be assignable and convertible to the output value type.

+

We could try to describe this with a concept that expresses a sum +type. The sum type would include all the types that might show up in the +expression. However, we do not think this would improve clarity over +just the expression itself. Furthermore, different algorithms may need +different expressions, so we would need multiple concepts, one for each +expression. Why not just use the expressions to describe what the +algorithms can do?

+

Second, the magma concept is not helpful even if we only had one set +M, because our algorithms would still be useful even if binary +operations were not closed over that set. For example, consider a +hypothetical user-defined rational number type, where plus and times +throw if representing the result of the operation would take more than a +given fixed amount of memory. Programmers might handle this exception by +falling back to different algorithms. Neither plus or times on this type +would satisfy the magma requirement, but the algorithms would still be +useful for such a type. One could consider the magma requirement +satisfied in a purely syntactic sense, because of the return type of +plus and times. However, saying that would not accurately express the +type’s behavior.

+

This point returns us to the concerns we expressed earlier about +assuming associativity. “Approximately associative” or “usually +associative” are not useful concepts without further refinement. The way +to refine these concepts usefully is to describe the behavior of a type +fully, e.g., the way that IEEE 754 describes the behavior of +floating-point numbers. However, algorithms rarely depend on all the +properties in a specification like IEEE 754. The problem, again, is that +we need to describe what algorithms do – e.g., that they can rearrange +terms in a sum – not how the types that go into the algorithms +behave.

+

In summary:

+
    +
  • Many useful types have nonassociative or even non-closed +arithmetic.

  • +
  • Lack of (e.g.,) associativity is not just a rounding error +issue.

  • +
  • It can be useful to let algorithms do things like reparenthesize +sums or products, even for types that are not associative.

  • +
  • Permission for an algorithm to reparenthesize sums is not the +same as a concept constraining the terms in the sum.

  • +
  • We can and do use existing Standard language, like +GENERALIZED_SUM, for expressing permissions that algorithms +have.

  • +
+

In the sections that follow, we will describe a different way to +constrain the matrix and vector element types and scalars in our +algorithms. We will start by categorizing the different quality of +implementation (QoI) enhancements that implementers might like to make. +These enhancements call for changing algorithms in different ways. We +will distinguish textbook from non-textbook ways of changing algorithms, +explain that we only permit non-textbook changes for floating-point +types, then develop constraints on types that permit textbook +changes.

+

10.8.5 Categories of QoI +enhancements

+

An important goal of constraining our algorithms is to give +implementers the freedom to make QoI enhancements. We categorize QoI +enhancements in three ways:

+
    +
  1. those that depend entirely on the computer architecture;

  2. +
  3. those that might have architecture-dependent parameters, but +could otherwise be written in an architecture-independent way; +and

  4. +
  5. those that diverge from a textbook description of the algorithm, +and depend on element types having properties more specific than what +that description requires.

  6. +
+

An example of Category (1) would be special hardware instructions +that perform matrix-matrix multiplications on small, fixed-size blocks. +The hardware might only support a few types, such as integers, +fixed-point reals, or floating-point types. Implementations might use +these instructions for the entire algorithm, if the problem sizes and +element types match the instruction’s requirements. They might also use +these instructions to solve subproblems. In either case, these +instructions might reorder sums or create temporary values.

+

Examples of Category (2) include blocking to increase cache or +translation lookaside buffer (TLB) reuse, or using SIMD instructions +(given the Parallelism TS’ inclusion of SIMD). Many of these +optimizations relate to memory locality or parallelism. For an overview, +see (Goto 2008) or Section 2.6 of (Demmel 1997). All such optimizations +reorder sums and create temporary values.

+

Examples of Category (3) include Strassen’s algorithm for matrix +multiplication. The textbook formulation of matrix multiplication only +uses additions and multiplies, but Strassen’s algorithm also performs +subtractions. A common feature of Category (3) enhancements is that +their implementation diverges from a “textbook description of the +algorithm” in ways beyond just reordering sums. As a “textbook,” we +recommend either (Strang 2016), or the concise mathematical description +of operations in the BLAS Standard. In the next section, we will list +properties of textbook descriptions, and explain some ways in which QoI +enhancements might fail to adhere to those properties.

+

10.8.6 Properties of textbook +algorithm descriptions

+

“Textbook descriptions” of the algorithms we propose tend to have the +following properties. For each property, we give an example of a +“non-textbook” algorithm, and how it assumes something extra about the +matrix’s element type.

+
    +
  1. They compute floating-point sums straightforwardly (possibly +reordered, or with temporary intermediate values), rather than using any +of several algorithms that improve accuracy (e.g., compensated +summation) or even make the result independent of evaluation order (see +Demmel 2013). All such non-straightforward algorithms depend on +properties of floating-point arithmetic. We will define below what +“possibly reordered, or with temporary intermediate values” +means.

  2. +
  3. They use only those arithmetic operations on the matrix and +vector element types that the textbook description of the algorithm +requires, even if using other kinds of arithmetic operations would +improve performance or give an asymptotically faster algorithm.

  4. +
  5. They use exact algorithms (not considering rounding error), +rather than approximations (that would not be exact even if computing +with real numbers).

  6. +
  7. They do not use parallel algorithms that would give an +asymptotically faster parallelization in the theoretical limit of +infinitely many available parallel processing units, at the cost of +likely unacceptable rounding error in floating-point +arithmetic.

  8. +
+

As an example of (b), the textbook matrix multiplication algorithm +only adds or multiplies the matrices’ elements. In contrast, Strassen’s +algorithm for matrix-matrix multiply subtracts as well as adds and +multiplies the matrices’ elements. Use of subtraction assumes that +arbitrary elements have an additive inverse, but the textbook matrix +multiplication algorithm makes sense even for element types that lack +additive inverses for all elements. Also, use of subtractions changes +floating-point rounding behavior, though that change is understood and +often considered acceptable (see Demmel 2007).

+

As an example of (c), the textbook substitution algorithm for solving +triangular linear systems is exact. In contrast, one can approximate +triangular solve with a stationary iteration. (See, e.g., Section 5 of +(Chow 2015). That paper concerns the sparse matrix case; we cite it +merely as an example of an approximate algorithm, not as a +recommendation for dense triangular solve.) Approximation only makes +sense for element types that have enough precision for the approximation +to be accurate. If the approximation checks convergence, than the +algorithm also requires less-than comparison of absolute values of +differences.

+

Multiplication by the reciprocal of a number, rather than division by +that number, could fit into (b) or (c). As an example of (c), +implementations for hardware where floating-point division is slow +compared with multiplication could use an approximate reciprocal +multiplication to implement division.

+

As an example of (d), the textbook substitution algorithm for solving +triangular linear systems has data dependencies that limit its +theoretical parallelism. In contrast, one can solve a triangular linear +system by building all powers of the matrix in parallel, then solving +the linear system as with a Krylov subspace method. This approach is +exact for real numbers, but commits too much rounding error to be useful +in practice for all but the smallest linear systems. In fact, the +algorithm requires that the matrix’s element type have precision +exponential in the matrix’s dimension.

+

Many of these non-textbook algorithms rely on properties of +floating-point arithmetic. Strassen’s algorithm makes sense for unsigned +integer types, but it could lead to unwarranted and unexpected overflow +for signed integer types. Thus, we think it best to limit implementers +to textbook algorithms, unless all matrix and vector element types are +floating-point types. We always forbid non-textbook algorithms of type +(d). If all matrix and vector element types are floating-point types, we +permit non-textbook algorithms of Types (a), (b), and (c), under two +conditions:

+
    +
  1. they satisfy the complexity requirements; and

  2. +
  3. they result in a logarithmically stable algorithm, in +the sense of (Demmel 2007).

  4. +
+

We believe that Condition (2) is a reasonable interpretation of +Section 2.7 of the BLAS Standard. This says that “no particular +computational order is mandated by the function specifications. In other +words, any algorithm that produces results ‘close enough’ to the usual +algorithms presented in a standard book on matrix computations is +acceptable.” Examples of what the BLAS Standard considers “acceptable” +include Strassen’s algorithm, and implementing matrix multiplication as +C = (alpha * A) * B + (beta * C), +C = alpha * (A * B) + (beta * C), or +C = A * (alpha * B) + (beta * C).

+

“Textbook algorithms” includes optimizations commonly found in BLAS +implementations. This includes any available hardware acceleration, as +well as the locality and parallelism optimizations we describe below. +Thus, we think restricting generic implementations to textbook +algorithms will not overly limit implementers.

+

Acceptance of P1467 (“Extended floating-point types and standard +names”) into C++23 means that the set of floating-point types has grown. +Before P1467, this set had three members: float, +double, and long double. After P1467, it +includes implementation-specific types, such as short or +extended-precision floats. This change may require implementers to look +carefully at the definition of logarithmically stable before making +certain algorithmic choices, especially for short floats.

+

10.8.7 Reordering sums and creating +temporaries

+

Even textbook descriptions of linear algebra algorithms presume the +freedom to reorder sums and create temporary values. Optimizations for +memory locality and parallelism depend on this. This freedom imposes +requirements on algorithms’ matrix and vector element types.

+

We could get this freedom either by limiting our proposal to the +Standard’s current arithmetic types, or by forbidding reordering and +temporaries for types other than arithmetic types. However, doing so +would unnecessarily prevent straightforward optimizations for small and +fast types that act just like arithmetic types. This includes so-called +“short floats” such as bfloat16 or binary16, extended-precision +floating-point numbers, and fixed-point reals. Some of these types may +be implementation defined, and others may be user-specified. We intend +to permit implementers to optimize for these types as well. This +motivates us to describe our algorithms’ type requirements in a generic +way.

+

10.8.7.1 Special case: Only one +element type

+

We find it easier to think about type requirements by starting with +the assumption that all element and scalar types in algorithms are the +same. One can then generalize to input element type(s) that might differ +from the output element type and/or scalar result type.

+

Optimizations for memory locality and parallelism both create +temporary values, and change the order of sums. For example, +reorganizing matrix data to reduce stride involves making a temporary +copy of a subset of the matrix, and accumulating partial sums into the +temporary copy. Thus, both kinds of optimizations impose a common set of +requirements and assumptions on types. Let value_type be +the output mdspan’s value_type. +Implementations may:

+
    +
  1. create arbitrarily many objects of type value_type, +value-initializing them or direct-initializing them with any existing +object of that type;

  2. +
  3. perform sums in any order; or

  4. +
  5. replace any value with the sum of that value and a +value-initialized value_type object.

  6. +
+

Assumption (1) implies that the output value type is +semiregular. Contrast with +[algorithms.parallel.exec]: “Unless otherwise stated, +implementations may make arbitrary copies of elements of type +T, from sequences where +is_trivially_copy_constructible_v<T> and +is_trivially_destructible_v<T> are true.” We omit the +trivially constructible and destructible requirements here and permit +any semiregular type. Linear algebra algorithms assume +mathematical properties that let us impose more specific requirements +than general parallel algorithms. Nevertheless, implementations may want +to enable optimizations that create significant temporary storage only +if the value type is trivially constructible, trivially destructible, +and not too large.

+

Regarding Assumption (2): The freedom to compute sums in any order is +not necessarily a type constraint. Rather, it’s a right that the +algorithm claims, regardless of whether the type’s addition is +associative or commutative. For example, floating-point sums are not +associative, yet both parallelization and customary linear algebra +optimizations rely on reordering sums. See the above “Value type +constraints do not suffice to describe algorithm behavior” section for a +more detailed explanation.

+

Regarding Assumption (3), we do not actually say that +value-initialization produces a two-sided additive identity. What +matters is what the algorithm’s implementation may do, not whether the +type actually behaves in this way.

+

10.8.7.2 General case: Multiple +input element types

+

An important feature of P1673 is the ability to compute with mixed +matrix or vector element types. For instance, +add(y, scaled(alpha, x), z) implements the operation z = y ++ alpha*x, an elementwise scaled vector sum. The element types of the +vectors x, y, and z could be all different, and could differ from the +type of alpha.

+
10.8.7.2.1 Accumulate into output +value type
+

Generic algorithms would use the output mdspan’s +value_type to accumulate partial sums, and for any +temporary results. This is the analog of std::reduce’s +scalar result type T. Implementations for floating-point +types might accumulate into higher-precision temporaries, or use other +ways to increase accuracy when accumulating partial sums, but the output +mdspan’s value_type would still control +accumulation behavior in general.

+
10.8.7.2.2 Proxy references or +expression templates
+
    +
  1. Proxy references: The input and/or output mdspan +might have an accessor with a reference type other than +element_type&. For example, the output +mdspan might have a value type value_type, but +its reference type might be +atomic_ref<value_type>.

  2. +
  3. Expression templates: The element types themselves might have +arithmetic operations that defer the actual computation until the +expression is assigned. These “expression template” types typically hold +some kind of reference or pointer to their input arguments.

  4. +
+

Neither proxy references nor expression template types are +semiregular, because they behave like references, not like +values. However, we can still require that their underlying value type +be semiregular. For instance, the possiblity of proxy +references just means that we need to use the output +mdspan‘s value_type when constructing or +value-initializing temporary values, rather than trying to deduce the +value type from the type of an expression that indexes into the output +mdspan. Expression templates just mean that we need to use +the output mdspan’s value_type to construct or +value-initialize temporaries, rather than trying to deduce the +temporaries’ type from the right-hand side of the expression.

+

The z = y + alpha*x example above shows that some of the +algorithms we propose have multiple terms in a sum on the right-hand +side of the expression that defines the algorithm. If algorithms have +permission to rearrange the order of sums, then they need to be able to +break up such expressions into separate terms, even if some of those +expressions are expression templates.

+

10.8.8 “Textbook” algorithm +description in semiring terms

+

As we explain in the “Value type constraints do not suffice to +describe algorithm behavior” section above, we deliberately constrain +matrix and vector element types to require associative addition. This +means that we do not, for instance, define concepts like “ring” or +“group.” We cannot even speak of a single set of values that would +permit defining things like a “ring” or “group.” This is because our +algorithms must handle mixed value types, expression templates, and +proxy references. However, it may still be helpful to use mathematical +language to explain what we mean by “a textbook description of the +algorithm.”

+

Most of the algorithms we propose only depend on addition and +multiplication. We describe these algorithms as if working on elements +of a semiring with possibly noncommutative multiplication. The +only difference between a semiring and a ring is that a semiring does +not require all elements to have an additive inverse. That is, addition +is allowed, but not subtraction. Implementers may apply any mathematical +transformation to the expressions that would give the same result for +any semiring.

+

10.8.8.1 Why a semiring?

+

We use a semiring because

+
    +
  1. we generally want to reorder terms in sums, but we do not want to +order terms in products; and

  2. +
  3. we do not want to assume that subtraction works.

  4. +
+

The first is because linear algebra computations are useful for +matrix or vector element types with noncommutative multiplication, such +as quaternions or matrices. The second is because algebra operations +might be useful for signed integers, where a formulation using +subtraction risks unexpected undefined behavior.

+

10.8.8.2 Semirings and testing

+

It’s important that implementers be able to test our proposed +algorithms for custom element types, not just the built-in arithmetic +types. We don’t want to require hypothetical “exact real arithmetic” +types that take particular expertise to implement. Instead, we propose +testing with simple classes built out of unsigned integers. This section +is not part of our Standard Library proposal, but we include it to give +guidance to implementers and to show that it’s feasible to test our +proposal.

+

10.8.8.3 Commutative +multiplication

+

C++ unsigned integers implement commutative rings. (Rings always have +commutative addition; a “commutative ring” has commutative +multiplication as well.) We may transform (say) uint32_t +into a commutative semiring by wrapping it in a class that does not +provide unary or binary operator-. Adding a “tag” template +parameter to this class would let implementers build tests for mixed +element types.

+

10.8.8.4 Noncommutative +multiplication

+

The semiring of 2x2 matrices with element type a commutative semiring +is itself a semiring, but with noncommutative multiplication. This is a +good way to build a noncommutative semiring for testing.

+

10.8.9 Summary

+
    +
  • Constraining the matrix and vector element types and scalar types +in our functions gives implementers the freedom to make QoI enhancements +without risking correctness.

  • +
  • We think describing algorithms’ behavior and implementation +freedom is more useful than mathematical concepts like “ring.” For +example, we permit implementations to reorder sums, but this does not +mean that they assume sums are associative. This is why we do not define +a hierarchy of number concepts.

  • +
  • We categorize different ways that implementers might like to +change algorithms, list categories we exclude and categories we permit, +and use the permitted categories to derive constraints on the types of +matrix and vector elements and scalar results.

  • +
  • We explain how a semiring is a good way to talk about +implementation freedom, even though we do not think it is a good way to +constrain types. We then use the semiring description to explain how +implementers can test generic algorithms.

  • +
+

10.9 Fix issues with +complex, and support user-defined complex number types

+

10.9.1 Motivation and summary of +solution

+

The BLAS provides functionality specifically for vectors and matrices +of complex numbers. Built into the BLAS is the assumption that the +complex conjugate of a real number is just the number. This makes it +possible to write algorithms that are generic on whether the vector or +matrix value type is real or complex. For example, such generic +algorithms can use the “conjugate transpose” for both cases. P1673 users +may thus want to apply conjugate_transposed to matrices of +arbitrary value types.

+

The BLAS also needs to distinguish between complex and real value +types, because some BLAS functions behave differently for each. For +example, for real numbers, the BLAS functions SASUM and +DASUM compute the sum of absolute values of the vector’s +elements. For complex numbers, the corresponding BLAS functions +SCASUM and DZASUM compute the sum of absolute +values of the real and imaginary components, rather than the sum of +magnitudes. This is an optimization to avoid square roots (as Appendix +A.1 of the BLAS Standard explains), but it is well founded in the theory +of accuracy of matrix factorizations.

+

The C++ Standard’s functions conj, real, +and imag return the complex conjugate, real part, and +imaginary part of a complex number, respectively. They also have +overloads for arithmetic types that give the expected mathematical +behavior: the conjugate or real part of a real number is the number +itself, and the imaginary part of a real number is zero. Thus, it would +make sense for P1673 to use these functions to express generic +algorithms with complex or real numbers. However, these functions have +three issues that preclude their direct use in P1673’s algorithms.

+
    +
  1. The existing overloads of these functions in the std +namespace do not always preserve the type of their argument. For +arguments of arithmetic type, conj returns +complex. For arguments of integral type, real +and imag always return double. The resulting +value type change is mathematically unexpected in generic algorithms, +can cause information loss for 64-bit integers, and can hinder use of an +optimized BLAS.

  2. +
  3. Users sometimes need to define their own “custom” complex number +types instead of using specializations of complex, but +users cannot overload functions in the std namespace for +these types.

  4. +
  5. How does P1673 tell if a user-defined type represents a complex +number?

  6. +
+

Libraries such as Kokkos address Issue (2) by defining +conj, real, and imag functions in +the library’s namespace (not std). Generic code then can +rely on argument-dependent lookup (ADL) by invoking these functions +without namespace qualification. The library’s functions help make the +custom complex type “interface-compatible” with +std::complex. This ADL solution long predates any more +elaborate customization point technique. In our experience, many +mathematical libraries use this approach.

+

This common practice suggests a way for P1673 to solve the other +issues as well. A user-defined type represents a complex number if and +only if it has ADL-findable conj, real, and +imag functions. This means that P1673 can “wrap” these +functions to make them behave as mathematically expected for any value +type. These “wrappers” can also adjust the behavior of +std::conj, std::real, and +std::imag for arithmetic types.

+

P1673 defines these wrappers as the exposition-only functions +conj-if-needed, +real-if-needed, and +imag-if-needed.

+

10.9.2 Why users define their own +complex number types

+

Users define their own complex number types for three reasons.

+
    +
  1. The Standard only permits complex<R> if +R is a floating-point type.

  2. +
  3. complex<R> only has sizeof(R) +alignment, not 2 * sizeof(R).

  4. +
  5. Some C++ extensions cannot use complex<R>, +because they require annotations on a type’s member functions.

  6. +
+

First, the C++ Standard currently permits +complex<R> only if R is a floating-point +type. Before adoption of P1467 into C++23, this meant that +R could only be float, double, or +long double. Even after adoption of P1467, implementations +are not required to provide other floating-point types. This prevents +use of other types in portable C++, such as

+
    +
  • “short” low-precision floating-point or fixed-point number types +that are critical for performance of machine learning and signal +processing applications;

  • +
  • signed integers (the resulting complex numbers represent the +Gaussian integers);

  • +
  • extended-precision floating-point types that can improve the +accuracy of floating-point sums, reduce parallel nondeterminism, and +make sums less dependent on evaluation order; and

  • +
  • custom number types such as “bigfloats” (arbitrary-precision +floating-point types) or fractions of integers.

  • +
+

Second, the Standard explicitly specifies that +complex<R> has the same alignment as +R[2]. That is, it is aligned to sizeof(R). +Some systems would give better parallel or vectorized performance if +complex numbers were aligned to 2 * sizeof(R). Some C++ +extensions define their own complex number types partly for this reason. +Software libraries that use these custom complex number types tempt +users to alias between complex<R> and these custom +types, which would have the same bit representation except for +alignment. This has led to crashes or worse in software projects that +the authors have worked on. Third, some C++ extensions cannot use +complex, because they require types’ member functions to +have special annotations, in order to compile code to make use of +accelerator hardware.

+

These issues have led several software libraries and C++ extensions +to define their own complex number types. These include CUDA, Kokkos, +and Thrust. The SYCL standard is contemplating adding a custom complex +number type. One of the authors wrote Kokkos::complex circa +2014 to make it possible to build and run Trilinos’ linear solvers with +such C++ extensions.

+

10.9.3 Why users want to +“conjugate” matrices of real numbers

+

It’s possible to describe many linear algebra algorithms in a way +that works for both complex and real numbers, by treating conjugation as +the identity for real numbers. This makes the “conjugate transpose” just +the transpose for a matrix of real numbers. Matlab takes this approach, +by defining the single quote operator to take the conjugate transpose if +its argument is complex, and the transpose if its argument is real. The +Fortran BLAS also supports this, by letting users specify the +'Conjugate Transpose' (TRANSA='C') even for +real routines like DGEMM (double-precision general +matrix-matrix multiply). Krylov subspace methods in Trilinos’ Anasazi +and Belos packages also follow a Matlab-like generic approach.

+

Even though we think it should be possible to write “generic” (real +or complex) linear algebra code using conjugate_transposed, +we still need to distinguish between symmetric and Hermitian matrix +algorithms. This is because symmetric does not mean the same +thing as Hermitian for matrices of complex numbers. For example, a +matrix whose off-diagonal elements are all 3 + 4i is +symmetric, but not Hermitian. Complex symmetric matrices are useful in +practice, for example when modeling damped vibrations (Craven 1969).

+

10.9.4 Effects of +conj’s real-to-complex change

+

The fact that conj for arithmetic-type arguments returns +complex may complicate or prevent implementers from using +an existing optimized BLAS library. If the user calls +matrix_product with matrices all of value type +double, use of the (mathematically harmless) +conjugate_transposed function would make one matrix have +value type complex<double>. Implementations could +undo this value type change for known layouts and accessors, but would +need to revert to generic code otherwise.

+

For example, suppose that a custom real value type +MyReal has arithmetic operators defined to permit all +needed mixed-type expressions with double, where +double times MyReal and MyReal +times double both “promote” to MyReal. Users +may then call matrix_product(A, B, C) with A +having value type double, B having value type +MyReal, and C having a value type +MyReal. However, +matrix_product(conjugate_transposed(A), B, C) would not +compile, due to +complex<decltype(declval<double>() * declval<MyReal>())> +not being well formed.

+

10.9.5 LEWG feedback on R8 +solution

+

In R8 of this paper, we proposed an exposition-only function +conj-if-needed. For arithmetic types, it would be +the identity function. This would fix Issue (1). For all other types, it +would call conj through argument-dependent lookup (ADL), +just like how iter_swap calls swap. This would +fix Issue (2). However, it would force users who define custom +real number types to define a trivial conj (in +their own namespace) for their type. The alternative would be to make +conj-if-needed the identity if it could not find +conj via ADL lookup. However, that would cause silently +incorrect results for users who define a custom complex number type, but +forget or misspell conj.

+

When reviewing R8, LEWG expressed a preference for a different +solution.

+
    +
  1. Temporarily change P1673 to permit use of conjugated +and conjugate_transposed only for value types that are +either complex or arithmetic types. Add a Note that reminds +readers to look out for Steps (2) and (3) below.

  2. +
  3. Write a separate paper which introduces a user-visible +customization point, provisionally named conjugate. The +paper could use any of various proposed library-only customization point +mechanisms, such as the customization point objects used by ranges or +tag_invoke (see +P1895R0, with the expectation +that LEWG and perhaps also EWG (see e.g., +P2547) may express a preference +for a different mechanism.

  4. +
  5. If LEWG accepts the conjugate customization point, +then change P1673 again to use conjugate (thus replacing +R8’s conj-if-needed). This would thus reintroduce +support for custom complex numbers.

  6. +
+

10.9.6 SG6’s response to LEWG’s R8 +feedback

+

SG6 small group (there was no quorum) reviewed P1673 on 2022/06/09, +after LEWG’s R8 review on 2022/05/24. SG6 small group expressed the +following:

+
    +
  • Being able to write conjugated(A) or +conjugate_transposed(A) for a matrix or vector +A of user-defined types is reasonably integral to the +proposal. We generically oppose deferring it based on the hope that +we’ll be able to specify it in a nicer way in the future, with some new +customization point syntax.

  • +
  • A simple, teachable rule: Do ADL-ONLY lookup (preventing finding +std::conj for primitive types) for conj (as +with ranges); if you find something you use it, and if you don’t, you do +nothing (conjugation is the identity). (“Primitives aren’t that +special.”) Benefit is that custom real types work out of the +box.

  • +
  • The alternative: specify that if users choose to use +conjugated or conjugate_transposed with a +user-defined type, then they MUST supply the conj +ADL-findable thing, else ill-formed. This is a safety mechanism that may +not have been considered previously by LEWG. (Make primitives special, +to regain safety. The cost is that custom real types need to have a +conj ADL-findable, if users use conjugated or +conjugate_transposed.)

  • +
+

10.9.7 Current solution

+

We have adopted SG6 small group’s recommendation, with a slight +wording modification to make it obvious that the conjugate of an +arithmetic type returns the same type as its input.

+

We propose an exposition-only function object +conj-if-needed. For arithmetic types, it behaves +as the identity function. If it can call conj through +ADL-only (unqualified) lookup, it does so. Otherwise, it again behaves +as the identity function.

+

We take the same approach to fix the issues discussed above with +real and imag. That is, we define +exposition-only functions real-if-needed and +imag-if-needed. These assume that any +non-arithmetic type without ADL-findable real resp. +imag is a non-complex type.

+

This approach has the following advantages.

+
    +
  1. Most custom number types, noncomplex or complex, will work “out +of the box.” Existing custom complex number types likely already have +ADL-findable conj, real, and +imag. If they do not, then users can define them.

  2. +
  3. It ensures type preservation for arithmetic types.

  4. +
  5. It uses existing C++ idioms and interfaces for complex +numbers.

  6. +
  7. It does not depend on a future customization point syntax or +library convention.

  8. +
+

10.10 Support for division with +noncommutative multiplication

+

An important feature of this proposal is its support for value types +that have noncommutative multiplication. Examples include square +matrices with a fixed number of rows and columns, and quaternions and +their generalizations. Most of the algorithms in this proposal only add +or multiply arbitrary value types, so preserving the order of +multiplication arguments is straightforward. The various triangular +solve algorithms are an exception, because they need to perform +divisions as well.

+

If multiplication commutes and if a type has division, then the +division x ÷ y is just x times (the multiplicative inverse of y), +assuming that the multiplicative inverse of y exists. However, if +multiplication does not commute, “x times (the multiplicative inverse of +y)” need not equal “(the multiplicative inverse of y) times x.” The C++ +binary operator/ does not give callers a way to distinguish +between these two cases.

+

This suggests four ways to express “ordered division.”

+
    +
  1. Explicitly divide one by the quotient: x * (1/y), or +(1/y) * x

  2. +
  3. Like (2), but instead of using literal 1, get “one” +as a value_type input to the algorithm: +x * (one/y), or (one/y) * x

  4. +
  5. inverse as a unary callable input to the algorithm: +x * inverse(y), or inverse(y) * x

  6. +
  7. divide as a binary callable input to the algorithm: +divide(x, y), or divide(y, x)

  8. +
+

Both SG6 small group (in its review of this proposal on 2022/06/09) +and the authors prefer Way (4), the divide binary callable +input. The binary callable would be optional, and ordinary binary +operator/ would be used as the default. This would imitate +existing Standard Library algorithms like reduce, with its +optional BinaryOp that defaults to addition. For +mixed-precision computation, std::divides<void>{} or +the equivalent +[](const auto& x, const auto& y) { return x / y; } +should work just fine. Users should avoid specifying T +other than void in std::divides<T> for +mixed-precision computation, as std::divides<T> would +coerce both its arguments to T and then force the division +result to be T as well. Way (4) also preserves the original +rounding behavior for types with commutative multiplication.

+

The main disadvantage of the other approaches is that they would +change rounding behavior for floating-point types. They also require two +operations – computing an inverse, and multiplication – rather than one. +“Ordered division” may actually be the operation users want, and the +“inverse” might be just a byproduct. This is the case for square +matrices, where users often “compute an inverse” only because they want +to solve a linear system. Each of the other approaches has its own other +disadvantages.

+
    +
  • Way (1) would assume that an overloaded +operator/(int, value_type) exists, and that the literal +1 behaves like a multiplicative identity. In practice, not +all custom number types may have defined mixed arithmetic with +int.

  • +
  • Way (2) would complicate the interface. Users might make the +mistake of passing in literal 1 (of type int) +or 1.0 (of type double) as the value of one, +thus leading to Way (1)’s issues.

  • +
  • Way (3) would again complicate the interface. Users would be +tempted to use [](const auto& y) { return 1 / y; } as +the inverse function, thus leading back to Way (1)’s issues.

  • +
+

10.11 Packed layout’s Triangle must +match function’s Triangle

+

10.11.1 Summary

+
    +
  • P1673 symmetric_*, hermitian_*, and +triangular_* functions always take Triangle +(and DiagonalStorage, if applicable) parameters so that the +functions always have the same signature, regardless of +mdspan layout.

  • +
  • For symmetric or Hermitian algorithms, mismatching the mdspan +layout’s Triangle and the function’s Triangle would have no +effect.

  • +
  • For triangular algorithms, mismatch would have an effect that +users likely would not want. (It means “use the other triangle,” which +is zero.) Thus, it’s reasonable to make mismatch an error.

  • +
  • In practice, users aren’t likely to encounter a triangular packed +matrix in isolation. Such matrices usually occur in context of symmetric +or Hermitian packed matrices. A common user error might thus be +mismatching the Triangles for both symmetric or Hermitian functions, and +triangular functions. The first is harmless; the second is likely an +error.

  • +
  • Therefore, we recommend

    +
      +
    1. retaining the Triangle (and +DiagonalStorage, if applicable) parameters;

    2. +
    3. making it a Mandate that the layout’s Triangle match +the function’s Triangle parameter, for all the +functions (not just the triangular_* ones).

    4. +
  • +
+

10.11.2 When do packed formats show +up in practice?

+

Users aren’t likely to encounter a triangular packed matrix in +isolation. It generally comes as an in-place transformation (e.g., +factorization) of a symmetric or Hermitian packed matrix. For example, +LAPACK’s DSPTRF (Double-precision Symmetric Packed +TRiangular Factorization) computes a symmetric LDLT +(or UDUT) +factorization in place, overwriting the input symmetric packed matrix +A. LAPACK’s +DSPTRS (Double-precision Symmetric Packed TRiangular Solve) +then uses the result of DSPTRF to solve a linear system. +DSPTRF overwrites A with the triangle L (if A uses lower triangle storage, or +U, if A uses upper triangle storage). This +is an idiom for which the BLAS was designed: factorizations typically +overwrite their input, and thus reinterpret the input’s “data structure” +on the fly.

+

10.11.3 What the BLAS does

+

For a summary of the BLAS’ packed storage formats, please refer to +the “Packed +Storage” section of the LAPACK Users’ Guide, Third Edition +(1999).

+

BLAS routines for packed storage have only a single argument, +UPLO. This describes both whether the caller is storing the +upper or lower triangle, and the triangle of the matrix on which the +routine will operate. (Packed BLAS formats always store the diagonal +explicitly; they don’t have the analog of DiagonalStorage.) +An example of a BLAS triangular packed routine is DTPMV, +Double-precision (D) Triangular Packed (TP) Matrix-Vector (MV) +product.

+

BLAS packed formats don’t represent metadata explicitly; the caller +is responsible for knowing whether they are storing the upper or lower +triangle. Getting the UPLO argument wrong makes the matrix +wrong. For example, suppose that the matrix is 4 x 4, and the user’s +array input for the matrix is [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]. If the +user is storing the upper triangle (in column-major order, as the +Fortran BLAS requires), then the matrix looks like this.

+ + + + + + + + + + + + + + + + + + + + + + + + + +
+1 + +2 + +4 + +7 +
+ +3 + +5 + +8 +
+ + +6 + +9 +
+ + + +10 +
+

Mismatching the UPLO argument (by passing in +'Lower triangle' instead of 'Upper triangle') +would result in an entirely wrong matrix – not even the transpose. Note +how the diagonal elements differ, for instance.

+ + + + + + + + + + + + + + + + + + + + + + + + + +
+1 + + + +
+2 + +5 + + +
+3 + +6 + +8 + +
+4 + +7 + +9 + +10 +
+

This would be incorrect for triangular, symmetric, or Hermitian +matrices.

+

10.11.4 P1673’s interpretation of +the BLAS

+

P1673 offers packed formats that encode the Triangle. +This means that the mdspan alone conveys the data +structure. P1673 retains the function’s separate Triangle +parameter so that the function’s signature doesn’t change based on the +mdspan’s layout. P1673 requires that the function’s +Triangle match the mdspan’s Triangle.

+

If P1673 were to permit mismatching the two Triangles, how would the +function reasonably interpret the user’s intent? For triangular matrices +with explicit diagonal, mismatch would mean multiplying by or solving +with a zero triangle matrix. For triangular matrices with implicit unit +diagonal, mismatch would mean multiplying by or solving with a diagonal +matrix of ones – that is, the identity matrix. Users wouldn’t want to do +either one of those.

+

For symmetric matrices, mismatch has no effect; the +mdspan layout’s Triangle rules. For example, +the lower triangle of an upper triangle storage format is just the upper +triangle again. For Hermitian matrices, again, mismatch has no effect. +For example, suppose that the following is the lower triangle +representation of a complex-valued Hermitian matrix (where i is the imaginary unit).

+ + + + + + + + + + + + + + + + +
+1 + 1i + + +
+2 + 2i + +4 + 4i + +
+3 + 3i + +5 + 5i + +6 + 6i +
+

If the user asks the function to operate on the upper triangle of +this matrix, that would imply the following.

+ + + + + + + + + + + + + + + + +
+1 + 1i + +2 − 2i + +3 − 3i +
+ +4 + 4i + +5 − 5i +
+ + +6 + 6i +
+

(Note that the imaginary parts now have negative sign. The matrix is +Hermitian, so A[j,i] equals conj(A[i,j]).) +That’s just the “other triangle” of the matrix. These are Hermitian +algorithms, so they will interpret the “other triangle of the other +triangle” in a way that restores the original matrix. Even though the +user never stores the original matrix, it would look like this +mathematically.

+ + + + + + + + + + + + + + + + +
+1 + 1i + +2 − 2i + +3 − 3i +
+2 + 2i + +4 + 4i + +5 − 5i +
+3 + 3i + +5 + 5i + +6 + 6i +
+

11 Future work

+

11.1 Batched linear algebra

+

We have submitted a separate proposal, +P2901, that adds “batched” +versions of linear algebra functions to this proposal. “Batched” linear +algebra functions solve many independent problems all at once, in a +single function call. For discussion, please see also Section 6.2 of our +background paper P1417R0. +Batched interfaces have the following advantages.

+
    +
  • They expose more parallelism and vectorization opportunities for +many small linear algebra operations.

  • +
  • They are useful for many different fields, including machine +learning.

  • +
  • Hardware vendors currently offer both hardware features and +optimized software libraries to support batched linear algebra.

  • +
  • There is an ongoing interface +standardization effort, in which we participate.

  • +
+

The mdspan data structure makes it easy to represent a +batch of linear algebra objects, and to optimize their data layout.

+

With few exceptions, the extension of this proposal to support +batched operations will not require new functions or interface changes. +Only the requirements on functions will change. Output arguments can +have an additional rank; if so, then the leftmost extent will refer to +the batch dimension. Input arguments may also have an additional rank to +match; if they do not, the function will use (“broadcast”) the same +input argument for all the output arguments in the batch.

+

12 Data structures and utilities +borrowed from other proposals

+

12.1 mdspan

+

This proposal depends on mdspan, a feature proposed by +P0009 and follow-on papers, and voted into C++23. The +mdspan class template views the elements of a +multidimensional array. The rank (number of dimensions) is fixed at +compile time. Users may specify some dimensions at run time and others +at compile time; the type of the mdspan expresses this. +mdspan also has two customization points:

+
    +
  • Layout expresses the array’s memory layout: e.g., +row-major (C++ style), column-major (Fortran style), or strided. We use +a custom Layout later in this paper to implement a +“transpose view” of an existing mdspan.

  • +
  • Accessor defines the storage handle +(data_handle_type) stored in the mdspan, as +well as the reference type returned by its access operator. This is an +extension point for modifying how access happens, for example by using +atomic_ref to get atomic access to every element. We use +custom Accessors later in this paper to implement “scaled +views” and “conjugated views” of an existing +mdspan.

  • +
+

12.2 New mdspan +layouts in this proposal

+

Our proposal uses the layout mapping policy of mdspan in +order to represent different matrix and vector data layouts. The current +C++ Standard draft includes three layouts: layout_left, +layout_right, and layout_stride. +P2642 proposes two more: +layout_left_padded and layout_right_padded. +These two layouts represent exactly the data layout assumed by the +General (GE) matrix type in the BLAS’ C and Fortran bindings. They have +has two advantages.

+
    +
  1. Unlike layout_left and layout_right, +any “submatrix” (subspan of consecutive rows and consecutive columns) of +a matrix with layout_left_padded resp. +layout_right_padded layout also has +layout_left_padded resp. layout_right_padded +layout.

  2. +
  3. Unlike layout_stride, the two layouts always have +compile-time unit stride in one of the matrix’s two extents.

  4. +
+

BLAS functions call the possibly nonunit stride of the matrix the +“leading dimension” of that matrix. For example, a BLAS function +argument corresponding to the leading dimension of the matrix +A is called LDA, for “leading dimension of the +matrix A.”

+

This proposal introduces a new layout, +layout_blas_packed. This describes the layout used by the +BLAS’ Symmetric Packed (SP), Hermitian Packed (HP), and Triangular +Packed (TP) “types.” The layout_blas_packed class has a +“tag” template parameter that controls its properties; see below.

+

We do not include layouts for unpacked “types,” such as Symmetric +(SY), Hermitian (HE), and Triangular (TR). Our paper +P1674. explains our reasoning. In +summary: Their actual layout – the arrangement of matrix elements in +memory – is the same as General. The only differences are constraints on +what entries of the matrix algorithms may access, and assumptions about +the matrix’s mathematical properties. Trying to express those +constraints or assumptions as “layouts” or “accessors” violates the +spirit (and sometimes the law) of mdspan. We address these +different matrix types with different function names.

+

The packed matrix “types” do describe actual arrangements of matrix +elements in memory that are not the same as in General. This is why we +provide layout_blas_packed. Note that +layout_blas_packed is the first addition to the existing +layouts that is neither always unique, nor always strided.

+

Algorithms cannot be written generically if they permit output +arguments with nonunique layouts. Nonunique output arguments require +specialization of the algorithm to the layout, since there’s no way to +know generically at compile time what indices map to the same matrix +element. Thus, we will impose the following rule: Any +mdspan output argument to our functions must always have +unique layout (is_always_unique() is true), +unless otherwise specified.

+

Some of our functions explicitly require outputs with specific +nonunique layouts. This includes low-rank updates to symmetric or +Hermitian matrices.

+

13 Implementation experience

+

As far as the authors know, there are currently two implementations +of the proposal:

+
    +
  • the reference +implementation written and maintained by the authors and others, +and

  • +
  • an implementation by NVIDIA, which was released as part of their +HPC SDK, and which uses NVIDIA libraries such as cuBLAS to accelerate +many of the algorithms.

  • +
+

This proposal depends on mdspan (adopted into C++23), +submdspan (P2630, adopted into the current C++ draft), and +(indirectly) on the padded mdspan layouts in P2642. The +reference implementation of +mdspan, written and maintained by the authors and +others, includes implementations of all these features.

+

14 Interoperable with other linear +algebra proposals

+

We believe this proposal is complementary to P1385, a proposal for a +C++ Standard linear algebra library that introduces matrix and vector +classes with overloaded arithmetic operators. The P1385 authors and we +have expressed together in a joint paper, +P1891, +that P1673 and P1385 “are orthogonal. They are not competing papers; … +there is no overlap of functionality.”

+

We designed P1673 in part as a natural foundation or implementation +layer for existing libraries with similar design and goals as P1385. Our +view is that a free function interface like P1673’s clearly separates +algorithms from data structures, and more naturally allows for a richer +set of operations such as what the BLAS provides. Our paper +P1674 explains why we think our +proposal is a minimal C++ “respelling” of the BLAS.

+

A natural extension of the present proposal would include accepting +P1385’s matrix and vector objects as input for the algorithms proposed +here. A straightforward way to do that would be for P1385’s matrix and +vector objects to make views of their data available as +mdspan.

+

15 Acknowledgments

+

Sandia National Laboratories is a multimission laboratory managed and +operated by National Technology & Engineering Solutions of Sandia, +LLC, a wholly owned subsidiary of Honeywell International, Inc., for the +U.S. Department of Energy’s National Nuclear Security Administration +under contract DE-NA0003525.

+

Special thanks to Bob Steagall and Guy Davidson for boldly leading +the charge to add linear algebra to the C++ Standard Library, and for +many fruitful discussions. Thanks also to Andrew Lumsdaine for his +pioneering efforts and history lessons. In addition, I very much +appreciate feedback from Davis Herring on constraints wording.

+

16 References

+

16.1 References by coathors

+
    +
  • G. Ballard, E. Carson, J. Demmel, M. Hoemmen, N. Knight, and O. +Schwartz, “Communication lower +bounds and optimal algorithms for numerical linear algebra,”, +Acta Numerica, Vol. 23, May 2014, pp. 1-155.

  • +
  • G. Davidson, M. Hoemmen, D. S. Hollman, B. Steagall, and C. +Trott, P1891R0, +Oct. 2019.

  • +
  • M. Hoemmen, D. S. Hollman, and C. Trott, “Evolving a Standard C++ +Linear Algebra Library from the BLAS,” +P1674R2, May 2022.

  • +
  • M. Hoemmen, J. Badwaik, M. Brucher, A. Iliopoulos, and J. +Michopoulos, “Historical lessons for C++ linear algebra library +standardization,” P1417R0, +Jan. 2019.

  • +
  • M. Hoemmen, D. S. Hollman, C. Jabot, I. Muerte, and C. Trott, +“Multidimensional subscript operator,” +P2128R6, Sep. 2021.

  • +
  • M. Hoemmen, K. Liegeois, and C. Trott, “Extending Linear Algebra +Support to Batched Operations,” +P2901R0, May 2023.

  • +
  • D. S. Hollman, C. Kohlhoff, B. A. Lelbach, J. Hoberock, G. Brown, +and M. Dominiak, “A General Property Customization Mechanism,” +P1393R0, Jan. 2019.

  • +
  • C. Trott, D. S. Hollman, D. Lebrun-Grandie, M. Hoemmen, D. +Sunderland, H. C. Edwards, B. A. Lelbach, M. Bianco, B. Sander, A. +Iliopoulos, J. Michopoulos, and N. Liber, “mdspan,” +P0009R18, Jul. 2022.

  • +
  • C. Trott, D. S. Hollman, M. Hoemmen, D. Sunderland, and D. +Lebrun-Grandie, “mdarray: An Owning Multidimensional Array +Analog of mdspan”, +P1684R5, May 2023.

  • +
  • C. Trott, D. Lebrun-Grandie, M. Hoemmen, and N. Liber, +“submdspan,” +P2630R4, Jul. 2023.

  • +
+

16.2 Other references

+
    +
  • E. Anderson, “Algorithm 978: Safe Scaling in the Level 1 BLAS,” +ACM Transactions on Mathematical Software, Vol. 44, pp. 1-28, +2017.

  • +
  • E. Anderson et al., LAPACK Users’ +Guide, Third Edition, SIAM, 1999.

  • +
  • “Basic Linear +Algebra Subprograms Technical (BLAST) Forum Standard,” +International Journal of High Performance Applications and +Supercomputing, Vol. 16, No. 1, Spring 2002.

  • +
  • L. S. Blackford, J. Demmel, J. Dongarra, I. Duff, S. Hammarling, +G. Henry, M. Heroux, L. Kaufman, A. Lumsdaine, A. Petitet, R. Pozo, K. +Remington, and R. C. Whaley, “An updated set of basic +linear algebra subprograms (BLAS),” ACM Transactions on +Mathematical Software, Vol. 28, No. 2, Jun. 2002, +pp. 135-151.

  • +
  • J. L. Blue, “A Portable Fortran Program to Find the Euclidean +Norm of a Vector,” ACM Transactions on Mathematical Software, +Vol. 4, pp. 15-23, 1978.

  • +
  • B. D. Craven, +“Complex symmetric +matrices”, Journal of the Australian Mathematical Society, +Vol. 10, No. 3-4, Nov. 1969, pp. 341–354.

  • +
  • E. Chow and A. Patel, “Fine-Grained Parallel Incomplete LU +Factorization”, SIAM J. Sci. Comput., Vol. 37, No. 2, +C169-C193, 2015.

  • +
  • G. Davidson and B. Steagall, “A proposal to add linear algebra +support to the C++ standard library,” +P1385R7, Oct. 2022.

  • +
  • B. Dawes, H. Hinnant, B. Stroustrup, D. Vandevoorde, and M. Wong, +“Direction for ISO C++,” +P0939R4, Oct. 2019.

  • +
  • J. Demmel, “Applied Numerical Linear Algebra,” Society for +Industrial and Applied Mathematics, Philadelphia, PA, 1997, ISBN +0-89871-389-7.

  • +
  • J. Demmel, I. Dumitriu, and O. Holtz, “Fast linear algebra is +stable,” Numerische Mathematik 108 (59-91), 2007.

  • +
  • J. Demmel and H. D. Nguyen, “Fast Reproducible Floating-Point +Summation,” 2013 IEEE 21st Symposium on Computer Arithmetic, 2013, +pp. 163-172, doi: 10.1109/ARITH.2013.9.

  • +
  • J. Dongarra, J. Du Croz, S. Hammarling, and I. Duff, +“A set of level 3 +basic linear algebra subprograms,” ACM Transactions on +Mathematical Software, Vol. 16, No. 1, pp. 1-17, +Mar. 1990.

  • +
  • J. Dongarra, R. Pozo, and D. Walker, “LAPACK++: A Design Overview +of Object-Oriented Extensions for High Performance Linear Algebra,” in +Proceedings of Supercomputing ’93, IEEE Computer Society Press, 1993, +pp. 162-171.

  • +
  • M. Gates, P. Luszczek, A. Abdelfattah, J. Kurzak, J. Dongarra, K. +Arturov, C. Cecka, and C. Freitag, +“C++ +API for BLAS and LAPACK,” SLATE Working Notes, Innovative Computing +Laboratory, University of Tennessee Knoxville, Feb. 2018.

  • +
  • K. Goto and R. A. van de Geijn, “Anatomy of +high-performance matrix multiplication,”, ACM Transactions on +Mathematical Software, Vol. 34, No. 3, pp. 1-25, May 2008.

  • +
  • N. J. Higham, Accuracy and Stability of Numerical +Algorithms, Second Edition, SIAM, 2022.

  • +
  • N. J. Higham, “FORTRAN codes for +estimating the one-norm of a real or complex matrix, with applications +to condition estimation,” ACM Transactions on Mathematical +Software, Vol. 14, No. 4, pp. 381-396, Dec. 1988.

  • +
  • N. A. Josuttis, “The C++ Standard Library: A Tutorial and +Reference,” Addison-Wesley, 1999.

  • +
  • M. Kretz, “Data-Parallel Vector Types & Operations,” +P0214R9, Mar. 2018.

  • +
  • A. J. Perlis, “Epigrams on programming,” SIGPLAN Notices, Vol. +17, No. 9, pp. 7-13, 1982.

  • +
  • G. Strang, “Introduction to Linear Algebra,” 5th Edition, +Wellesley - Cambridge Press, 2016, ISBN 978-0-9802327-7-6, x+574 +pages.

  • +
  • D. Vandevoorde and N. A. Josuttis, “C++ Templates: The Complete +Guide,” Addison-Wesley Professional, 2003.

  • +
+

17 Dummy Heading To Align Wording +Numbering

+

18 Dummy Heading To Align Wording +Numbering

+

19 Dummy Heading To Align Wording +Numbering

+

20 Dummy Heading To Align Wording +Numbering

+

21 Dummy Heading To Align Wording +Numbering

+

22 Dummy Heading To Align Wording +Numbering

+

23 Dummy Heading To Align Wording +Numbering

+

24 Dummy Heading To Align Wording +Numbering

+

25 Dummy Heading To Align Wording +Numbering

+

26 Dummy Heading To Align Wording +Numbering

+

27 Dummy Heading To Align Wording +Numbering

+

28 Dummy Heading To Align Wording +Numbering

+

28.1 Dummy Heading To Align Wording +Numbering

+

28.2 Dummy Heading To Align Wording +Numbering

+

28.3 Dummy Heading To Align Wording +Numbering

+

28.4 Dummy Heading To Align Wording +Numbering

+

28.5 Dummy Heading To Align Wording +Numbering

+

28.6 Dummy Heading To Align Wording +Numbering

+

28.7 Dummy Heading To Align Wording +Numbering

+

The preceding headings just push the automatic numbering of the +document generator so that “Wording” is in the place of 28.8 Numbers

+

28.8 Wording

+
+

Text in blockquotes is not proposed wording, but rather instructions +for generating proposed wording. The � character is used to denote a +placeholder section number which the editor shall determine.

+

In the Bibliography, add the following reference:

+
+

J. Demmel, I. Dumitriu, and O. Holtz, “Fast linear algebra is +stable,” Numerische Mathematik 108 (59-91), 2007.

+
+

In [algorithms.parallel.defns] modify paragraph 3.1:

+
+
    +
  • (3.1) All +operations of the categories of the iterators or mdspan +types that the algorithm is instantiated with.
  • +
+
+

In [algorithms.parallel.user] modify paragraph 1:

+
+

1 +Unless otherwise specified, function objects passed into parallel +algorithms as objects of type Predicate, +BinaryPredicate, Compare, +UnaryOperation, BinaryOperation, +BinaryOperation1, BinaryOperation2, +BinaryDivideOp and the operators +used by the analogous overloads to these parallel algorithms that are +formed by an invocation with the specified default predicate or +operation (where applicable) shall not directly or indirectly modify +objects via their arguments, nor shall they rely on the identity of the +provided objects.

+
+

In [headers], add the header <linalg> to +[tab:headers.cpp].

+
+
+

In [diff.cpp23] add a new subsection

+
+

Clause 16: library introduction [diff.cpp23.library]

+

Affected subclause: 16.4.2.3

+

Change: New headers

+

Rationale: New functionality.

+

Effect on original feature: The following C++ +headers are new: <linalg>. Valid C++2023 code that +includes headers with these names may be invalid in this revision of +C++.

+
+

In [version.syn], add

+
+
#define __cpp_lib_linalg YYYYMML // also in <linalg>
+
+

Adjust the placeholder value as needed so as to denote this +proposal’s date of adoption.

+

At the end of Table � (“Numerics library summary”) in +[numerics.general], add the following: [linalg], Linear +algebra, <linalg>.

+

At the end of [numerics] (after subsection 28.8 +[numbers]), add all the material that follows.

+
+

28.9 Basic linear algebra +algorithms [linalg]

+

28.9.1 Overview [linalg.overview]

+

1 +Subclause [linalg] defines basic linear algebra algorithms. The +algorithms that access the elements of arrays view those elements +through mdspan [mdspan].

+

28.9.2 Header +<linalg> synopsis [linalg.syn]

+
namespace std::linalg {
+// [linalg.tags.order], storage order tags
+struct column_major_t;
+inline constexpr column_major_t column_major;
+struct row_major_t;
+inline constexpr row_major_t row_major;
+
+// [linalg.tags.triangle], triangle tags
+struct upper_triangle_t;
+inline constexpr upper_triangle_t upper_triangle;
+struct lower_triangle_t;
+inline constexpr lower_triangle_t lower_triangle;
+
+// [linalg.tags.diagonal], diagonal tags
+struct implicit_unit_diagonal_t;
+inline constexpr implicit_unit_diagonal_t implicit_unit_diagonal;
+struct explicit_diagonal_t;
+inline constexpr explicit_diagonal_t explicit_diagonal;
+
+// [linalg.layout.packed], class template layout_blas_packed
+template<class Triangle,
+         class StorageOrder>
+class layout_blas_packed;
+
+// [linalg.helpers], exposition-only helpers
+
+// [linalg.helpers.concepts], linear algebra argument concepts
+
+template<class T>
+constexpr bool is-mdspan = see below; // exposition only
+
+template<class T>
+concept in-vector = see below; // exposition only
+
+template<class T>
+concept out-vector = see below; // exposition only
+
+template<class T>
+concept inout-vector = see below; // exposition only
+
+template<class T>
+concept in-matrix = see below; // exposition only
+
+template<class T>
+concept out-matrix = see below; // exposition only
+
+template<class T>
+concept inout-matrix = see below; // exposition only
+
+template<class T>
+concept possibly-packed-inout-matrix = see below; // exposition only
+
+template<class T>
+concept in-object = see below; // exposition only
+
+template<class T>
+concept out-object = see below; // exposition only
+
+template<class T>
+concept inout-object = see below; // exposition only
+
+// [linalg.scaled], scaled in-place transformation
+
+// [linalg.scaled.scaledaccessor], class template scaled_accessor
+template<class ScalingFactor,
+         class NestedAccessor>
+class scaled_accessor;
+
+// [linalg.scaled.scaled], function template scaled
+template<class ScalingFactor,
+         class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto scaled(
+  ScalingFactor alpha,
+  mdspan<ElementType, Extents, Layout, Accessor> x);
+
+// [linalg.conj], conjugated in-place transformation
+
+// [linalg.conj.conjugatedaccessor], class template conjugated_accessor
+template<class NestedAccessor>
+class conjugated_accessor;
+
+// [linalg.conj.conjugated], function template conjugated
+template<class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto conjugated(
+  mdspan<ElementType, Extents, Layout, Accessor> a);
+
+// [linalg.transp], transpose in-place transformation
+
+// [linalg.transp.layout.transpose], class template layout_transpose
+template<class Layout>
+class layout_transpose;
+
+// [linalg.transp.transposed], function template transposed
+template<class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto transposed(
+  mdspan<ElementType, Extents, Layout, Accessor> a);
+
+// [linalg.conjtransposed],
+// conjugated transpose in-place transformation
+template<class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto conjugate_transposed(
+  mdspan<ElementType, Extents, Layout, Accessor> a);
+
+// [linalg.algs.blas1], BLAS 1 algorithms
+
+// [linalg.algs.blas1.givens], Givens rotations
+
+// [linalg.algs.blas1.givens.lartg], compute Givens rotation
+
+template<class Real>
+struct setup_givens_rotation_result {
+  Real c;
+  Real s;
+  Real r;
+};
+template<class Real>
+struct setup_givens_rotation_result<complex<Real>> {
+  Real c;
+  complex<Real> s;
+  complex<Real> r;
+};
+
+template<class Real>
+setup_givens_rotation_result<Real>
+setup_givens_rotation(Real a, Real b) noexcept;
+
+template<class Real>
+setup_givens_rotation_result<complex<Real>>
+setup_givens_rotation(complex<Real> a, complex<Real> b) noexcept;
+
+// [linalg.algs.blas1.givens.rot], apply computed Givens rotation
+template<inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  Real s);
+template<class ExecutionPolicy,
+         inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  ExecutionPolicy&& exec,
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  Real s);
+template<inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  complex<Real> s);
+template<class ExecutionPolicy,
+         inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  ExecutionPolicy&& exec,
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  complex<Real> s);
+
+// [linalg.algs.blas1.swap], swap elements
+template<inout-object InOutObj1,
+         inout-object InOutObj2>
+void swap_elements(InOutObj1 x,
+                   InOutObj2 y);
+template<class ExecutionPolicy,
+         inout-object InOutObj1,
+         inout-object InOutObj2>
+void swap_elements(ExecutionPolicy&& exec,
+                   InOutObj1 x,
+                   InOutObj2 y);
+
+// [linalg.algs.blas1.scal], multiply elements by scalar
+template<class Scalar,
+         inout-object InOutObj>
+void scale(Scalar alpha,
+           InOutObj x);
+template<class ExecutionPolicy,
+         class Scalar,
+         inout-object InOutObj>
+void scale(ExecutionPolicy&& exec,
+           Scalar alpha,
+           InOutObj x);
+
+// [linalg.algs.blas1.copy], copy elements
+template<in-object InObj,
+         out-object OutObj>
+void copy(InObj x,
+          OutObj y);
+template<class ExecutionPolicy,
+         in-object InObj,
+         out-object OutObj>
+void copy(ExecutionPolicy&& exec,
+          InObj x,
+          OutObj y);
+
+// [linalg.algs.blas1.add], add elementwise
+template<in-object InObj1,
+         in-object InObj2,
+         out-object OutObj>
+void add(InObj1 x,
+         InObj2 y,
+         OutObj z);
+template<class ExecutionPolicy,
+         in-object InObj1,
+         in-object InObj2,
+         out-object OutObj>
+void add(ExecutionPolicy&& exec,
+         InObj1 x,
+         InObj2 y,
+         OutObj z);
+
+// [linalg.algs.blas1.dot],
+// dot product of two vectors
+
+// [linalg.algs.blas1.dot.dotu],
+// nonconjugated dot product of two vectors
+template<in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+T dot(InVec1 v1,
+      InVec2 v2,
+      Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+T dot(ExecutionPolicy&& exec,
+      InVec1 v1,
+      InVec2 v2,
+      Scalar init);
+template<in-vector InVec1,
+         in-vector InVec2>
+auto dot(InVec1 v1,
+         InVec2 v2);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2>
+auto dot(ExecutionPolicy&& exec,
+         InVec1 v1,
+         InVec2 v2);
+
+// [linalg.algs.blas1.dot.dotc],
+// conjugated dot product of two vectors
+template<in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+Scalar dotc(InVec1 v1,
+            InVec2 v2,
+            Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+Scalar dotc(ExecutionPolicy&& exec,
+            InVec1 v1,
+            InVec2 v2,
+            Scalar init);
+template<in-vector InVec1,
+         in-vector InVec2>
+auto dotc(InVec1 v1,
+          InVec2 v2);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2>
+auto dotc(ExecutionPolicy&& exec,
+          InVec1 v1,
+          InVec2 v2);
+
+// [linalg.algs.blas1.ssq],
+// Scaled sum of squares of a vector's elements
+template<class Scalar>
+struct sum_of_squares_result {
+  Scalar scaling_factor;
+  Scalar scaled_sum_of_squares;
+};
+template<in-vector InVec,
+         class Scalar>
+sum_of_squares_result<Scalar> vector_sum_of_squares(
+  InVec v,
+  sum_of_squares_result<Scalar> init);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         class Scalar>
+sum_of_squares_result<Scalar> vector_sum_of_squares(
+  ExecutionPolicy&& exec,
+  InVec v,
+  sum_of_squares_result<Scalar> init);
+
+// [linalg.algs.blas1.nrm2],
+// Euclidean norm of a vector
+template<in-vector InVec,
+         class Scalar>
+Scalar vector_two_norm(InVec v,
+                       Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         class Scalar>
+Scalar vector_two_norm(ExecutionPolicy&& exec,
+                       InVec v,
+                       Scalar init);
+template<in-vector InVec>
+auto vector_two_norm(InVec v);
+template<class ExecutionPolicy,
+         in-vector InVec>
+auto vector_two_norm(ExecutionPolicy&& exec,
+                     InVec v);
+
+// [linalg.algs.blas1.asum],
+// sum of absolute values of vector elements
+template<in-vector InVec,
+         class Scalar>
+Scalar vector_abs_sum(InVec v,
+                      Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         class Scalar>
+Scalar vector_abs_sum(ExecutionPolicy&& exec,
+                      InVec v,
+                      Scalar init);
+template<in-vector InVec>
+auto vector_abs_sum(InVec v);
+template<class ExecutionPolicy,
+         in-vector InVec>
+auto vector_abs_sum(ExecutionPolicy&& exec,
+                    InVec v);
+
+// [linalg.algs.blas1.iamax],
+// index of maximum absolute value of vector elements
+template<in-vector InVec>
+typename InVec::extents_type vector_idx_abs_max(InVec v);
+template<class ExecutionPolicy,
+         in-vector InVec>
+typename InVec::extents_type vector_idx_abs_max(
+  ExecutionPolicy&& exec,
+  InVec v);
+
+// [linalg.algs.blas1.matfrobnorm],
+// Frobenius norm of a matrix
+template<in-matrix InMat,
+         class Scalar>
+Scalar matrix_frob_norm(InMat A,
+  Scalar init);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Scalar>
+Scalar matrix_frob_norm(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Scalar init);
+template<in-matrix InMat>
+auto matrix_frob_norm(
+  InMat A);
+template<class ExecutionPolicy,
+         in-matrix InMat>
+auto matrix_frob_norm(
+  ExecutionPolicy&& exec,
+  InMat A);
+
+// [linalg.algs.blas1.matonenorm],
+// One norm of a matrix
+template<in-matrix InMat,
+         class Scalar>
+Scalar matrix_one_norm(
+  InMat A,
+  Scalar init);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Scalar>
+Scalar matrix_one_norm(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Scalar init);
+template<in-matrix InMat>
+auto matrix_one_norm(
+  InMat A);
+template<class ExecutionPolicy,
+         in-matrix InMat>
+auto matrix_one_norm(
+  ExecutionPolicy&& exec,
+  InMat A);
+
+// [linalg.algs.blas1.matinfnorm],
+// Infinity norm of a matrix
+template<in-matrix InMat,
+         class Scalar>
+Scalar matrix_inf_norm(
+  InMat A,
+  Scalar init);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Scalar>
+Scalar matrix_inf_norm(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Scalar init);
+template<in-matrix InMat>
+auto matrix_inf_norm(
+  InMat A);
+template<class ExecutionPolicy,
+         in-matrix InMat>
+auto matrix_inf_norm(
+  ExecutionPolicy&& exec,
+  InMat A);
+
+// [linalg.algs.blas2], BLAS 2 algorithms
+
+// [linalg.algs.blas2.gemv],
+// general matrix-vector product
+template<in-matrix InMat,
+         in-vector InVec,
+         out-vector OutVec>
+void matrix_vector_product(InMat A,
+                           InVec x,
+                           OutVec y);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         in-vector InVec,
+         out-vector OutVec>
+void matrix_vector_product(ExecutionPolicy&& exec,
+                           InMat A,
+                           InVec x,
+                           OutVec y);
+template<in-matrix InMat,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void matrix_vector_product(InMat A,
+                           InVec1 x,
+                           InVec2 y,
+                           OutVec z);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void matrix_vector_product(ExecutionPolicy&& exec,
+                           InMat A,
+                           InVec1 x,
+                           InVec2 y,
+                           OutVec z);
+
+// [linalg.algs.blas2.symv],
+// symmetric matrix-vector product
+template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(ExecutionPolicy&& exec,
+                                     InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(
+  InMat A,
+  Triangle t,
+  InVec1 x,
+  InVec2 y,
+  OutVec z);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  InVec1 x,
+  InVec2 y,
+  OutVec z);
+
+// [linalg.algs.blas2.hemv],
+// Hermitian matrix-vector product
+template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(ExecutionPolicy&& exec,
+                                     InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(InMat A,
+                                     Triangle t,
+                                     InVec1 x,
+                                     InVec2 y,
+                                     OutVec z);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(ExecutionPolicy&& exec,
+                                     InMat A,
+                                     Triangle t,
+                                     InVec1 x,
+                                     InVec2 y,
+                                     OutVec z);
+
+// [linalg.algs.blas2.trmv],
+// Triangular matrix-vector product
+
+// Overwriting triangular matrix-vector product
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec x,
+  OutVec y);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec x,
+  OutVec y);
+
+// In-place triangular matrix-vector product
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec y);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec y);
+
+// Updating triangular matrix-vector product
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void triangular_matrix_vector_product(InMat A,
+                                      Triangle t,
+                                      DiagonalStorage d,
+                                      InVec1 x,
+                                      InVec2 y,
+                                      OutVec z);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void triangular_matrix_vector_product(ExecutionPolicy&& exec,
+                                      InMat A,
+                                      Triangle t,
+                                      DiagonalStorage d,
+                                      InVec1 x,
+                                      InVec2 y,
+                                      OutVec z);
+
+// [linalg.algs.blas2.trsv],
+// Solve a triangular linear system
+
+// Solve a triangular linear system, not in place
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x,
+  BinaryDivideOp divide);
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x);
+
+// Solve a triangular linear system, in place
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b,
+  BinaryDivideOp divide);
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b);
+
+// [linalg.algs.blas2.rank1],
+// nonsymmetric rank-1 matrix update
+template<in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+
+template<in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update_c(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update_c(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+
+// [linalg.algs.blas2.symherrank1],
+// symmetric or Hermitian rank-1 matrix update
+template<in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  InVec x,
+  InOutMat A,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+template<class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+template<class ExecutionPolicy,
+         class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+
+template<in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  InVec x,
+  InOutMat A,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+template<class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+template<class ExecutionPolicy,
+         class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+
+// [linalg.algs.blas2.rank2],
+// Symmetric and Hermitian rank-2 matrix updates
+
+// symmetric rank-2 matrix update
+template<in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2_update(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2_update(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+
+// Hermitian rank-2 matrix update
+template<in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2_update(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2_update(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+
+// [linalg.algs.blas3], BLAS 3 algorithms
+
+// [linalg.algs.blas3.gemm],
+// general matrix-matrix product
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void matrix_product(InMat1 A,
+                    InMat2 B,
+                    OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void matrix_product(ExecutionPolicy&& exec,
+                    InMat1 A,
+                    InMat2 B,
+                    OutMat C);
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void matrix_product(InMat1 A,
+                    InMat2 B,
+                    InMat3 E,
+                    OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void matrix_product(ExecutionPolicy&& exec,
+                    InMat1 A,
+                    InMat2 B,
+                    InMat3 E,
+                    OutMat C);
+
+// [linalg.algs.blas3.xxmm],
+// symmetric, Hermitian, and triangular matrix-matrix product
+
+template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  InMat3 E,
+  OutMat C);
+
+// [linalg.algs.blas3.trmm],
+// in-place triangular matrix-matrix product
+
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_left_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_left_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_right_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_right_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+
+// [linalg.algs.blas3.rankk],
+// rank-k update of a symmetric or Hermitian matrix
+
+// rank-k symmetric matrix update
+template<class Scalar,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class Scalar,
+         class ExecutionPolicy,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+
+template<in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+
+// rank-k Hermitian matrix update
+template<class Scalar,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         class Scalar,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+
+template<in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+
+// [linalg.algs.blas3.rank2k],
+// rank-2k update of a symmetric or Hermitian matrix
+
+// rank-2k symmetric matrix update
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2k_update(
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2k_update(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+
+// rank-2k Hermitian matrix update
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2k_update(
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2k_update(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+
+// [linalg.algs.blas3.trsm],
+// solve multiple triangular linear systems
+
+// solve multiple triangular systems on the left, not-in-place
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_left_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+
+// solve multiple triangular systems on the right, not-in-place
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_right_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+
+// solve multiple triangular systems on the left, in-place
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_left_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+
+// solve multiple triangular systems on the right, in-place
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);  
+template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_right_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+

28.9.3 General [linalg.general]

+

1 +For the effects of all functions in [linalg], when the effects are +described as “computes R = EXPR” +or “compute R = EXPR” +(for some R and mathematical +expression EXPR), +the following apply:

+
    +
  • (1.1) +EXPR has +the conventional mathematical meaning as written.

  • +
  • (1.2) The +pattern xT +should be read as “the transpose of x.”

  • +
  • (1.3) The +pattern xH +should be read as “the conjugate transpose of x.”

  • +
  • (1.4) When +R is the same name as a +function parameter whose type is a template parameter with +Out in its name, the intent is that the result of the +computation is written to the elements of the function parameter +R.

  • +
+

2 +Some of the functions and types in [linalg] distinguish between the +“rows” and the “columns” of a matrix. For a matrix A and a +multidimensional index i, j in +A.extents(),

+
    +
  • (2.1) +row i of A is the set of elements +A[i, k1] for all k1 such that +i, k1 is in A.extents(); and

  • +
  • (2.2) +column j of A is the set of elements +A[k0, j] for all k0 such that +k0, j is in A.extents().

  • +
+

3 +Some of the functions in [linalg] distinguish between the “upper +triangle,” “lower triangle,” and “diagonal” of a matrix.

+
    +
  • (3.1) The +diagonal is the set of all elements of A accessed +by A[i,i] for 0 ≤ i < +min(A.extent(0), A.extent(1)).

  • +
  • (3.2) The +upper triangle of a matrix A is the set of all +elements of A accessed by A[i,j] with +ij. It includes the diagonal.

  • +
  • (3.3) The +lower triangle of A is the set of all elements of +A accessed by A[i,j] with i ≥ +j. It includes the diagonal.

  • +
+

4 +For any function F that takes a parameter named +t, t applies to accesses done through the +parameter preceding t in the parameter list of +F. Let m be such an access-modified function +parameter. F will only access the triangle of +m specified by t. For accesses +m[i, j] outside the triangle specified by t, +F will use the value`

+
    +
  • conj-if-needed(m[j, i]) if the +name of F starts with hermitian,

  • +
  • m[j, i] if the name of F starts with +symmetric, or

  • +
  • the additive identity if the name of F starts with +triangular.

  • +
+

[Example: Small vector product accessing only specified +triangle. It would not be a precondition violation for the non-accessed +matrix element to be non-zero.

+
template<class Triangle>
+void triangular_matrix_vector_2x2_product(
+       mdspan<const float, extents<int, 2, 2>> m,
+       Triangle t,
+       mdspan<const float, extents<int, 2>> x,
+       mdspan<float, extents<int, 2>> y) {
+
+  static_assert(is_same_v<Triangle, lower_triangle_t> ||
+                is_same_v<Triangle, upper_triangle_t>);
+
+  if constexpr (is_same_v<Triangle, lower_triangle_t>) {
+    y[0] = m[0,0] * x[0]; // + 0 * x[1]
+    y[1] = m[1,0] * x[0] + m[1,1] * x[1];
+  } else { // upper_triangle_t
+    y[0] = m[0,0] * x[0] + m[0,1] * x[1];
+    y[1] = /* 0 * x[0] + */ m[1,1] * x[1];
+  }
+}
+

end example]

+

5 +For any function F that takes a parameter named +d, d applies to accesses done through the +previous-of-the-previous parameter of d in the parameter +list of F. Let m be such an access-modified +function parameter. If d specifies that an implicit unit +diagonal is to be assumed, then

+
    +
  • F will not access the diagonal of m; +and

  • +
  • the algorithm will interpret m as if it has a unit +diagonal, that is, a diagonal each of whose elements behaves as a +two-sided multiplicative identity (even if m’s value type +does not have a two-sided multiplicative identity).

  • +
+

Otherwise, if d specifies that an explicit diagonal is +to be assumed, then F will access the diagonal of +m.

+

6 +Within all the functions in [linalg], any calls to abs, +conj, imag, and real are +unqualified.

+

7 +Two mdspan objects x and y +alias each other, if they have the same extents e, +and for every pack of integers i which is a +multidimensional index in e, x[i...] and +y[i...] refer to the same element. [Note: This means +that x and y view the same elements in the +same order. – end note]

+

8 +Two mdspan objects x and y +overlap each other, if for some pack of integers i +that is a multidimensional index in x.extents(), there +exists a pack of integers j that is a multidimensional +index in y.extents(), such that x[i...] and +y[j...] refer to the same element. [Note: Aliasing +is a special case of overlapping. If x and y +do not overlap, then they also do not alias each other. – end +note]

+

28.9.4 Requirements [linalg.reqs]

+

28.9.4.1 Linear algebra value types +[linalg.reqs.val]

+

1 +Throughout [linalg], the following types are linear algebra value +types:

+
    +
  • (1.1) the +value_type type alias of any input or output +mdspan parameter(s) of any function in [linalg]; +and

  • +
  • (1.2) the +Scalar template parameter (if any) of any function or class +in [linalg].

  • +
+

2 +Linear algebra value types shall model semiregular.

+

3 +A value-initialized object of linear algebra value type shall act as the +additive identity.

+

28.9.4.2 Algorithm and class +requirements [linalg.reqs.alg]

+

1 +[linalg.reqs.alg] lists common requirements for all algorithms and +classes in [linalg].

+

2 +All of the following statements presume that the algorithm’s asymptotic +complexity requirements, if any, are satisfied.

+
    +
  • (2.1) The +function may make arbitrarily many objects of any linear algebra value +type, value-initializing or direct-initializing them with any existing +object of that type.

  • +
  • (2.2) The +triangular solve algorithms in [linalg.algs] +either have a BinaryDivideOp template parameter (see +[linalg.algs.reqs]) and a binary function object +parameter divide of that type, or they have effects +equivalent to invoking such an algorithm. Triangular solve algorithms +interpret divide(a, b) as a times the +multiplicative inverse of b. Each triangular solve +algorithm uses a sequence of evaluations of *, +*=, divide, unary +, binary ++, +=, unary -, binary +-, -=, and = operators that would +produce the result specified by the algorithm’s Effects and Remarks when +operating on elements of a field with noncommutative multiplication. It +is a precondition of the algorithm that any addend, any subtrahend, any +partial sum of addends in any order (treating any difference as a sum +with the second term negated), any factor, any partial product of +factors respecting their order, any numerator (first argument of +divide), any denominator (second argument of +divide), and any assignment is a well-formed expression. +

  • +
  • (2.3) +Otherwise, the function will use a sequence of evaluations of +*, *=, +, +=, and += operators that would produce the result specified by the +algorithm’s Effects and Remarks when operating on elements of a semiring +with noncommutative multiplication. It is a precondition of the +algorithm that any addend, any partial sum of addends in any order, any +factor, any partial product of factors respecting their order, and any +assignment is a well-formed expression.

  • +
  • (2.4) If +the function has an output mdspan, then all addends, +subtrahends (for the triangular solve algorithms), or results of the +divide parameter on intermediate terms (if the function +takes a divide parameter) are assignable and convertible to +the output mdspan’s value_type.

  • +
  • (2.5) The +function may reorder addends and partial sums arbitrarily. [Note: +Factors in each product are not reordered; multiplication is not +necessarily commutative. – end note]

  • +
+

[Note: The above requirements do not prohibit implementation +approaches and optimization techniques which are not user-observable. In +particular, if for all input and output arguments the +value_type is a floating-point type, implementers are free +to leverage approximations, use arithmetic operations not explicitly +listed above, and compute floating point sums in any way that improves +their accuracy. – end note]

+

[Note: For all functions in [linalg], suppose that all input +and output mdspan have as value_type a +floating-point type, and any Scalar template argument has a +floating-point type.

+

Then, functions may do all of the following:

+
    +
  • compute floating-point sums in any way that improves their +accuracy for arbitrary input;

  • +
  • perform additional arithmetic operations (other than those +specified by the function’s wording and +[linalg.reqs.alg]) in order to improve performance or +accuracy; and

  • +
  • use approximations (that might not be exact even if computing +with real numbers), instead of computations that would be exact if it +were possible to compute without rounding error;

  • +
+

as long as

+
    +
  • the function satisfies the complexity requirements; and

  • +
  • the function is logarithmically stable, as defined in Demmel +2007. Strassen’s algorithm for matrix-matrix multiply is an example of a +logarithmically stable algorithm. – end note]

  • +
+

28.9.5 Tag classes [linalg.tags]

+

28.9.5.1 Storage order tags +[linalg.tags.order]

+

1 +The storage order tags describe the order of elements in an +mdspan with layout_blas_packed +([linalg.layout.packed]) layout.

+
struct column_major_t {
+  explicit column_major_t() = default;
+};
+inline constexpr column_major_t column_major{};
+
+struct row_major_t {
+  explicit row_major_t() = default;
+};
+inline constexpr row_major_t row_major{};
+

2 +column_major_t indicates a column-major order, and +row_major_t indicates a row-major order.

+

28.9.5.2 Triangle tags +[linalg.tags.triangle]

+
struct upper_triangle_t {
+  explicit upper_triangle_t() = default;
+};
+inline constexpr upper_triangle_t upper_triangle{};
+
+struct lower_triangle_t {
+  explicit lower_triangle_t() = default;
+};
+inline constexpr lower_triangle_t lower_triangle{};
+

1 +These tag classes specify whether algorithms and other users of a matrix +(represented as an mdspan) access the upper triangle +(upper_triangle_t) or lower triangle +(lower_triangle_t) of the matrix (see also +[linalg.general]). This is also subject to the restrictions of +implicit_unit_diagonal_t if that tag is also used as a +function argument; see below.

+

28.9.5.3 Diagonal tags +[linalg.tags.diagonal]

+
struct implicit_unit_diagonal_t {
+  explicit implicit_unit_diagonal_t() = default;
+};
+inline constexpr implicit_unit_diagonal_t
+  implicit_unit_diagonal{};
+
+struct explicit_diagonal_t {
+  explicit explicit_diagonal_t() = default;
+};
+inline constexpr explicit_diagonal_t explicit_diagonal{};
+

1 +These tag classes specify whether algorithms access the matrix’s +diagonal entries, and if not, then how algorithms interpret the matrix’s +implicitly represented diagonal values.

+

2 +The implicit_unit_diagonal_t tag indicates that an implicit +unit diagonal is to be assumed ([linalg.general]).

+

3 +The explicit_diagonal_t tag indicates that an explicit +diagonal is used ([linalg.general]).

+

28.9.6 Layouts for packed matrix +types [linalg.layout.packed]

+

28.9.6.1 Overview +[linalg.layout.packed.overview]

+

1 +layout_blas_packed is an mdspan layout mapping +policy that represents a square matrix that stores only the entries in +one triangle, in a packed contiguous format. Its Triangle +template parameter determines whether an mdspan with this +layout stores the upper or lower triangle of the matrix. Its +StorageOrder template parameter determines whether the +layout packs the matrix’s elements in column-major or row-major +order.

+

2 +A StorageOrder of column_major_t indicates +column-major ordering. This packs matrix elements starting with the +leftmost (least column index) column, and proceeding column by column, +from the top entry (least row index).

+

3 +A StorageOrder of row_major_t indicates +row-major ordering. This packs matrix elements starting with the topmost +(least row index) row, and proceeding row by row, from the leftmost +(least column index) entry.

+

[Note: layout_blas_packed describes the data +layout used by the BLAS’ Symmetric Packed (SP), Hermitian Packed (HP), +and Triangular Packed (TP) matrix types. – end note]

+
template<class Triangle,
+         class StorageOrder>
+class layout_blas_packed {
+public:
+  using triangle_type = Triangle;
+  using storage_order_type = StorageOrder;
+
+  template<class Extents>
+  struct mapping {
+  public:
+    using extents_type = Extents;
+    using index_type = typename extents_type::index_type;
+    using size_type = typename extents_type::size_type;
+    using rank_type = typename extents_type::rank_type;
+    using layout_type = layout_blas_packed;
+
+    // [linalg.layout.packed.cons], constructors
+    constexpr mapping() noexcept = default;
+    constexpr mapping(const mapping&) noexcept = default;
+    constexpr mapping(const extents_type&) noexcept;
+    template<class OtherExtents>
+      constexpr explicit(! is_convertible_v<OtherExtents, extents_type>)
+        mapping(const mapping<OtherExtents>& other) noexcept;
+
+    constexpr mapping& operator=(const mapping&) noexcept = default;
+
+    // [linalg.layout.packed.obs], observers
+    constexpr const extents_type& extents() const noexcept { return extents_; }
+
+    constexpr index_type required_span_size() const noexcept;
+
+    template<class Index0, class Index1>
+      constexpr index_type operator() (Index0 ind0, Index1 ind1) const noexcept;
+
+    static constexpr bool is_always_unique() noexcept {
+      return (extents_type::static_extent(0) != dynamic_extent &&
+              extents_type::static_extent(0) < 2) ||
+             (extents_type::static_extent(1) != dynamic_extent &&
+              extents_type::static_extent(1) < 2);
+    }
+    static constexpr bool is_always_exhaustive() noexcept { return true; }
+    static constexpr bool is_always_strided() noexcept
+      { return is_always_unique(); }
+
+    constexpr bool is_unique() const noexcept {
+      return extents_.extent(0) < 2;
+    }
+    constexpr bool is_exhaustive() const noexcept { return true; }
+    constexpr bool is_strided() const noexcept {
+      return extents_.extent(0) < 2;
+    }
+
+    constexpr index_type stride(rank_type) const noexcept;
+
+    template<class OtherExtents>
+      friend constexpr bool
+        operator==(const mapping&, const mapping<OtherExtents>&) noexcept;
+
+  private:
+    extents_type extents_{}; // exposition only
+  };
+};
+

4 +Mandates:

+
    +
  • (4.1) +Triangle is either upper_triangle_t or +lower_triangle_t,

  • +
  • (4.2) +StorageOrder is either column_major_t or +row_major_t,

  • +
  • (4.3) +Extents is a specialization of +std::extents,

  • +
  • (4.4) +Extents::rank() equals 2,

  • +
  • (4.5) one +of extents_type::static_extent(0) == dynamic_extent, +extents_type::static_extent(1) == dynamic_extent, or +extents_type::static_extent(0) == extents_type::static_extent(1) +is true, and

  • +
  • (4.6) if +Extents::rank_dynamic() == 0 is true, let +Ns be +equal to Extents::static_extent(0); then, Ns × (Ns+1) +is representable as a value of type index_type.

  • +
+

5 +layout_blas_packed<T, SO>::mapping<E> is a +trivially copyable type that models regular for each +T, SO, and E.

+

28.9.6.2 Constructors +[linalg.layout.packed.cons]

+
constexpr mapping(const extents_type& e) noexcept;
+

6 +Preconditions:

+
    +
  • (6.1) Let +N be equal to +e.extent(0). Then, N × (N+1) is representable +as a value of type index_type +([basic.fundamental]).

  • +
  • (6.2) +e.extent(0) equals e.extent(1).

  • +
+

7 +Effects: Direct-non-list-initializes +extents_ with e.

+
template<class OtherExtents>
+  explicit(! is_convertible_v<OtherExtents, extents_type>)
+    constexpr mapping(const mapping<OtherExtents>& other) noexcept;
+

8 +Constraints: +is_constructible_v<extents_type, OtherExtents> is +true.

+

9 +Preconditions: Let N +be other.extents().extent(0). Then, N × (N+1) is representable +as a value of type index_type ([basic.fundamental]).

+

10 +Effects: Direct-non-list-initializes +extents_ with other.extents().

+

28.9.6.3 Observers +[linalg.layout.packed.obs]

+
constexpr index_type required_span_size() const noexcept;
+

11 +Returns: +extents_.extent(0) * (extents_.extent(0) + 1)/2. +[Note: For example, a 5 x 5 packed matrix only stores 15 matrix +elements. – end note]

+
template<class Index0, class Index1>
+  constexpr index_type operator() (Index0 ind0, Index1 ind1) const noexcept;
+

12 +Constraints:

+
    +
  • (12.1) +is_convertible_v<Index0, index_type> is +true,

  • +
  • (12.2) +is_convertible_v<Index1, index_type> is +true,

  • +
  • (12.3) +is_nothrow_constructible_v<index_type, Index0> is +true, and

  • +
  • (12.4) +is_nothrow_constructible_v<index_type, Index1> is +true.

  • +
+

13 +Preconditions: +extents_type::index-cast(ind0), extents_type::index-cast(ind1) +is a multidimensional index in extents_ +([mdspan.overview]).

+

14 +Returns: Let N be +extents_.extent(0), let +i be +extents_type::index-cast(ind0), +and let j be +extents_type::index-cast(ind1). +Then

+
    +
  • (14.1) +(*this)(j, i) if i > j is +true; otherwise

  • +
  • (14.2) +i + j * (j + 1)/2 if +is_same_v<StorageOrder, column_major_t> && is_same_v<Triangle, upper_triangle_t> +is true or +is_same_v<StorageOrder, row_major_t> && is_same_v<Triangle, lower_triangle_t> +is true; otherwise

  • +
  • (14.3) +j + N * i - i * (i + 1)/2.

  • +
+
constexpr index_type stride(rank_type r) const noexcept;
+

15 +Preconditions:

+
    +
  • (15.1) +is_strided() is true, and

  • +
  • (15.1) +r < extents_type::rank() is true.

  • +
+

16 +Returns: 1.

+
template<class OtherExtents>
+  friend constexpr bool
+    operator==(const mapping& x, const mapping<OtherExtents>& y) noexcept;
+

17 +Effects: Equivalent to: +return x.extents() == y.extents();

+

28.9.7 Exposition-only helpers +[linalg.helpers]

+

28.9.7.1 +abs-if-needed [linalg.helpers.abs]

+

1 +The name abs-if-needed denotes an exposition-only +function object. The expression +abs-if-needed(E) for subexpression +E whose type is T is expression-equivalent +to:

+
    +
  • (1.1) +E if T is an unsigned integer;

  • +
  • (1.2) +otherwise, std::abs(E) if T is an arithmetic +type,

  • +
  • (1.3) +otherwise, abs(E), if that expression is valid, with +overload resolution performed in a context that includes the declaration +template<class T> T abs(T) = delete;. If the function +selected by overload resolution does not return the absolute value of +its input, the program is ill-formed, no diagnostic required.

  • +
+

28.9.7.2 +conj-if-needed [linalg.helpers.conj]

+

1 +The name conj-if-needed denotes an exposition-only +function object. The expression +conj-if-needed(E) for subexpression +E whose type is T is expression-equivalent +to:

+
    +
  • (1.1) +conj(E), if T is not an arithmetic type and +the expression conj(E) is valid, with overload resolution +performed in a context that includes the declaration +template<class T> T conj(const T&) = delete;. If +the function selected by overload resolution does not return the complex +conjugate of its input, the program is ill-formed, no diagnostic +required;

  • +
  • (1.2) +otherwise, E.

  • +
+

28.9.7.3 +real-if-needed [linalg.helpers.real]

+

1 +The name real-if-needed denotes an exposition-only +function object. The expression +real-if-needed(E) for subexpression +E whose type is T is expression-equivalent +to:

+
    +
  • (1.1) +real(E), if T is not an arithmetic type and +the expression real(E) is valid, with overload resolution +performed in a context that includes the declaration +template<class T> T real(const T&) = delete;. If +the function selected by overload resolution does not return the real +part of its input, the program is ill-formed, no diagnostic +required;

  • +
  • (1.2) +otherwise, E.

  • +
+ + +

28.9.7.4 +imag-if-needed [linalg.helpers.imag]

+

1 +The name imag-if-needed denotes an exposition-only +function object. The expression +imag-if-needed(E) for subexpression +E whose type is T is expression-equivalent +to:

+
    +
  • (1.1) +imag(E), if T is not an arithmetic type and +the expression imag(E) is valid, with overload resolution +performed in a context that includes the declaration +template<class T> T imag(const T&) = delete;. If +the function selected by overload resolution does not return the +imaginary part of its input, the program is ill-formed, no diagnostic +required;

  • +
  • (1.2) +otherwise, ((void)E, T{}).

  • +
+ +

28.9.7.5 Linear algebra argument +concepts [linalg.helpers.concepts]

+

1 +The exposition-only concepts defined in this section constrain the +algorithms in [linalg.algs].

+
template<class T>
+constexpr bool is-mdspan = false; // exposition only
+
+template<class ElementType, class Extents, class Layout, class Accessor>
+constexpr bool is-mdspan<mdspan<ElementType, Extents, Layout, Accessor>> = true; // exposition only
+
+template<class T>
+concept in-vector = // exposition only
+  is-mdspan<T> &&
+  T::rank() == 1;
+
+template<class T>
+concept out-vector = // exposition only
+  is-mdspan<T> &&
+  T::rank() == 1 &&
+  is_assignable_v<typename T::reference, typename T::element_type> &&
+  T::is_always_unique();
+
+template<class T>
+concept inout-vector = // exposition only
+  is-mdspan<T> &&
+  T::rank() == 1 &&
+  is_assignable_v<typename T::reference, typename T::element_type> &&
+  T::is_always_unique();
+
+template<class T>
+concept in-matrix = // exposition only
+  is-mdspan<T> &&
+  T::rank() == 2;
+
+template<class T>
+concept out-matrix = // exposition only
+  is-mdspan<T> &&
+  T::rank() == 2 &&
+  is_assignable_v<typename T::reference, typename T::element_type> &&
+  T::is_always_unique();
+
+template<class T>
+concept inout-matrix = // exposition only
+  is-mdspan<T> &&
+  T::rank() == 2 &&
+  is_assignable_v<typename T::reference, typename T::element_type> &&
+  T::is_always_unique();
+
+template<class T>
+constexpr bool is-layout-blas-packed = false; // exposition only
+
+template<class Triangle, class StorageOrder>
+constexpr bool is-layout-blas-packed<layout-blas-packed<Triangle, StorageOrder>> = true; // exposition only
+
+template<class T>
+concept possibly-packed-inout-matrix = // exposition only
+  is-mdspan<T> &&
+  T::rank() == 2 &&
+  is_assignable_v<typename T::reference, typename T::element_type> &&
+  (T::is_always_unique() || is-layout-blas-packed<typename T::layout_type>);
+
+template<class T>
+concept in-object = // exposition only
+  is-mdspan<T> &&
+  (T::rank() == 1 || T::rank() == 2);
+
+template<class T>
+concept out-object = // exposition only
+  is-mdspan<T> &&
+  (T::rank() == 1 || T::rank() == 2) &&
+  is_assignable_v<typename T::reference, typename T::element_type> &&
+  T::is_always_unique();
+
+template<class T>
+concept inout-object = // exposition only
+  is-mdspan<T> &&
+  (T::rank() == 1 || T::rank() == 2) &&
+  is_assignable_v<typename T::reference, typename T::element_type> &&
+  T::is_always_unique();
+

2 +If a function in [linalg.algs] accesses the elements of a +parameter constrained by in-vector, +in-matrix, or in-object, +those accesses will not modify the elements.

+

3 +Unless explicitly permitted, any inout-vector, +inout-matrix, inout-object, +possibly-packed-inout-matrix, +out-vector, out-matrix, or +out-object parameter of a function in +[linalg.algs] shall not overlap any other mdspan +parameter of the function.

+

28.9.7.6 Exposition-only helpers +for algorithm mandates [linalg.helpers.mandates]

+

[Note: These exposition-only helper functions use the less +constraining input concepts even for the output arguments, because the +additional constraint for assignability of elements is not necessary, +and they are sometimes used in a context where the third argument is an +input type too. - end Note.]

+
template<class MDS1, class MDS2>
+requires(is-mdspan<MDS1> && is-mdspan<MDS2>) 
+constexpr bool compatible-static-extents(size_t r1, size_t r2) // exposition only
+{
+  return MDS1::static_extent(r1) == dynamic_extent ||
+         MDS2::static_extent(r2) == dynamic_extent || 
+         MDS1::static_extent(r1) == MDS2::static_extent(r2));
+}
+
template<in-vector In1, in-vector In2, in-vector Out>
+constexpr bool possibly-addable() // exposition only
+{
+  return compatible-static-extents<Out, In1>(0, 0) &&
+         compatible-static-extents<Out, In2>(0, 0) &&
+         compatible-static-extents<In1, In2>(0, 0);
+}
+
template<in-matrix In1, in-matrix In2, in-matrix Out>
+constexpr bool possibly-addable() // exposition only
+{
+  return compatible-static-extents<Out, In1>(0, 0) &&
+         compatible-static-extents<Out, In1>(1, 1) &&
+         compatible-static-extents<Out, In2>(0, 0) &&
+         compatible-static-extents<Out, In2>(1, 1) &&
+         compatible-static-extents<In1, In2>(0, 0) &&
+         compatible-static-extents<In1, In2>(1, 1);
+}
+
template<in-matrix InMat, in-vector InVec, in-vector OutVec>
+constexpr bool possibly-multipliable() // exposition only
+{
+  return compatible-static-extents<OutVec, InMat>(0, 0) &&
+         compatible-static-extents<InMat, InVec>(1, 0);
+}
+
template<in-vector InVec, in-matrix InMat, in-vector OutVec>
+constexpr bool possibly-multipliable() // exposition only
+{
+  return compatible-static-extents<OutVec, InMat>(0, 1) &&
+         compatible-static-extents<InMat, InVec>(0, 0);
+}
+
template<in-matrix InMat1, in-matrix InMat2, in-matrix OutMat>
+constexpr bool possibly-multipliable() // exposition only
+{
+  return compatible-static-extents<OutMat, InMat1>(0, 0) &&
+         compatible-static-extents<OutMat, InMat2>(1, 1) &&
+         compatible-static-extents<InMat1, InMat2>(1, 0);
+}
+

28.9.7.7 Exposition-only checks for +algorithm preconditions [linalg.helpers.precond]

+

[Note: These helpers use the less constraining input concepts +even for the output arguments, because the additional constraint for +assignability of elements is not necessary, and they are sometimes used +in a context where the third argument is an input type too. - end +Note.]

+
constexpr bool addable( // exposition only
+  const in-vector auto& in1,
+  const in-vector auto& in2,
+  const in-vector auto& out)
+{
+  return out.extent(0) == in1.extent(0) &&
+         out.extent(0) == in2.extent(0);
+}
+
constexpr bool addable( // exposition only
+  const in-matrix auto& in1,
+  const in-matrix auto& in2,
+  const in-matrix auto& out)
+{
+  return out.extent(0) == in1.extent(0) &&
+         out.extent(1) == in1.extent(1) &&
+         out.extent(0) == in2.extent(0) &&
+         out.extent(1) == in2.extent(1);
+}
+
constexpr bool multipliable( // exposition only
+  const in-matrix auto& in_mat,
+  const in-vector auto& in_vec,
+  const in-vector auto& out_vec)
+{
+  return out_vec.extent(0) == in_mat.extent(0) &&
+         in_mat.extent(1) == in_vec.extent(0);
+}
+
constexpr bool multipliable( // exposition only
+  const in-vector auto& in_vec,
+  const in-matrix auto& in_mat,
+  const in-vector auto& out_vec)
+{
+  return out_vec.extent(0) == in_mat.extent(1) &&
+         in_mat.extent(0) == in_vec.extent(0);
+}
+
constexpr bool multipliable( // exposition only
+  const in-matrix auto& in_mat1,
+  const in-matrix auto& in_mat2,
+  const in-matrix auto& out_mat)
+{
+  return out_mat.extent(0) == in_mat1.extent(0) &&
+         out_mat.extent(1) == in_mat2.extent(1) &&
+         in1_mat.extent(1) == in_mat2.extent(0);
+}
+

28.9.8 Scaled in-place +transformation [linalg.scaled]

+

28.9.8.1 Introduction +[linalg.scaled.intro]

+

1 +The scaled function takes a value alpha and an +mdspan x, and returns a new read-only +mdspan that represents the elementwise product of +alpha with each element of x.

+

[Example:

+
using Vec = mdspan<double, dextents<size_t, 1>>;
+
+// z = alpha * x + y
+void z_equals_alpha_times_x_plus_y(
+  double alpha, Vec x,
+  Vec y,
+  Vec z)
+{
+  add(scaled(alpha, x), y, z);
+}
+
+// z = alpha * x + beta * y
+void z_equals_alpha_times_x_plus_beta_times_y(
+  double alpha, Vec x,
+  double beta, Vec y,
+  Vec z)
+{
+  add(scaled(alpha, x), scaled(beta, y), z);
+}
+

end example]

+ +

28.9.8.2 Class template +scaled_accessor [linalg.scaled.scaledaccessor]

+

1 +The class template scaled_accessor is an +mdspan accessor policy which upon access produces scaled +elements. reference. It is part of the implementation of +scaled [linalg.scaled.scaled].

+
template<class ScalingFactor,
+         class NestedAccessor>
+class scaled_accessor {
+public:
+  using element_type = add_const_t<decltype(declval<ScalingFactor>() * declval<NestedAccessor::element_type>())>;
+  using reference = remove_const_t<element_type>;
+  using data_handle_type = NestedAccessor::data_handle_type;
+  using offset_policy = scaled_accessor<ScalingFactor, NestedAccessor::offset_policy>;
+
+  constexpr scaled_accessor() = default;
+  template<class OtherNestedAccessor>
+    explicit(!is_convertible_v<OtherNestedAccessor, NestedAccessor>)
+      constexpr scaled_accessor(const scaled_accessor<ScalingFactor, OtherNestedAccessor>& other);
+  constexpr scaled_accessor(const ScalingFactor& s, const NestedAccessor& a);
+
+  constexpr reference access(data_handle_type p, size_t i) const;
+  constexpr offset_policy::data_handle_type
+    offset(data_handle_type p, size_t i) const;
+
+  constexpr const ScalingFactor& scaling_factor() const noexcept { return scaling-factor; }
+  constexpr const NestedAccessor& nested_accessor() const noexcept { return nested-accessor; }
+
+private:
+  ScalingFactor scaling-factor{}; // exposition only
+  NestedAccessor nested-accessor{}; // exposition only
+};
+

2 +Mandates:

+
    +
  • (2.1) +element_type is valid and denotes a type,

  • +
  • (2.2) +is_copy_constructible_v<reference> is +true,

  • +
  • (2.3) +is_reference_v<element_type> is +false,

  • +
  • (2.4) +ScalingFactor models semiregular, and

  • +
  • (2.5) +NestedAccessor meets the accessor policy requirements +[mdspan.accessor.reqmts].

  • +
+
template<class OtherNestedAccessor>
+  explicit(!is_convertible_v<OtherNestedAccessor, NestedAccessor>)
+    constexpr scaled_accessor(const scaled_accessor<ScalingFactor, OtherNestedAccessor>& other);
+

3 +Constraints: +is_constructible_v<NestedAccessor, const OtherNestedAccessor&> +is true.

+

4 +Effects:

+
    +
  • (4.1) +Direct-non-list-initializes scaling-factor with +other.scaling_factor(), and

  • +
  • (4.2) +direct-non-list-initializes nested-accessor with +other.nested_accessor().

  • +
+
constexpr scaled_accessor(const ScalingFactor& s, const NestedAccessor& a);
+

5 +Effects:

+
    +
  • (5.1) +Direct-non-list-initializes scaling-factor with +s, and

  • +
  • (5.2) +direct-non-list-initializes nested-accessor with +a.

  • +
+
constexpr reference access(data_handle_type p, size_t i) const;
+

6 +Returns: +scaling_factor() * NestedAccessor::element_type( +nested-accessor.access(p, i))

+
constexpr offset_policy::data_handle_type
+offset(data_handle_type p, size_t i) const;
+

7 +Returns: +nested-accessor.offset(p, i)

+

28.9.8.3 Function template +scaled [linalg.scaled.scaled]

+

1 +The scaled function template takes a scaling factor +alpha and an mdspan x, and +returns a new read-only mdspan with the same domain as +x, that represents the elementwise product of +alpha with each element of x.

+
template<class ScalingFactor,
+         class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto scaled(
+  ScalingFactor alpha,
+  mdspan<ElementType, Extents, Layout, Accessor> x);
+

2 +Let SA be +scaled_accessor<ScalingFactor, Accessor>

+

3 +Returns: +mdspan<typename SA::element_type, Extents, Layout, SA>(x.data_handle(), x.mapping(), SA(alpha, x.accessor()))

+ +

[Example:

+
void test_scaled(mdspan<double, extents<int, 10>> x)
+{
+  auto x_scaled = scaled(5.0, x);
+  for(int i = 0; i < x.extent(0); ++i) {
+    assert(x_scaled[i] == 5.0 * x[i]);
+  }
+}
+

end example]

+

28.9.9 Conjugated in-place +transformation [linalg.conj]

+

28.9.9.1 Introduction +[linalg.conj.intro]

+

1 +The conjugated function takes an mdspan +x, and returns a new read-only mdspan +y with the same domain as x, whose elements +are the complex conjugates of the corresponding elements of +x.

+ +

28.9.9.2 Class template +conjugated_accessor [linalg.conj.conjugatedaccessor]

+

1 +The class template conjugated_accessor is an +mdspan accessor policy which upon access produces conjugate +elements. It is part of the implementation of conjugated +[linalg.conj.conjugated].

+
template<class NestedAccessor>
+class conjugated_accessor {
+public:
+  using element_type = add_const_t<decltype(conj-if-needed(declval<NestedAccessor::element_type>()))>;
+  using reference = remove_const_t<element_type>;
+  using data_handle_type = typename NestedAccessor::data_handle_type;
+  using offset_policy = conjugated_accessor<NestedAccessor::offset_policy>;
+
+  constexpr conjugated_accessor() = default;
+  template<class OtherNestedAccessor>
+    explicit(!is_convertible_v<OtherNestedAccessor, NestedAccessor>>)
+    constexpr conjugated_accessor(const conjugated_accessor<OtherNestedAccessor>& other);
+
+  constexpr reference access(data_handle_type p, size_t i) const;
+
+  constexpr typename offset_policy::data_handle_type
+    offset(data_handle_type p, size_t i) const;
+
+  constexpr const Accessor& nested_accessor() const noexcept { return nested-accessor_; }
+
+private:
+  NestedAccessor nested-accessor_{}; // exposition only
+};
+

2 +Mandates:

+
    +
  • (2.1) +element_type is valid and denotes a type,

  • +
  • (2.2) +is_copy_constructible_v<reference> is +true,

  • +
  • (2.3) +is_reference_v<element_type> is false, +and

  • +
  • (2.4) +NestedAccessor meets the accessor policy requirements +[mdspan.accessor.reqmts].

  • +
+
constexpr conjugated_accessor(const NestedAccessor& acc);
+

3 +Effects: Direct-non-list-initializes +nested-accessor_ with acc.

+
template<class OtherNestedAccessor>
+  explicit(!is_convertible_v<OtherNestedAccessor, NestedAccessor>>)
+    constexpr conjugated_accessor(const conjugated_accessor<OtherNestedAccessor>& other);
+

4 +Constraints: +is_constructible_v<NestedAccessor, const OtherNestedAccessor&> +is true.

+

5 +Effects: Direct-non-list-initializes +nested-accessor_ with +other.nested_accessor().

+
constexpr reference access(data_handle_type p, size_t i) const;
+

6 +Returns: +conj-if-needed(NestedAccessor::element_type(nested-accessor_.access(p, i)))

+
constexpr typename offset_policy::data_handle_type
+  offset(data_handle_type p, size_t i) const;
+

7 +Returns: +nested-accessor_.offset(p, i)

+

28.9.9.3 Function template +conjugated [linalg.conj.conjugated]

+
template<class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto conjugated(
+  mdspan<ElementType, Extents, Layout, Accessor> a);
+

1 +Let A be +remove_cvref_t<decltype(a.accessor().nested_accessor())> +if Accessor is a specialization of +conjugated_accessor, and otherwise +conjugated_accessor<Accessor>.

+

2 +Returns:

+
    +
  • (2.1) +mdspan<typename A::element_type, Extents, Layout, A>(a.data_handle(), a.mapping(), a.accessor().nested_accessor()) +if Accessor is a specialization of +conjugated_accessor; otherwise

  • +
  • (2.2) +mdspan<typename A::element_type, Extents, Layout, A>(a.data_handle(), a.mapping(), conjugated_accessor(a.accessor())).

  • +
+

[Example:

+
void test_conjugated_complex(
+  mdspan<complex<double>, extents<int, 10>> a)
+{
+  auto a_conj = conjugated(a);
+  for(int i = 0; i < a.extent(0); ++i) {
+    assert(a_conj[i] == conj(a[i]);
+  }
+  auto a_conj_conj = conjugated(a_conj);
+  for(int i = 0; i < a.extent(0); ++i) {
+    assert(a_conj_conj[i] == a[i]);
+  }
+}
+
+void test_conjugated_real(
+  mdspan<double, extents<int, 10>> a)
+{
+  auto a_conj = conjugated(a);
+  for(int i = 0; i < a.extent(0); ++i) {
+    assert(a_conj[i] == a[i]);
+  }
+  auto a_conj_conj = conjugated(a_conj);
+  for(int i = 0; i < a.extent(0); ++i) {
+    assert(a_conj_conj[i] == a[i]);
+  }
+}
+

end example]

+

28.9.10 Transpose in-place +transformation [linalg.transp]

+

28.9.10.1 Introduction +[linalg.transp.intro]

+

1 +layout_transpose is an mdspan layout mapping +policy that swaps the two indices, extents, and strides of any unique +mdspan layout mapping policy.

+

2 +The transposed function takes an mdspan +representing a matrix, and returns a new mdspan +representing the transpose of the input matrix.

+

28.9.10.2 Exposition-only helpers +for layout_transpose and transposed +[linalg.transp.helpers]

+

1 +The exposition-only transpose-extents function +takes an extents object representing the extents of a +matrix, and returns a new extents object representing the +extents of the transpose of the matrix.

+

2 +The exposition-only alias template +transpose-extents-t<InputExtents> gives the +type of transpose-extents(e) for a +given extents object e of type +InputExtents.

+
template<class IndexType, size_t InputExtent0, size_t InputExtent1>
+constexpr extents<IndexType, InputExtent1, InputExtent0>
+transpose-extents(const extents<IndexType, InputExtent0, InputExtent1>& in); // exposition only
+

3 +Returns: +extents<IndexType, InputExtent1, InputExtent0>(in.extent(1), in.extent(0))

+
template<class InputExtents>
+using transpose-extents-t =
+  decltype(transpose-extents(declval<InputExtents>())); // exposition only
+

28.9.10.3 Class template +layout_transpose [linalg.transp.layout.transpose]

+

1 +layout_transpose is an mdspan layout mapping +policy that swaps the two indices, extents, and strides of any +mdspan layout mapping policy.

+
template<class Layout>
+class layout_transpose {
+public:
+  using nested_layout_type = Layout;
+
+  template<class Extents>
+  struct mapping {
+  private:
+    using nested-mapping-type =
+      typename Layout::template mapping<
+        transpose-extents-t<Extents>>; // exposition only
+
+  public:
+    using extents_type = Extents;
+    using index_type = typename extents_type::index_type;
+    using size_type = typename extents_type::size_type;
+    using rank_type = typename extents_type::rank_type;
+    using layout_type = layout_transpose;
+
+    constexpr explicit mapping(const nested-mapping-type&);
+
+    constexpr const extents_type& extents() const noexcept
+      { return extents_; }
+
+    constexpr index_type required_span_size() const
+      { return nested-mapping_.required_span_size(); }
+
+    template<class Index0, class Index1>
+      constexpr index_type operator()(Index0 ind0, Index1 ind1) const
+      { return nested-mapping_(ind1, ind0); }
+
+    constexpr const nested-mapping-type& nested_mapping() const noexcept
+      { return nested-mapping_; }
+
+    static constexpr bool is_always_unique() noexcept
+      { return nested-mapping-type::is_always_unique(); }
+    static constexpr bool is_always_exhaustive() noexcept
+      { return nested-mapping_type::is_always_exhaustive(); }
+    static constexpr bool is_always_strided() noexcept
+      { return nested-mapping_type::is_always_strided(); }
+
+    constexpr bool is_unique() const
+      { return nested-mapping_.is_unique(); }
+    constexpr bool is_exhaustive() const
+      { return nested-mapping_.is_exhaustive(); }
+    constexpr bool is_strided() const
+      { return nested-mapping_.is_strided(); }
+
+    constexpr index_type stride(size_t r) const;
+
+    template<class OtherExtents>
+      friend constexpr bool
+        operator==(const mapping& x, const mapping<OtherExtents>& y);
+  };
+
+  private:
+    nested-mapping-type nested-mapping_; // exposition only
+    extents_type extents_; // exposition only
+};
+

2 +Layout shall meet the layout mapping policy requirements +([mdspan.layout.policy.reqmts]).

+

3 +Mandates:

+
    +
  • (3.1) +Extents is a specialization of std::extents, +and

  • +
  • (3.2) +Extents::rank() equals 2.

  • +
+
constexpr explicit mapping(const nested-mapping-type& map);
+

4 +Effects:

+
    +
  • (4.1) +Initializes nested-mapping_ with map, +and

  • +
  • (4.1) +initializes extents_ with +transpose-extents(map.extents()).

  • +
+
constexpr index_type stride(size_t r) const;
+

5 +Preconditions:

+
    +
  • (5.1) +is_strided() is true, and

  • +
  • (5.2) +r < 2 is true.

  • +
+

6 +Returns: nested-mapping_ +.stride(r == 0 ? 1 : 0)

+
template<class OtherExtents>
+  friend constexpr bool
+    operator==(const mapping& x, const mapping<OtherExtents>& y);
+

7 +Constraints: The expression +x.nested-mapping_ +== y.nested-mapping_ is well-formed +and its result is convertible to bool.

+

8 +Returns: x.nested-mapping_ +== y.nested-mapping_.

+

28.9.10.4 Function template +transposed [linalg.transp.transposed]

+

1 +The transposed function takes a rank-2 mdspan +representing a matrix, and returns a new mdspan +representing the input matrix’s transpose. The input matrix’s data are +not modified, and the returned mdspan accesses the input +matrix’s data in place.

+
template<class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto transposed(
+  mdspan<ElementType, Extents, Layout, Accessor> a);
+

2 +Mandates: Extents::rank() == 2 is +true.

+

3 +Let ReturnExtents be +transpose-extents-t<Extents>. +Let R be +mdspan<ElementType, ReturnExtents, ReturnLayout, Accessor>, +where ReturnLayout is:

+
    +
  • (3.1) +layout_right if Layout is +layout_left;

  • +
  • (3.2) +otherwise, layout_left if Layout is +layout_right;

  • +
  • (3.3) +otherwise, layout_stride if Layout is +layout_stride;

  • +
  • (3.4) +otherwise, +layout_blas_packed<OppositeTriangle, OppositeStorageOrder>, +if Layout is +layout_blas_packed<Triangle, StorageOrder> for some +Triangle and StorageOrder, where

    +
      +
    • (3.4.1) +OppositeTriangle is +conditional_t<is_same_v<Triangle, upper_triangle_t>, lower_triangle_t, upper_triangle_t>, +and

    • +
    • (3.4.2) +OppositeStorageOrder is +conditional_t<is_same_v<StorageOrder, column_major_t>, row_major_t, column_major_t>;

    • +
  • +
  • (3.5) +otherwise, NestedLayout if Layout is +layout_transpose<NestedLayout> for some +NestedLayout;

  • +
  • (3.6) +otherwise, layout_transpose<Layout>.

  • +
+

4 +Returns: With ReturnMapping being the type +typename ReturnLayout::template mapping<ReturnExtents>:

+
    +
  • (4.1) +R(a.data_handle(), ReturnMapping(transpose-extents(a.mapping().extents())), a.accessor()) +if Layout is layout_left, +layout_right, or a specialization of +layout_blas_packed;

  • +
  • (4.2) +otherwise, +R(a.data_handle(), ReturnMapping(transpose-extents(a.mapping().extents()), array{a.mapping().stride(1), a.mapping().stride(0)}), a.accessor()) +if Layout is layout_stride; +

  • +
  • (4.3) +otherwise, +R(a.data_handle(), a.mapping().nested_mapping(), a.accessor()) +if Layout is a specialization of +layout_transpose; +

  • +
  • (4.4) +otherwise, +R(a.data_handle(), ReturnMapping(a.mapping()), a.accessor()).

  • +
+

[Example:

+
void test_transposed(mdspan<double, extents<size_t, 3, 4>> a)
+{
+  const auto num_rows = a.extent(0);
+  const auto num_cols = a.extent(1);
+
+  auto a_t = transposed(a);
+  assert(num_rows == a_t.extent(1));
+  assert(num_cols == a_t.extent(0));
+  assert(a.stride(0) == a_t.stride(1));
+  assert(a.stride(1) == a_t.stride(0));
+
+  for(size_t row = 0; row < num_rows; ++row) {
+    for(size_t col = 0; col < num_rows; ++col) {
+      assert(a[row, col] == a_t[col, row]);
+    }
+  }
+
+  auto a_t_t = transposed(a_t);
+  assert(num_rows == a_t_t.extent(0));
+  assert(num_cols == a_t_t.extent(1));
+  assert(a.stride(0) == a_t_t.stride(0));
+  assert(a.stride(1) == a_t_t.stride(1));
+
+  for(size_t row = 0; row < num_rows; ++row) {
+    for(size_t col = 0; col < num_rows; ++col) {
+      assert(a[row, col] == a_t_t[row, col]);
+    }
+  }
+}
+

end example]

+

28.9.11 Conjugate transpose +in-place transform [linalg.conjtransposed]

+

1 +The conjugate_transposed function returns a conjugate +transpose view of an object. This combines the effects of +transposed and conjugated.

+
template<class ElementType,
+         class Extents,
+         class Layout,
+         class Accessor>
+constexpr auto conjugate_transposed(
+  mdspan<ElementType, Extents, Layout, Accessor> a);
+

2 +Effects: Equivalent to: +return conjugated(transposed(a));

+

[Example:

+
void test_conjugate_transposed(
+  mdspan<complex<double>, extents<size_t, 3, 4>> a)
+{
+  const auto num_rows = a.extent(0);
+  const auto num_cols = a.extent(1);
+
+  auto a_ct = conjugate_transposed(a);
+  assert(num_rows == a_ct.extent(1));
+  assert(num_cols == a_ct.extent(0));
+  assert(a.stride(0) == a_ct.stride(1));
+  assert(a.stride(1) == a_ct.stride(0));
+
+  for(size_t row = 0; row < num_rows; ++row) {
+    for(size_t col = 0; col < num_rows; ++col) {
+      assert(a[row, col] == conj(a_ct[col, row]));
+    }
+  }
+
+  auto a_ct_ct = conjugate_transposed(a_ct);
+  assert(num_rows == a_ct_ct.extent(0));
+  assert(num_cols == a_ct_ct.extent(1));
+  assert(a.stride(0) == a_ct_ct.stride(0));
+  assert(a.stride(1) == a_ct_ct.stride(1));
+
+  for(size_t row = 0; row < num_rows; ++row) {
+    for(size_t col = 0; col < num_rows; ++col) {
+      assert(a[row, col] == a_ct_ct[row, col]);
+      assert(conj(a_ct[col, row]) == a_ct_ct[row, col]);
+    }
+  }
+}
+

end example]

+

28.9.12 Algorithm Requirements +based on template parameter name [linalg.algs.reqs]

+

1 +Throughout [linalg.algs.blas1], [linalg.algs.blas2], and +[linalg.algs.blas3], where the template parameters are not constrained, +the names of template parameters are used to express the following +Constraints.

+
    +
  • (1.1) +is_execution_policy<ExecutionPolicy>::value is +true ([execpol.type]).

  • +
  • (1.2) +Real is any type such that complex<Real> +is specified ([complex.numbers.general]).

  • +
  • (1.3) +Triangle is either upper_triangle_t or +lower_triangle_t.

  • +
  • (1.4) +DiagonalStorage is either +implicit_unit_diagonal_t or +explicit_diagonal_t.

  • +
+

[Note: Function templates that have a template parameter named +ExecutionPolicy are parallel algorithms +([algorithms.parallel.defns]). – end note]

+

28.9.13 BLAS 1 algorithms +[linalg.algs.blas1]

+

28.9.13.1 Complexity +[linalg.algs.blas1.complexity]

+

1 +Complexity: All algorithms in [linalg.algs.blas1] with +mdspan parameters perform a count of mdspan +array accesses and arithmetic operations that is linear in the maximum +product of extents of any mdspan parameter.

+

28.9.13.2 Givens rotations +[linalg.algs.blas1.givens]

+
28.9.13.2.1 Compute Givens rotation +[linalg.algs.blas1.givens.lartg]
+
template<class Real>
+setup_givens_rotation_result<Real>
+setup_givens_rotation(Real a, Real b) noexcept;
+template<class Real>
+setup_givens_rotation_result<complex<Real>>
+setup_givens_rotation(complex<Real> a, complex<Real> b) noexcept;
+

1 +These functions compute the Givens plane rotation represented by the two +values c and s such that the 2 x 2 system of +equations

+ + + + + + + + + +
+c + +s +
+ −  + +c +
+ + + + + + + + +
+a +
+b +
+= + + + + + + + +
+r +
+0 +
+

[Note: EDITORIAL NOTE: Preferred rendering would use the LaTeX +code like the following.

+
\left[ \begin{matrix}
+c             & s \\
+-\overline{s} & c \\
+\end{matrix} \right]
+\cdot
+\left[ \begin{matrix}
+a \\
+b \\
+\end{matrix} \right]
+=
+\left[ \begin{matrix}
+r \\
+0 \\
+\end{matrix} \right]
+

– end note]

+ +

holds, where c is always a +real scalar, and c2 + |s|2 = 1.

+

That is, c and s represent a 2 x 2 matrix, that +when multiplied by the right by the input vector whose components are +a and b, produces a result vector whose +first component r is the +Euclidean norm of the input vector, and whose second component as +zero.

+

[Note: These functions correspond to the LAPACK function +xLARTG. – end note]

+ +

2 +Returns: {c, s, r}, where c and +s form the Givens plane rotation corresponding to the input +a and b, and r is the Euclidean +norm of the two-component vector formed by a and +b.

+
28.9.13.2.2 Apply a computed Givens +rotation to vectors [linalg.algs.blas1.givens.rot]
+
template<inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  Real s);
+template<class ExecutionPolicy,
+         inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  ExecutionPolicy&& exec,
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  Real s);
+
+template<inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  complex<Real> s);
+template<class ExecutionPolicy,
+         inout-vector InOutVec1,
+         inout-vector InOutVec2,
+         class Real>
+void apply_givens_rotation(
+  ExecutionPolicy&& exec,
+  InOutVec1 x,
+  InOutVec2 y,
+  Real c,
+  complex<Real> s);
+

[Note: These functions correspond to the BLAS function +xROT. – end note]

+ +

1 +Mandates: +compatible-static-extents<InOutVec1, InOutVec2>(0,0) +is true.

+

2 +Preconditions: x.extent(0) equals +y.extent(0).

+

3 +Effects: Applies the plane rotation specified by c +and s to the input vectors x and +y, as if the rotation were a 2 x 2 matrix and the input +vectors were successive rows of a matrix with two rows.

+

28.9.13.3 Swap matrix or vector +elements [linalg.algs.blas1.swap]

+
template<inout-object InOutObj1,
+         inout-object InOutObj2>
+void swap_elements(InOutObj1 x,
+                   InOutObj2 y);
+
+template<class ExecutionPolicy,
+         inout-object InOutObj1,
+         inout-object InOutObj2>
+void swap_elements(ExecutionPolicy&& exec,
+                   InOutObj1 x,
+                   InOutObj2 y);
+

[Note: These functions correspond to the BLAS function +xSWAP. – end note]

+

1 +Constraints: x.rank() equals +y.rank().

+

2 +Mandates: For all r in the range [ 0, x.rank()), +compatible-static-extents<InOutObj1, InOutObj2>(r, r) +is true.

+

3 +Preconditions: x.extents() equals +y.extents().

+

4 +Effects: Swaps all corresponding elements of x and +y.

+

28.9.13.4 Multiply the elements of +an object in place by a scalar [linalg.algs.blas1.scal]

+
template<class Scalar,
+         inout-object InOutObj>
+void scale(Scalar alpha,
+           InOutObj x);
+
+template<class ExecutionPolicy,
+         class Scalar,
+         inout-object InOutObj>
+void scale(ExecutionPolicy&& exec,
+           Scalar alpha,
+           InOutObj x);
+

[Note: These functions correspond to the BLAS function +xSCAL. – end note]

+

5 +Effects: Overwrites x +with the result of computing the elementwise multiplication αx, where the scalar α is alpha.

+

28.9.13.5 Copy elements of one +matrix or vector into another [linalg.algs.blas1.copy]

+
template<in-object InObj,
+         out-object OutObj>
+void copy(InObj x,
+          OutObj y);
+
+template<class ExecutionPolicy,
+         in-object InObj,
+         out-object OutObj>
+void copy(ExecutionPolicy&& exec,
+          InObj x,
+          OutObj y);
+

[Note: These functions correspond to the BLAS function +xCOPY. – end note]

+

1 +Constraints: x.rank() equals +y.rank().

+

2 +Mandates: For all r in the range [ 0, x.rank()), +compatible-static-extents<InObj, OutObj>(r, r) +is true.

+

3 +Preconditions: x.extents() equals +y.extents().

+

4 +Effects: Assigns each element of x to the corresponding element of +y.

+

28.9.13.6 Add vectors or matrices +elementwise [linalg.algs.blas1.add]

+
template<in-object InObj1,
+         in-object InObj2,
+         out-object OutObj>
+void add(InObj1 x,
+         InObj2 y,
+         OutObj z);
+
+template<class ExecutionPolicy,
+         in-object InObj1,
+         in-object InObj2,
+         out-object OutObj>
+void add(ExecutionPolicy&& exec,
+         InObj1 x,
+         InObj2 y,
+         OutObj z);
+

[Note: These functions correspond to the BLAS function +xAXPY. – end note]

+

1 +Constraints: x.rank(), y.rank(), and +z.rank() are all equal.

+

2 +Mandates: +possibly-addable<InObj1, InObj2, OutObj>() +is true.

+

3 +Preconditions: +addable(x,y,z) is +true.

+

4 +Effects: Computes z = x + y.

+

5 +Remarks: z may alias x or +y.

+

28.9.13.7 Dot product of two +vectors [linalg.algs.blas1.dot]

+

[Note: The functions in this section correspond to the BLAS +functions xDOT, xDOTU, and xDOTC. +– end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas1.dot].

+

2 +Mandates: +compatible-static-extents<InVec1, InVec2>(0, 0) +is true.

+

3 +Preconditions: v1.extent(0) equals +v2.extent(0).

+
template<in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+Scalar dot(InVec1 v1,
+           InVec2 v2,
+           Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+Scalar dot(ExecutionPolicy&& exec,
+           InVec1 v1,
+           InVec2 v2,
+           Scalar init);
+

4 +These functions compute a non-conjugated dot product with an explicitly +specified result type.

+

5 +Returns: Let N be v1.extent(0).

+
    +
  • (5.1) +init if N is zero;

  • +
  • (5.2) +otherwise, GENERALIZED_SUM(plus<>(), +init, v1[0]*v2[0], …, +v1[N-1]*v2[N-1]).

  • +
+

6 +Remarks: If InVec1::value_type, +InVec2::value_type, and Scalar are all +floating-point types or specializations of complex, and if +Scalar has higher precision than +InVec1::value_type or InVec2::value_type, then +intermediate terms in the sum use Scalar’s precision or +greater.

+
template<in-vector InVec1,
+         in-vector InVec2>
+auto dot(InVec1 v1,
+         InVec2 v2);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2>
+auto dot(ExecutionPolicy&& exec,
+         InVec1 v1,
+         InVec2 v2);
+

7 +These functions compute a non-conjugated dot product with a default +result type.

+

8 +Effects: Let T be +decltype(declval<typename InVec1::value_type>() * declval<typename InVec2::value_type>()). +Then,

+
    +
  • (8.1) the +two-parameter overload is equivalent to +return dot(v1, v2, T{});, and

  • +
  • (8.2) the +three-parameter overload is equivalent to +return dot(std::forward<ExecutionPolicy>(exec), v1, v2, T{});.

  • +
+
template<in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+Scalar dotc(InVec1 v1,
+            InVec2 v2,
+            Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         class Scalar>
+Scalar dotc(ExecutionPolicy&& exec,
+            InVec1 v1,
+            InVec2 v2,
+            Scalar init);
+

9 +These functions compute a conjugated dot product with an explicit +specified result type.

+

10 +Effects:

+
    +
  • (10.1) The +three-parameter overload is equivalent to +return dot(conjugated(v1), v2, init);, and

  • +
  • (10.2) the +four-parameter overload is equivalent to +return dot(std::forward<ExecutionPolicy>(exec), conjugated(v1), v2, init);.

  • +
+
template<in-vector InVec1,
+         in-vector InVec2>
+auto dotc(InVec1 v1,
+          InVec2 v2);
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2>
+auto dotc(ExecutionPolicy&& exec,
+          InVec1 v1,
+          InVec2 v2);
+

11 These +functions compute a conjugated dot product with a default result +type.

+

12 +Effects: Let T be +decltype(conj-if-needed(declval<typename InVec1::value_type>()) * declval<typename InVec2::value_type>()). +Then,

+
    +
  • (12.1) the +two-parameter overload is equivalent to +return dotc(v1, v2, T{});, and

  • +
  • (12.2) the +three-parameter overload is equivalent to +return dotc(std::forward<ExecutionPolicy>(exec), v1, v2, T{});.

  • +
+

28.9.13.8 Scaled sum of squares of +a vector’s elements [linalg.algs.blas1.ssq]

+
template<class Scalar>
+struct sum_of_squares_result {
+  Scalar scaling_factor;
+  Scalar scaled_sum_of_squares;
+};
+template<in-vector InVec,
+         class Scalar>
+sum_of_squares_result<Scalar> vector_sum_of_squares(
+  InVec v,
+  sum_of_squares_result<Scalar> init);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         class Scalar>
+sum_of_squares_result<Scalar> vector_sum_of_squares(
+  ExecutionPolicy&& exec,
+  InVec v,
+  sum_of_squares_result<Scalar> init);
+

[Note: These functions correspond to the LAPACK function +xLASSQ. – end note]

+

1 +Mandates: +decltype(abs-if-needed(declval<typename InVec::value_type>())) +is convertible to Scalar.

+

2 +Effects: Returns a value result such that

+
    +
  • (2.1) +result.scaling_factor is the maximum of +init.scaling_factor and +abs-if-needed(x[i]) for all +i in the domain of v; and

  • +
  • (2.2) let +s2init be +init.scaling_factor * init.scaling_factor * init.scaled_sum_of_squares, +then +result.scaling_factor * result.scaling_factor * result.scaled_sum_of_squares +equals the sum of s2init and the squares of +abs-if-needed(x[i]) for all +i in the domain of v.

  • +
+

3 +Remarks: If InVec::value_type, and +Scalar are all floating-point types or specializations of +complex, and if Scalar has higher precision +than InVec::value_type, then intermediate terms in the sum +use Scalar’s precision or greater.

+

28.9.13.9 Euclidean norm of a +vector [linalg.algs.blas1.nrm2]

+
template<in-vector InVec,
+         class Scalar>
+Scalar vector_two_norm(InVec v,
+                       Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         class Scalar>
+Scalar vector_two_norm(ExecutionPolicy&& exec,
+                       InVec v,
+                       Scalar init);
+

[Note: These functions correspond to the BLAS function +xNRM2. – end note]

+

1 +Mandates: decltype(init + +abs-if-needed(declval<typename InVec::value_type>()) * +abs-if-needed(declval<typename InVec::value_type>())) +is convertible to Scalar.

+

2 +Returns: The square root of the sum of the square of +init and the squares of the absolute values of the elements +of v. [Note: For init equal to zero, +this is the Euclidean norm (also called 2-norm) of the vector +v.– end note]

+

3 +Remarks: If InVec::value_type, and +Scalar are all floating-point types or specializations of +complex, and if Scalar has higher precision +than InVec::value_type, then intermediate terms in the sum +use Scalar’s precision or greater.

+

[Note: A possible implementation of this function for +floating-point types T would use the +scaled_sum_of_squares result from +vector_sum_of_squares(x, {.scaling_factor=1.0, .scaled_sum_of_squares=init}). +– end note]

+
template<in-vector InVec>
+auto vector_two_norm(InVec v);
+template<class ExecutionPolicy,
+         in-vector InVec>
+auto vector_two_norm(ExecutionPolicy&& exec, InVec v);
+

4 +Effects: Let T be decltype( +abs-if-needed(declval<typename InVec::value_type>()) * +abs-if-needed(declval<typename InVec::value_type>())). +Then,

+
    +
  • (4.1) the +one-parameter overload is equivalent to +return vector_two_norm(v, T{});, and

  • +
  • (4.2) the +two-parameter overload is equivalent to +return vector_two_norm(std::forward<ExecutionPolicy>(exec), v, T{});.

  • +
+

28.9.13.10 Sum of absolute values +of vector elements [linalg.algs.blas1.asum]

+
template<in-vector InVec,
+         class Scalar>
+Scalar vector_abs_sum(InVec v,
+                      Scalar init);
+template<class ExecutionPolicy,
+         in-vector InVec,
+         class Scalar>
+Scalar vector_abs_sum(ExecutionPolicy&& exec,
+                      InVec v,
+                      Scalar init);
+

[Note: These functions correspond to the BLAS functions +SASUM, DASUM, SCASUM, and +DZASUM.– end note]

+ +

1 +Mandates: decltype(init + +abs-if-needed( +real-if-needed +(declval<typename InVec::value_type>())) + +abs-if-needed( +imag-if-needed +(declval<typename InVec::value_type>()))) is +convertible to Scalar.

+

2 +Returns: Let N be v.extent(0).

+
    +
  • (2.1) +init if N is zero;

  • +
  • (2.2) +otherwise, GENERALIZED_SUM(plus<>(), +init, +abs-if-needed(v[0]), …, +abs-if-needed(v[N-1])), if +InVec::value_type is an arithmetic type;

  • +
  • (2.3) +otherwise, GENERALIZED_SUM(plus<>(), +init, abs-if-needed( +real-if-needed (v[0])) + +abs-if-needed( +imag-if-needed (v[0])), …, +abs-if-needed( +real-if-needed (v[N-1])) + +abs-if-needed( +imag-if-needed (v[N-1]))).

  • +
+

3 +Remarks: If InVec::value_type and +Scalar are all floating-point types or specializations of +complex, and if Scalar has higher precision +than InVec::value_type, then intermediate terms in the sum +use Scalar’s precision or greater.

+
template<in-vector InVec>
+auto vector_abs_sum(InVec v);
+template<class ExecutionPolicy,
+         in-vector InVec>
+auto vector_abs_sum(ExecutionPolicy&& exec, InVec v);
+

4 +Effects: Let T be +typename InVec::value_type. Then,

+
    +
  • (4.1) the +one-parameter overload is equivalent to +return vector_abs_sum(v, T{});, and

  • +
  • (4.2) the +two-parameter overload is equivalent to +return vector_abs_sum(std::forward<ExecutionPolicy>(exec), v, T{});.

  • +
+

28.9.13.11 Index of maximum +absolute value of vector elements [linalg.algs.blas1.iamax]

+
template<in-vector InVec>
+typename InVec::size_type vector_idx_abs_max(InVec v);
+
+template<class ExecutionPolicy,
+         in-vector InVec>
+typename InVec::size_type vector_idx_abs_max(
+  ExecutionPolicy&& exec,
+  InVec v);
+

[Note: These functions correspond to the BLAS function +IxAMAX. – end note]

+

1 +Let T be decltype( +abs-if-needed( +real-if-needed +(declval<typename InVec::value_type>())) + +abs-if-needed( +imag-if-needed +(declval<typename InVec::value_type>()))).

+

2 +Mandates: +declval<T>() < declval<T>() is a valid +expression.

+

3 +Returns:

+
    +
  • (3.1) +numeric_limits<typename InVec::size_type>::max() if +v has zero elements;

  • +
  • (3.2) +otherwise, the index of the first element of v having +largest absolute value, if InVec::value_type is an +arithmetic type;

  • +
  • (3.3) +otherwise, the index of the first element v_e of +v for which +abs-if-needed( +real-if-needed (v_e)) + +abs-if-needed( +imag-if-needed (v_e)) has the largest +value.

  • +
+

28.9.13.12 Frobenius norm of a +matrix [linalg.algs.blas1.matfrobnorm]

+

[Note: These functions exist in the BLAS standard but are not +part of the reference implementation. – end note]

+
template<in-matrix InMat,
+         class Scalar>
+Scalar matrix_frob_norm(
+  InMat A,
+  Scalar init);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Scalar>
+Scalar matrix_frob_norm(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Scalar init);
+

1 +Mandates: decltype(init + +abs-if-needed(declval<typename InMat::value_type>()) * +abs-if-needed(declval<typename InMat::value_type>())) +is convertible to Scalar.

+

2 +Returns: The square root of the sum of squares of +init and the absolute values of the elements of +A.

+

[Note: For init equal to zero, this is the +Frobenius norm of the matrix A. – end note]

+

3 +Remarks: If InMat::value_type and +Scalar are all floating-point types or specializations of +complex, and if Scalar has higher precision +than InMat::value_type, then intermediate terms in the sum +use Scalar’s precision or greater.

+
template<in-matrix InMat>
+auto matrix_frob_norm(InMat A);
+template<class ExecutionPolicy,
+         in-matrix InMat>
+auto matrix_frob_norm(
+  ExecutionPolicy&& exec, InMat A);
+

4 +Effects: Let T be decltype( +abs-if-needed(declval<typename InMat::value_type>()) * +abs-if-needed(declval<typename InMat::value_type>())). +Then,

+
    +
  • (4.1) the +one-parameter overload is equivalent to +return matrix_frob_norm(A, T{});, and

  • +
  • (4.2) the +two-parameter overload is equivalent to +return matrix_frob_norm(std::forward<ExecutionPolicy>(exec), A, T{});.

  • +
+

28.9.13.13 One norm of a matrix +[linalg.algs.blas1.matonenorm]

+

[Note: These functions exist in the BLAS standard but are not +part of the reference implementation. – end note]

+
template<in-matrix InMat,
+         class Scalar>
+Scalar matrix_one_norm(
+  InMat A,
+  Scalar init);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Scalar>
+Scalar matrix_one_norm(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Scalar init);
+

1 +Mandates: +decltype(abs-if-needed(declval<typename InMat::value_type>())) +is convertible to Scalar.

+

2 +Returns:

+
    +
  • (2.1) +init if A.extent(1) is zero;

  • +
  • (2.2) +otherwise, the sum of init and the one norm of the matrix +A.

  • +
+

[Note: The one norm of the matrix A is the +maximum over all columns of A, of the sum of the absolute +values of the elements of the column. – end note]

+

3 +Remarks: If InMat::value_type and +Scalar are all floating-point types or specializations of +complex, and if Scalar has higher precision +than InMat::value_type, then intermediate terms in the sum +use Scalar’s precision or greater.

+
template<in-matrix InMat>
+auto matrix_one_norm(InMat A);
+template<class ExecutionPolicy,
+         in-matrix InMat>
+auto matrix_one_norm(
+  ExecutionPolicy&& exec, InMat A);
+

4 +Effects: Let T be decltype( +abs-if-needed(declval<typename InMat::value_type>()). +Then,

+
    +
  • (4.1) the +one-parameter overload is equivalent to +return matrix_one_norm(A, T{});, and

  • +
  • (4.2) the +two-parameter overload is equivalent to +return matrix_one_norm(std::forward<ExecutionPolicy>(exec), A, T{});.

  • +
+

28.9.13.14 Infinity norm of a +matrix [linalg.algs.blas1.matinfnorm]

+

[Note: These functions exist in the BLAS standard but are not +part of the reference implementation. – end note]

+
template<in-matrix InMat,
+         class Scalar>
+Scalar matrix_inf_norm(
+  InMat A,
+  Scalar init);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Scalar>
+Scalar matrix_inf_norm(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Scalar init);
+

1 +Mandates: +decltype(abs-if-needed(declval<typename InMat::value_type>())) +is convertible to Scalar.

+

2 +Returns:

+
    +
  • (2.1) +init if A.extent(0) is zero;

  • +
  • (2.2) +otherwise, the sum of init and the infinity norm of the +matrix A.

  • +
+

[Note: The infinity norm of the matrix A is the +maximum over all rows of A, of the sum of the absolute +values of the elements of the row. – end note]

+

3 +Remarks: If InMat::value_type and +Scalar are all floating-point types or specializations of +complex, and if Scalar has higher precision +than InMat::value_type, then intermediate terms in the sum +use Scalar’s precision or greater.

+
template<in-matrix InMat>
+auto matrix_inf_norm(InMat A);
+template<class ExecutionPolicy,
+         in-matrix InMat>
+auto matrix_inf_norm(
+  ExecutionPolicy&& exec, InMat A);
+

4 +Effects: Let T be decltype( +abs-if-needed(declval<typename InMat::value_type>()). +Then,

+
    +
  • (4.1) the +one-parameter overload is equivalent to +return matrix_inf_norm(A, T{});, and

  • +
  • (4.2) the +two-parameter overload is equivalent to +return matrix_inf_norm(std::forward<ExecutionPolicy>(exec), A, T{});.

  • +
+

28.9.14 BLAS 2 algorithms +[linalg.algs.blas2]

+

28.9.14.1 General matrix-vector +product [linalg.algs.blas2.gemv]

+

[Note: These functions correspond to the BLAS function +xGEMV. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas2.gemv].

+

2 +Mandates:

+
    +
  • (2.1) +possibly-multipliable<decltype(A), decltype(x), decltype(y)>() +is true, and

  • +
  • (2.2) +possibly-addable<decltype(x),decltype(y),decltype(z)>() +is true for those overloads that take a z +parameter.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +multipliable(A,x,y) is +true, and

  • +
  • (3.2) +addable(x,y,z) is true +for those overloads that take a z parameter.

  • +
+

4 +Complexity: O( +x.extent(0) +A.extent(1) )

+
template<in-matrix InMat,
+         in-vector InVec,
+         out-vector OutVec>
+void matrix_vector_product(InMat A,
+                           InVec x,
+                           OutVec y);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         in-vector InVec,
+         out-vector OutVec>
+void matrix_vector_product(ExecutionPolicy&& exec,
+                           InMat A,
+                           InVec x,
+                           OutVec y);
+

5 +These functions perform an overwriting matrix-vector product.

+

6 +Effects: Computes y = Ax.

+

[Example:

+
constexpr size_t num_rows = 5;
+constexpr size_t num_cols = 6;
+
+// y = 3.0 * A * x
+void scaled_matvec_1(
+  mdspan<double, extents<size_t, num_rows, num_cols>> A,
+  mdspan<double, extents<size_t, num_cols>> x,
+  mdspan<double, extents<size_t, num_rows>> y)
+{
+  matrix_vector_product(scaled(3.0, A), x, y);
+}
+
+// y = 3.0 * A * x + 2.0 * y
+void scaled_matvec_2(
+  mdspan<double, extents<size_t, num_rows, num_cols>> A,
+  mdspan<double, extents<size_t, num_cols>> x,
+  mdspan<double, extents<size_t, num_rows>> y)
+{
+  matrix_vector_product(scaled(3.0, A), x,
+                        scaled(2.0, y), y);
+}
+
+// z = 7.0 times the transpose of A, times y
+void scaled_transposed_matvec(mdspan<double, extents<size_t, num_rows, num_cols>> A,
+  mdspan<double, extents<size_t, num_rows>> y,
+  mdspan<double, extents<size_t, num_cols>> z)
+{
+  matrix_vector_product(scaled(7.0, transposed(A)), y, z);
+}
+

end example]

+
template<in-matrix InMat,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void matrix_vector_product(InMat A,
+                           InVec1 x,
+                           InVec2 y,
+                           OutVec z);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void matrix_vector_product(ExecutionPolicy&& exec,
+                           InMat A,
+                           InVec1 x,
+                           InVec2 y,
+                           OutVec z);
+

7 +These functions performs an updating matrix-vector product.

+

8 +Effects: Computes z = y + Ax.

+

9 +Remarks: z may alias y.

+

28.9.14.2 Symmetric matrix-vector +product [linalg.algs.blas2.symv]

+

[Note: These functions correspond to the BLAS functions +xSYMV and xSPMV. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas2.symv].

+

2 +Mandates:

+
    +
  • (2.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (2.2) +compatible-static-extents<decltype(A), decltype(A)>(0, 1) +is true;

  • +
  • (2.3) +possibly-multipliable<decltype(A), decltype(x), decltype(y)>() +is true; and

  • +
  • (2.4) +possibly-addable<decltype(x),decltype(y),decltype(z)>() +is true for those overloads that take a z +parameter.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +A.extent(0) equals A.extent(1),

  • +
  • (3.2) +multipliable(A,x,y) is +true, and

  • +
  • (3.3) +addable(x,y,z) is true +for those overloads that take a z parameter.

  • +
+

4 +Complexity: O( +x.extent(0) +A.extent(1) )

+
template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(ExecutionPolicy&& exec,
+                                     InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+

5 +These functions perform an overwriting symmetric matrix-vector product, +taking into account the Triangle parameter that applies to +the symmetric matrix A [linalg.general].

+

6 +Effects: Computes y = Ax.

+
template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(
+  InMat A,
+  Triangle t,
+  InVec1 x,
+  InVec2 y,
+  OutVec z);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void symmetric_matrix_vector_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  InVec1 x,
+  InVec2 y,
+  OutVec z);
+

7 +These functions perform an updating symmetric matrix-vector product, +taking into account the Triangle parameter that applies to +the symmetric matrix A [linalg.general].

+

8 +Effects: Computes z = y + Ax.

+

9 +Remarks: z may alias y.

+

28.9.14.3 Hermitian matrix-vector +product [linalg.algs.blas2.hemv]

+

[Note: These functions correspond to the BLAS functions +xHEMV and xHPMV. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas2.hemv].

+

2 +Mandates:

+
    +
  • (2.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (2.2) +compatible-static-extents<decltype(A), decltype(A)>(0, 1) +is true;

  • +
  • (2.3) +possibly-multipliable<decltype(A), decltype(x), decltype(y)>() +is true; and

  • +
  • (2.4) +possibly-addable<decltype(x),decltype(y),decltype(z)>() +is true for those overloads that take a z +parameter.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +A.extent(0) equals A.extent(1),

  • +
  • (3.2) +multipliable(A,x,y) is +true, and

  • +
  • (3.3) +addable(x,y,z) is true +for those overloads that take a z parameter.

  • +
+

4 +Complexity: O( +x.extent(0) +A.extent(1) )

+
template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(ExecutionPolicy&& exec,
+                                     InMat A,
+                                     Triangle t,
+                                     InVec x,
+                                     OutVec y);
+

5 +These functions perform an overwriting Hermitian matrix-vector product, +taking into account the Triangle parameter that applies to +the Hermitian matrix A [linalg.general].

+

6 +Effects: Computes y = Ax.

+
template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(InMat A,
+                                     Triangle t,
+                                     InVec1 x,
+                                     InVec2 y,
+                                     OutVec z);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void hermitian_matrix_vector_product(ExecutionPolicy&& exec,
+                                     InMat A,
+                                     Triangle t,
+                                     InVec1 x,
+                                     InVec2 y,
+                                     OutVec z);
+

7 +These functions perform an updating Hermitian matrix-vector product, +taking into account the Triangle parameter that applies to +the Hermitian matrix A [linalg.general].

+

8 +Effects: Computes z = y + Ax.

+

9 +Remarks: z may alias y.

+

28.9.14.4 Triangular matrix-vector +product [linalg.algs.blas2.trmv]

+

[Note: These functions correspond to the BLAS functions +xTRMV and xTPMV. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas2.trmv].

+

2 +Mandates:

+
    +
  • (2.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (2.2) +compatible-static-extents<decltype(A), decltype(A)>(0, 1) +is true;

  • +
  • (2.3) +compatible-static-extents<decltype(A), decltype(y)>(0, 0) +is true;

  • +
  • (2.4) +compatible-static-extents<decltype(A), decltype(x)>(0, 0) +is true for those overloads that take an x +parameter; and

  • +
  • (2.5) +compatible-static-extents<decltype(A), decltype(z)>(0, 0) +is true for those overloads that take a z +parameter.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +A.extent(0) equals A.extent(1),

  • +
  • (3.2) +A.extent(0) equals y.extent(0),

  • +
  • (3.3) +A.extent(0) equals x.extent(0) for those +overloads that take an x parameter, and

  • +
  • (3.4) +A.extent(0) equals z.extent(0) for those +overloads that take a z parameter.

  • +
+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec x,
+  OutVec y);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec x,
+  OutVec y);
+

4 +These functions perform an overwriting triangular matrix-vector product, +taking into account the Triangle and +DiagonalStorage parameters that apply to the triangular +matrix A [linalg.general].

+

5 +Effects: Computes y = Ax.

+

6 +Complexity: O( +x.extent(0) +A.extent(1) )

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec y);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec y);
+

7 +These functions perform an in-place triangular matrix-vector product, +taking into account the Triangle and +DiagonalStorage parameters that apply to the triangular +matrix A [linalg.general].

+

[Note: Performing this operation in place hinders +parallelization. However, other ExecutionPolicy specific +optimizations, such as vectorization, are still possible. – end +note]

+

8 +Effects: Computes a vector y such that y′ = Ay, and +assigns each element of y to +the corresponding element of y.

+

9 +Complexity: O( +y.extent(0) +A.extent(1) )

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void triangular_matrix_vector_product(InMat A,
+                                      Triangle t,
+                                      DiagonalStorage d,
+                                      InVec1 x,
+                                      InVec2 y,
+                                      OutVec z);
+
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec1,
+         in-vector InVec2,
+         out-vector OutVec>
+void triangular_matrix_vector_product(ExecutionPolicy&& exec,
+                                      InMat A,
+                                      Triangle t,
+                                      DiagonalStorage d,
+                                      InVec1 x,
+                                      InVec2 y,
+                                      OutVec z);
+

10 These +functions perform an updating triangular matrix-vector product, taking +into account the Triangle and DiagonalStorage +parameters that apply to the triangular matrix A +[linalg.general].

+

11 +Effects: Computes z = y + Ax.

+

12 +Remarks: z may alias y.

+

13 +Complexity: O( +x.extent(0) +A.extent(1) )

+

28.9.14.5 Solve a triangular linear +system [linalg.algs.blas2.trsv]

+

[Note: These functions correspond to the BLAS functions +xTRSV and xTPSV. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas2.trsv].

+

2 +Mandates:

+
    +
  • (2.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (2.2) +compatible-static-extents<decltype(A), decltype(A)>(0, 1) +is true;

  • +
  • (2.3) +compatible-static-extents<decltype(A), decltype(b)>(0, 0) +is true; and

  • +
  • (2.4) +compatible-static-extents<decltype(A), decltype(x)>(0, 0) +is true for those overloads that take an x +parameter.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +A.extent(0) equals A.extent(1),

  • +
  • (3.2) +A.extent(0) equals b.extent(0), and

  • +
  • (3.3) +A.extent(0) equals x.extent(0) for those +overloads that take an x parameter.

  • +
+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x,
+  BinaryDivideOp divide);
+

4 +These functions perform a triangular solve, taking into account the +Triangle and DiagonalStorage parameters that +apply to the triangular matrix A [linalg.general].

+

5 +Effects: Computes a vector x such that b = Ax, and +assigns each element of x to +the corresponding element of x. If no such x exists, then the elements of +x are valid but unspecified.

+

6 +Complexity: O( +A.extent(1) +b.extent(0) )

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x);
+

7 +Effects: Equivalent to

+
triangular_matrix_vector_solve(A, t, d, b, x, divides<void>{});
+
template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         in-vector InVec,
+         out-vector OutVec>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InVec b,
+  OutVec x);
+

8 +Effects: Equivalent to

+
triangular_matrix_vector_solve(std::forward<ExecutionPolicy>(exec),
+  A, t, d, b, x, divides<void>{});
+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec,
+         class BinaryDivideOp>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b,
+  BinaryDivideOp divide);
+

9 +These functions perform an in-place triangular solve, taking into +account the Triangle and DiagonalStorage +parameters that apply to the triangular matrix A +[linalg.general].

+

[Note: Performing triangular solve in place hinders +parallelization. However, other ExecutionPolicy specific +optimizations, such as vectorization, are still possible. – end +note]

+

10 +Effects: Computes a vector x such that b = Ax, and +assigns each element of x to +the corresponding element of b. If no such x exists, then the elements of +b are valid but unspecified.

+

11 +Complexity: O( +A.extent(1) +b.extent(0) )

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b);
+

12 +Effects: Equivalent to:

+
triangular_matrix_vector_solve(A, t, d, b, divides<void>{});
+
template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-vector InOutVec>
+void triangular_matrix_vector_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutVec b);
+

13 +Effects: Equivalent to:

+
triangular_matrix_vector_solve(std::forward<ExecutionPolicy>(exec),
+  A, t, d, b, divides<void>{});
+

28.9.14.6 Rank-1 (outer product) +update of a matrix [linalg.algs.blas2.rank1]

+
template<in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+

1 +These functions perform a nonsymmetric nonconjugated rank-1 update.

+

[Note: These functions correspond to the BLAS functions +xGER (for real element types) and xGERU (for +complex element types). – end note]

+

2 +Mandates: +possibly-multipliable<InOutMat, InVec2, InVec1>() +is true.

+

3 +Preconditions: +multipliable(A,y,x) is +true.

+

4 +Effects: Computes a matrix A such that A′ = A + xyT, +and assigns each element of A +to the corresponding element of A.

+

5 +Complexity: O( +x.extent(0) +y.extent(0) )

+ +
template<in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update_c(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         inout-matrix InOutMat>
+void matrix_rank_1_update_c(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A);
+

6 +These functions perform a nonsymmetric conjugated rank-1 update.

+

[Note: These functions correspond to the BLAS functions +xGER (for real element types) and xGERC (for +complex element types). – end note]

+

7 +Effects:

+
    +
  • (7.1) For +the overloads without an ExecutionPolicy argument, +equivalent to +matrix_rank_1_update(x, conjugated(y), A);;

  • +
  • (7.2) +otherwise, equivalent to +matrix_rank_1_update(std::forward<ExecutionPolicy>(exec), x, conjugated(y), A);.

  • +
+

28.9.14.7 Symmetric or Hermitian +Rank-1 (outer product) update of a matrix +[linalg.algs.blas2.symherrank1]

+

[Note: These functions correspond to the BLAS functions +xSYR, xSPR, xHER, and +xHPR.

+

They have overloads taking a scaling factor alpha, +because it would be impossible to express the update A = A − xxT +otherwise. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas2.symherrank1].

+

2 +Mandates:

+
    +
  • (2.1) If +InOutMat has layout_blas_packed layout, then +the layout’s Triangle template argument has the same type +as the function’s Triangle template argument;

  • +
  • (2.2) +compatible-static-extents<decltype(A), decltype(A)>(0,1) +is true; and

  • +
  • (2.3) +compatible-static-extents<decltype(A), decltype(x)>(0,0) +is true.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +A.extent(0) equals A.extent(1), and

  • +
  • (3.2) +A.extent(0) equals x.extent(0).

  • +
+

4 +Complexity: O( +x.extent(0) +x.extent(0) )

+
template<in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  InVec x,
+  InOutMat A,
+  Triangle t);
+
+template<class ExecutionPolicy,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+

5 +These functions perform a symmetric rank-1 update of the symmetric +matrix A, taking into account the Triangle +parameter that applies to A [linalg.general].

+

6 +Effects: Computes a matrix A such that A′ = A + xxT +and assigns each element of A +to the corresponding element of A.

+
template<class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+
+template<class ExecutionPolicy,
+         class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+

7 +These functions perform a symmetric rank-1 update of the symmetric +matrix A, taking into account the Triangle +parameter that applies to A [linalg.general].

+

8 +Effects: Computes a matrix A such that A′ = A + αxxT, +where the scalar α is +alpha, and assigns each element of A to the corresponding element of +A.

+
template<in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  InVec x,
+  InOutMat A,
+  Triangle t);
+
+template<class ExecutionPolicy,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+

9 +These functions perform a Hermitian rank-1 update of the Hermitian +matrix A, taking into account the Triangle +parameter that applies to A [linalg.general].

+

10 +Effects: Computes a matrix A such that A′ = A + xxH +and assigns each element of A +to the corresponding element of A.

+
template<class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+
+template<class ExecutionPolicy,
+         class Scalar,
+         in-vector InVec,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_1_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InVec x,
+  InOutMat A,
+  Triangle t);
+

11 These +functions perform a Hermitian rank-1 update of the Hermitian matrix +A, taking into account the Triangle parameter +that applies to A [linalg.general].

+

12 +Effects: Computes A +such that A′ = A + αxxH, +where the scalar α is +alpha, and assigns each element of A to the corresponding element of +A.

+

28.9.14.8 Symmetric and Hermitian +rank-2 matrix updates [linalg.algs.blas2.rank2]

+

[Note: These functions correspond to the BLAS functions +xSYR2,xSPR2, xHER2 and +xHPR2. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas2.rank2].

+

2 +Mandates:

+
    +
  • (2.1) If +InOutMat has layout_blas_packed layout, then +the layout’s Triangle template argument has the same type +as the function’s Triangle template argument;

  • +
  • (2.2) +compatible-static-extents<decltype(A), decltype(A)>(0, 1) +is true; and

  • +
  • (2.3) +possibly-multipliable<decltype(A), decltype(x), decltype(y)>() +is true.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +A.extent(0) equals A.extent(1), and

  • +
  • (3.2) +multipliable(A,x,y) is +true.

  • +
+

4 +Complexity: O( +x.extent(0) +y.extent(0) )

+
template<in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2_update(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2_update(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+

5 +These functions perform a symmetric rank-2 update of the symmetric +matrix A, taking into account the Triangle +parameter that applies to A [linalg.general].

+

6 +Effects: Computes A +such that A′ = A + xyT + yxT +and assigns each element of A +to the corresponding element of A.

+
template<in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2_update(
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+
+template<class ExecutionPolicy,
+         in-vector InVec1,
+         in-vector InVec2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2_update(
+  ExecutionPolicy&& exec,
+  InVec1 x,
+  InVec2 y,
+  InOutMat A,
+  Triangle t);
+

7 +These functions perform a Hermitian rank-2 update of the Hermitian +matrix A, taking into account the Triangle +parameter that applies to A [linalg.general].

+

8 +Effects: Computes A +such that A′ = A + xyH + yxH +and assigns each element of A +to the corresponding element of A.

+

28.9.15 BLAS 3 algorithms +[linalg.algs.blas3]

+

28.9.15.1 General matrix-matrix +product [linalg.algs.blas3.gemm]

+

[Note: These functions correspond to the BLAS function +xGEMM. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas3.gemm] in addition to function-specific elements.

+

2 +Mandates: +possibly-multipliable<decltype(A), decltype(B), decltype(C)>() +is true.

+

3 +Preconditions: +multipliable(A, B, C) is +true.

+

4 +Complexity: O( +A.extent(0) +A.extent(1) +B.extent(1) )

+
template<in-matrix InMat1,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void matrix_product(InMat1 A,
+                    InMat2 B,
+                    OutMat C);
+
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void matrix_product(ExecutionPolicy&& exec,
+                    InMat1 A,
+                    InMat2 B,
+                    OutMat C);
+

5 +Effects: Computes C = AB.

+
template<in-matrix InMat1,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void matrix_product(InMat1 A,
+                    InMat2 B,
+                    InMat3 E,
+                    OutMat C);
+
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void matrix_product(ExecutionPolicy&& exec,
+                    InMat1 A,
+                    InMat2 B,
+                    InMat3 E,
+                    OutMat C);
+

6 +Mandates: +possibly-addable<InMat3, InMat3, OutMat>() +is true.

+

7 +Preconditions: +addable(E, E, C) is +true.

+

8 +Effects: Computes C = E + AB.

+

9 +Remarks: C may alias E.

+

28.9.15.2 Symmetric, Hermitian, and +triangular matrix-matrix product [linalg.algs.blas3.xxmm]

+

[Note: These functions correspond to the BLAS functions +xSYMM, xHEMM, and xTRMM. – end +note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas3.xxmm] in addition to function-specific elements.

+

2 +Mandates:

+
    +
  • (2.1) +possibly-multipliable<decltype(A), decltype(B), decltype(C)>() +is true, and

  • +
  • (2.2) +possibly-addable<decltype(E), decltype(E), decltype(C)>() +is true for those overloads that take an E +parameter.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +multipliable(A, B, C) is +true, and

  • +
  • (3.2) +addable(E, E, C) is true +for those overloads that take an E parameter.

  • +
+

4 +Complexity: O( +A.extent(0) +A.extent(1) +B.extent(1) )

+
template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat C);
+

5 +These functions perform a matrix-matrix multiply, taking into account +the Triangle and DiagonalStorage (if +applicable) parameters that apply to the symmetric, Hermitian, or +triangular (respectively) matrix A [linalg.general].

+

6 +Mandates:

+
    +
  • (6.1) If +InMat1 has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument; and

  • +
  • (6.2) +compatible-static-extents<InMat1,InMat1>(0,1).

  • +
+

7 +Preconditions: A.extent(0) == A.extent(1) is +true.

+

8 +Effects: Computes C = AB.

+
template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  OutMat C);
+

9 +These functions perform a matrix-matrix multiply, taking into account +the Triangle and DiagonalStorage (if +applicable) parameters that apply to the symmetric, Hermitian, or +triangular (respectively) matrix B [linalg.general].

+

10 +Mandates:

+
    +
  • (10.1) If +InMat2 has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument; and

  • +
  • (10.2) +compatible-static-extents<InMat2,InMat2>(0,1).

  • +
+

11 +Preconditions: B.extent(0) == B.extent(1) is +true.

+

12 +Effects: Computes C = AB.

+
template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  InMat3 E,
+  OutMat C);
+

13 These +functions perform a potentially overwriting matrix-matrix multiply-add, +taking into account the Triangle and +DiagonalStorage (if applicable) parameters that apply to +the symmetric, Hermitian, or triangular (respectively) matrix +A [linalg.general].

+

14 +Mandates:

+
    +
  • (14.1) If +InMat1 has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument; and

  • +
  • (14.2) +compatible-static-extents<InMat1,InMat1>(0,1).

  • +
+

15 +Preconditions: A.extent(0) == A.extent(1) is +true.

+

16 +Effects: Computes C = E + AB.

+

17 +Remarks: C may alias E.

+
template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void symmetric_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void hermitian_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  InMat3 E,
+  OutMat C);
+
+template<in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  InMat3 E,
+  OutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat3,
+         out-matrix OutMat>
+void triangular_matrix_product(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  Triangle t,
+  DiagonalStorage d,
+  InMat3 E,
+  OutMat C);
+

18 These +functions perform a potentially overwriting matrix-matrix multiply-add, +taking into account the Triangle and +DiagonalStorage (if applicable) parameters that apply to +the symmetric, Hermitian, or triangular (respectively) matrix +B [linalg.general].

+

19 +Mandates:

+
    +
  • (19.1) If +InMat2 has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument; and

  • +
  • (19.2) +compatible-static-extents<InMat2,InMat2>(0,1).

  • +
+

20 +Preconditions: B.extent(0) == B.extent(1) is +true.

+

21 +Effects: Computes C = E + AB.

+

22 +Remarks: C may alias E.

+

28.9.15.3 In-place triangular +matrix-matrix product [linalg.algs.blas3.trmm]

+

1 +These functions perform an in-place matrix-matrix multiply, taking into +account the Triangle and DiagonalStorage +parameters that apply to the triangular matrix A +[linalg.general].

+

[Note: These functions correspond to the BLAS function +xTRMM. – end note]

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_left_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_left_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+

2 +Mandates:

+
    +
  • (2.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (2.2) +possibly-multipliable<InMat, InOutMat, InOutMat>() +is true; and

  • +
  • (2.3) +compatible-static-extents<InMat,InMat>(0,1).

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +multipliable(A, C, C) is +true, and

  • +
  • (3.2) +A.extent(0) == A.extent(1) is true.

  • +
+

4 +Effects: Computes a matrix C such that C′ = AC and +assigns each element of C to +the corresponding element of C.

+

5 +Complexity: O( +A.extent(0) +A.extent(1) +C.extent(0) )

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_right_product(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_right_product(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat C);
+

6 +Mandates:

+
    +
  • (6.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (6.2) +possibly-multipliable<InOutMat, InMat, InOutMat>() +is true; and

  • +
  • (6.3) +compatible-static-extents<InMat, InMat>(0,1).

  • +
+

7 +Preconditions:

+
    +
  • (7.1) +multipliable(C, A, C) is +true, and

  • +
  • (7.2) +A.extent(0) == A.extent(1) is true.

  • +
+

8 +Effects: Computes a matrix C such that C′ = CA and +assigns each element of C to +the corresponding element of C.

+

9 +Complexity: O( +A.extent(0) +A.extent(1) +C.extent(0) )

+

28.9.15.4 Rank-k update of a +symmetric or Hermitian matrix [linalg.algs.blas3.rankk]

+

[Note: These functions correspond to the BLAS functions +xSYRK and xHERK. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas3.rankk].

+

2 +Mandates:

+
    +
  • (2.1) If +InOutMat has layout_blas_packed layout, then +the layout’s Triangle template argument has the same type +as the function’s Triangle template argument;

  • +
  • (2.2) +compatible-static-extents<decltype(A), decltype(A)>(0, 1) +is true;

  • +
  • (2.3) +compatible-static-extents<decltype(C), decltype(C)>(0, 1) +is true; and

  • +
  • (2.4) +compatible-static-extents<decltype(A), decltype(C)>(0, 0) +is true.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +A.extent(0) equals A.extent(1),

  • +
  • (3.2) +C.extent(0) equals C.extent(1), and

  • +
  • (3.3) +A.extent(0) equals C.extent(0).

  • +
+

4 +Complexity: O( +A.extent(0) +A.extent(1) +C.extent(0) )

+
template<class Scalar,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         class Scalar,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+

5 +Effects: Computes a matrix C such that C′ = C + αAAT, +where the scalar α is +alpha, and assigns each element of C to the corresponding element of +C.

+
template<in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+

6 +Effects: Computes a matrix C such that C′ = C + AAT, +and assigns each element of C +to the corresponding element of C.

+
template<class Scalar,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         class Scalar,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  Scalar alpha,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+

7 +Effects: Computes a matrix C such that C′ = C + AAH, +where the scalar α is +alpha, and assigns each element of C to the corresponding element of +C.

+
template<in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  InMat A,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_k_update(
+  ExecutionPolicy&& exec,
+  InMat A,
+  InOutMat C,
+  Triangle t);
+

8 +Effects: Computes a matrix C such that C′ = C + AAH, +and assigns each element of C +to the corresponding element of C.

+

28.9.15.5 Rank-2k update of a +symmetric or Hermitian matrix [linalg.algs.blas3.rank2k]

+

[Note: These functions correspond to the BLAS functions +xSYR2K and xHER2K. – end note]

+

1 +The following elements apply to all functions in +[linalg.algs.blas3.rank2k].

+

2 +Mandates:

+
    +
  • (2.1) If +InOutMat has layout_blas_packed layout, then +the layout’s Triangle template argument has the same type +as the function’s Triangle template argument;

  • +
  • (2.2) +possibly-addable<decltype(A), decltype(B), decltype(C)>() +is true; and

  • +
  • (2.3) +compatible-static-extents<decltype(A), decltype(A)>(0, 1) +is true.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +addable(A, B, C) is +true, and

  • +
  • (3.2) +A.extent(0) equals A.extent(1).

  • +
+

4 +Complexity: O( +A.extent(0) +A.extent(1) +C.extent(0) )

+
template<in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2k_update(
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void symmetric_matrix_rank_2k_update(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+

5 +Effects: Computes a matrix C such that C′ = C + ABT + BAT, +and assigns each element of C +to the corresponding element of C.

+
template<in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2k_update(
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         in-matrix InMat2,
+         possibly-packed-inout-matrix InOutMat,
+         class Triangle>
+void hermitian_matrix_rank_2k_update(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  InMat2 B,
+  InOutMat C,
+  Triangle t);
+

6 +Effects: Computes a matrix C such that C′ = C + ABT + BAT, +and assigns each element of C +to the corresponding element of C.

+

28.9.15.6 Solve multiple triangular +linear systems [linalg.algs.blas3.trsm]

+

[Note: These functions correspond to the BLAS function +xTRSM. + – end +note]

+
template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+

1 +These functions perform multiple matrix solves, taking into account the +Triangle and DiagonalStorage parameters that +apply to the triangular matrix A [linalg.general].

+

2 +Mandates:

+
    +
  • (2.1) If +InMat1 has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (2.2) +possibly-multipliable<InMat1, OutMat, InMat2>() +is true; and

  • +
  • (2.3) +compatible-static-extents<InMat1, InMat1>(0,1) +is true.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +multipliable(A,X,B) is +true, and

  • +
  • (3.2) +A.extent(0) == A.extent(1) is true.

  • +
+

4 +Effects: Computes X +such that AX′ = B, and +assigns each element of X to +the corresponding element of X. If no such X exists, then the elements of +X are valid but unspecified.

+

5 +Complexity: O( +A.extent(0) +X.extent(1) +X.extent(1) )

+

[Note: Since the triangular matrix is on the left, the desired +divide implementation in the case of noncommutative +multiplication would be mathematically equivalent to y−1x, where +x is the first argument and +y is the second argument, and +y−1 denotes the +multiplicative inverse of y. +– end note]

+
template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_left_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+

6 +Effects: Equivalent to:

+
triangular_matrix_matrix_left_solve(A, t, d, B, X, divides<void>{});
+
template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+

7 +Effects: Equivalent to:

+
triangular_matrix_matrix_left_solve(std::forward<ExecutionPolicy>(exec),
+  A, t, d, B, X, divides<void>{});
+
template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X,
+  BinaryDivideOp divide);
+

8 +These functions perform multiple matrix solves, taking into account the +Triangle and DiagonalStorage parameters that +apply to the triangular matrix A [linalg.general].

+

9 +Mandates:

+
    +
  • (9.1) If +InMat1 has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (9.2) +possibly-multipliable<OutMat, InMat1, InMat2>() +is true; and

  • +
  • (9.3) +compatible-static-extents<InMat1, InMat1>(0,1) +is true.

  • +
+

10 +Preconditions:

+
    +
  • (10.1) +multipliable(X,A,B) is +true, and

  • +
  • (10.2) +A.extent(0) == A.extent(1) is true.

  • +
+

11 +Effects: Computes X +such that XA = B, and +assigns each element of X to +the corresponding element of X. If no such X exists, then the elements of +X are valid but unspecified.

+

12 +Complexity: O( +B.extent(0) +B.extent(1) +A.extent(1) )

+

[Note: Since the triangular matrix is on the right, the +desired divide implementation in the case of noncommutative +multiplication would be mathematically equivalent to xy−1, where +x is the first argument and +y is the second argument, and +y−1 denotes the +multiplicative inverse of y. +– end note]

+
template<in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_right_solve(
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+

13 +Effects: Equivalent to:

+
triangular_matrix_matrix_right_solve(A, t, d, B, X, divides<void>{});
+
template<class ExecutionPolicy,
+         in-matrix InMat1,
+         class Triangle,
+         class DiagonalStorage,
+         in-matrix InMat2,
+         out-matrix OutMat>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat1 A,
+  Triangle t,
+  DiagonalStorage d,
+  InMat2 B,
+  OutMat X);
+

14 +Effects: Equivalent to:

+
triangular_matrix_matrix_right_solve(std::forward<ExecutionPolicy>(exec),
+  A, t, d, B, X, divides<void>{});
+

28.9.15.7 Solve multiple triangular +linear systems in-place [linalg.algs.blas3.inplacetrsm]

+

[Note: These functions correspond to the BLAS function +xTRSM. + – end +note]

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);
+

1 +These functions perform multiple in-place matrix solves, taking into +account the Triangle and DiagonalStorage +parameters that apply to the triangular matrix A +[linalg.general].

+

[Note: This algorithm makes it possible to compute +factorizations like Cholesky and LU in place.

+

Performing triangular solve in place hinders parallelization. +However, other ExecutionPolicy specific optimizations, such +as vectorization, are still possible. – end note]

+

2 +Mandates:

+
    +
  • (2.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (2.2) +possibly-multipliable<InMat, InOutMat, InOutMat>() +is true; and

  • +
  • (2.3) +compatible-static-extents<InMat, InMat>(0,1) +is true.

  • +
+

3 +Preconditions:

+
    +
  • (3.1) +multipliable(A,B,B) is +true, and

  • +
  • (3.2) +A.extent(0) == A.extent(1) is true.

  • +
+

4 +Effects: Computes X +such that AX′ = B, and +assigns each element of X to +the corresponding element of B. If so such X exists, then the elements of +B are valid but unspecified.

+

5 +Complexity: O( +A.extent(0) +A.extent(1) +B.extent(1) )

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_left_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+

6 +Effects: Equivalent to:

+
triangular_matrix_matrix_left_solve(A, t, d, B, divides<void>{});
+
template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_left_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+

7 +Effects: Equivalent to:

+
triangular_matrix_matrix_left_solve(std::forward<ExecutionPolicy>(exec),
+  A, t, d, B, divides<void>{});
+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);
+template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat,
+         class BinaryDivideOp>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B,
+  BinaryDivideOp divide);  
+

8 +These functions perform multiple in-place matrix solves, taking into +account the Triangle and DiagonalStorage +parameters that apply to the triangular matrix A +[linalg.general].

+

[Note: This algorithm makes it possible to compute +factorizations like Cholesky and LU in place.

+

Performing triangular solve in place hinders parallelization. +However, other ExecutionPolicy specific optimizations, such +as vectorization, are still possible. – end note]

+

9 +Mandates:

+
    +
  • (9.1) If +InMat has layout_blas_packed layout, then the +layout’s Triangle template argument has the same type as +the function’s Triangle template argument;

  • +
  • (9.2) +possibly-multipliable<InOutMat, InMat, InOutMat>() +is true; and

  • +
  • (9.3) +compatible-static-extents<InMat, InMat>(0,1) +is true.

  • +
+

10 +Preconditions:

+
    +
  • (10.1) +multipliable(B,A,B) is +true, and

  • +
  • (10.2) +A.extent(0) == A.extent(1) is true.

  • +
+

11 +Effects: Computes X +such that XA = B, and +assigns each element of X to +the corresponding element of B. If so such X exists, then the elements of +B are valid but unspecified.

+

12 +Complexity: O( +A.extent(0) +A.extent(1) +B.extent(1) )

+
template<in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_right_solve(
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+

13 +Effects: Equivalent to:

+
triangular_matrix_matrix_right_solve(A, t, d, B, divides<void>{});
+
template<class ExecutionPolicy,
+         in-matrix InMat,
+         class Triangle,
+         class DiagonalStorage,
+         inout-matrix InOutMat>
+void triangular_matrix_matrix_right_solve(
+  ExecutionPolicy&& exec,
+  InMat A,
+  Triangle t,
+  DiagonalStorage d,
+  InOutMat B);
+

14 +Effects: Equivalent to:

+
triangular_matrix_matrix_right_solve(std::forward<ExecutionPolicy>(exec),
+  A, t, d, B, divides<void>{});
+

29 Examples

+

29.1 Cholesky factorization

+

1 +This example shows how to compute the Cholesky factorization of a real +symmetric positive definite matrix A stored as an mdspan +with a unique non-packed layout. The algorithm imitates +DPOTRF2 in LAPACK 3.9.0. If Triangle is +upper_triangle_t, then it computes the factorization A = UTU +in place, with U stored in the upper triangle of A on output. Otherwise, +it computes the factorization A = LLT +in place, with L stored in the lower triangle of A on output. The +function returns 0 if success, else k+1 if row/column k has a zero or +NaN (not a number) diagonal entry.

+
#include <linalg>
+#include <cmath>
+
+// Flip upper to lower, and lower to upper
+lower_triangular_t opposite_triangle(upper_triangular_t) {
+  return {};
+}
+upper_triangular_t opposite_triangle(lower_triangular_t) {
+  return {};
+}
+
+// Returns nullopt if no bad pivots,
+// else the index of the first bad pivot.
+// A "bad" pivot is zero or NaN.
+template<inout-matrix InOutMat,
+         class Triangle>
+std::optional<typename InOutMat::size_type>
+cholesky_factor(InOutMat A, Triangle t)
+{
+  using std::submdspan;
+  using std::tuple;
+  using value_type = typename InOutMat::value_type;
+  using size_type = typename InOutMat::size_type;
+
+  constexpr value_type ZERO {};
+  constexpr value_type ONE (1.0);
+  const size_type n = A.extent(0);
+
+  if (n == 0) {
+    return std::nullopt;
+  }
+  else if (n == 1) {
+    if (A[0,0] <= ZERO || std::isnan(A[0,0])) {
+      return {size_type(1)};
+    }
+    A[0,0] = std::sqrt(A[0,0]);
+  }
+  else {
+    // Partition A into [A11, A12,
+    //                   A21, A22],
+    // where A21 is the transpose of A12.
+    const size_type n1 = n / 2;
+    const size_type n2 = n - n1;
+    auto A11 = submdspan(A, pair{0, n1}, pair{0, n1});
+    auto A22 = submdspan(A, pair{n1, n}, pair{n1, n});
+
+    // Factor A11
+    const auto info1 = cholesky_factor(A11, t);
+    if (info1.has_value()) {
+      return info1;
+    }
+
+    using std::linalg::explicit_diagonal;
+    using std::linalg::symmetric_matrix_rank_k_update;
+    using std::linalg::transposed;
+    if constexpr (std::is_same_v<Triangle, upper_triangle_t>) {
+      // Update and scale A12
+      auto A12 = submdspan(A, tuple{0, n1}, tuple{n1, n});
+      using std::linalg::triangular_matrix_matrix_left_solve;
+      // BLAS would use original triangle; we need to flip it
+      triangular_matrix_matrix_left_solve(transposed(A11),
+        opposite_triangle(t), explicit_diagonal, A12);
+      // A22 = A22 - A12^T * A12
+      //
+      // The Triangle argument applies to A22,
+      // not transposed(A12), so we don't flip it.
+      symmetric_matrix_rank_k_update(-ONE, transposed(A12),
+                                     A22, t);
+    }
+    else {
+      //
+      // Compute the Cholesky factorization A = L * L^T
+      //
+      // Update and scale A21
+      auto A21 = submdspan(A, tuple{n1, n}, tuple{0, n1});
+      using std::linalg::triangular_matrix_matrix_right_solve;
+      // BLAS would use original triangle; we need to flip it
+      triangular_matrix_matrix_right_solve(transposed(A11),
+        opposite_triangle(t), explicit_diagonal, A21);
+      // A22 = A22 - A21 * A21^T
+      symmetric_matrix_rank_k_update(-ONE, A21, A22, t);
+    }
+
+    // Factor A22
+    const auto info2 = cholesky_factor(A22, t);
+    if (info2.has_value()) {
+      return {info2.value() + n1};
+    }
+  }
+
+  return std::nullopt;
+}
+

29.2 Solve linear system using +Cholesky factorization

+

1 +This example shows how to solve a symmetric positive definite linear +system Ax = b, using the +Cholesky factorization computed in the previous example in-place in the +matrix A. The example assumes that +cholesky_factor(A, t) returned nullopt, +indicating no zero or NaN pivots.

+
template<in-matrix InMat,
+         class Triangle,
+         in-vector InVec,
+         out-vector OutVec>
+void cholesky_solve(
+  InMat A,
+  Triangle t,
+  InVec b,
+  OutVec x)
+{
+  using std::linalg::explicit_diagonal;
+  using std::linalg::transposed;
+  using std::linalg::triangular_matrix_vector_solve;
+
+  if constexpr (std::is_same_v<Triangle, upper_triangle_t>) {
+    // Solve Ax=b where A = U^T U
+    //
+    // Solve U^T c = b, using x to store c.
+    triangular_matrix_vector_solve(transposed(A),
+      opposite_triangle(t), explicit_diagonal, b, x);
+    // Solve U x = c, overwriting x with result.
+    triangular_matrix_vector_solve(A, t, explicit_diagonal, x);
+  }
+  else {
+    // Solve Ax=b where A = L L^T
+    //
+    // Solve L c = b, using x to store c.
+    triangular_matrix_vector_solve(A, t, explicit_diagonal, b, x);
+    // Solve L^T x = c, overwriting x with result.
+    triangular_matrix_vector_solve(transposed(A),
+      opposite_triangle(t), explicit_diagonal, x);
+  }
+}
+

29.3 Compute QR factorization of a +tall skinny matrix

+

1 +This example shows how to compute the QR factorization of a “tall and +skinny” matrix V, using a cache-blocked algorithm based on +rank-k symmetric matrix update and Cholesky factorization. “Tall and +skinny” means that the matrix has many more rows than columns.

+
// Compute QR factorization A = Q R, with A storing Q.
+template<inout-matrix InOutMat,
+         out-matrix OutMat>
+std::optional<typename InOutMat::size_type>
+cholesky_tsqr_one_step(
+  InOutMat A, // A on input, Q on output
+  OutMat R)
+{
+  using std::full_extent;
+  using std::submdspan;
+  using std::tuple;
+  using size_type = typename InOutMat::size_type;
+
+  // One might use cache size, sizeof(element_type), and A.extent(1)
+  // to pick the number of rows per block.  For now, we just pick
+  // some constant.
+  constexpr size_type max_num_rows_per_block = 500;
+
+  using R_value_type = typename OutMat::value_type;
+  constexpr R_element_type ZERO {};
+  for(size_type j = 0; j < R.extent(1); ++j) {
+    for(size_type i = 0; i < R.extent(0); ++i) {
+      R[i,j] = ZERO;
+    }
+  }
+
+  // Cache-blocked version of R = R + A^T * A.
+  const auto num_rows = A.extent(0);
+  auto rest_num_rows = num_rows;
+  auto A_rest = A;
+  while(A_rest.extent(0) > 0) {
+    const size_type num_rows_per_block =
+      std::min(A.extent(0), max_num_rows_per_block);
+    auto A_cur = submdspan(A_rest,
+      tuple{0, num_rows_per_block},
+      full_extent);
+    A_rest = submdspan(A_rest,
+      tuple{num_rows_per_block, A_rest.extent(0)},
+      full_extent);
+    // R = R + A_cur^T * A_cur
+    using std::linalg::symmetric_matrix_rank_k_update;
+    constexpr R_element_type ONE(1.0);
+    // The Triangle argument applies to R,
+    // not transposed(A_cur), so we don't flip it.
+    symmetric_matrix_rank_k_update(ONE,
+      std::linalg::transposed(A_cur),
+      R, upper_triangle);
+  }
+
+  const auto info = cholesky_factor(R, upper_triangle);
+  if(info.has_value()) {
+    return info;
+  }
+  using std::linalg::triangular_matrix_matrix_left_solve;
+  triangular_matrix_matrix_left_solve(R, upper_triangle, A);
+  return std::nullopt;
+}
+
+// Compute QR factorization A = Q R.
+// Use R_tmp as temporary R factor storage
+// for iterative refinement.
+template<in-matrix InMat,
+         out-matrix OutMat1,
+         out-matrix OutMat2,
+         out-matrix OutMat3>
+std::optional<typename OutMat1::size_type>
+cholesky_tsqr(
+  InMat A,
+  OutMat1 Q,
+  OutMat2 R_tmp,
+  OutMat3 R)
+{
+  assert(R.extent(0) == R.extent(1));
+  assert(A.extent(1) == R.extent(0));
+  assert(R_tmp.extent(0) == R_tmp.extent(1));
+  assert(A.extent(0) == Q.extent(0));
+  assert(A.extent(1) == Q.extent(1));
+
+  std::linalg::copy(A, Q);
+  const auto info1 = cholesky_tsqr_one_step(Q, R);
+  if(info1.has_value()) {
+    return info1;
+  }
+  // Use one step of iterative refinement to improve accuracy.
+  const auto info2 = cholesky_tsqr_one_step(Q, R_tmp);
+  if(info2.has_value()) {
+    return info2;
+  }
+  // R = R_tmp * R
+  using std::linalg::triangular_matrix_product;
+  triangular_matrix_product(R_tmp, upper_triangle,
+                            explicit_diagonal, R);
+  return std::nullopt;
+}
+
+
+ +