From a453a6a23ab1b6ceaca561bd88110df9940b221a Mon Sep 17 00:00:00 2001
From: dovgopoly <69435717+dovgopoly@users.noreply.github.com>
Date: Wed, 11 Dec 2024 13:39:17 +0200
Subject: [PATCH] Feat/affine (#123)

* init affine

* fixed & opt

* 2.7.14

* rm debug

* opt & fixed warning

* fix comments

* removed unused functions

* added quad

* opt 8.1kk

* fixed comment & rm debug

* remove unused functions

* rollback version

---------

Co-authored-by: Artem Chystiakov <artem.ch31@gmail.com>
---
 contracts/libs/crypto/ECDSA384.sol | 467 ++++++++++++-----------------
 1 file changed, 190 insertions(+), 277 deletions(-)

diff --git a/contracts/libs/crypto/ECDSA384.sol b/contracts/libs/crypto/ECDSA384.sol
index 940718ff..7fa6e9fb 100644
--- a/contracts/libs/crypto/ECDSA384.sol
+++ b/contracts/libs/crypto/ECDSA384.sol
@@ -7,9 +7,12 @@ import {MemoryUtils} from "../utils/MemoryUtils.sol";
  * @notice Cryptography module
  *
  * This library provides functionality for ECDSA verification over any 384-bit curve. Currently,
- * this is the most efficient implementation out there, consuming ~9 million gas per call.
+ * this is the most efficient implementation out there, consuming ~8.1 million gas per call.
  *
- * The approach is Strauss-Shamir double scalar multiplication with 4 bits of precompute + projective points.
+ * The approach is Strauss-Shamir double scalar multiplication with 4 bits of precompute + affine coordinates.
+ * For reference, naive implementation uses ~400 billion gas, which is 48000 times more expensive.
+ *
+ * We also tried using projective coordinates, however, the gas consumption rose to ~9 million gas.
  */
 library ECDSA384 {
     using MemoryUtils for *;
@@ -117,7 +120,7 @@ library ECDSA384 {
                 uint256 three = U384.init(3);
 
                 /// We use 4-bit masks where the first 2 bits refer to `scalar1` and the last 2 bits refer to `scalar2`.
-                uint256[3][16] memory points_ = _precomputePointsTable(
+                uint256[2][16] memory points_ = _precomputePointsTable(
                     call,
                     params_.p,
                     three,
@@ -128,7 +131,7 @@ library ECDSA384 {
                     inputs_.y
                 );
 
-                (scalar1, , scalar2) = _doubleScalarMultiplication(
+                (scalar1, ) = _doubleScalarMultiplication(
                     call,
                     params_.p,
                     three,
@@ -139,7 +142,7 @@ library ECDSA384 {
                 );
             }
 
-            return U384.eq(U384.moddiv(call, scalar1, scalar2, params_.p), inputs_.r);
+            return U384.eq(scalar1, inputs_.r);
         }
     }
 
@@ -182,10 +185,10 @@ library ECDSA384 {
         uint256 p,
         uint256 three,
         uint256 a,
-        uint256[3][16] memory points,
+        uint256[2][16] memory points,
         uint256 scalar1,
         uint256 scalar2
-    ) private view returns (uint256 x, uint256 y, uint256 z) {
+    ) private view returns (uint256 x, uint256 y) {
         unchecked {
             uint256 mask_;
             uint256 scalar1Bits_;
@@ -196,33 +199,15 @@ library ECDSA384 {
                 scalar2Bits_ := mload(scalar2)
             }
 
-            x = U384.init(0);
-            y = U384.init(0);
-            z = U384.init(1);
-
             for (uint256 word = 2; word <= 184; word += 2) {
-                (x, y, z) = _twiceProj(call, p, three, a, x, y, z);
-                (x, y, z) = _twiceProj(call, p, three, a, x, y, z);
+                (x, y) = _qaudAffine(call, p, three, a, x, y);
 
                 mask_ =
                     (((scalar1Bits_ >> (184 - word)) & 0x03) << 2) |
                     ((scalar2Bits_ >> (184 - word)) & 0x03);
 
                 if (mask_ != 0) {
-                    uint256[3] memory maskedPoints_ = points[mask_];
-
-                    (x, y, z) = _addProj(
-                        call,
-                        p,
-                        three,
-                        a,
-                        maskedPoints_[0],
-                        maskedPoints_[1],
-                        maskedPoints_[2],
-                        x,
-                        y,
-                        z
-                    );
+                    (x, y) = _addAffine(call, p, points[mask_][0], points[mask_][1], x, y);
                 }
             }
 
@@ -232,177 +217,150 @@ library ECDSA384 {
             }
 
             for (uint256 word = 2; word <= 256; word += 2) {
-                (x, y, z) = _twiceProj(call, p, three, a, x, y, z);
-                (x, y, z) = _twiceProj(call, p, three, a, x, y, z);
+                (x, y) = _qaudAffine(call, p, three, a, x, y);
 
                 mask_ =
                     (((scalar1Bits_ >> (256 - word)) & 0x03) << 2) |
                     ((scalar2Bits_ >> (256 - word)) & 0x03);
 
                 if (mask_ != 0) {
-                    uint256[3] memory maskedPoints_ = points[mask_];
-
-                    (x, y, z) = _addProj(
-                        call,
-                        p,
-                        three,
-                        a,
-                        maskedPoints_[0],
-                        maskedPoints_[1],
-                        maskedPoints_[2],
-                        x,
-                        y,
-                        z
-                    );
+                    (x, y) = _addAffine(call, p, points[mask_][0], points[mask_][1], x, y);
                 }
             }
-
-            return (x, y, z);
         }
     }
 
     /**
-     * @dev Double an elliptic curve point in projective coordinates. See
-     * https://www.nayuki.io/page/elliptic-curve-point-addition-in-projective-coordinates
+     * @dev Double an elliptic curve point in affine coordinates.
      */
-    function _twiceProj(
+    function _twiceAffine(
         uint256 call,
         uint256 p,
         uint256 three,
         uint256 a,
-        uint256 x0,
-        uint256 y0,
-        uint256 z0
-    ) private view returns (uint256 x1, uint256 y1, uint256 z1) {
+        uint256 x1,
+        uint256 y1
+    ) private view returns (uint256 x2, uint256 y2) {
         unchecked {
-            if (U384.eqInteger(x0, 0) && U384.eqInteger(y0, 0)) {
-                return (U384.init(0), U384.init(0), U384.init(1)); // zero proj
+            if (x1 == 0) {
+                return (0, 0);
             }
 
-            uint256 u = U384.modmul(call, y0, z0);
-            U384.modshl1Assign(u, p);
-
-            x1 = U384.modmul(call, u, x0);
-            U384.modmulAssign(call, x1, y0);
-            U384.modshl1Assign(x1, p);
-
-            x0 = U384.modexp(call, x0, 2);
-
-            y1 = U384.modmul(call, x0, three);
-
-            z0 = U384.modexp(call, z0, 2);
-            U384.modmulAssign(call, z0, a);
-            U384.modaddAssign(y1, z0, p);
-
-            z1 = U384.modexp(call, y1, 2);
-            U384.modshl1AssignTo(x0, x1, p);
-
-            uint256 diff = U384.sub(p, x0);
-            U384.modaddAssign(z1, diff, p);
-
-            U384.subAssignTo(diff, p, z1);
-            U384.modaddAssignTo(x0, x1, diff, p);
-            U384.modmulAssign(call, x0, y1);
+            if (U384.eqInteger(y1, 0)) {
+                return (0, 0);
+            }
 
-            y0 = U384.modmul(call, y0, u);
-            U384.modexpAssign(call, y0, 2);
-            U384.modshl1Assign(y0, p);
+            uint256 m1 = U384.modexp(call, x1, 2);
+            U384.modmulAssign(call, m1, three);
+            U384.modaddAssign(m1, a, p);
 
-            U384.subAssignTo(diff, p, y0);
-            U384.modaddAssignTo(y1, x0, diff, p);
+            uint256 m2 = U384.modshl1(y1, p);
+            U384.moddivAssign(call, m1, m2);
 
-            U384.modmulAssignTo(call, x1, u, z1);
+            x2 = U384.modexp(call, m1, 2);
+            U384.modsubAssign(x2, x1, p);
+            U384.modsubAssign(x2, x1, p);
 
-            U384.modexpAssignTo(call, z1, u, 2);
-            U384.modmulAssign(call, z1, u);
+            y2 = U384.modsub(x1, x2, p);
+            U384.modmulAssign(call, y2, m1);
+            U384.modsubAssign(y2, y1, p);
         }
     }
 
     /**
-     * @dev Add two elliptic curve points in projective coordinates. See
-     * https://www.nayuki.io/page/elliptic-curve-point-addition-in-projective-coordinates
+     * @dev Quads an elliptic curve point in affine coordinates.
      */
-    function _addProj(
+    function _qaudAffine(
         uint256 call,
         uint256 p,
         uint256 three,
         uint256 a,
-        uint256 x0,
-        uint256 y0,
-        uint256 z0,
         uint256 x1,
-        uint256 y1,
-        uint256 z1
-    ) private view returns (uint256 x2, uint256 y2, uint256 z2) {
+        uint256 y1
+    ) private view returns (uint256 x2, uint256 y2) {
         unchecked {
-            if (U384.eqInteger(x0, 0) && U384.eqInteger(y0, 0)) {
-                return (x1.copy(), y1.copy(), z1.copy());
-            } else if (U384.eqInteger(x1, 0) && U384.eqInteger(y1, 0)) {
-                return (x0.copy(), y0.copy(), z0.copy());
+            if (x1 == 0) {
+                return (0, 0);
             }
 
-            x2 = U384.modmul(call, y0, z1);
-            y2 = U384.modmul(call, y1, z0);
-            z2 = U384.modmul(call, x0, z1);
-            y1 = U384.modmul(call, x1, z0);
+            if (U384.eqInteger(y1, 0)) {
+                return (0, 0);
+            }
 
-            if (U384.eq(z2, y1)) {
-                if (U384.eq(x2, y2)) {
-                    return _twiceProj(call, p, three, a, x0, y0, z0);
-                } else {
-                    return (U384.init(0), U384.init(0), U384.init(1)); // zero proj
-                }
+            uint256 m1 = U384.modexp(call, x1, 2);
+            U384.modmulAssign(call, m1, three);
+            U384.modaddAssign(m1, a, p);
+
+            uint256 m2 = U384.modshl1(y1, p);
+            U384.moddivAssign(call, m1, m2);
+
+            x2 = U384.modexp(call, m1, 2);
+            U384.modsubAssign(x2, x1, p);
+            U384.modsubAssign(x2, x1, p);
+
+            y2 = U384.modsub(x1, x2, p);
+            U384.modmulAssign(call, y2, m1);
+            U384.modsubAssign(y2, y1, p);
+
+            if (U384.eqInteger(y2, 0)) {
+                return (0, 0);
             }
 
-            a = U384.modmul(call, z0, z1);
+            U384.modexpAssignTo(call, m1, x2, 2);
+            U384.modmulAssign(call, m1, three);
+            U384.modaddAssign(m1, a, p);
+
+            U384.modshl1AssignTo(m2, y2, p);
+            U384.moddivAssign(call, m1, m2);
+
+            U384.modexpAssignTo(call, x1, m1, 2);
+            U384.modsubAssign(x1, x2, p);
+            U384.modsubAssign(x1, x2, p);
+
+            U384.modsubAssignTo(y1, x2, x1, p);
+            U384.modmulAssign(call, y1, m1);
+            U384.modsubAssign(y1, y2, p);
 
-            return _addProj2(call, a, z2, p, y1, y2, x2);
+            return (x1, y1);
         }
     }
 
     /**
-     * @dev Helper function that splits addProj to avoid too many local variables.
+     * @dev Add two elliptic curve points in affine coordinates.
      */
-    function _addProj2(
+    function _addAffine(
         uint256 call,
-        uint256 v,
-        uint256 u0,
         uint256 p,
-        uint256 u1,
-        uint256 t1,
-        uint256 t0
-    ) private view returns (uint256 x2, uint256 y2, uint256 z2) {
+        uint256 x1,
+        uint256 y1,
+        uint256 x2,
+        uint256 y2
+    ) private view returns (uint256 x3, uint256 y3) {
         unchecked {
-            uint256 diff = U384.sub(p, t1);
-            y2 = U384.modadd(t0, diff, p);
-
-            U384.subAssignTo(diff, p, u1);
-            x2 = U384.modadd(u0, diff, p);
-            uint256 u2 = U384.modexp(call, x2, 2);
-
-            z2 = U384.modexp(call, y2, 2);
-
-            U384.modmulAssign(call, z2, v);
-            u1 = U384.modadd(u1, u0, p);
-            U384.modmulAssign(call, u1, u2);
-            U384.subAssignTo(diff, p, u1);
-            U384.modaddAssign(z2, diff, p);
+            if (x1 == 0 || x2 == 0) {
+                if (x1 == 0 && x2 == 0) {
+                    return (0, 0);
+                }
 
-            uint256 u3 = U384.modmul(call, u2, x2);
+                return x1 == 0 ? (x2.copy(), y2.copy()) : (x1.copy(), y1.copy());
+            }
 
-            U384.modmulAssign(call, x2, z2);
+            if (U384.eq(x1, x2)) {
+                return (0, 0);
+            }
 
-            u0 = U384.modmul(call, u0, u2);
+            uint256 m1 = U384.modsub(y1, y2, p);
+            uint256 m2 = U384.modsub(x1, x2, p);
 
-            U384.subAssignTo(diff, p, z2);
-            U384.modaddAssign(u0, diff, p);
-            U384.modmulAssign(call, y2, u0);
-            t0 = U384.modmul(call, t0, u3);
+            U384.moddivAssign(call, m1, m2);
 
-            U384.subAssignTo(diff, p, t0);
-            U384.modaddAssign(y2, diff, p);
+            x3 = U384.modexp(call, m1, 2);
+            U384.modsubAssign(x3, x1, p);
+            U384.modsubAssign(x3, x2, p);
 
-            U384.modmulAssignTo(call, z2, u3, v);
+            y3 = U384.modsub(x1, x3, p);
+            U384.modmulAssign(call, y3, m1);
+            U384.modsubAssign(y3, y1, p);
         }
     }
 
@@ -415,181 +373,127 @@ library ECDSA384 {
         uint256 gy,
         uint256 hx,
         uint256 hy
-    ) private view returns (uint256[3][16] memory points_) {
+    ) private view returns (uint256[2][16] memory points_) {
         /// 0b0100: 1G + 0H
-        (points_[0x04][0], points_[0x04][1], points_[0x04][2]) = (
-            gx.copy(),
-            gy.copy(),
-            U384.init(1)
-        );
+        (points_[0x04][0], points_[0x04][1]) = (gx.copy(), gy.copy());
         /// 0b1000: 2G + 0H
-        (points_[0x08][0], points_[0x08][1], points_[0x08][2]) = _twiceProj(
+        (points_[0x08][0], points_[0x08][1]) = _twiceAffine(
             call,
             p,
             three,
             a,
             points_[0x04][0],
-            points_[0x04][1],
-            points_[0x04][2]
+            points_[0x04][1]
         );
         /// 0b1100: 3G + 0H
-        (points_[0x0C][0], points_[0x0C][1], points_[0x0C][2]) = _addProj(
+        (points_[0x0C][0], points_[0x0C][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x04][0],
             points_[0x04][1],
-            points_[0x04][2],
             points_[0x08][0],
-            points_[0x08][1],
-            points_[0x08][2]
+            points_[0x08][1]
         );
         /// 0b0001: 0G + 1H
-        (points_[0x01][0], points_[0x01][1], points_[0x01][2]) = (
-            hx.copy(),
-            hy.copy(),
-            U384.init(1)
-        );
+        (points_[0x01][0], points_[0x01][1]) = (hx.copy(), hy.copy());
         /// 0b0010: 0G + 2H
-        (points_[0x02][0], points_[0x02][1], points_[0x02][2]) = _twiceProj(
+        (points_[0x02][0], points_[0x02][1]) = _twiceAffine(
             call,
             p,
             three,
             a,
             points_[0x01][0],
-            points_[0x01][1],
-            points_[0x01][2]
+            points_[0x01][1]
         );
         /// 0b0011: 0G + 3H
-        (points_[0x03][0], points_[0x03][1], points_[0x03][2]) = _addProj(
+        (points_[0x03][0], points_[0x03][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x01][0],
             points_[0x01][1],
-            points_[0x01][2],
             points_[0x02][0],
-            points_[0x02][1],
-            points_[0x02][2]
+            points_[0x02][1]
         );
         /// 0b0101: 1G + 1H
-        (points_[0x05][0], points_[0x05][1], points_[0x05][2]) = _addProj(
+        (points_[0x05][0], points_[0x05][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x04][0],
             points_[0x04][1],
-            points_[0x04][2],
             points_[0x01][0],
-            points_[0x01][1],
-            points_[0x01][2]
+            points_[0x01][1]
         );
         /// 0b0110: 1G + 2H
-        (points_[0x06][0], points_[0x06][1], points_[0x06][2]) = _addProj(
+        (points_[0x06][0], points_[0x06][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x04][0],
             points_[0x04][1],
-            points_[0x04][2],
             points_[0x02][0],
-            points_[0x02][1],
-            points_[0x02][2]
+            points_[0x02][1]
         );
         /// 0b0111: 1G + 3H
-        (points_[0x07][0], points_[0x07][1], points_[0x07][2]) = _addProj(
+        (points_[0x07][0], points_[0x07][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x04][0],
             points_[0x04][1],
-            points_[0x04][2],
             points_[0x03][0],
-            points_[0x03][1],
-            points_[0x03][2]
+            points_[0x03][1]
         );
         /// 0b1001: 2G + 1H
-        (points_[0x09][0], points_[0x09][1], points_[0x09][2]) = _addProj(
+        (points_[0x09][0], points_[0x09][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x08][0],
             points_[0x08][1],
-            points_[0x08][2],
             points_[0x01][0],
-            points_[0x01][1],
-            points_[0x01][2]
+            points_[0x01][1]
         );
         /// 0b1010: 2G + 2H
-        (points_[0x0A][0], points_[0x0A][1], points_[0x0A][2]) = _addProj(
+        (points_[0x0A][0], points_[0x0A][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x08][0],
             points_[0x08][1],
-            points_[0x08][2],
             points_[0x02][0],
-            points_[0x02][1],
-            points_[0x02][2]
+            points_[0x02][1]
         );
         /// 0b1011: 2G + 3H
-        (points_[0x0B][0], points_[0x0B][1], points_[0x0B][2]) = _addProj(
+        (points_[0x0B][0], points_[0x0B][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x08][0],
             points_[0x08][1],
-            points_[0x08][2],
             points_[0x03][0],
-            points_[0x03][1],
-            points_[0x03][2]
+            points_[0x03][1]
         );
         /// 0b1101: 3G + 1H
-        (points_[0x0D][0], points_[0x0D][1], points_[0x0D][2]) = _addProj(
+        (points_[0x0D][0], points_[0x0D][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x0C][0],
             points_[0x0C][1],
-            points_[0x0C][2],
             points_[0x01][0],
-            points_[0x01][1],
-            points_[0x01][2]
+            points_[0x01][1]
         );
         /// 0b1110: 3G + 2H
-        (points_[0x0E][0], points_[0x0E][1], points_[0x0E][2]) = _addProj(
+        (points_[0x0E][0], points_[0x0E][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x0C][0],
             points_[0x0C][1],
-            points_[0x0C][2],
             points_[0x02][0],
-            points_[0x02][1],
-            points_[0x02][2]
+            points_[0x02][1]
         );
         /// 0b1111: 3G + 3H
-        (points_[0x0F][0], points_[0x0F][1], points_[0x0F][2]) = _addProj(
+        (points_[0x0F][0], points_[0x0F][1]) = _addAffine(
             call,
             p,
-            three,
-            a,
             points_[0x0C][0],
             points_[0x0C][1],
-            points_[0x0C][2],
             points_[0x03][0],
-            points_[0x03][1],
-            points_[0x03][2]
+            points_[0x03][1]
         );
     }
 }
@@ -601,12 +505,12 @@ library ECDSA384 {
  */
 library U384 {
     uint256 private constant SHORT_ALLOCATION = 64;
-    uint256 private constant LONG_ALLOCATION = 96;
 
-    uint256 private constant CALL_ALLOCATION = 3 * 288;
+    uint256 private constant CALL_ALLOCATION = 4 * 288;
 
     uint256 private constant MUL_OFFSET = 288;
     uint256 private constant EXP_OFFSET = 2 * 288;
+    uint256 private constant INV_OFFSET = 3 * 288;
 
     function init(uint256 from_) internal pure returns (uint256 handler_) {
         unchecked {
@@ -664,6 +568,8 @@ library U384 {
         unchecked {
             handler_ = _allocate(CALL_ALLOCATION);
 
+            _sub(m_, init(2), handler_ + INV_OFFSET + 0xA0);
+
             assembly {
                 let call_ := add(handler_, MUL_OFFSET)
 
@@ -681,6 +587,14 @@ library U384 {
                 mstore(add(0x40, call_), 0x40)
                 mstore(add(0xC0, call_), mload(m_))
                 mstore(add(0xE0, call_), mload(add(m_, 0x20)))
+
+                call_ := add(handler_, INV_OFFSET)
+
+                mstore(call_, 0x40)
+                mstore(add(0x20, call_), 0x40)
+                mstore(add(0x40, call_), 0x40)
+                mstore(add(0xE0, call_), mload(m_))
+                mstore(add(0x0100, call_), mload(add(m_, 0x20)))
             }
         }
     }
@@ -765,18 +679,6 @@ library U384 {
         }
     }
 
-    function modexpAssign(uint256 call_, uint256 b_, uint256 eInteger_) internal view {
-        assembly {
-            call_ := add(call_, EXP_OFFSET)
-
-            mstore(add(0x60, call_), mload(b_))
-            mstore(add(0x80, call_), mload(add(b_, 0x20)))
-            mstore(add(0xA0, call_), eInteger_)
-
-            pop(staticcall(gas(), 0x5, call_, 0x0100, b_, 0x40))
-        }
-    }
-
     function modexpAssignTo(
         uint256 call_,
         uint256 to_,
@@ -818,16 +720,6 @@ library U384 {
         }
     }
 
-    function modaddAssignTo(uint256 to_, uint256 a_, uint256 b_, uint256 m_) internal pure {
-        unchecked {
-            _add(a_, b_, to_);
-
-            if (cmp(to_, m_) >= 0) {
-                return _subFrom(to_, m_);
-            }
-        }
-    }
-
     function modmul(uint256 call_, uint256 a_, uint256 b_) internal view returns (uint256 r_) {
         unchecked {
             r_ = _allocate(SHORT_ALLOCATION);
@@ -856,41 +748,55 @@ library U384 {
         }
     }
 
-    function modmulAssignTo(uint256 call_, uint256 to_, uint256 a_, uint256 b_) internal view {
+    function modsub(uint256 a_, uint256 b_, uint256 m_) internal pure returns (uint256 r_) {
         unchecked {
-            _mul(a_, b_, call_ + MUL_OFFSET + 0x60);
-
-            assembly {
-                call_ := add(call_, MUL_OFFSET)
+            r_ = _allocate(SHORT_ALLOCATION);
 
-                pop(staticcall(gas(), 0x5, call_, 0x0120, to_, 0x40))
+            if (cmp(a_, b_) >= 0) {
+                _sub(a_, b_, r_);
+                return r_;
             }
+
+            _add(a_, m_, r_);
+            _subFrom(r_, b_);
         }
     }
 
-    function sub(uint256 a_, uint256 b_) internal pure returns (uint256 r_) {
+    function modsubAssign(uint256 a_, uint256 b_, uint256 m_) internal pure {
         unchecked {
-            r_ = _allocate(SHORT_ALLOCATION);
-
-            _sub(a_, b_, r_);
+            if (cmp(a_, b_) >= 0) {
+                _subFrom(a_, b_);
+                return;
+            }
 
-            return r_;
+            _addTo(a_, m_);
+            _subFrom(a_, b_);
         }
     }
 
-    function subAssignTo(uint256 to_, uint256 a_, uint256 b_) internal pure {
+    function modsubAssignTo(uint256 to_, uint256 a_, uint256 b_, uint256 m_) internal pure {
         unchecked {
-            _sub(a_, b_, to_);
+            if (cmp(a_, b_) >= 0) {
+                _sub(a_, b_, to_);
+                return;
+            }
+
+            _add(a_, m_, to_);
+            _subFrom(to_, b_);
         }
     }
 
-    function modshl1Assign(uint256 a_, uint256 m_) internal pure {
+    function modshl1(uint256 a_, uint256 m_) internal pure returns (uint256 r_) {
         unchecked {
-            _shl1To(a_);
+            r_ = _allocate(SHORT_ALLOCATION);
 
-            if (cmp(a_, m_) >= 0) {
-                _subFrom(a_, m_);
+            _shl1(a_, r_);
+
+            if (cmp(r_, m_) >= 0) {
+                _subFrom(r_, m_);
             }
+
+            return r_;
         }
     }
 
@@ -904,6 +810,22 @@ library U384 {
         }
     }
 
+    /// @dev Stores modinv into `b_` and moddiv into `a_`.
+    function moddivAssign(uint256 call_, uint256 a_, uint256 b_) internal view {
+        unchecked {
+            assembly {
+                call_ := add(call_, INV_OFFSET)
+
+                mstore(add(0x60, call_), mload(b_))
+                mstore(add(0x80, call_), mload(add(b_, 0x20)))
+
+                pop(staticcall(gas(), 0x5, call_, 0x0120, b_, 0x40))
+            }
+
+            modmulAssign(call_ - INV_OFFSET, a_, b_);
+        }
+    }
+
     function moddiv(
         uint256 call_,
         uint256 a_,
@@ -957,15 +879,6 @@ library U384 {
         }
     }
 
-    function _shl1To(uint256 a_) internal pure {
-        assembly {
-            let a1_ := mload(add(a_, 0x20))
-
-            mstore(a_, or(shl(1, mload(a_)), shr(255, a1_)))
-            mstore(add(a_, 0x20), shl(1, a1_))
-        }
-    }
-
     function _add(uint256 a_, uint256 b_, uint256 r_) private pure {
         assembly {
             let aWord_ := mload(add(a_, 0x20))
@@ -1022,7 +935,7 @@ library U384 {
         }
     }
 
-    function _mul(uint256 a_, uint256 b_, uint256 r_) private view {
+    function _mul(uint256 a_, uint256 b_, uint256 r_) private pure {
         assembly {
             let a0_ := mload(a_)
             let a1_ := shr(128, mload(add(a_, 0x20)))