From 1690355bf27c2cbba685ba0cd70486275c7620b8 Mon Sep 17 00:00:00 2001
From: Adam Getchell <acgetchell@ucdavis.edu>
Date: Mon, 8 Jun 2026 19:58:12 -0700
Subject: [PATCH] perf(solve): optimize exact and factorized solve kernels

- Split LU and LDLT solve paths so tiny matrices keep the direct kernels while larger fixed dimensions avoid extra substitution work.
- Convert dyadic exact solve results directly to finite f64 and preserve UnrepresentableReason recovery semantics on strict conversion failures.
- Modernize release branch commands and keep just recipes sorted.
---
 docs/RELEASING.md |   6 +--
 justfile          | 130 ++++++++++++++++++++++-----------------------
 src/exact.rs      | 132 +++++++++++++++++++++++++++++++++-------------
 src/ldlt.rs       |  71 ++++++++++++++++++++-----
 src/lu.rs         |  79 ++++++++++++++++++++-------
 5 files changed, 281 insertions(+), 137 deletions(-)

diff --git a/docs/RELEASING.md b/docs/RELEASING.md
index 7a6aff0..afd9712 100644
--- a/docs/RELEASING.md
+++ b/docs/RELEASING.md
@@ -32,7 +32,7 @@ git remote -v
 Ensure your local `main` is up to date before beginning:
 
 ```bash
-git checkout main
+git switch main
 git pull --ff-only
 ```
 
@@ -50,7 +50,7 @@ but keep them minimal and release-critical.
 1. Create the release branch
 
 ```bash
-git checkout -b "release/$TAG"
+git switch -c "release/$TAG"
 ```
 
 2. Bump versions
@@ -200,7 +200,7 @@ If you discover issues after generating the changelog:
 1. Sync your local `main` to the merge commit
 
 ```bash
-git checkout main
+git switch main
 git pull --ff-only
 ```
 
diff --git a/justfile b/justfile
index 7bbb61b..4bc416d 100644
--- a/justfile
+++ b/justfile
@@ -165,19 +165,6 @@ action-lint: _ensure-actionlint
 bench:
     cargo bench --features bench
 
-# Compile benchmarks without running them, treating warnings as errors.
-# This catches bench/release-profile-only warnings that won't show up in normal debug-profile runs.
-bench-compile:
-    RUSTFLAGS='-D warnings' cargo bench --no-run --features bench
-    RUSTFLAGS='-D warnings' cargo bench --no-run --features bench,exact --bench exact
-
-# Run the cheaper latest measurements used for latest-vs-last reports.
-bench-latest: bench-vs-linalg-la-stack bench-exact
-
-# Run latest measurements and render the latest-vs-last performance report.
-bench-latest-vs-last baseline="last": bench-latest python-sync
-    uv run bench-compare {{baseline}}
-
 # Compare latest measurements against a saved baseline.
 # Defaults to the `last` full-release baseline.
 bench-compare baseline="last" suite="all" scope="release-signal": python-sync
@@ -186,53 +173,22 @@ bench-compare baseline="last" suite="all" scope="release-signal": python-sync
     baseline="{{baseline}}"
     uv run bench-compare "$baseline" --suite "{{suite}}" --scope "{{scope}}"
 
-# Backward-compatible alias for the GitHub Actions release-asset comparison.
-performance-archive-published current_tag="" baseline_tag="":
-    just performance-github-assets "{{current_tag}}" "{{baseline_tag}}"
-
-# Compare stored GitHub Actions release benchmark assets without local cargo runs.
-performance-github-assets current_tag="" baseline_tag="": python-sync
-    #!/usr/bin/env bash
-    set -euo pipefail
-    current_tag="{{current_tag}}"
-    baseline_tag="{{baseline_tag}}"
-    if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then
-        if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then
-            echo "current_tag and baseline_tag must be provided together" >&2
-            exit 2
-        fi
-        uv run archive-performance "$current_tag" "$baseline_tag" --github-assets --generate-in-temp-worktree --worktree-ref "$current_tag" --output-only --output target/bench-reports/github-assets-performance.md
-    else
-        uv run archive-performance --published-latest --github-assets --generate-in-temp-worktree --output-only --output target/bench-reports/github-assets-performance.md
-    fi
-
-# Compare the current tree against the latest published release locally.
-performance-local: python-sync
-    uv run archive-performance --current-vs-latest --generate-in-temp-worktree --output-only --output target/bench-reports/performance.md
-
-# Generate local release-signal measurements in a temp worktree, then promote/archive docs.
-performance-release current_tag="" baseline_tag="": python-sync
-    #!/usr/bin/env bash
-    set -euo pipefail
-    current_tag="{{current_tag}}"
-    baseline_tag="{{baseline_tag}}"
-    if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then
-        if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then
-            echo "current_tag and baseline_tag must be provided together" >&2
-            exit 2
-        fi
-        uv run archive-performance "$current_tag" "$baseline_tag" --generate-in-temp-worktree --worktree-ref HEAD
-    else
-        uv run archive-performance --infer-release --generate-in-temp-worktree --worktree-ref HEAD
-    fi
+# Compile benchmarks without running them, treating warnings as errors.
+# This catches bench/release-profile-only warnings that won't show up in normal debug-profile runs.
+bench-compile:
+    RUSTFLAGS='-D warnings' cargo bench --no-run --features bench
+    RUSTFLAGS='-D warnings' cargo bench --no-run --features bench,exact --bench exact
 
 # Run the exact-arithmetic benchmark suite.
 bench-exact:
     cargo bench --features bench,exact --bench exact
 
-# Save a full Criterion baseline for the previous release signal.
-bench-save-last:
-    just bench-save-baseline last
+# Run the cheaper latest measurements used for latest-vs-last reports.
+bench-latest: bench-vs-linalg-la-stack bench-exact
+
+# Run latest measurements and render the latest-vs-last performance report.
+bench-latest-vs-last baseline="last": bench-latest python-sync
+    uv run bench-compare {{baseline}}
 
 # Save a Criterion baseline. Defaults to all release-signal benchmark suites.
 bench-save-baseline tag suite="all":
@@ -256,6 +212,10 @@ bench-save-baseline tag suite="all":
             ;;
     esac
 
+# Save a full Criterion baseline for the previous release signal.
+bench-save-last:
+    just bench-save-baseline last
+
 # Bench the la-stack vs nalgebra/faer comparison suite.
 bench-vs-linalg filter="":
     #!/usr/bin/env bash
@@ -340,6 +300,10 @@ check-fast:
 ci: check bench-compile test-all examples
     @echo "🎯 CI checks complete!"
 
+# Validate CITATION.cff against the Citation File Format schema.
+citation-check: _ensure-uv
+    uvx --from cffconvert==2.0.0 cffconvert --validate -i CITATION.cff
+
 # Clean build artifacts
 clean:
     cargo clean
@@ -502,6 +466,46 @@ markdown-fix: _ensure-rumdl
 
 markdown-lint: markdown-check
 
+# Backward-compatible alias for the GitHub Actions release-asset comparison.
+performance-archive-published current_tag="" baseline_tag="":
+    just performance-github-assets "{{current_tag}}" "{{baseline_tag}}"
+
+# Compare stored GitHub Actions release benchmark assets without local cargo runs.
+performance-github-assets current_tag="" baseline_tag="": python-sync
+    #!/usr/bin/env bash
+    set -euo pipefail
+    current_tag="{{current_tag}}"
+    baseline_tag="{{baseline_tag}}"
+    if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then
+        if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then
+            echo "current_tag and baseline_tag must be provided together" >&2
+            exit 2
+        fi
+        uv run archive-performance "$current_tag" "$baseline_tag" --github-assets --generate-in-temp-worktree --worktree-ref "$current_tag" --output-only --output target/bench-reports/github-assets-performance.md
+    else
+        uv run archive-performance --published-latest --github-assets --generate-in-temp-worktree --output-only --output target/bench-reports/github-assets-performance.md
+    fi
+
+# Compare the current tree against the latest published release locally.
+performance-local: python-sync
+    uv run archive-performance --current-vs-latest --generate-in-temp-worktree --output-only --output target/bench-reports/performance.md
+
+# Generate local release-signal measurements in a temp worktree, then promote/archive docs.
+performance-release current_tag="" baseline_tag="": python-sync
+    #!/usr/bin/env bash
+    set -euo pipefail
+    current_tag="{{current_tag}}"
+    baseline_tag="{{baseline_tag}}"
+    if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then
+        if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then
+            echo "current_tag and baseline_tag must be provided together" >&2
+            exit 2
+        fi
+        uv run archive-performance "$current_tag" "$baseline_tag" --generate-in-temp-worktree --worktree-ref HEAD
+    else
+        uv run archive-performance --infer-release --generate-in-temp-worktree --worktree-ref HEAD
+    fi
+
 # Plot: generate a single time-vs-dimension SVG from Criterion results.
 plot-vs-linalg metric="lu_solve" stat="median" sample="new" log_y="false": python-sync
     #!/usr/bin/env bash
@@ -690,6 +694,8 @@ shell-check:
         echo "No shell files found to check."
     fi
 
+shell-fix: shell-fmt
+
 shell-fmt: _ensure-shfmt
     #!/usr/bin/env bash
     set -euo pipefail
@@ -706,8 +712,6 @@ shell-fmt: _ensure-shfmt
 
 shell-lint: shell-check
 
-shell-fix: shell-fmt
-
 # Spell check (typos)
 spell-check: _ensure-typos
     #!/usr/bin/env bash
@@ -777,6 +781,10 @@ test-python: python-sync
     uv run pytest -q
 
 # TOML
+toml-check: toml-fmt-check toml-lint
+
+toml-fix: toml-fmt
+
 toml-fmt: _ensure-taplo
     #!/usr/bin/env bash
     set -euo pipefail
@@ -816,10 +824,6 @@ toml-lint: _ensure-taplo
         echo "No TOML files found to lint."
     fi
 
-toml-check: toml-fmt-check toml-lint
-
-toml-fix: toml-fmt
-
 # File validation
 validate-json: _ensure-jq
     #!/usr/bin/env bash
@@ -863,10 +867,6 @@ yaml-fix: _ensure-dprint
 
 yaml-lint: yaml-check
 
-# Validate CITATION.cff against the Citation File Format schema.
-citation-check: _ensure-uv
-    uvx --from cffconvert==2.0.0 cffconvert --validate -i CITATION.cff
-
 # GitHub Actions security analysis
 zizmor: _ensure-zizmor
     zizmor .github
diff --git a/src/exact.rs b/src/exact.rs
index f2aecdb..173f26a 100644
--- a/src/exact.rs
+++ b/src/exact.rs
@@ -172,42 +172,59 @@ fn bigint_exp_to_bigrational(mut value: BigInt, mut exp: i32) -> BigRational {
     }
 }
 
-/// Convert a finite `f64` back to the exact rational value it represents.
-fn finite_f64_to_bigrational(value: f64) -> Result<BigRational, LaError> {
-    let Some((mantissa, exponent, is_negative)) = f64_decompose(value)? else {
-        return Ok(BigRational::from_integer(BigInt::from(0)));
-    };
-
-    let value = BigInt::from(mantissa.get());
-    let value = if is_negative { -value } else { value };
-    Ok(bigint_exp_to_bigrational(value, exponent))
-}
-
 /// Convert an exact rational result to `f64` only when the conversion is exact.
+///
+/// This supports the strict `*_exact_f64` public APIs by accepting only dyadic
+/// rational values that fit in finite binary64. The optional `index` is attached
+/// to [`LaError::Unrepresentable`] for vector-valued solve components.
+///
+/// # Errors
+/// Returns [`LaError::Unrepresentable`] with
+/// [`UnrepresentableReason::RequiresRounding`] when the rational denominator is
+/// not a power of two and the rounded value would still be finite.
 fn exact_rational_to_finite_f64(exact: &BigRational, index: Option<usize>) -> Result<f64, LaError> {
-    let Some(value) = exact.to_f64() else {
+    let numerator = exact.numer();
+    if numerator.sign() == Sign::NoSign {
+        return Ok(0.0);
+    }
+
+    let denominator = exact.denom();
+    let Some(denominator_exp) = denominator.trailing_zeros() else {
         cold_path();
         return Err(LaError::unrepresentable(
             index,
-            UnrepresentableReason::NotFinite,
+            rounded_rational_unrepresentable_reason(exact),
         ));
     };
-    if !value.is_finite() {
+
+    if denominator.bits().checked_sub(1) != Some(denominator_exp) {
         cold_path();
         return Err(LaError::unrepresentable(
             index,
-            UnrepresentableReason::NotFinite,
+            rounded_rational_unrepresentable_reason(exact),
         ));
     }
 
-    if finite_f64_to_bigrational(value)? == *exact {
-        Ok(value)
-    } else {
+    let Ok(denominator_exp) = i32::try_from(denominator_exp) else {
         cold_path();
-        Err(LaError::unrepresentable(
+        return Err(LaError::unrepresentable(
             index,
-            UnrepresentableReason::RequiresRounding,
-        ))
+            rounded_rational_unrepresentable_reason(exact),
+        ));
+    };
+
+    bigint_exp_to_finite_f64(numerator.clone(), -denominator_exp, index)
+}
+
+/// Classify a failed exact-rational-to-`f64` conversion by the rounded result.
+///
+/// Strict exact conversion has already failed when this helper is called. It
+/// preserves the [`UnrepresentableReason`] recovery contract: callers may retry
+/// with a rounded API only when that rounded result would still be finite.
+fn rounded_rational_unrepresentable_reason(exact: &BigRational) -> UnrepresentableReason {
+    match exact.to_f64() {
+        Some(value) if value.is_finite() => UnrepresentableReason::RequiresRounding,
+        _ => UnrepresentableReason::NotFinite,
     }
 }
 
@@ -234,9 +251,26 @@ fn exact_rational_to_rounded_f64(
     }
 }
 
-/// Convert a `BigInt × 2^exp` determinant pair to an exactly represented finite
-/// `f64` without allocating a `BigRational`.
-fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result<f64, LaError> {
+/// Convert a `BigInt × 2^exp` pair to an exactly represented finite `f64`.
+///
+/// This avoids allocating a [`BigRational`] when determinant and solve paths
+/// already have an integer significand plus binary exponent. The optional
+/// `index` is forwarded to [`LaError::Unrepresentable`] for vector-valued solve
+/// components; determinant callers pass `None`.
+///
+/// # Errors
+/// Returns [`LaError::Unrepresentable`] with
+/// [`UnrepresentableReason::RequiresRounding`] when the exact nonzero value
+/// would need rounding or underflows below the smallest positive subnormal.
+///
+/// Returns [`LaError::Unrepresentable`] with
+/// [`UnrepresentableReason::NotFinite`] when the exact value cannot be
+/// represented by any finite `f64`.
+fn bigint_exp_to_finite_f64(
+    mut value: BigInt,
+    exp: i32,
+    index: Option<usize>,
+) -> Result<f64, LaError> {
     if value == BigInt::from(0) {
         return Ok(0.0);
     }
@@ -252,14 +286,14 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result<f64, LaError>
         let Ok(tz) = i64::try_from(tz) else {
             cold_path();
             return Err(LaError::unrepresentable(
-                None,
+                index,
                 UnrepresentableReason::NotFinite,
             ));
         };
         let Some(updated_exp) = exp.checked_add(tz) else {
             cold_path();
             return Err(LaError::unrepresentable(
-                None,
+                index,
                 UnrepresentableReason::NotFinite,
             ));
         };
@@ -270,35 +304,35 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result<f64, LaError>
     let Ok(bit_len) = i64::try_from(bit_len) else {
         cold_path();
         return Err(LaError::unrepresentable(
-            None,
+            index,
             UnrepresentableReason::NotFinite,
         ));
     };
     let Some(top_bit_exp) = exp.checked_add(bit_len - 1) else {
         cold_path();
         return Err(LaError::unrepresentable(
-            None,
+            index,
             UnrepresentableReason::NotFinite,
         ));
     };
     if exp < F64_MIN_BINARY_EXPONENT {
         cold_path();
         return Err(LaError::unrepresentable(
-            None,
+            index,
             UnrepresentableReason::RequiresRounding,
         ));
     }
     if top_bit_exp > F64_MAX_BINARY_EXPONENT {
         cold_path();
         return Err(LaError::unrepresentable(
-            None,
+            index,
             UnrepresentableReason::NotFinite,
         ));
     }
     if bit_len > F64_SIGNIFICAND_BITS {
         cold_path();
         return Err(LaError::unrepresentable(
-            None,
+            index,
             UnrepresentableReason::RequiresRounding,
         ));
     }
@@ -306,7 +340,7 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result<f64, LaError>
     let Some(mantissa) = value.to_u64() else {
         cold_path();
         return Err(LaError::unrepresentable(
-            None,
+            index,
             UnrepresentableReason::NotFinite,
         ));
     };
@@ -316,7 +350,7 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result<f64, LaError>
         let Ok(shift) = u32::try_from(exp - F64_MIN_BINARY_EXPONENT) else {
             cold_path();
             return Err(LaError::unrepresentable(
-                None,
+                index,
                 UnrepresentableReason::RequiresRounding,
             ));
         };
@@ -325,14 +359,14 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result<f64, LaError>
         let Ok(biased_exp) = u64::try_from(top_bit_exp + F64_EXPONENT_BIAS) else {
             cold_path();
             return Err(LaError::unrepresentable(
-                None,
+                index,
                 UnrepresentableReason::NotFinite,
             ));
         };
         let Ok(shift) = u32::try_from(F64_FRACTION_BITS - (bit_len - 1)) else {
             cold_path();
             return Err(LaError::unrepresentable(
-                None,
+                index,
                 UnrepresentableReason::RequiresRounding,
             ));
         };
@@ -794,7 +828,7 @@ fn det_exact_finite<const D: usize>(m: &Matrix<D>) -> Result<BigRational, LaErro
 #[inline]
 fn det_exact_f64_finite<const D: usize>(m: &Matrix<D>) -> Result<f64, LaError> {
     let (det_int, total_exp) = bareiss_det_int_finite(m)?;
-    bigint_exp_to_finite_f64(det_int, total_exp)
+    bigint_exp_to_finite_f64(det_int, total_exp, None)
 }
 
 /// Exact determinant rounded to finite `f64`.
@@ -2931,6 +2965,20 @@ mod tests {
         );
     }
 
+    #[test]
+    fn solve_exact_f64_huge_non_dyadic_component_returns_not_finite() {
+        let a = Matrix::<1>::try_from_rows([[3.0 * f64::MIN_POSITIVE]]).unwrap();
+        let b = Vector::<1>::new([f64::MAX]);
+
+        assert_eq!(
+            a.solve_exact_f64(b),
+            Err(LaError::Unrepresentable {
+                index: Some(0),
+                reason: UnrepresentableReason::NotFinite,
+            })
+        );
+    }
+
     #[test]
     fn solve_exact_rounded_f64_overflow_returns_err() {
         let big = f64::MAX / 2.0;
@@ -2961,6 +3009,18 @@ mod tests {
         );
     }
 
+    #[test]
+    fn solve_exact_f64_accepts_smallest_subnormal_result() {
+        let tiny = f64::from_bits(1);
+        let a = Matrix::<1>::identity();
+        let b = Vector::<1>::new([tiny]);
+
+        assert_eq!(
+            a.solve_exact_f64(b).unwrap().into_array()[0].to_bits(),
+            tiny.to_bits()
+        );
+    }
+
     #[test]
     fn solve_exact_f64_rejects_non_dyadic_component() {
         let a = Matrix::<1>::try_from_rows([[3.0]]).unwrap();
diff --git a/src/ldlt.rs b/src/ldlt.rs
index 55458c9..4f49a49 100644
--- a/src/ldlt.rs
+++ b/src/ldlt.rs
@@ -268,22 +268,46 @@ impl<const D: usize> Ldlt<D> {
             i += 1;
         }
 
-        // Back substitution: Lᵀ x = z.
-        let mut ii = 0;
-        while ii < D {
-            let i = D - 1 - ii;
-            let mut sum = x[i];
-            let mut j = i + 1;
-            while j < D {
-                sum = (-self.factors.entry(j, i)).mul_add(x[j], sum);
-                j += 1;
+        if D <= 4 {
+            // Tiny matrices benchmark better with the direct textbook dot
+            // product for each row of Lᵀ.
+            let mut ii = 0;
+            while ii < D {
+                let i = D - 1 - ii;
+                let mut sum = x[i];
+                let mut j = i + 1;
+                while j < D {
+                    sum = (-self.factors.entry(j, i)).mul_add(x[j], sum);
+                    j += 1;
+                }
+                if !sum.is_finite() {
+                    cold_path();
+                    return Err(LaError::non_finite_at(i));
+                }
+                x[i] = sum;
+                ii += 1;
             }
-            if !sum.is_finite() {
-                cold_path();
-                return Err(LaError::non_finite_at(i));
+        } else {
+            // Larger fixed dimensions benchmark better by walking finalized
+            // rows downward and scattering contributions into the remaining
+            // contiguous lower-triangular row prefix.
+            let mut jj = D;
+            while jj > 0 {
+                jj -= 1;
+
+                let x_j = x[jj];
+                if !x_j.is_finite() {
+                    cold_path();
+                    return Err(LaError::non_finite_at(jj));
+                }
+
+                let row = self.factors.row(jj);
+                let mut i = 0;
+                while i < jj {
+                    x[i] = (-row[i]).mul_add(x_j, x[i]);
+                    i += 1;
+                }
             }
-            x[i] = sum;
-            ii += 1;
         }
 
         Ok(Vector::new_unchecked(x))
@@ -586,6 +610,25 @@ mod tests {
         assert_eq!(err, LaError::NonFinite { row: None, col: 1 });
     }
 
+    #[test]
+    fn nonfinite_solve_back_substitution_overflow_scatter_branch_5d() {
+        // Exercises the D >= 5 row-prefix scatter branch with the same
+        // bottom-right 2x2 SPD block used by the D3 back-substitution test.
+        let a = Matrix::<5>::try_from_rows([
+            [1.0, 0.0, 0.0, 0.0, 0.0],
+            [0.0, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0, 2.0],
+            [0.0, 0.0, 0.0, 2.0, 5.0],
+        ])
+        .unwrap();
+        let ldlt = a.ldlt(DEFAULT_SINGULAR_TOL).unwrap();
+
+        let b = Vector::<5>::new([0.0, 0.0, 0.0, 0.0, 1e308]);
+        let err = ldlt.solve(b).unwrap_err();
+        assert_eq!(err, LaError::NonFinite { row: None, col: 3 });
+    }
+
     #[test]
     fn nonfinite_solve_diagonal_solve_overflow() {
         // Diagonal SPD matrix with a tiny diagonal entry just above the
diff --git a/src/lu.rs b/src/lu.rs
index 11c1976..0300a52 100644
--- a/src/lu.rs
+++ b/src/lu.rs
@@ -162,29 +162,51 @@ impl<const D: usize> Lu<D> {
     #[inline]
     pub(crate) const fn solve_finite(&self, b: Vector<D>) -> Result<Vector<D>, LaError> {
         let mut x = [0.0; D];
-        let mut i = 0;
         let b = b.as_array();
-        while i < D {
-            x[i] = b[self.piv[i]];
-            i += 1;
-        }
-
-        // Forward substitution for L (unit diagonal).
         let mut i = 0;
-        while i < D {
-            let mut sum = x[i];
-            let row = self.factors.row(i);
-            let mut j = 0;
-            while j < i {
-                sum = (-row[j]).mul_add(x[j], sum);
-                j += 1;
+
+        if D <= 4 {
+            while i < D {
+                x[i] = b[self.piv[i]];
+                i += 1;
             }
-            if !sum.is_finite() {
-                cold_path();
-                return Err(LaError::non_finite_at(i));
+
+            // Tiny matrices benchmark better when pivoted RHS materialization
+            // stays separate from forward substitution.
+            i = 0;
+            while i < D {
+                let mut sum = x[i];
+                let row = self.factors.row(i);
+                let mut j = 0;
+                while j < i {
+                    sum = (-row[j]).mul_add(x[j], sum);
+                    j += 1;
+                }
+                if !sum.is_finite() {
+                    cold_path();
+                    return Err(LaError::non_finite_at(i));
+                }
+                x[i] = sum;
+                i += 1;
+            }
+        } else {
+            // Larger fixed dimensions avoid an extra pass by reading the
+            // pivoted right-hand side directly into forward substitution.
+            while i < D {
+                let mut sum = b[self.piv[i]];
+                let row = self.factors.row(i);
+                let mut j = 0;
+                while j < i {
+                    sum = (-row[j]).mul_add(x[j], sum);
+                    j += 1;
+                }
+                if !sum.is_finite() {
+                    cold_path();
+                    return Err(LaError::non_finite_at(i));
+                }
+                x[i] = sum;
+                i += 1;
             }
-            x[i] = sum;
-            i += 1;
         }
 
         // Back substitution for U.
@@ -546,6 +568,25 @@ mod tests {
         assert_eq!(err, LaError::NonFinite { row: None, col: 1 });
     }
 
+    #[test]
+    fn solve_nonfinite_forward_substitution_overflow_fused_branch_5d() {
+        // Exercises the D >= 5 fused pivot/forward-substitution branch with the
+        // same overflowing L multiplier as the D3 test.
+        let a = Matrix::<5>::try_from_rows([
+            [1.0, 0.0, 0.0, 0.0, 0.0],
+            [-1.0, 1.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.0, 1.0, 0.0],
+            [0.0, 0.0, 0.0, 0.0, 1.0],
+        ])
+        .unwrap();
+        let lu = a.lu(DEFAULT_SINGULAR_TOL).unwrap();
+
+        let b = Vector::<5>::new([1.0e308, 1.0e308, 0.0, 0.0, 0.0]);
+        let err = lu.solve(b).unwrap_err();
+        assert_eq!(err, LaError::NonFinite { row: None, col: 1 });
+    }
+
     #[test]
     fn solve_nonfinite_back_substitution_overflow() {
         // Make x[1] overflow during back substitution, then ensure it is detected on the next row.