From 1690355bf27c2cbba685ba0cd70486275c7620b8 Mon Sep 17 00:00:00 2001 From: Adam Getchell Date: Mon, 8 Jun 2026 19:58:12 -0700 Subject: [PATCH] perf(solve): optimize exact and factorized solve kernels - Split LU and LDLT solve paths so tiny matrices keep the direct kernels while larger fixed dimensions avoid extra substitution work. - Convert dyadic exact solve results directly to finite f64 and preserve UnrepresentableReason recovery semantics on strict conversion failures. - Modernize release branch commands and keep just recipes sorted. --- docs/RELEASING.md | 6 +-- justfile | 130 ++++++++++++++++++++++----------------------- src/exact.rs | 132 +++++++++++++++++++++++++++++++++------------- src/ldlt.rs | 71 ++++++++++++++++++++----- src/lu.rs | 79 ++++++++++++++++++++------- 5 files changed, 281 insertions(+), 137 deletions(-) diff --git a/docs/RELEASING.md b/docs/RELEASING.md index 7a6aff0..afd9712 100644 --- a/docs/RELEASING.md +++ b/docs/RELEASING.md @@ -32,7 +32,7 @@ git remote -v Ensure your local `main` is up to date before beginning: ```bash -git checkout main +git switch main git pull --ff-only ``` @@ -50,7 +50,7 @@ but keep them minimal and release-critical. 1. Create the release branch ```bash -git checkout -b "release/$TAG" +git switch -c "release/$TAG" ``` 2. Bump versions @@ -200,7 +200,7 @@ If you discover issues after generating the changelog: 1. Sync your local `main` to the merge commit ```bash -git checkout main +git switch main git pull --ff-only ``` diff --git a/justfile b/justfile index 7bbb61b..4bc416d 100644 --- a/justfile +++ b/justfile @@ -165,19 +165,6 @@ action-lint: _ensure-actionlint bench: cargo bench --features bench -# Compile benchmarks without running them, treating warnings as errors. -# This catches bench/release-profile-only warnings that won't show up in normal debug-profile runs. -bench-compile: - RUSTFLAGS='-D warnings' cargo bench --no-run --features bench - RUSTFLAGS='-D warnings' cargo bench --no-run --features bench,exact --bench exact - -# Run the cheaper latest measurements used for latest-vs-last reports. -bench-latest: bench-vs-linalg-la-stack bench-exact - -# Run latest measurements and render the latest-vs-last performance report. -bench-latest-vs-last baseline="last": bench-latest python-sync - uv run bench-compare {{baseline}} - # Compare latest measurements against a saved baseline. # Defaults to the `last` full-release baseline. bench-compare baseline="last" suite="all" scope="release-signal": python-sync @@ -186,53 +173,22 @@ bench-compare baseline="last" suite="all" scope="release-signal": python-sync baseline="{{baseline}}" uv run bench-compare "$baseline" --suite "{{suite}}" --scope "{{scope}}" -# Backward-compatible alias for the GitHub Actions release-asset comparison. -performance-archive-published current_tag="" baseline_tag="": - just performance-github-assets "{{current_tag}}" "{{baseline_tag}}" - -# Compare stored GitHub Actions release benchmark assets without local cargo runs. -performance-github-assets current_tag="" baseline_tag="": python-sync - #!/usr/bin/env bash - set -euo pipefail - current_tag="{{current_tag}}" - baseline_tag="{{baseline_tag}}" - if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then - if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then - echo "current_tag and baseline_tag must be provided together" >&2 - exit 2 - fi - uv run archive-performance "$current_tag" "$baseline_tag" --github-assets --generate-in-temp-worktree --worktree-ref "$current_tag" --output-only --output target/bench-reports/github-assets-performance.md - else - uv run archive-performance --published-latest --github-assets --generate-in-temp-worktree --output-only --output target/bench-reports/github-assets-performance.md - fi - -# Compare the current tree against the latest published release locally. -performance-local: python-sync - uv run archive-performance --current-vs-latest --generate-in-temp-worktree --output-only --output target/bench-reports/performance.md - -# Generate local release-signal measurements in a temp worktree, then promote/archive docs. -performance-release current_tag="" baseline_tag="": python-sync - #!/usr/bin/env bash - set -euo pipefail - current_tag="{{current_tag}}" - baseline_tag="{{baseline_tag}}" - if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then - if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then - echo "current_tag and baseline_tag must be provided together" >&2 - exit 2 - fi - uv run archive-performance "$current_tag" "$baseline_tag" --generate-in-temp-worktree --worktree-ref HEAD - else - uv run archive-performance --infer-release --generate-in-temp-worktree --worktree-ref HEAD - fi +# Compile benchmarks without running them, treating warnings as errors. +# This catches bench/release-profile-only warnings that won't show up in normal debug-profile runs. +bench-compile: + RUSTFLAGS='-D warnings' cargo bench --no-run --features bench + RUSTFLAGS='-D warnings' cargo bench --no-run --features bench,exact --bench exact # Run the exact-arithmetic benchmark suite. bench-exact: cargo bench --features bench,exact --bench exact -# Save a full Criterion baseline for the previous release signal. -bench-save-last: - just bench-save-baseline last +# Run the cheaper latest measurements used for latest-vs-last reports. +bench-latest: bench-vs-linalg-la-stack bench-exact + +# Run latest measurements and render the latest-vs-last performance report. +bench-latest-vs-last baseline="last": bench-latest python-sync + uv run bench-compare {{baseline}} # Save a Criterion baseline. Defaults to all release-signal benchmark suites. bench-save-baseline tag suite="all": @@ -256,6 +212,10 @@ bench-save-baseline tag suite="all": ;; esac +# Save a full Criterion baseline for the previous release signal. +bench-save-last: + just bench-save-baseline last + # Bench the la-stack vs nalgebra/faer comparison suite. bench-vs-linalg filter="": #!/usr/bin/env bash @@ -340,6 +300,10 @@ check-fast: ci: check bench-compile test-all examples @echo "🎯 CI checks complete!" +# Validate CITATION.cff against the Citation File Format schema. +citation-check: _ensure-uv + uvx --from cffconvert==2.0.0 cffconvert --validate -i CITATION.cff + # Clean build artifacts clean: cargo clean @@ -502,6 +466,46 @@ markdown-fix: _ensure-rumdl markdown-lint: markdown-check +# Backward-compatible alias for the GitHub Actions release-asset comparison. +performance-archive-published current_tag="" baseline_tag="": + just performance-github-assets "{{current_tag}}" "{{baseline_tag}}" + +# Compare stored GitHub Actions release benchmark assets without local cargo runs. +performance-github-assets current_tag="" baseline_tag="": python-sync + #!/usr/bin/env bash + set -euo pipefail + current_tag="{{current_tag}}" + baseline_tag="{{baseline_tag}}" + if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then + if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then + echo "current_tag and baseline_tag must be provided together" >&2 + exit 2 + fi + uv run archive-performance "$current_tag" "$baseline_tag" --github-assets --generate-in-temp-worktree --worktree-ref "$current_tag" --output-only --output target/bench-reports/github-assets-performance.md + else + uv run archive-performance --published-latest --github-assets --generate-in-temp-worktree --output-only --output target/bench-reports/github-assets-performance.md + fi + +# Compare the current tree against the latest published release locally. +performance-local: python-sync + uv run archive-performance --current-vs-latest --generate-in-temp-worktree --output-only --output target/bench-reports/performance.md + +# Generate local release-signal measurements in a temp worktree, then promote/archive docs. +performance-release current_tag="" baseline_tag="": python-sync + #!/usr/bin/env bash + set -euo pipefail + current_tag="{{current_tag}}" + baseline_tag="{{baseline_tag}}" + if [[ -n "$current_tag" || -n "$baseline_tag" ]]; then + if [[ -z "$current_tag" || -z "$baseline_tag" ]]; then + echo "current_tag and baseline_tag must be provided together" >&2 + exit 2 + fi + uv run archive-performance "$current_tag" "$baseline_tag" --generate-in-temp-worktree --worktree-ref HEAD + else + uv run archive-performance --infer-release --generate-in-temp-worktree --worktree-ref HEAD + fi + # Plot: generate a single time-vs-dimension SVG from Criterion results. plot-vs-linalg metric="lu_solve" stat="median" sample="new" log_y="false": python-sync #!/usr/bin/env bash @@ -690,6 +694,8 @@ shell-check: echo "No shell files found to check." fi +shell-fix: shell-fmt + shell-fmt: _ensure-shfmt #!/usr/bin/env bash set -euo pipefail @@ -706,8 +712,6 @@ shell-fmt: _ensure-shfmt shell-lint: shell-check -shell-fix: shell-fmt - # Spell check (typos) spell-check: _ensure-typos #!/usr/bin/env bash @@ -777,6 +781,10 @@ test-python: python-sync uv run pytest -q # TOML +toml-check: toml-fmt-check toml-lint + +toml-fix: toml-fmt + toml-fmt: _ensure-taplo #!/usr/bin/env bash set -euo pipefail @@ -816,10 +824,6 @@ toml-lint: _ensure-taplo echo "No TOML files found to lint." fi -toml-check: toml-fmt-check toml-lint - -toml-fix: toml-fmt - # File validation validate-json: _ensure-jq #!/usr/bin/env bash @@ -863,10 +867,6 @@ yaml-fix: _ensure-dprint yaml-lint: yaml-check -# Validate CITATION.cff against the Citation File Format schema. -citation-check: _ensure-uv - uvx --from cffconvert==2.0.0 cffconvert --validate -i CITATION.cff - # GitHub Actions security analysis zizmor: _ensure-zizmor zizmor .github diff --git a/src/exact.rs b/src/exact.rs index f2aecdb..173f26a 100644 --- a/src/exact.rs +++ b/src/exact.rs @@ -172,42 +172,59 @@ fn bigint_exp_to_bigrational(mut value: BigInt, mut exp: i32) -> BigRational { } } -/// Convert a finite `f64` back to the exact rational value it represents. -fn finite_f64_to_bigrational(value: f64) -> Result { - let Some((mantissa, exponent, is_negative)) = f64_decompose(value)? else { - return Ok(BigRational::from_integer(BigInt::from(0))); - }; - - let value = BigInt::from(mantissa.get()); - let value = if is_negative { -value } else { value }; - Ok(bigint_exp_to_bigrational(value, exponent)) -} - /// Convert an exact rational result to `f64` only when the conversion is exact. +/// +/// This supports the strict `*_exact_f64` public APIs by accepting only dyadic +/// rational values that fit in finite binary64. The optional `index` is attached +/// to [`LaError::Unrepresentable`] for vector-valued solve components. +/// +/// # Errors +/// Returns [`LaError::Unrepresentable`] with +/// [`UnrepresentableReason::RequiresRounding`] when the rational denominator is +/// not a power of two and the rounded value would still be finite. fn exact_rational_to_finite_f64(exact: &BigRational, index: Option) -> Result { - let Some(value) = exact.to_f64() else { + let numerator = exact.numer(); + if numerator.sign() == Sign::NoSign { + return Ok(0.0); + } + + let denominator = exact.denom(); + let Some(denominator_exp) = denominator.trailing_zeros() else { cold_path(); return Err(LaError::unrepresentable( index, - UnrepresentableReason::NotFinite, + rounded_rational_unrepresentable_reason(exact), )); }; - if !value.is_finite() { + + if denominator.bits().checked_sub(1) != Some(denominator_exp) { cold_path(); return Err(LaError::unrepresentable( index, - UnrepresentableReason::NotFinite, + rounded_rational_unrepresentable_reason(exact), )); } - if finite_f64_to_bigrational(value)? == *exact { - Ok(value) - } else { + let Ok(denominator_exp) = i32::try_from(denominator_exp) else { cold_path(); - Err(LaError::unrepresentable( + return Err(LaError::unrepresentable( index, - UnrepresentableReason::RequiresRounding, - )) + rounded_rational_unrepresentable_reason(exact), + )); + }; + + bigint_exp_to_finite_f64(numerator.clone(), -denominator_exp, index) +} + +/// Classify a failed exact-rational-to-`f64` conversion by the rounded result. +/// +/// Strict exact conversion has already failed when this helper is called. It +/// preserves the [`UnrepresentableReason`] recovery contract: callers may retry +/// with a rounded API only when that rounded result would still be finite. +fn rounded_rational_unrepresentable_reason(exact: &BigRational) -> UnrepresentableReason { + match exact.to_f64() { + Some(value) if value.is_finite() => UnrepresentableReason::RequiresRounding, + _ => UnrepresentableReason::NotFinite, } } @@ -234,9 +251,26 @@ fn exact_rational_to_rounded_f64( } } -/// Convert a `BigInt × 2^exp` determinant pair to an exactly represented finite -/// `f64` without allocating a `BigRational`. -fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result { +/// Convert a `BigInt × 2^exp` pair to an exactly represented finite `f64`. +/// +/// This avoids allocating a [`BigRational`] when determinant and solve paths +/// already have an integer significand plus binary exponent. The optional +/// `index` is forwarded to [`LaError::Unrepresentable`] for vector-valued solve +/// components; determinant callers pass `None`. +/// +/// # Errors +/// Returns [`LaError::Unrepresentable`] with +/// [`UnrepresentableReason::RequiresRounding`] when the exact nonzero value +/// would need rounding or underflows below the smallest positive subnormal. +/// +/// Returns [`LaError::Unrepresentable`] with +/// [`UnrepresentableReason::NotFinite`] when the exact value cannot be +/// represented by any finite `f64`. +fn bigint_exp_to_finite_f64( + mut value: BigInt, + exp: i32, + index: Option, +) -> Result { if value == BigInt::from(0) { return Ok(0.0); } @@ -252,14 +286,14 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result let Ok(tz) = i64::try_from(tz) else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::NotFinite, )); }; let Some(updated_exp) = exp.checked_add(tz) else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::NotFinite, )); }; @@ -270,35 +304,35 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result let Ok(bit_len) = i64::try_from(bit_len) else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::NotFinite, )); }; let Some(top_bit_exp) = exp.checked_add(bit_len - 1) else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::NotFinite, )); }; if exp < F64_MIN_BINARY_EXPONENT { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::RequiresRounding, )); } if top_bit_exp > F64_MAX_BINARY_EXPONENT { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::NotFinite, )); } if bit_len > F64_SIGNIFICAND_BITS { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::RequiresRounding, )); } @@ -306,7 +340,7 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result let Some(mantissa) = value.to_u64() else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::NotFinite, )); }; @@ -316,7 +350,7 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result let Ok(shift) = u32::try_from(exp - F64_MIN_BINARY_EXPONENT) else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::RequiresRounding, )); }; @@ -325,14 +359,14 @@ fn bigint_exp_to_finite_f64(mut value: BigInt, exp: i32) -> Result let Ok(biased_exp) = u64::try_from(top_bit_exp + F64_EXPONENT_BIAS) else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::NotFinite, )); }; let Ok(shift) = u32::try_from(F64_FRACTION_BITS - (bit_len - 1)) else { cold_path(); return Err(LaError::unrepresentable( - None, + index, UnrepresentableReason::RequiresRounding, )); }; @@ -794,7 +828,7 @@ fn det_exact_finite(m: &Matrix) -> Result(m: &Matrix) -> Result { let (det_int, total_exp) = bareiss_det_int_finite(m)?; - bigint_exp_to_finite_f64(det_int, total_exp) + bigint_exp_to_finite_f64(det_int, total_exp, None) } /// Exact determinant rounded to finite `f64`. @@ -2931,6 +2965,20 @@ mod tests { ); } + #[test] + fn solve_exact_f64_huge_non_dyadic_component_returns_not_finite() { + let a = Matrix::<1>::try_from_rows([[3.0 * f64::MIN_POSITIVE]]).unwrap(); + let b = Vector::<1>::new([f64::MAX]); + + assert_eq!( + a.solve_exact_f64(b), + Err(LaError::Unrepresentable { + index: Some(0), + reason: UnrepresentableReason::NotFinite, + }) + ); + } + #[test] fn solve_exact_rounded_f64_overflow_returns_err() { let big = f64::MAX / 2.0; @@ -2961,6 +3009,18 @@ mod tests { ); } + #[test] + fn solve_exact_f64_accepts_smallest_subnormal_result() { + let tiny = f64::from_bits(1); + let a = Matrix::<1>::identity(); + let b = Vector::<1>::new([tiny]); + + assert_eq!( + a.solve_exact_f64(b).unwrap().into_array()[0].to_bits(), + tiny.to_bits() + ); + } + #[test] fn solve_exact_f64_rejects_non_dyadic_component() { let a = Matrix::<1>::try_from_rows([[3.0]]).unwrap(); diff --git a/src/ldlt.rs b/src/ldlt.rs index 55458c9..4f49a49 100644 --- a/src/ldlt.rs +++ b/src/ldlt.rs @@ -268,22 +268,46 @@ impl Ldlt { i += 1; } - // Back substitution: Lᵀ x = z. - let mut ii = 0; - while ii < D { - let i = D - 1 - ii; - let mut sum = x[i]; - let mut j = i + 1; - while j < D { - sum = (-self.factors.entry(j, i)).mul_add(x[j], sum); - j += 1; + if D <= 4 { + // Tiny matrices benchmark better with the direct textbook dot + // product for each row of Lᵀ. + let mut ii = 0; + while ii < D { + let i = D - 1 - ii; + let mut sum = x[i]; + let mut j = i + 1; + while j < D { + sum = (-self.factors.entry(j, i)).mul_add(x[j], sum); + j += 1; + } + if !sum.is_finite() { + cold_path(); + return Err(LaError::non_finite_at(i)); + } + x[i] = sum; + ii += 1; } - if !sum.is_finite() { - cold_path(); - return Err(LaError::non_finite_at(i)); + } else { + // Larger fixed dimensions benchmark better by walking finalized + // rows downward and scattering contributions into the remaining + // contiguous lower-triangular row prefix. + let mut jj = D; + while jj > 0 { + jj -= 1; + + let x_j = x[jj]; + if !x_j.is_finite() { + cold_path(); + return Err(LaError::non_finite_at(jj)); + } + + let row = self.factors.row(jj); + let mut i = 0; + while i < jj { + x[i] = (-row[i]).mul_add(x_j, x[i]); + i += 1; + } } - x[i] = sum; - ii += 1; } Ok(Vector::new_unchecked(x)) @@ -586,6 +610,25 @@ mod tests { assert_eq!(err, LaError::NonFinite { row: None, col: 1 }); } + #[test] + fn nonfinite_solve_back_substitution_overflow_scatter_branch_5d() { + // Exercises the D >= 5 row-prefix scatter branch with the same + // bottom-right 2x2 SPD block used by the D3 back-substitution test. + let a = Matrix::<5>::try_from_rows([ + [1.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 2.0], + [0.0, 0.0, 0.0, 2.0, 5.0], + ]) + .unwrap(); + let ldlt = a.ldlt(DEFAULT_SINGULAR_TOL).unwrap(); + + let b = Vector::<5>::new([0.0, 0.0, 0.0, 0.0, 1e308]); + let err = ldlt.solve(b).unwrap_err(); + assert_eq!(err, LaError::NonFinite { row: None, col: 3 }); + } + #[test] fn nonfinite_solve_diagonal_solve_overflow() { // Diagonal SPD matrix with a tiny diagonal entry just above the diff --git a/src/lu.rs b/src/lu.rs index 11c1976..0300a52 100644 --- a/src/lu.rs +++ b/src/lu.rs @@ -162,29 +162,51 @@ impl Lu { #[inline] pub(crate) const fn solve_finite(&self, b: Vector) -> Result, LaError> { let mut x = [0.0; D]; - let mut i = 0; let b = b.as_array(); - while i < D { - x[i] = b[self.piv[i]]; - i += 1; - } - - // Forward substitution for L (unit diagonal). let mut i = 0; - while i < D { - let mut sum = x[i]; - let row = self.factors.row(i); - let mut j = 0; - while j < i { - sum = (-row[j]).mul_add(x[j], sum); - j += 1; + + if D <= 4 { + while i < D { + x[i] = b[self.piv[i]]; + i += 1; } - if !sum.is_finite() { - cold_path(); - return Err(LaError::non_finite_at(i)); + + // Tiny matrices benchmark better when pivoted RHS materialization + // stays separate from forward substitution. + i = 0; + while i < D { + let mut sum = x[i]; + let row = self.factors.row(i); + let mut j = 0; + while j < i { + sum = (-row[j]).mul_add(x[j], sum); + j += 1; + } + if !sum.is_finite() { + cold_path(); + return Err(LaError::non_finite_at(i)); + } + x[i] = sum; + i += 1; + } + } else { + // Larger fixed dimensions avoid an extra pass by reading the + // pivoted right-hand side directly into forward substitution. + while i < D { + let mut sum = b[self.piv[i]]; + let row = self.factors.row(i); + let mut j = 0; + while j < i { + sum = (-row[j]).mul_add(x[j], sum); + j += 1; + } + if !sum.is_finite() { + cold_path(); + return Err(LaError::non_finite_at(i)); + } + x[i] = sum; + i += 1; } - x[i] = sum; - i += 1; } // Back substitution for U. @@ -546,6 +568,25 @@ mod tests { assert_eq!(err, LaError::NonFinite { row: None, col: 1 }); } + #[test] + fn solve_nonfinite_forward_substitution_overflow_fused_branch_5d() { + // Exercises the D >= 5 fused pivot/forward-substitution branch with the + // same overflowing L multiplier as the D3 test. + let a = Matrix::<5>::try_from_rows([ + [1.0, 0.0, 0.0, 0.0, 0.0], + [-1.0, 1.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 1.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 1.0], + ]) + .unwrap(); + let lu = a.lu(DEFAULT_SINGULAR_TOL).unwrap(); + + let b = Vector::<5>::new([1.0e308, 1.0e308, 0.0, 0.0, 0.0]); + let err = lu.solve(b).unwrap_err(); + assert_eq!(err, LaError::NonFinite { row: None, col: 1 }); + } + #[test] fn solve_nonfinite_back_substitution_overflow() { // Make x[1] overflow during back substitution, then ensure it is detected on the next row.