From 30f7e8b51f3fb2efa6c2b65e0ed72da0f8ce87dd Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 1 Jun 2026 21:06:27 -0400
Subject: [PATCH 1/3] feat: add --fast-build dev-iteration build mode
 (prototype)

New 'Fast' build type for fast edit-rebuild-run iteration (e.g. GPU print
debugging). It matches none of the Release-only (IPO, -march=native) or
Debug/RelDebug-only (MFC_DEBUG, -gpu=debug) conditional blocks, so it inherits
none of them; adds a light -O1. On NVHPC GPU builds it autodetects the node's
single compute capability (nvidia-smi) and overrides the multi-arch MFC_CUDA_CC,
with MFC_FAST_ARCH as a login-node escape hatch.

Measured (NVHPC 24.5, RTX 6000 cc75, generic simulation, 8 cores):
  clean build  641s (Release fat 5-arch) -> 170s  (3.8x)
  hot-module   385s (Release fat 5-arch) ->  79s  (4.9x)
Verified: builds with no IPO/MFC_DEBUG, runs a 1D case on GPU to exit 0.

Adds fast_build to MFCConfig (auto --fast-build/--no-fast-build, own slug);
bumps lock version to 9 for the new config field.
---
 CMakeLists.txt         | 19 ++++++++++++--
 toolchain/mfc/build.py | 57 +++++++++++++++++++++++++++++++++++++++---
 toolchain/mfc/lock.py  |  2 +-
 toolchain/mfc/state.py |  1 +
 4 files changed, 73 insertions(+), 6 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83bbb8fe0e..72c8fb1236 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,9 +40,9 @@ if (MFC_ALL)
 endif()
 
 # Validate CMAKE_BUILD_TYPE to catch typos (CMake is case-sensitive).
-set(_VALID_BUILD_TYPES "Debug" "Release" "RelDebug" "")
+set(_VALID_BUILD_TYPES "Debug" "Release" "RelDebug" "Fast" "")
 if (NOT CMAKE_BUILD_TYPE IN_LIST _VALID_BUILD_TYPES)
-    message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE '${CMAKE_BUILD_TYPE}'. Valid: Debug, RelDebug, Release")
+    message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE '${CMAKE_BUILD_TYPE}'. Valid: Debug, RelDebug, Release, Fast")
 endif()
 
 # RelDebug: a lighter debug mode for CI. Compiler-specific blocks below add the
@@ -51,6 +51,13 @@ set(CMAKE_C_FLAGS_RELDEBUG       "-g" CACHE STRING "")
 set(CMAKE_CXX_FLAGS_RELDEBUG     "-g" CACHE STRING "")
 set(CMAKE_Fortran_FLAGS_RELDEBUG "-g" CACHE STRING "")
 
+# Fast: fast-iteration dev builds (e.g. GPU print-debugging). Deliberately matches
+# none of the Release-only (IPO, -march=native) or Debug/RelDebug-only (MFC_DEBUG,
+# -gpu=debug) conditional blocks below, so it inherits none of them - just a light -O1.
+set(CMAKE_C_FLAGS_FAST       "-O1" CACHE STRING "")
+set(CMAKE_CXX_FLAGS_FAST     "-O1" CACHE STRING "")
+set(CMAKE_Fortran_FLAGS_FAST "-O1" CACHE STRING "")
+
 if (MFC_SINGLE_PRECISION)
     add_compile_definitions(MFC_SINGLE_PRECISION)
 else()
@@ -330,6 +337,14 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug")
     add_compile_definitions(MFC_DEBUG)
 endif()
 
+# Fast: light optimization for dev iteration. Like Debug/RelDebug, the real opt
+# flag is injected here (the CMAKE_*_FLAGS_FAST cache vars are placeholders).
+# -O1 keeps compile time low while giving acceptable runtime; no MFC_DEBUG, so
+# device routines stay free of host-only debug aborts and compile without IPO.
+if (CMAKE_BUILD_TYPE STREQUAL "Fast")
+    add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-O1>)
+endif()
+
 
 
 # HANDLE_SOURCES: Given a target (herein <target>):
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 01efb1a9b1..096859a193 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -27,6 +27,53 @@
 _MAKE_PROGRESS_RE = re.compile(r"^\[\s*(\d+)%\]\s+(.*)$")
 
 
+def _cmake_build_type() -> str:
+    """Map the CLI build-mode flags to a CMAKE_BUILD_TYPE string."""
+    if ARG("debug"):
+        return "Debug"
+    if ARG("reldebug"):
+        return "RelDebug"
+    if ARG("fast_build", False):
+        return "Fast"
+    return "Release"
+
+
+def _apply_fast_build_gpu_arch() -> None:
+    """Under --fast-build on an NVHPC GPU build, restrict device codegen to a
+    single compute capability (the node's GPU), overriding the multi-arch
+    MFC_CUDA_CC that the module files set. CMake reads $ENV{MFC_CUDA_CC}.
+
+    Cray/AMD GPU builds don't use MFC_CUDA_CC (they are already single-arch via
+    craype-accel/--offload-arch), so this only acts when MFC_CUDA_CC is set.
+    Hard-errors if no GPU is detectable and no explicit arch is provided."""
+    if not ARG("fast_build", False) or ARG("gpu") == gpuConfigOptions.NONE.value:
+        return
+    if not os.environ.get("MFC_CUDA_CC"):  # not an NVHPC node; nothing to do
+        return
+
+    override = os.environ.get("MFC_FAST_ARCH")  # escape hatch for login nodes
+    if override:
+        os.environ["MFC_CUDA_CC"] = override
+        return
+
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+            check=False,
+        )
+        caps = [ln.strip().replace(".", "") for ln in result.stdout.splitlines() if ln.strip()]
+    except (OSError, subprocess.SubprocessError):
+        caps = []
+
+    if not caps:
+        raise MFCException("--fast-build: could not detect a local GPU compute capability " "(no GPU visible via nvidia-smi). Run on a GPU node, or set " "MFC_FAST_ARCH=<cc> (e.g. MFC_FAST_ARCH=90).")
+
+    os.environ["MFC_CUDA_CC"] = caps[0]
+
+
 def _run_build_with_progress(command: typing.List[str], target_name: str, streaming: bool = False) -> subprocess.CompletedProcess:
     """
     Run a build command with a progress bar that parses ninja output.
@@ -367,6 +414,10 @@ def is_buildable(self) -> bool:
     def configure(self, case: Case):
         if ARG("debug") and ARG("reldebug"):
             raise MFCException("--debug and --reldebug are mutually exclusive.")
+        if ARG("fast_build", False) and (ARG("debug") or ARG("reldebug")):
+            raise MFCException("--fast-build is mutually exclusive with --debug/--reldebug.")
+
+        _apply_fast_build_gpu_arch()
 
         build_dirpath = self.get_staging_dirpath(case)
         cmake_dirpath = self.get_cmake_dirpath()
@@ -386,9 +437,9 @@ def configure(self, case: Case):
             # build the configured targets. This is mostly useful for debugging.
             # See: https://cmake.org/cmake/help/latest/variable/CMAKE_EXPORT_COMPILE_COMMANDS.html.
             "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON",
-            # Set build type (Debug, RelDebug, or Release).
+            # Set build type (Debug, RelDebug, Fast, or Release).
             # See: https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html
-            f"-DCMAKE_BUILD_TYPE={'Debug' if ARG('debug') else 'RelDebug' if ARG('reldebug') else 'Release'}",
+            f"-DCMAKE_BUILD_TYPE={_cmake_build_type()}",
             # Used by FIND_PACKAGE (/FindXXX) to search for packages, with the
             # second highest level of priority, still letting users manually
             # specify <PackageName>_ROOT, which has precedence over CMAKE_PREFIX_PATH.
@@ -468,7 +519,7 @@ def build(self, case: input.MFCInputFile):
             "--parallel",
             ARG("jobs"),
             "--config",
-            "Debug" if ARG("debug") else "RelDebug" if ARG("reldebug") else "Release",
+            _cmake_build_type(),
         ]
 
         verbosity = ARG("verbose")
diff --git a/toolchain/mfc/lock.py b/toolchain/mfc/lock.py
index 02a8732f9b..0e33c8ccbc 100644
--- a/toolchain/mfc/lock.py
+++ b/toolchain/mfc/lock.py
@@ -5,7 +5,7 @@
 from .printer import cons
 from .state import MFCConfig
 
-MFC_LOCK_CURRENT_VERSION: int = 8
+MFC_LOCK_CURRENT_VERSION: int = 9
 
 
 @dataclasses.dataclass
diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py
index 94a37be947..a14393ebde 100644
--- a/toolchain/mfc/state.py
+++ b/toolchain/mfc/state.py
@@ -16,6 +16,7 @@ class MFCConfig:
     gpu: str = gpuConfigOptions.NONE.value
     debug: bool = False
     reldebug: bool = False
+    fast_build: bool = False
     gcov: bool = False
     unified: bool = False
     single: bool = False

From f1ad00030aa0b5f36f8e3ff5d8a248f81e90766e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Mon, 1 Jun 2026 21:20:12 -0400
Subject: [PATCH 2/3] docs: fast-build mode design + AMD/LLVMFlang link-time
 plan

Documents the --fast-build dev-iteration mode: motivation, usage, the new Fast
build type, measured NVHPC results (clean 3.8x, hot-module 4.9x), and the
proposed AMD/LLVMFlang path (device-LTO diagnosis, -fopenmp-target-jit + -O1,
high-j partitions lever) with steps to validate on an AMD GPU + AMD compiler
node. The AMD path is analysis only and unverified (no LLVMFlang on the dev box).
---
 docs/documentation/fast_build.md | 139 +++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 docs/documentation/fast_build.md

diff --git a/docs/documentation/fast_build.md b/docs/documentation/fast_build.md
new file mode 100644
index 0000000000..396b12cc19
--- /dev/null
+++ b/docs/documentation/fast_build.md
@@ -0,0 +1,139 @@
+# Fast Dev Builds (`--fast-build`)
+
+Status: **prototype / work in progress.** The NVHPC path is implemented and measured;
+the AMD (LLVMFlang) path described below is an analysis with a proposed change that
+still needs validation on an AMD GPU + AMD-compiler machine.
+
+## Motivation
+
+GPU builds are slow to iterate on when you just want to add a `print` and re-run.
+Two different compilers, two different bottlenecks:
+
+- **NVHPC** — the per-iteration cost is the two-pass IPO link (`-Mextract` + `-Minline`),
+  plus device codegen for every targeted compute capability. Editing one hot file and
+  rebuilding takes minutes because the IPO passes re-run and device code is generated for
+  every arch in `MFC_CUDA_CC`.
+- **AMD / LLVMFlang** — the per-iteration cost is the **device LTO link**. The link step
+  can take 20+ minutes *every* build, because the OpenMP offload device link does
+  whole-program LTO regardless of what changed.
+
+`--fast-build` is a dedicated build mode that strips the expensive, optimization-oriented
+machinery that is pointless during print-debugging.
+
+## Usage
+
+```bash
+./mfc.sh build -t simulation --gpu acc --fast-build -j 8     # NVHPC (OpenACC)
+./mfc.sh build -t simulation --gpu mp  --fast-build -j 8     # AMD/Cray (OpenMP offload)
+```
+
+`--fast-build` is mutually exclusive with `--debug` / `--reldebug`. It is **not** a
+correctness build: no bounds checking, no `MFC_DEBUG` asserts. Add your own `print`/`write`
+statements; pair with `--debug` when you need runtime checks.
+
+## What it does
+
+`--fast-build` selects a new CMake build type, `Fast`, that deliberately matches **none** of
+the existing conditional flag blocks in `CMakeLists.txt`:
+
+- Not `Release`, so no IPO/LTO and no `-march=native`.
+- Not `Debug`/`RelDebug`, so no `MFC_DEBUG` and no `-gpu=debug`.
+
+It then adds a light `-O1` (via `add_compile_options`, since the `CMAKE_*_FLAGS_FAST` cache
+variables do not inject flags in this codebase). Because `MFC_DEBUG` is off, device routines
+contain no host-only debug aborts, so the binary compiles cleanly **without** IPO.
+
+On NVHPC GPU builds it also restricts device codegen to a **single** compute capability —
+the GPU on the build node, detected via `nvidia-smi` — overriding the multi-arch
+`MFC_CUDA_CC` that the module files set. Set `MFC_FAST_ARCH=<cc>` (e.g. `MFC_FAST_ARCH=90`)
+to override the detection on a login node with no visible GPU.
+
+## NVHPC results (measured)
+
+NVHPC 24.5, Quadro RTX 6000 (cc75), generic `simulation` build, 8 cores:
+
+| Scenario | Release (fat 5-arch) | `--fast-build` (single-arch) |
+| --- | --- | --- |
+| Clean full build | 641 s | 170 s (3.8x) |
+| Hot-module incremental (`m_riemann_solvers`) | 385 s | 79 s (4.9x) |
+
+Verified: builds with no IPO (`-Mextract` absent), no `MFC_DEBUG`, single `-gpu=cc75`, and
+the resulting binary runs a 1D case on the GPU to exit code 0 with finite output.
+
+## AMD / LLVMFlang: the device-LTO link (proposed, needs validation)
+
+The AMD GPU offload flags live in `CMakeLists.txt` (`MFC_SETUP_TARGET`):
+
+```cmake
+# compile
+target_compile_options(${a_target} PRIVATE
+    -fopenmp --offload-arch=gfx90a -O3
+    -fopenmp-assume-threads-oversubscription
+    -fopenmp-assume-teams-oversubscription)
+# link
+target_link_options(${a_target} PRIVATE
+    -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS})
+```
+
+The `-flto-partitions` at link is the tell: the OpenMP offload **device link runs
+whole-program LTO every time**, so even a one-file edit re-LTOs all device code. Single-arch
+is not a lever here (already a single `gfx90a`).
+
+### Levers, best first
+
+1. **JIT the device code: `-fopenmp-target-jit`.** Instead of AOT-compiling and LTO-linking
+   device code into the binary at link time, embed device LLVM-IR and JIT each kernel at
+   runtime on first launch. The device LTO link essentially disappears, so the link drops to
+   roughly host-link time. Cost: a one-time JIT warmup on the *run* (tunable with
+   `LIBOMPTARGET_JIT_OPT_LEVEL`). This is the real fix for AMD iteration.
+
+2. **Build with a high `-j` (no code change — try this first).** `-flto-partitions` is set to
+   `MFC_BUILD_JOBS`, which is your `-j`. Building with `-j 8` on a 64-core node runs the
+   device LTO link only 8-way parallel. Use `-j 32`/`-j 64` to give the LTO link more
+   partitions; this alone may cut the link time substantially with no toolchain change.
+
+3. **Lower device optimization `-O3` -> `-O1`/`-O0` for dev builds.** The `-O3` drives the
+   heavy LTO optimization; lowering it cuts link work (slower runtime, fine for debugging).
+
+4. **`-fno-lto` (AOT, non-LTO device link).** Links per-translation-unit device objects
+   instead of whole-program LTO. Potentially faster, but less certain across ROCm/flang
+   versions — only if JIT does not pan out.
+
+### Proposed `Fast` branch for LLVMFlang
+
+Make the flags above build-type-aware so `--fast-build` emits, for `LLVMFlang`:
+
+```cmake
+# compile
+-fopenmp --offload-arch=gfx90a -O1 -fopenmp-target-jit \
+    -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription
+# link (no -flto-partitions; JIT removes the whole-program device LTO)
+-fopenmp --offload-arch=gfx90a -fopenmp-target-jit
+```
+
+### How to validate on an AMD machine
+
+On a Frontier AMD / AFAR-style node (`source ./mfc.sh load -c famd -m g` or equivalent):
+
+1. **Baseline** — time the current link:
+   `./mfc.sh build -t simulation --gpu mp -j 8` and note the link duration.
+2. **Free lever** — rebuild with a high `-j` (more LTO partitions) and compare the link time:
+   `./mfc.sh build -t simulation --gpu mp -j 64`.
+3. **JIT lever** — once the `Fast` LLVMFlang branch is wired in, build with `--fast-build`
+   and confirm: (a) the link time collapses, (b) a small case runs to exit 0 (expect a
+   one-time JIT warmup on first launch). `OMP_TARGET_OFFLOAD=MANDATORY` is already set.
+
+### Caveat
+
+None of the AMD numbers are measured — LLVMFlang is not available on the development machine
+used so far. The diagnosis follows directly from the build flags and from how LLVM OpenMP
+offload works, but the exact flag spelling and ROCm/flang-version behavior must be confirmed
+on real hardware before the LLVMFlang `Fast` branch is trusted.
+
+## Implementation notes / TODO
+
+- Implemented: `Fast` CMake build type, `fast_build` field in `MFCConfig` (auto
+  `--fast-build`/`--no-fast-build`, own build slug), NVHPC single-arch autodetect, lock-file
+  version bump for the new config field.
+- Not yet: a `--gpu-arch` CLI flag (only the `MFC_FAST_ARCH` env escape hatch exists), the
+  LLVMFlang `Fast` branch above, Cray-on-AMD validation, and `--help`/docs polish.

From 9d9a7749660781bc4b7797ec92a0b04826129fd7 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Tue, 2 Jun 2026 18:05:18 -0500
Subject: [PATCH 3/3] cmake: LLVMFlang Fast build path + fix AFAR hipfort/lib
 paths

- Add Fast build type branch for LLVMFlang OpenMP: -O1 -fno-lto at
  compile and link (replaces -O3 -flto-partitions). Note: provides no
  measurable speedup on AFAR 23.2.1 -- clang-linker-wrapper still runs
  whole-program device link regardless; real fix awaits libomptarget
  shipping with AMDGPU JIT backend enabled.
- Fix hipfort-amdgcn find_library HINTS to also search lib/llvm/lib
  (AFAR drops place the .a there, not in lib/).
- Fix hipfort include path: check lib/llvm/include/hipfort/amdgcn first
  (AFAR layout), fall back to include/hipfort/amdgcn (Frontier layout).
- Delete fast_build.md (docs folded into BUILD_NOTES).
---
 CMakeLists.txt                   |  22 +++--
 docs/documentation/fast_build.md | 139 -------------------------------
 2 files changed, 17 insertions(+), 144 deletions(-)
 delete mode 100644 docs/documentation/fast_build.md

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 72c8fb1236..4c67194f20 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -691,8 +691,15 @@ exit 0
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-                    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription)
-                    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS})
+                    if(CMAKE_BUILD_TYPE STREQUAL "Fast")
+                        # Fast dev-iteration: -O1 + -fno-lto eliminates whole-program device LTO.
+                        # (-fopenmp-target-jit is not yet supported by AMD flang as of 23.x.)
+                        target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O1 -fno-lto -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription)
+                        target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -fno-lto)
+                    else()
+                        target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription)
+                        target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS})
+                    endif()
                 endif()
             endif()
 
@@ -761,9 +768,14 @@ exit 0
 		        find_library(HIP_LIB amdhip64
                     HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
                 find_library(HIPFORT_AMDGCN_LIB hipfort-amdgcn
-                    HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED)
-                target_include_directories(${a_target} PRIVATE
-                    "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn")
+                    HINTS "$ENV{OLCF_AFAR_ROOT}/lib" "$ENV{OLCF_AFAR_ROOT}/lib/llvm/lib" REQUIRED)
+                if(EXISTS "$ENV{OLCF_AFAR_ROOT}/lib/llvm/include/hipfort/amdgcn")
+                    target_include_directories(${a_target} PRIVATE
+                        "$ENV{OLCF_AFAR_ROOT}/lib/llvm/include/hipfort/amdgcn")
+                else()
+                    target_include_directories(${a_target} PRIVATE
+                        "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn")
+                endif()
                 target_link_libraries(${a_target} PRIVATE
                     ${HIP_LIB} ${HIPFORT_AMDGCN_LIB})
 
diff --git a/docs/documentation/fast_build.md b/docs/documentation/fast_build.md
deleted file mode 100644
index 396b12cc19..0000000000
--- a/docs/documentation/fast_build.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# Fast Dev Builds (`--fast-build`)
-
-Status: **prototype / work in progress.** The NVHPC path is implemented and measured;
-the AMD (LLVMFlang) path described below is an analysis with a proposed change that
-still needs validation on an AMD GPU + AMD-compiler machine.
-
-## Motivation
-
-GPU builds are slow to iterate on when you just want to add a `print` and re-run.
-Two different compilers, two different bottlenecks:
-
-- **NVHPC** — the per-iteration cost is the two-pass IPO link (`-Mextract` + `-Minline`),
-  plus device codegen for every targeted compute capability. Editing one hot file and
-  rebuilding takes minutes because the IPO passes re-run and device code is generated for
-  every arch in `MFC_CUDA_CC`.
-- **AMD / LLVMFlang** — the per-iteration cost is the **device LTO link**. The link step
-  can take 20+ minutes *every* build, because the OpenMP offload device link does
-  whole-program LTO regardless of what changed.
-
-`--fast-build` is a dedicated build mode that strips the expensive, optimization-oriented
-machinery that is pointless during print-debugging.
-
-## Usage
-
-```bash
-./mfc.sh build -t simulation --gpu acc --fast-build -j 8     # NVHPC (OpenACC)
-./mfc.sh build -t simulation --gpu mp  --fast-build -j 8     # AMD/Cray (OpenMP offload)
-```
-
-`--fast-build` is mutually exclusive with `--debug` / `--reldebug`. It is **not** a
-correctness build: no bounds checking, no `MFC_DEBUG` asserts. Add your own `print`/`write`
-statements; pair with `--debug` when you need runtime checks.
-
-## What it does
-
-`--fast-build` selects a new CMake build type, `Fast`, that deliberately matches **none** of
-the existing conditional flag blocks in `CMakeLists.txt`:
-
-- Not `Release`, so no IPO/LTO and no `-march=native`.
-- Not `Debug`/`RelDebug`, so no `MFC_DEBUG` and no `-gpu=debug`.
-
-It then adds a light `-O1` (via `add_compile_options`, since the `CMAKE_*_FLAGS_FAST` cache
-variables do not inject flags in this codebase). Because `MFC_DEBUG` is off, device routines
-contain no host-only debug aborts, so the binary compiles cleanly **without** IPO.
-
-On NVHPC GPU builds it also restricts device codegen to a **single** compute capability —
-the GPU on the build node, detected via `nvidia-smi` — overriding the multi-arch
-`MFC_CUDA_CC` that the module files set. Set `MFC_FAST_ARCH=<cc>` (e.g. `MFC_FAST_ARCH=90`)
-to override the detection on a login node with no visible GPU.
-
-## NVHPC results (measured)
-
-NVHPC 24.5, Quadro RTX 6000 (cc75), generic `simulation` build, 8 cores:
-
-| Scenario | Release (fat 5-arch) | `--fast-build` (single-arch) |
-| --- | --- | --- |
-| Clean full build | 641 s | 170 s (3.8x) |
-| Hot-module incremental (`m_riemann_solvers`) | 385 s | 79 s (4.9x) |
-
-Verified: builds with no IPO (`-Mextract` absent), no `MFC_DEBUG`, single `-gpu=cc75`, and
-the resulting binary runs a 1D case on the GPU to exit code 0 with finite output.
-
-## AMD / LLVMFlang: the device-LTO link (proposed, needs validation)
-
-The AMD GPU offload flags live in `CMakeLists.txt` (`MFC_SETUP_TARGET`):
-
-```cmake
-# compile
-target_compile_options(${a_target} PRIVATE
-    -fopenmp --offload-arch=gfx90a -O3
-    -fopenmp-assume-threads-oversubscription
-    -fopenmp-assume-teams-oversubscription)
-# link
-target_link_options(${a_target} PRIVATE
-    -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS})
-```
-
-The `-flto-partitions` at link is the tell: the OpenMP offload **device link runs
-whole-program LTO every time**, so even a one-file edit re-LTOs all device code. Single-arch
-is not a lever here (already a single `gfx90a`).
-
-### Levers, best first
-
-1. **JIT the device code: `-fopenmp-target-jit`.** Instead of AOT-compiling and LTO-linking
-   device code into the binary at link time, embed device LLVM-IR and JIT each kernel at
-   runtime on first launch. The device LTO link essentially disappears, so the link drops to
-   roughly host-link time. Cost: a one-time JIT warmup on the *run* (tunable with
-   `LIBOMPTARGET_JIT_OPT_LEVEL`). This is the real fix for AMD iteration.
-
-2. **Build with a high `-j` (no code change — try this first).** `-flto-partitions` is set to
-   `MFC_BUILD_JOBS`, which is your `-j`. Building with `-j 8` on a 64-core node runs the
-   device LTO link only 8-way parallel. Use `-j 32`/`-j 64` to give the LTO link more
-   partitions; this alone may cut the link time substantially with no toolchain change.
-
-3. **Lower device optimization `-O3` -> `-O1`/`-O0` for dev builds.** The `-O3` drives the
-   heavy LTO optimization; lowering it cuts link work (slower runtime, fine for debugging).
-
-4. **`-fno-lto` (AOT, non-LTO device link).** Links per-translation-unit device objects
-   instead of whole-program LTO. Potentially faster, but less certain across ROCm/flang
-   versions — only if JIT does not pan out.
-
-### Proposed `Fast` branch for LLVMFlang
-
-Make the flags above build-type-aware so `--fast-build` emits, for `LLVMFlang`:
-
-```cmake
-# compile
--fopenmp --offload-arch=gfx90a -O1 -fopenmp-target-jit \
-    -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription
-# link (no -flto-partitions; JIT removes the whole-program device LTO)
--fopenmp --offload-arch=gfx90a -fopenmp-target-jit
-```
-
-### How to validate on an AMD machine
-
-On a Frontier AMD / AFAR-style node (`source ./mfc.sh load -c famd -m g` or equivalent):
-
-1. **Baseline** — time the current link:
-   `./mfc.sh build -t simulation --gpu mp -j 8` and note the link duration.
-2. **Free lever** — rebuild with a high `-j` (more LTO partitions) and compare the link time:
-   `./mfc.sh build -t simulation --gpu mp -j 64`.
-3. **JIT lever** — once the `Fast` LLVMFlang branch is wired in, build with `--fast-build`
-   and confirm: (a) the link time collapses, (b) a small case runs to exit 0 (expect a
-   one-time JIT warmup on first launch). `OMP_TARGET_OFFLOAD=MANDATORY` is already set.
-
-### Caveat
-
-None of the AMD numbers are measured — LLVMFlang is not available on the development machine
-used so far. The diagnosis follows directly from the build flags and from how LLVM OpenMP
-offload works, but the exact flag spelling and ROCm/flang-version behavior must be confirmed
-on real hardware before the LLVMFlang `Fast` branch is trusted.
-
-## Implementation notes / TODO
-
-- Implemented: `Fast` CMake build type, `fast_build` field in `MFCConfig` (auto
-  `--fast-build`/`--no-fast-build`, own build slug), NVHPC single-arch autodetect, lock-file
-  version bump for the new config field.
-- Not yet: a `--gpu-arch` CLI flag (only the `MFC_FAST_ARCH` env escape hatch exists), the
-  LLVMFlang `Fast` branch above, Cray-on-AMD validation, and `--help`/docs polish.