From 30f7e8b51f3fb2efa6c2b65e0ed72da0f8ce87dd Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 1 Jun 2026 21:06:27 -0400 Subject: [PATCH 1/3] feat: add --fast-build dev-iteration build mode (prototype) New 'Fast' build type for fast edit-rebuild-run iteration (e.g. GPU print debugging). It matches none of the Release-only (IPO, -march=native) or Debug/RelDebug-only (MFC_DEBUG, -gpu=debug) conditional blocks, so it inherits none of them; adds a light -O1. On NVHPC GPU builds it autodetects the node's single compute capability (nvidia-smi) and overrides the multi-arch MFC_CUDA_CC, with MFC_FAST_ARCH as a login-node escape hatch. Measured (NVHPC 24.5, RTX 6000 cc75, generic simulation, 8 cores): clean build 641s (Release fat 5-arch) -> 170s (3.8x) hot-module 385s (Release fat 5-arch) -> 79s (4.9x) Verified: builds with no IPO/MFC_DEBUG, runs a 1D case on GPU to exit 0. Adds fast_build to MFCConfig (auto --fast-build/--no-fast-build, own slug); bumps lock version to 9 for the new config field. --- CMakeLists.txt | 19 ++++++++++++-- toolchain/mfc/build.py | 57 +++++++++++++++++++++++++++++++++++++++--- toolchain/mfc/lock.py | 2 +- toolchain/mfc/state.py | 1 + 4 files changed, 73 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 83bbb8fe0e..72c8fb1236 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,9 +40,9 @@ if (MFC_ALL) endif() # Validate CMAKE_BUILD_TYPE to catch typos (CMake is case-sensitive). -set(_VALID_BUILD_TYPES "Debug" "Release" "RelDebug" "") +set(_VALID_BUILD_TYPES "Debug" "Release" "RelDebug" "Fast" "") if (NOT CMAKE_BUILD_TYPE IN_LIST _VALID_BUILD_TYPES) - message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE '${CMAKE_BUILD_TYPE}'. Valid: Debug, RelDebug, Release") + message(FATAL_ERROR "Unknown CMAKE_BUILD_TYPE '${CMAKE_BUILD_TYPE}'. Valid: Debug, RelDebug, Release, Fast") endif() # RelDebug: a lighter debug mode for CI. Compiler-specific blocks below add the @@ -51,6 +51,13 @@ set(CMAKE_C_FLAGS_RELDEBUG "-g" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELDEBUG "-g" CACHE STRING "") set(CMAKE_Fortran_FLAGS_RELDEBUG "-g" CACHE STRING "") +# Fast: fast-iteration dev builds (e.g. GPU print-debugging). Deliberately matches +# none of the Release-only (IPO, -march=native) or Debug/RelDebug-only (MFC_DEBUG, +# -gpu=debug) conditional blocks below, so it inherits none of them - just a light -O1. +set(CMAKE_C_FLAGS_FAST "-O1" CACHE STRING "") +set(CMAKE_CXX_FLAGS_FAST "-O1" CACHE STRING "") +set(CMAKE_Fortran_FLAGS_FAST "-O1" CACHE STRING "") + if (MFC_SINGLE_PRECISION) add_compile_definitions(MFC_SINGLE_PRECISION) else() @@ -330,6 +337,14 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug") add_compile_definitions(MFC_DEBUG) endif() +# Fast: light optimization for dev iteration. Like Debug/RelDebug, the real opt +# flag is injected here (the CMAKE_*_FLAGS_FAST cache vars are placeholders). +# -O1 keeps compile time low while giving acceptable runtime; no MFC_DEBUG, so +# device routines stay free of host-only debug aborts and compile without IPO. +if (CMAKE_BUILD_TYPE STREQUAL "Fast") + add_compile_options($<$:-O1>) +endif() + # HANDLE_SOURCES: Given a target (herein ): diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 01efb1a9b1..096859a193 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -27,6 +27,53 @@ _MAKE_PROGRESS_RE = re.compile(r"^\[\s*(\d+)%\]\s+(.*)$") +def _cmake_build_type() -> str: + """Map the CLI build-mode flags to a CMAKE_BUILD_TYPE string.""" + if ARG("debug"): + return "Debug" + if ARG("reldebug"): + return "RelDebug" + if ARG("fast_build", False): + return "Fast" + return "Release" + + +def _apply_fast_build_gpu_arch() -> None: + """Under --fast-build on an NVHPC GPU build, restrict device codegen to a + single compute capability (the node's GPU), overriding the multi-arch + MFC_CUDA_CC that the module files set. CMake reads $ENV{MFC_CUDA_CC}. + + Cray/AMD GPU builds don't use MFC_CUDA_CC (they are already single-arch via + craype-accel/--offload-arch), so this only acts when MFC_CUDA_CC is set. + Hard-errors if no GPU is detectable and no explicit arch is provided.""" + if not ARG("fast_build", False) or ARG("gpu") == gpuConfigOptions.NONE.value: + return + if not os.environ.get("MFC_CUDA_CC"): # not an NVHPC node; nothing to do + return + + override = os.environ.get("MFC_FAST_ARCH") # escape hatch for login nodes + if override: + os.environ["MFC_CUDA_CC"] = override + return + + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"], + capture_output=True, + text=True, + timeout=10, + check=False, + ) + caps = [ln.strip().replace(".", "") for ln in result.stdout.splitlines() if ln.strip()] + except (OSError, subprocess.SubprocessError): + caps = [] + + if not caps: + raise MFCException("--fast-build: could not detect a local GPU compute capability " "(no GPU visible via nvidia-smi). Run on a GPU node, or set " "MFC_FAST_ARCH= (e.g. MFC_FAST_ARCH=90).") + + os.environ["MFC_CUDA_CC"] = caps[0] + + def _run_build_with_progress(command: typing.List[str], target_name: str, streaming: bool = False) -> subprocess.CompletedProcess: """ Run a build command with a progress bar that parses ninja output. @@ -367,6 +414,10 @@ def is_buildable(self) -> bool: def configure(self, case: Case): if ARG("debug") and ARG("reldebug"): raise MFCException("--debug and --reldebug are mutually exclusive.") + if ARG("fast_build", False) and (ARG("debug") or ARG("reldebug")): + raise MFCException("--fast-build is mutually exclusive with --debug/--reldebug.") + + _apply_fast_build_gpu_arch() build_dirpath = self.get_staging_dirpath(case) cmake_dirpath = self.get_cmake_dirpath() @@ -386,9 +437,9 @@ def configure(self, case: Case): # build the configured targets. This is mostly useful for debugging. # See: https://cmake.org/cmake/help/latest/variable/CMAKE_EXPORT_COMPILE_COMMANDS.html. "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON", - # Set build type (Debug, RelDebug, or Release). + # Set build type (Debug, RelDebug, Fast, or Release). # See: https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html - f"-DCMAKE_BUILD_TYPE={'Debug' if ARG('debug') else 'RelDebug' if ARG('reldebug') else 'Release'}", + f"-DCMAKE_BUILD_TYPE={_cmake_build_type()}", # Used by FIND_PACKAGE (/FindXXX) to search for packages, with the # second highest level of priority, still letting users manually # specify _ROOT, which has precedence over CMAKE_PREFIX_PATH. @@ -468,7 +519,7 @@ def build(self, case: input.MFCInputFile): "--parallel", ARG("jobs"), "--config", - "Debug" if ARG("debug") else "RelDebug" if ARG("reldebug") else "Release", + _cmake_build_type(), ] verbosity = ARG("verbose") diff --git a/toolchain/mfc/lock.py b/toolchain/mfc/lock.py index 02a8732f9b..0e33c8ccbc 100644 --- a/toolchain/mfc/lock.py +++ b/toolchain/mfc/lock.py @@ -5,7 +5,7 @@ from .printer import cons from .state import MFCConfig -MFC_LOCK_CURRENT_VERSION: int = 8 +MFC_LOCK_CURRENT_VERSION: int = 9 @dataclasses.dataclass diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index 94a37be947..a14393ebde 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -16,6 +16,7 @@ class MFCConfig: gpu: str = gpuConfigOptions.NONE.value debug: bool = False reldebug: bool = False + fast_build: bool = False gcov: bool = False unified: bool = False single: bool = False From f1ad00030aa0b5f36f8e3ff5d8a248f81e90766e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Mon, 1 Jun 2026 21:20:12 -0400 Subject: [PATCH 2/3] docs: fast-build mode design + AMD/LLVMFlang link-time plan Documents the --fast-build dev-iteration mode: motivation, usage, the new Fast build type, measured NVHPC results (clean 3.8x, hot-module 4.9x), and the proposed AMD/LLVMFlang path (device-LTO diagnosis, -fopenmp-target-jit + -O1, high-j partitions lever) with steps to validate on an AMD GPU + AMD compiler node. The AMD path is analysis only and unverified (no LLVMFlang on the dev box). --- docs/documentation/fast_build.md | 139 +++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 docs/documentation/fast_build.md diff --git a/docs/documentation/fast_build.md b/docs/documentation/fast_build.md new file mode 100644 index 0000000000..396b12cc19 --- /dev/null +++ b/docs/documentation/fast_build.md @@ -0,0 +1,139 @@ +# Fast Dev Builds (`--fast-build`) + +Status: **prototype / work in progress.** The NVHPC path is implemented and measured; +the AMD (LLVMFlang) path described below is an analysis with a proposed change that +still needs validation on an AMD GPU + AMD-compiler machine. + +## Motivation + +GPU builds are slow to iterate on when you just want to add a `print` and re-run. +Two different compilers, two different bottlenecks: + +- **NVHPC** — the per-iteration cost is the two-pass IPO link (`-Mextract` + `-Minline`), + plus device codegen for every targeted compute capability. Editing one hot file and + rebuilding takes minutes because the IPO passes re-run and device code is generated for + every arch in `MFC_CUDA_CC`. +- **AMD / LLVMFlang** — the per-iteration cost is the **device LTO link**. The link step + can take 20+ minutes *every* build, because the OpenMP offload device link does + whole-program LTO regardless of what changed. + +`--fast-build` is a dedicated build mode that strips the expensive, optimization-oriented +machinery that is pointless during print-debugging. + +## Usage + +```bash +./mfc.sh build -t simulation --gpu acc --fast-build -j 8 # NVHPC (OpenACC) +./mfc.sh build -t simulation --gpu mp --fast-build -j 8 # AMD/Cray (OpenMP offload) +``` + +`--fast-build` is mutually exclusive with `--debug` / `--reldebug`. It is **not** a +correctness build: no bounds checking, no `MFC_DEBUG` asserts. Add your own `print`/`write` +statements; pair with `--debug` when you need runtime checks. + +## What it does + +`--fast-build` selects a new CMake build type, `Fast`, that deliberately matches **none** of +the existing conditional flag blocks in `CMakeLists.txt`: + +- Not `Release`, so no IPO/LTO and no `-march=native`. +- Not `Debug`/`RelDebug`, so no `MFC_DEBUG` and no `-gpu=debug`. + +It then adds a light `-O1` (via `add_compile_options`, since the `CMAKE_*_FLAGS_FAST` cache +variables do not inject flags in this codebase). Because `MFC_DEBUG` is off, device routines +contain no host-only debug aborts, so the binary compiles cleanly **without** IPO. + +On NVHPC GPU builds it also restricts device codegen to a **single** compute capability — +the GPU on the build node, detected via `nvidia-smi` — overriding the multi-arch +`MFC_CUDA_CC` that the module files set. Set `MFC_FAST_ARCH=` (e.g. `MFC_FAST_ARCH=90`) +to override the detection on a login node with no visible GPU. + +## NVHPC results (measured) + +NVHPC 24.5, Quadro RTX 6000 (cc75), generic `simulation` build, 8 cores: + +| Scenario | Release (fat 5-arch) | `--fast-build` (single-arch) | +| --- | --- | --- | +| Clean full build | 641 s | 170 s (3.8x) | +| Hot-module incremental (`m_riemann_solvers`) | 385 s | 79 s (4.9x) | + +Verified: builds with no IPO (`-Mextract` absent), no `MFC_DEBUG`, single `-gpu=cc75`, and +the resulting binary runs a 1D case on the GPU to exit code 0 with finite output. + +## AMD / LLVMFlang: the device-LTO link (proposed, needs validation) + +The AMD GPU offload flags live in `CMakeLists.txt` (`MFC_SETUP_TARGET`): + +```cmake +# compile +target_compile_options(${a_target} PRIVATE + -fopenmp --offload-arch=gfx90a -O3 + -fopenmp-assume-threads-oversubscription + -fopenmp-assume-teams-oversubscription) +# link +target_link_options(${a_target} PRIVATE + -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS}) +``` + +The `-flto-partitions` at link is the tell: the OpenMP offload **device link runs +whole-program LTO every time**, so even a one-file edit re-LTOs all device code. Single-arch +is not a lever here (already a single `gfx90a`). + +### Levers, best first + +1. **JIT the device code: `-fopenmp-target-jit`.** Instead of AOT-compiling and LTO-linking + device code into the binary at link time, embed device LLVM-IR and JIT each kernel at + runtime on first launch. The device LTO link essentially disappears, so the link drops to + roughly host-link time. Cost: a one-time JIT warmup on the *run* (tunable with + `LIBOMPTARGET_JIT_OPT_LEVEL`). This is the real fix for AMD iteration. + +2. **Build with a high `-j` (no code change — try this first).** `-flto-partitions` is set to + `MFC_BUILD_JOBS`, which is your `-j`. Building with `-j 8` on a 64-core node runs the + device LTO link only 8-way parallel. Use `-j 32`/`-j 64` to give the LTO link more + partitions; this alone may cut the link time substantially with no toolchain change. + +3. **Lower device optimization `-O3` -> `-O1`/`-O0` for dev builds.** The `-O3` drives the + heavy LTO optimization; lowering it cuts link work (slower runtime, fine for debugging). + +4. **`-fno-lto` (AOT, non-LTO device link).** Links per-translation-unit device objects + instead of whole-program LTO. Potentially faster, but less certain across ROCm/flang + versions — only if JIT does not pan out. + +### Proposed `Fast` branch for LLVMFlang + +Make the flags above build-type-aware so `--fast-build` emits, for `LLVMFlang`: + +```cmake +# compile +-fopenmp --offload-arch=gfx90a -O1 -fopenmp-target-jit \ + -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription +# link (no -flto-partitions; JIT removes the whole-program device LTO) +-fopenmp --offload-arch=gfx90a -fopenmp-target-jit +``` + +### How to validate on an AMD machine + +On a Frontier AMD / AFAR-style node (`source ./mfc.sh load -c famd -m g` or equivalent): + +1. **Baseline** — time the current link: + `./mfc.sh build -t simulation --gpu mp -j 8` and note the link duration. +2. **Free lever** — rebuild with a high `-j` (more LTO partitions) and compare the link time: + `./mfc.sh build -t simulation --gpu mp -j 64`. +3. **JIT lever** — once the `Fast` LLVMFlang branch is wired in, build with `--fast-build` + and confirm: (a) the link time collapses, (b) a small case runs to exit 0 (expect a + one-time JIT warmup on first launch). `OMP_TARGET_OFFLOAD=MANDATORY` is already set. + +### Caveat + +None of the AMD numbers are measured — LLVMFlang is not available on the development machine +used so far. The diagnosis follows directly from the build flags and from how LLVM OpenMP +offload works, but the exact flag spelling and ROCm/flang-version behavior must be confirmed +on real hardware before the LLVMFlang `Fast` branch is trusted. + +## Implementation notes / TODO + +- Implemented: `Fast` CMake build type, `fast_build` field in `MFCConfig` (auto + `--fast-build`/`--no-fast-build`, own build slug), NVHPC single-arch autodetect, lock-file + version bump for the new config field. +- Not yet: a `--gpu-arch` CLI flag (only the `MFC_FAST_ARCH` env escape hatch exists), the + LLVMFlang `Fast` branch above, Cray-on-AMD validation, and `--help`/docs polish. From 9d9a7749660781bc4b7797ec92a0b04826129fd7 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Tue, 2 Jun 2026 18:05:18 -0500 Subject: [PATCH 3/3] cmake: LLVMFlang Fast build path + fix AFAR hipfort/lib paths - Add Fast build type branch for LLVMFlang OpenMP: -O1 -fno-lto at compile and link (replaces -O3 -flto-partitions). Note: provides no measurable speedup on AFAR 23.2.1 -- clang-linker-wrapper still runs whole-program device link regardless; real fix awaits libomptarget shipping with AMDGPU JIT backend enabled. - Fix hipfort-amdgcn find_library HINTS to also search lib/llvm/lib (AFAR drops place the .a there, not in lib/). - Fix hipfort include path: check lib/llvm/include/hipfort/amdgcn first (AFAR layout), fall back to include/hipfort/amdgcn (Frontier layout). - Delete fast_build.md (docs folded into BUILD_NOTES). --- CMakeLists.txt | 22 +++-- docs/documentation/fast_build.md | 139 ------------------------------- 2 files changed, 17 insertions(+), 144 deletions(-) delete mode 100644 docs/documentation/fast_build.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 72c8fb1236..4c67194f20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -691,8 +691,15 @@ exit 0 target_compile_options(${a_target} PRIVATE -fopenmp) target_link_options(${a_target} PRIVATE -fopenmp) elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") - target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription) - target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS}) + if(CMAKE_BUILD_TYPE STREQUAL "Fast") + # Fast dev-iteration: -O1 + -fno-lto eliminates whole-program device LTO. + # (-fopenmp-target-jit is not yet supported by AMD flang as of 23.x.) + target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O1 -fno-lto -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription) + target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -fno-lto) + else() + target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription) + target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS}) + endif() endif() endif() @@ -761,9 +768,14 @@ exit 0 find_library(HIP_LIB amdhip64 HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED) find_library(HIPFORT_AMDGCN_LIB hipfort-amdgcn - HINTS "$ENV{OLCF_AFAR_ROOT}/lib" REQUIRED) - target_include_directories(${a_target} PRIVATE - "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn") + HINTS "$ENV{OLCF_AFAR_ROOT}/lib" "$ENV{OLCF_AFAR_ROOT}/lib/llvm/lib" REQUIRED) + if(EXISTS "$ENV{OLCF_AFAR_ROOT}/lib/llvm/include/hipfort/amdgcn") + target_include_directories(${a_target} PRIVATE + "$ENV{OLCF_AFAR_ROOT}/lib/llvm/include/hipfort/amdgcn") + else() + target_include_directories(${a_target} PRIVATE + "$ENV{OLCF_AFAR_ROOT}/include/hipfort/amdgcn") + endif() target_link_libraries(${a_target} PRIVATE ${HIP_LIB} ${HIPFORT_AMDGCN_LIB}) diff --git a/docs/documentation/fast_build.md b/docs/documentation/fast_build.md deleted file mode 100644 index 396b12cc19..0000000000 --- a/docs/documentation/fast_build.md +++ /dev/null @@ -1,139 +0,0 @@ -# Fast Dev Builds (`--fast-build`) - -Status: **prototype / work in progress.** The NVHPC path is implemented and measured; -the AMD (LLVMFlang) path described below is an analysis with a proposed change that -still needs validation on an AMD GPU + AMD-compiler machine. - -## Motivation - -GPU builds are slow to iterate on when you just want to add a `print` and re-run. -Two different compilers, two different bottlenecks: - -- **NVHPC** — the per-iteration cost is the two-pass IPO link (`-Mextract` + `-Minline`), - plus device codegen for every targeted compute capability. Editing one hot file and - rebuilding takes minutes because the IPO passes re-run and device code is generated for - every arch in `MFC_CUDA_CC`. -- **AMD / LLVMFlang** — the per-iteration cost is the **device LTO link**. The link step - can take 20+ minutes *every* build, because the OpenMP offload device link does - whole-program LTO regardless of what changed. - -`--fast-build` is a dedicated build mode that strips the expensive, optimization-oriented -machinery that is pointless during print-debugging. - -## Usage - -```bash -./mfc.sh build -t simulation --gpu acc --fast-build -j 8 # NVHPC (OpenACC) -./mfc.sh build -t simulation --gpu mp --fast-build -j 8 # AMD/Cray (OpenMP offload) -``` - -`--fast-build` is mutually exclusive with `--debug` / `--reldebug`. It is **not** a -correctness build: no bounds checking, no `MFC_DEBUG` asserts. Add your own `print`/`write` -statements; pair with `--debug` when you need runtime checks. - -## What it does - -`--fast-build` selects a new CMake build type, `Fast`, that deliberately matches **none** of -the existing conditional flag blocks in `CMakeLists.txt`: - -- Not `Release`, so no IPO/LTO and no `-march=native`. -- Not `Debug`/`RelDebug`, so no `MFC_DEBUG` and no `-gpu=debug`. - -It then adds a light `-O1` (via `add_compile_options`, since the `CMAKE_*_FLAGS_FAST` cache -variables do not inject flags in this codebase). Because `MFC_DEBUG` is off, device routines -contain no host-only debug aborts, so the binary compiles cleanly **without** IPO. - -On NVHPC GPU builds it also restricts device codegen to a **single** compute capability — -the GPU on the build node, detected via `nvidia-smi` — overriding the multi-arch -`MFC_CUDA_CC` that the module files set. Set `MFC_FAST_ARCH=` (e.g. `MFC_FAST_ARCH=90`) -to override the detection on a login node with no visible GPU. - -## NVHPC results (measured) - -NVHPC 24.5, Quadro RTX 6000 (cc75), generic `simulation` build, 8 cores: - -| Scenario | Release (fat 5-arch) | `--fast-build` (single-arch) | -| --- | --- | --- | -| Clean full build | 641 s | 170 s (3.8x) | -| Hot-module incremental (`m_riemann_solvers`) | 385 s | 79 s (4.9x) | - -Verified: builds with no IPO (`-Mextract` absent), no `MFC_DEBUG`, single `-gpu=cc75`, and -the resulting binary runs a 1D case on the GPU to exit code 0 with finite output. - -## AMD / LLVMFlang: the device-LTO link (proposed, needs validation) - -The AMD GPU offload flags live in `CMakeLists.txt` (`MFC_SETUP_TARGET`): - -```cmake -# compile -target_compile_options(${a_target} PRIVATE - -fopenmp --offload-arch=gfx90a -O3 - -fopenmp-assume-threads-oversubscription - -fopenmp-assume-teams-oversubscription) -# link -target_link_options(${a_target} PRIVATE - -fopenmp --offload-arch=gfx90a -flto-partitions=${MFC_BUILD_JOBS}) -``` - -The `-flto-partitions` at link is the tell: the OpenMP offload **device link runs -whole-program LTO every time**, so even a one-file edit re-LTOs all device code. Single-arch -is not a lever here (already a single `gfx90a`). - -### Levers, best first - -1. **JIT the device code: `-fopenmp-target-jit`.** Instead of AOT-compiling and LTO-linking - device code into the binary at link time, embed device LLVM-IR and JIT each kernel at - runtime on first launch. The device LTO link essentially disappears, so the link drops to - roughly host-link time. Cost: a one-time JIT warmup on the *run* (tunable with - `LIBOMPTARGET_JIT_OPT_LEVEL`). This is the real fix for AMD iteration. - -2. **Build with a high `-j` (no code change — try this first).** `-flto-partitions` is set to - `MFC_BUILD_JOBS`, which is your `-j`. Building with `-j 8` on a 64-core node runs the - device LTO link only 8-way parallel. Use `-j 32`/`-j 64` to give the LTO link more - partitions; this alone may cut the link time substantially with no toolchain change. - -3. **Lower device optimization `-O3` -> `-O1`/`-O0` for dev builds.** The `-O3` drives the - heavy LTO optimization; lowering it cuts link work (slower runtime, fine for debugging). - -4. **`-fno-lto` (AOT, non-LTO device link).** Links per-translation-unit device objects - instead of whole-program LTO. Potentially faster, but less certain across ROCm/flang - versions — only if JIT does not pan out. - -### Proposed `Fast` branch for LLVMFlang - -Make the flags above build-type-aware so `--fast-build` emits, for `LLVMFlang`: - -```cmake -# compile --fopenmp --offload-arch=gfx90a -O1 -fopenmp-target-jit \ - -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription -# link (no -flto-partitions; JIT removes the whole-program device LTO) --fopenmp --offload-arch=gfx90a -fopenmp-target-jit -``` - -### How to validate on an AMD machine - -On a Frontier AMD / AFAR-style node (`source ./mfc.sh load -c famd -m g` or equivalent): - -1. **Baseline** — time the current link: - `./mfc.sh build -t simulation --gpu mp -j 8` and note the link duration. -2. **Free lever** — rebuild with a high `-j` (more LTO partitions) and compare the link time: - `./mfc.sh build -t simulation --gpu mp -j 64`. -3. **JIT lever** — once the `Fast` LLVMFlang branch is wired in, build with `--fast-build` - and confirm: (a) the link time collapses, (b) a small case runs to exit 0 (expect a - one-time JIT warmup on first launch). `OMP_TARGET_OFFLOAD=MANDATORY` is already set. - -### Caveat - -None of the AMD numbers are measured — LLVMFlang is not available on the development machine -used so far. The diagnosis follows directly from the build flags and from how LLVM OpenMP -offload works, but the exact flag spelling and ROCm/flang-version behavior must be confirmed -on real hardware before the LLVMFlang `Fast` branch is trusted. - -## Implementation notes / TODO - -- Implemented: `Fast` CMake build type, `fast_build` field in `MFCConfig` (auto - `--fast-build`/`--no-fast-build`, own build slug), NVHPC single-arch autodetect, lock-file - version bump for the new config field. -- Not yet: a `--gpu-arch` CLI flag (only the `MFC_FAST_ARCH` env escape hatch exists), the - LLVMFlang `Fast` branch above, Cray-on-AMD validation, and `--help`/docs polish.