Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
184 changes: 184 additions & 0 deletions docs/hooks/llms_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""Generate llms.txt, llms-full.txt, and per-page markdown (https://llmstxt.org/).

The hook publishes three artifacts into the built site:

- `llms.txt`: a markdown index of the documentation, one link per page,
grouped by nav section.
- a `.md` rendition of every prose page next to its HTML (e.g.
`tutorial/tools/index.md`), which is what the llms.txt links point at.
- `llms-full.txt`: every prose page concatenated for single-fetch consumption.

Page markdown is the source markdown with `--8<--` snippet includes resolved
(so the `docs_src/` code examples appear inline) and relative links rewritten
to absolute URLs. The API reference pages under `api/` are mkdocstrings stubs
with no markdown source, so they are linked as rendered HTML from an Optional
section instead of being embedded.

Incremental builds (`mkdocs build --dirty`) are rejected: they skip unmodified
pages, which would silently truncate the generated artifacts.
"""

from __future__ import annotations

import posixpath
import re
from dataclasses import dataclass, field
from pathlib import Path

from mkdocs.config.defaults import MkDocsConfig
from mkdocs.exceptions import PluginError
from mkdocs.structure.files import File, Files
from mkdocs.structure.nav import Navigation, Section
from mkdocs.structure.pages import Page

# Pages with no markdown source, linked as HTML under "## Optional".
_OPTIONAL_PAGES = [
("api/mcp/index.md", "mcp API reference", "Auto-generated API reference for the mcp package (rendered HTML)"),
(
"api/mcp_types/index.md",
"mcp-types API reference",
"Auto-generated API reference for the mcp-types package (rendered HTML)",
),
]

_SNIPPET_LINE = re.compile(r'^(?P<indent>[ \t]*)--8<-- "(?P<path>[^"\n]+)"$', flags=re.MULTILINE)

@cubic-dev-ai cubic-dev-ai Bot Jun 29, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: Snippet parsing is narrower than pymdownx snippets syntax, so valid include directives can fail the build. Expand the regex to accept documented inline marker variants.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At docs/hooks/llms_txt.py, line 43:

<comment>Snippet parsing is narrower than pymdownx snippets syntax, so valid include directives can fail the build. Expand the regex to accept documented inline marker variants.</comment>

<file context>
@@ -0,0 +1,172 @@
+    ),
+]
+
+_SNIPPET_LINE = re.compile(r'^(?P<indent>[ \t]*)--8<-- "(?P<path>[^"\n]+)"$', flags=re.MULTILINE)
+_MD_LINK = re.compile(r'(\]\()([^)\s]+\.md)(#[^)\s]*)?( +"[^"]*")?(\))')
+
</file context>
Fix with cubic

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also deliberate — the docs only use whole-file includes of complete runnable examples from docs_src/ (the convention described in mkdocs.yml), so the hook supports exactly that pattern and fails the build with an error naming the page for anything else. Section syntax and the block form both die loudly rather than rendering wrong. If we ever adopt other variants, extending the hook then beats carrying an implementation of pymdownx's full semantics that nothing exercises.

AI Disclaimer

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That whole-file-only boundary is intentional, so the parent comment was too broad here. This hook should keep failing loud on section/block variants rather than growing full pymdownx snippet semantics.

_MD_LINK = re.compile(r'(\]\()([^)\s]+\.md)(#[^)\s]*)?( +"[^"]*")?(\))')


@dataclass
class _State:
page_markdown: dict[str, str] = field(default_factory=dict)
rendition_uris: set[str] = field(default_factory=set)
nav: Navigation | None = None
files: Files | None = None


_state = _State()


def _site_url(config: MkDocsConfig) -> str:
assert config.site_url is not None
return config.site_url.rstrip("/") + "/"


def _md_uri(file: File) -> str:
return re.sub(r"\.html$", ".md", file.dest_uri)


def on_config(config: MkDocsConfig) -> None:
# `mkdocs serve` rebuilds reuse the imported module; start each build clean.
_state.page_markdown.clear()
_state.rendition_uris.clear()
_state.nav = _state.files = None


def on_nav(nav: Navigation, config: MkDocsConfig, files: Files) -> None:
_state.nav = nav
_state.files = files
_state.rendition_uris.update(page.file.src_uri for page in nav.pages if not page.file.src_uri.startswith("api/"))


def on_page_markdown(markdown: str, page: Page, config: MkDocsConfig, files: Files) -> str | None:
if page.file.src_uri not in _state.rendition_uris:
return None

# Same anchor as the pymdownx.snippets `base_path` in mkdocs.yml.
repo_root = Path(config.config_file_path).parent

def include(match: re.Match[str]) -> str:
indent, path = match["indent"], match["path"]
# Mirror the snippets extension's restrict_base_path: reject paths
# that resolve outside the repo root.
resolved_path = (repo_root / path).resolve()
if not resolved_path.is_relative_to(repo_root.resolve()):
raise PluginError(f"llms_txt: snippet path {path!r} in {page.file.src_uri} escapes the repo root")
try:
content = resolved_path.read_text(encoding="utf-8").rstrip("\n")
except OSError as exc:
raise PluginError(f"llms_txt: cannot read snippet {path!r} in {page.file.src_uri}") from exc
# Keep a pointer to the embedded file so readers can find it on disk.
if path.endswith(".py"):
content = f"# {path}\n{content}"
if indent:
content = "\n".join(indent + line if line else line for line in content.split("\n"))
return content

resolved, substitutions = _SNIPPET_LINE.subn(include, markdown)
if substitutions != sum("--8<--" in line for line in markdown.splitlines()):

@cubic-dev-ai cubic-dev-ai Bot Jun 29, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2: The unresolved-include guard treats literal/escaped --8<-- text as failures. Check for remaining real, unescaped include directives instead of raw substring counts.

Prompt for AI agents
Check if this issue is valid — if so, understand the root cause and fix it. At docs/hooks/llms_txt.py, line 99:

<comment>The unresolved-include guard treats literal/escaped `--8<--` text as failures. Check for remaining real, unescaped include directives instead of raw substring counts.</comment>

<file context>
@@ -0,0 +1,172 @@
+        return content
+
+    resolved, substitutions = _SNIPPET_LINE.subn(include, markdown)
+    if substitutions != sum("--8<--" in line for line in markdown.splitlines()):
+        raise PluginError(f"llms_txt: unresolved snippet include in {page.file.src_uri}")
+
</file context>
Suggested change
if substitutions != sum("--8<--" in line for line in markdown.splitlines()):
if re.search(r'^[ \t]*(?!;)-{1,}8<-{1,}(?:$|[ \t]+)', resolved, flags=re.MULTILINE):
Fix with cubic

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one's deliberate. The guard's job is to make any marker-bearing line the hook didn't consume fail the build rather than ship as junk in the renditions — including malformed directives that a "does it look like a directive" regex would miss. The cost is that a page mentioning --8<-- literally fails the build, but no page does today, and the error names the page, so whoever hits it first can adjust the hook with the actual case in front of them. A separate detection regex can drift from the consume regex, which is exactly the silent gap this avoids.

AI Disclaimer

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parent comment was wrong here: the guard is intentionally checking for any marker-bearing line the hook didn’t consume, including malformed directives and literal --8<-- text. That keeps the detection in lockstep with the consume regex and avoids a silent drift gap.

Thanks for the feedback! I've saved this as a new learning to improve future reviews.

raise PluginError(f"llms_txt: unresolved snippet include in {page.file.src_uri}")

site_url = _site_url(config)
src_dir = posixpath.dirname(page.file.src_uri)

def rewrite(match: re.Match[str]) -> str:
opening, target, anchor, title, closing = match.groups()
if "://" in target:
return match.group(0)
linked = files.get_file_from_path(posixpath.normpath(posixpath.join(src_dir, target)))
if linked is None:
raise PluginError(f"llms_txt: cannot resolve link target {target!r} in {page.file.src_uri}")
# Pages without a markdown rendition (the api/ stubs) link to their HTML instead.
url = _md_uri(linked) if linked.src_uri in _state.rendition_uris else linked.url
return f"{opening}{site_url}{url}{anchor or ''}{title or ''}{closing}"

_state.page_markdown[page.file.src_uri] = _MD_LINK.sub(rewrite, resolved)
return None


def _section_pages(section: Section) -> list[Page]:
pages: list[Page] = []
for child in section.children:
if isinstance(child, Page) and child.file.src_uri in _state.rendition_uris:
pages.append(child)
elif isinstance(child, Section):
pages.extend(_section_pages(child))
return pages


def on_post_build(config: MkDocsConfig) -> None:
assert _state.nav is not None and _state.files is not None
missing = _state.rendition_uris - _state.page_markdown.keys()
if missing:
raise PluginError(f"llms_txt: pages skipped this build (is this a --dirty build?): {sorted(missing)}")

site_dir = Path(config.site_dir)
site_url = _site_url(config)

top_level = [
item for item in _state.nav.items if isinstance(item, Page) and item.file.src_uri in _state.rendition_uris
]
sections: list[tuple[str, list[Page]]] = [("Docs", top_level)] if top_level else []
for item in _state.nav.items:
if isinstance(item, Section):
pages = _section_pages(item)
if pages:
sections.append((item.title, pages))

index = [f"# {config.site_name}", "", f"> {config.site_description}", ""]
full: list[str] = []
for title, pages in sections:
index += [f"## {title}", ""]
for page in pages:
markdown = _state.page_markdown[page.file.src_uri]
(site_dir / _md_uri(page.file)).write_text(markdown, encoding="utf-8")

description = page.meta.get("description")
tail = f": {description}" if description else ""
index.append(f"- [{page.title}]({site_url}{_md_uri(page.file)}){tail}")

body, h1_found = re.subn(r"\A\s*# .+\n", "", markdown)
if not h1_found:
raise PluginError(f"llms_txt: page {page.file.src_uri} does not start with an H1")
full += [f"# {page.title}", "", f"Source: {page.canonical_url}", "", body.strip(), ""]
index.append("")

index += ["## Optional", ""]
for src_uri, title, description in _OPTIONAL_PAGES:
linked = _state.files.get_file_from_path(src_uri)
if linked is None:
raise PluginError(f"llms_txt: optional page {src_uri} not found")
index.append(f"- [{title}]({site_url}{linked.url}): {description}")
index.append("")

(site_dir / "llms.txt").write_text("\n".join(index), encoding="utf-8")
(site_dir / "llms-full.txt").write_text("\n".join(full), encoding="utf-8")
3 changes: 3 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,6 @@ You wrote two Python functions with type hints and a docstring. The SDK does the
* The **[Tutorial](tutorial/index.md)** walks through everything a server can do, one small step at a time.
* Migrating from v1? Start with the **[Migration Guide](migration.md)**.
* Hunting for an exact signature? The **[API Reference](api/mcp/index.md)** is generated from the source.
* Reading with an LLM? This documentation is also published in the [llms.txt](https://llmstxt.org/) format:
[llms.txt](https://py.sdk.modelcontextprotocol.io/v2/llms.txt) is an index of the pages, and
[llms-full.txt](https://py.sdk.modelcontextprotocol.io/v2/llms-full.txt) contains every page in a single file.
7 changes: 6 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,12 @@ markdown_extensions:
- pymdownx.superfences
# Code examples live as complete, importable, tested files under `docs_src/`
# and are included into pages with `--8<-- "docs_src/<chapter>/tutorialNNN.py"`
# (resolved against the repo root, the extension's default base_path).
# (resolved against the repo root regardless of the build's working
# directory; the extension's default base_path is the CWD).
# `check_paths: true` + `strict: true` turn a renamed/deleted example into a
# build failure instead of a silently empty code block.
- pymdownx.snippets:
base_path: !relative $config_dir
check_paths: true
- pymdownx.tilde
- pymdownx.inlinehilite
Expand Down Expand Up @@ -146,6 +148,9 @@ watch:
- src
- docs_src

hooks:
- docs/hooks/llms_txt.py

plugins:
- search
- social:
Expand Down
Loading