From 4f2147af345141d97ad691fef9aed8a616034042 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 27 Apr 2026 11:37:53 +0800 Subject: [PATCH 01/22] support generating release notes by ai --- scripts/release_notes_ai/__init__.py | 1 + scripts/release_notes_ai/ai_client.py | 296 +++++++ scripts/release_notes_ai/cli.py | 283 ++++++ scripts/release_notes_ai/constants.py | 98 +++ scripts/release_notes_ai/excel_workbook.py | 906 ++++++++++++++++++++ scripts/release_notes_ai/github_client.py | 321 +++++++ scripts/release_notes_ai/markdown_writer.py | 121 +++ scripts/release_notes_ai/models.py | 101 +++ scripts/release_notes_ai/requirements.txt | 3 + scripts/release_notes_ai/scope_filter.py | 366 ++++++++ scripts/release_notes_ai/utils.py | 87 ++ scripts/release_notes_generate_ai.py | 10 + 12 files changed, 2593 insertions(+) create mode 100644 scripts/release_notes_ai/__init__.py create mode 100644 scripts/release_notes_ai/ai_client.py create mode 100644 scripts/release_notes_ai/cli.py create mode 100644 scripts/release_notes_ai/constants.py create mode 100644 scripts/release_notes_ai/excel_workbook.py create mode 100644 scripts/release_notes_ai/github_client.py create mode 100644 scripts/release_notes_ai/markdown_writer.py create mode 100644 scripts/release_notes_ai/models.py create mode 100644 scripts/release_notes_ai/requirements.txt create mode 100644 scripts/release_notes_ai/scope_filter.py create mode 100644 scripts/release_notes_ai/utils.py create mode 100644 scripts/release_notes_generate_ai.py diff --git a/scripts/release_notes_ai/__init__.py b/scripts/release_notes_ai/__init__.py new file mode 100644 index 0000000000000..65f7e128c779b --- /dev/null +++ b/scripts/release_notes_ai/__init__.py @@ -0,0 +1 @@ +"""Helpers for generating TiDB release notes with AI.""" diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release_notes_ai/ai_client.py new file mode 100644 index 0000000000000..503e28b63023b --- /dev/null +++ b/scripts/release_notes_ai/ai_client.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import dataclasses +from functools import lru_cache +import json +import os +import shlex +import shutil +import subprocess +import tempfile +import textwrap +from pathlib import Path +from typing import Any + +from .constants import BUG_FIXES_REFERENCE, IMPROVEMENTS_REFERENCE +from .models import GeneratedNote, RowContext + + +class AIClient: + def __init__(self, command: str, model: str | None, timeout: int): + self.command = shlex.split(command) + self.model = model + self.timeout = timeout + + def generate(self, prompt: str, expected_links: list[str], contributors: list[str]) -> GeneratedNote: + result, errors = self._run_and_validate(prompt, expected_links, contributors) + if result: + return result + + repair_prompt = build_repair_prompt(prompt, errors) + result, repair_errors = self._run_and_validate(repair_prompt, expected_links, contributors) + if result: + return result + raise ValueError("; ".join(repair_errors)) + + def _run_and_validate( + self, prompt: str, expected_links: list[str], contributors: list[str] + ) -> tuple[GeneratedNote | None, list[str]]: + output = self._run(prompt) + try: + data = extract_json_object(output) + except ValueError as exc: + return None, [str(exc)] + return validate_ai_response(data, expected_links, contributors) + + def _run(self, prompt: str) -> str: + command = list(self.command) + if not command: + raise ValueError("AI command is empty. Pass a command with --ai-command.") + if not is_executable_available(command[0]): + raise FileNotFoundError( + f"AI command executable not found: {command[0]!r}. " + "Install it or pass a custom command with --ai-command." + ) + + with tempfile.TemporaryDirectory() as temp_dir: + output_path: Path | None = None + if self._is_codex_exec(command): + if self.model: + command.extend(["-m", self.model]) + temp_path = Path(temp_dir) + schema_path = temp_path / "ai-output-schema.json" + output_path = temp_path / "ai-output.txt" + schema_path.write_text(json.dumps(ai_output_schema()), encoding="utf-8") + output_path.touch() + command.extend(["--output-schema", str(schema_path)]) + command.extend(["--output-last-message", str(output_path)]) + + completed = subprocess.run( + command, + input=prompt, + text=True, + capture_output=True, + timeout=self.timeout, + check=False, + ) + if completed.returncode != 0: + raise RuntimeError( + "AI command failed with exit code " + f"{completed.returncode}: {summarize_process_output(completed)}" + ) + if output_path and output_path.exists(): + last_message = output_path.read_text(encoding="utf-8").strip() + if last_message: + return last_message + return completed.stdout.strip() + + @staticmethod + def _is_codex_exec(command: list[str]) -> bool: + if not command: + return False + executable = Path(command[0]).name + return executable == "codex" and "exec" in command[1:] + + +def is_executable_available(executable: str) -> bool: + if os.sep in executable or (os.altsep and os.altsep in executable): + return Path(executable).exists() + return shutil.which(executable) is not None + + +def ai_output_schema() -> dict[str, Any]: + return { + "type": "object", + "additionalProperties": False, + "required": ["type", "release_note", "needs_review", "reason"], + "properties": { + "type": {"type": "string", "enum": ["improvement", "bug_fix"]}, + "release_note": {"type": "string"}, + "needs_review": {"type": "boolean"}, + "reason": {"type": "string"}, + }, + } + + +def summarize_process_output(completed: subprocess.CompletedProcess[str]) -> str: + parts = [] + if completed.stderr.strip(): + parts.append("stderr:\n" + tail_output(completed.stderr)) + if completed.stdout.strip(): + parts.append("stdout:\n" + tail_output(completed.stdout)) + return "\n\n".join(parts) or "no output" + + +def tail_output(text: str, max_lines: int = 40, max_chars: int = 4000) -> str: + tail = "\n".join(text.strip().splitlines()[-max_lines:]) + if len(tail) > max_chars: + tail = "...[truncated]\n" + tail[-max_chars:] + return tail + + +def build_generation_prompt( + row_context: RowContext, + expected_links: list[str], + contributors: list[str], +) -> str: + improvements_reference = load_reference_file(IMPROVEMENTS_REFERENCE) + bug_fixes_reference = load_reference_file(BUG_FIXES_REFERENCE) + context = { + "row_number": row_context.row_number, + "component": row_context.component, + "raw_component_from_excel": row_context.raw_component, + "issue_type_from_excel": row_context.issue_type, + "pr_title_from_excel": row_context.pr_title, + "formatted_release_note_from_excel": row_context.formatted_release_note, + "expected_links": expected_links, + "contributors": contributors, + "issues": [dataclasses.asdict(issue) for issue in row_context.issues], + "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], + } + return textwrap.dedent( + f""" + You write exactly one English TiDB release note entry. + + Return only a JSON object with exactly these keys: + - type: "improvement" or "bug_fix" + - release_note: one Markdown bullet that starts with "- " + - needs_review: true or false + - reason: a short reason for the type and wording + + Rules: + - Write from the user's perspective. + - Use the Excel issue_type as a strong signal, but decide the final type from the issue, + PR description, and code changes. + - For improvements, follow the Improvements reference below. + - For bug fixes, follow the Bug fixes reference below. + - Do not end the release note with a period. + - Include every expected link in Markdown release-note style. + - Include every contributor as @[user](https://github.com/user). + - If there is no issue URL, use the PR link as the suffix link. + - Do not expose internal function names unless they are the user-visible behavior. + - If the available context is insufficient, still draft the best note and set needs_review + to true. + + Expected links: + {json.dumps(expected_links, ensure_ascii=False, indent=2)} + + Contributors: + {json.dumps(contributors, ensure_ascii=False, indent=2)} + + Row context: + {json.dumps(context, ensure_ascii=False, indent=2)} + + Improvements reference: + {improvements_reference} + + Bug fixes reference: + {bug_fixes_reference} + """ + ).strip() + + +def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: + return textwrap.dedent( + f""" + Your previous answer did not satisfy the required JSON schema or release-note rules. + + Validation errors: + {json.dumps(errors, ensure_ascii=False, indent=2)} + + Rewrite the answer. Return only the corrected JSON object. + + Original task: + {original_prompt} + """ + ).strip() + + +@lru_cache(maxsize=None) +def load_reference_file(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Cannot find release-note reference file: {path}. " + "Make sure the repo-local write-review-translate-release-notes skill is present." + ) from exc + + +def extract_json_object(output: str) -> dict[str, Any]: + output = output.strip() + if not output: + raise ValueError("AI command returned no output") + try: + data = json.loads(output) + except json.JSONDecodeError: + candidates = extract_json_object_candidates(output) + if not candidates: + raise ValueError("AI output did not contain a JSON object") from None + required_keys = {"type", "release_note", "needs_review", "reason"} + data = next( + (candidate for candidate in candidates if required_keys <= candidate.keys()), + candidates[0], + ) + if not isinstance(data, dict): + raise ValueError("AI output JSON is not an object") + return data + + +def extract_json_object_candidates(output: str) -> list[dict[str, Any]]: + decoder = json.JSONDecoder() + candidates: list[dict[str, Any]] = [] + for index, char in enumerate(output): + if char != "{": + continue + try: + data, _end = decoder.raw_decode(output[index:]) + except json.JSONDecodeError: + continue + if isinstance(data, dict): + candidates.append(data) + return candidates + + +def validate_ai_response( + data: dict[str, Any], + expected_links: list[str], + contributors: list[str], +) -> tuple[GeneratedNote | None, list[str]]: + errors: list[str] = [] + note_type = data.get("type") + release_note = data.get("release_note") + needs_review = data.get("needs_review") + reason = data.get("reason") + + if note_type not in {"improvement", "bug_fix"}: + errors.append('type must be "improvement" or "bug_fix"') + if not isinstance(release_note, str) or not release_note.startswith("- "): + errors.append('release_note must be a string that starts with "- "') + if isinstance(release_note, str) and release_note.rstrip().endswith("."): + errors.append("release_note must not end with a period") + if not isinstance(needs_review, bool): + errors.append("needs_review must be a boolean") + if not isinstance(reason, str): + errors.append("reason must be a string") + + if isinstance(release_note, str): + for link in expected_links: + if link and link not in release_note: + errors.append(f"release_note is missing expected link: {link}") + for contributor in contributors: + expected = f"@[{contributor}](https://github.com/{contributor})" + if contributor and expected not in release_note: + errors.append(f"release_note is missing contributor: {contributor}") + + if errors: + return None, errors + return ( + GeneratedNote( + note_type=str(note_type), + release_note=str(release_note).strip(), + needs_review=bool(needs_review), + reason=str(reason).strip(), + ), + [], + ) diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py new file mode 100644 index 0000000000000..ee1d79a074c4a --- /dev/null +++ b/scripts/release_notes_ai/cli.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +import argparse +import os +import tempfile +from pathlib import Path + +import openpyxl + +from .ai_client import AIClient +from .excel_workbook import ( + clear_output_columns, + generate_notes_without_ai, + generate_notes_for_sheet, + merge_rows_by_issue_and_component, + prepare_sheet_columns, + sort_sheet_rows_by_component, + store_existing_release_notes, + update_pr_authors_and_dup_notes, +) +from .github_client import GitHubClient +from .markdown_writer import write_release_file +from .scope_filter import move_prs_not_in_scope, parse_date_value + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate English release notes with AI from a tirelease workbook." + ) + parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") + parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") + parser.add_argument( + "--releases-dir", + required=True, + help="Path to the existing English release notes directory.", + ) + parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") + parser.add_argument("--github-token-file", help="Path to a GitHub token file.") + parser.add_argument( + "--ai-command", + default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", + help="Command-line AI command. The prompt is passed through stdin.", + ) + parser.add_argument( + "--ai-model", + default="gpt-5.4", + help="Model name passed to codex exec with -m.", + ) + parser.add_argument( + "--involve-ai-generation", + type=parse_on_off, + default="ON", + help=( + "Whether to use AI for non-dup release notes. Use ON to generate with AI, " + "or OFF to output the original formated_release_note values. Default: ON." + ), + ) + parser.add_argument( + "--output-release-file", + help="Output Markdown file. Defaults to release-{version}-updated-by-ai.md.", + ) + parser.add_argument( + "--ai-timeout", + type=int, + default=600, + help="Timeout in seconds for each AI command invocation.", + ) + parser.add_argument( + "--ai-workers", + type=int, + default=3, + help=( + "Number of concurrent AI command invocations. The default is conservative " + "for codex exec subprocesses." + ), + ) + parser.add_argument( + "--github-workers", + type=int, + default=8, + help="Number of concurrent GitHub API prefetch workers.", + ) + parser.add_argument( + "--author-workers", + type=int, + default=3, + help="Number of concurrent workers used to resolve bot-authored cherry-pick PR authors.", + ) + parser.add_argument( + "--checkpoint-interval", + type=int, + default=1, + help=( + "Save the Excel workbook after every N completed AI rows. " + "Default: 1. Use 0 to disable." + ), + ) + parser.add_argument( + "--force-regenerate", + action="store_true", + help="Clear existing AI release notes and regenerate all non-dup rows.", + ) + parser.add_argument( + "--release-date", + default="TBD", + help='Release date text for the Markdown header, for example "August 14, 2025".', + ) + parser.add_argument( + "--skip-scope-preprocess", + action="store_true", + help="Skip moving not-in-scope PR rows to the PRs_not_in_scope sheet.", + ) + parser.add_argument( + "--scope-base-branch-start-date", + help=( + "Override the estimated release-m.n branch start date for x.y.0 scope " + "preprocessing, in YYYY-MM-DD format." + ), + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + validate_positive_int("--ai-workers", args.ai_workers) + validate_positive_int("--github-workers", args.github_workers) + validate_positive_int("--author-workers", args.author_workers) + if args.checkpoint_interval < 0: + raise ValueError("--checkpoint-interval must be greater than or equal to 0") + base_branch_start_date = None + if args.scope_base_branch_start_date: + base_branch_start_date = parse_date_value(args.scope_base_branch_start_date) + if not base_branch_start_date: + raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") + + token = load_github_token(args.github_token_file) + github = GitHubClient(token) + involve_ai_generation = args.involve_ai_generation == "ON" + ai = AIClient(args.ai_command, args.ai_model, args.ai_timeout) if involve_ai_generation else None + + output_file = ( + Path(args.output_release_file) + if args.output_release_file + else Path(args.releases_dir) / f"release-{args.version}-updated-by-ai.md" + ) + + excel_path = Path(args.excel) + processed_excel_path = default_processed_excel_path(excel_path) + workbook = openpyxl.load_workbook(excel_path) + if args.sheet not in workbook.sheetnames: + raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") + sheet = workbook[args.sheet] + if not args.skip_scope_preprocess: + move_prs_not_in_scope( + workbook, + sheet, + args.version, + Path(args.releases_dir), + github, + base_branch_start_date=base_branch_start_date, + ) + sort_sheet_rows_by_component(sheet) + header = prepare_sheet_columns(sheet) + clear_output_columns(sheet, header, clear_ai=args.force_regenerate) + + existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + update_pr_authors_and_dup_notes( + sheet, + header, + existing_notes, + github, + author_workers=args.author_workers, + ) + merge_rows_by_issue_and_component(sheet, header) + + if involve_ai_generation: + checkpoint_callback = build_checkpoint_callback( + workbook, + processed_excel_path, + args.checkpoint_interval, + ) + markdown_entries = generate_notes_for_sheet( + sheet, + header, + github, + ai, + ai_workers=args.ai_workers, + github_workers=args.github_workers, + checkpoint_callback=checkpoint_callback, + ) + else: + markdown_entries = generate_notes_without_ai(sheet, header) + save_workbook_safely(workbook, processed_excel_path) + write_release_file(output_file, args.version, args.release_date, markdown_entries) + + print(f"Original Excel workbook unchanged: {excel_path}", flush=True) + print(f"Processed Excel workbook: {processed_excel_path}", flush=True) + print(f"Generated release note file: {output_file}", flush=True) + return 0 + + +def validate_positive_int(name: str, value: int) -> None: + if value < 1: + raise ValueError(f"{name} must be greater than or equal to 1") + + +def parse_on_off(value: str) -> str: + normalized = value.strip().upper() + if normalized not in {"ON", "OFF"}: + raise argparse.ArgumentTypeError("value must be ON or OFF") + return normalized + + +def default_processed_excel_path(excel_path: Path) -> Path: + return excel_path.with_name(f"{excel_path.stem}_processed{excel_path.suffix}") + + +def build_checkpoint_callback( + workbook: openpyxl.Workbook, + excel_path: Path, + checkpoint_interval: int, +): + if checkpoint_interval <= 0: + return None + + def checkpoint(completed: int, total: int) -> None: + if completed % checkpoint_interval != 0 and completed != total: + return + save_workbook_safely(workbook, excel_path) + print( + f"Checkpoint saved after {completed}/{total} AI row(s): {excel_path}", + flush=True, + ) + + return checkpoint + + +def save_workbook_safely(workbook: openpyxl.Workbook, excel_path: Path) -> None: + excel_path = excel_path.resolve() + temp_file = tempfile.NamedTemporaryFile( + prefix=f".{excel_path.stem}.", + suffix=excel_path.suffix, + dir=excel_path.parent, + delete=False, + ) + temp_path = Path(temp_file.name) + temp_file.close() + saved_temp = False + try: + workbook.save(temp_path) + saved_temp = True + os.replace(temp_path, excel_path) + except Exception as exc: + if saved_temp and temp_path.exists(): + raise RuntimeError( + f"Failed to replace {excel_path}: {exc}. " + f"A complete temporary workbook remains at {temp_path}." + ) from exc + temp_path.unlink(missing_ok=True) + raise RuntimeError(f"Failed to save workbook {excel_path}: {exc}") from exc + + +def load_github_token(token_file: str | None) -> str | None: + import shutil + import subprocess + + if token_file: + return Path(token_file).read_text(encoding="utf-8").strip() + if os.environ.get("GITHUB_TOKEN"): + return os.environ["GITHUB_TOKEN"].strip() + gh = shutil.which("gh") + if not gh: + return None + completed = subprocess.run( + [gh, "auth", "token"], + text=True, + capture_output=True, + timeout=10, + check=False, + ) + if completed.returncode == 0 and completed.stdout.strip(): + return completed.stdout.strip() + return None diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py new file mode 100644 index 0000000000000..c3e947167a23b --- /dev/null +++ b/scripts/release_notes_ai/constants.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import re +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +IMPROVEMENTS_REFERENCE = ( + REPO_ROOT + / ".ai" + / "skills" + / "write-review-translate-release-notes" + / "references" + / "improvements.md" +) +BUG_FIXES_REFERENCE = ( + REPO_ROOT + / ".ai" + / "skills" + / "write-review-translate-release-notes" + / "references" + / "bug-fixes.md" +) + +BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} +# Keep the misspelled source column name because tirelease exports it this way. +REQUIRED_HEADERS = { + "pr_author", + "pr_link", + "pr_title", + "formated_release_note", + "issue_type", +} +COMPONENT_HEADERS = ("component", "components") + +GITHUB_ITEM_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/" + r"(?Pissues|pull)/(?P\d+)" +) +ISSUE_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/issues/(?P\d+)" +) +PR_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/pull/(?P\d+)" +) +AUTHOR_RE = re.compile(r"@\[([^\]]+)\]") + +TOP_LEVEL_COMPONENTS = ["TiDB", "TiKV", "PD", "TiFlash", "TiProxy"] +TOOL_COMPONENTS = [ + "Backup & Restore (BR)", + "TiCDC", + "TiDB Data Migration (DM)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "TiDB Binlog", + "sync-diff-inspector", +] +COMPONENT_ALIASES = { + "tidb": "TiDB", + "tikv": "TiKV", + "pd": "PD", + "tiflash": "TiFlash", + "tiproxy": "TiProxy", + "br": "Backup & Restore (BR)", + "backup & restore": "Backup & Restore (BR)", + "backup & restore (br)": "Backup & Restore (BR)", + "cdc": "TiCDC", + "ticdc": "TiCDC", + "dm": "TiDB Data Migration (DM)", + "tidb data migration": "TiDB Data Migration (DM)", + "tidb data migration (dm)": "TiDB Data Migration (DM)", + "tidb lightning": "TiDB Lightning", + "lightning": "TiDB Lightning", + "dumpling": "Dumpling", + "tiup": "TiUP", + "tidb binlog": "TiDB Binlog", + "ng monitoring": "TiDB", + "sync_diff": "sync-diff-inspector", + "sync-diff-inspector": "sync-diff-inspector", + "sync diff inspector": "sync-diff-inspector", + "planner": "TiDB", + "execution": "TiDB", + "sql-infra": "TiDB", + "transaction": "TiDB", + "engine": "TiDB", + "observability": "TiDB", + "dxf": "TiDB", + "storage": "TiDB", + "tidb-dashboard": "TiDB", + "tidb dashboard": "TiDB", + "ddl": "TiDB", + "coprocessor": "TiDB", + "compute": "TiDB", + "scheduling": "TiDB", + "spm": "TiDB", + "ng-monitoring": "TiDB", +} diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release_notes_ai/excel_workbook.py new file mode 100644 index 0000000000000..260b4b807d04e --- /dev/null +++ b/scripts/release_notes_ai/excel_workbook.py @@ -0,0 +1,906 @@ +from __future__ import annotations + +import copy +import re +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from collections import OrderedDict +from pathlib import Path +from typing import Any, Callable + +from openpyxl.styles import PatternFill + +from .ai_client import build_generation_prompt +from .constants import ( + AUTHOR_RE, + BOT_AUTHORS, + COMPONENT_HEADERS, + GITHUB_ITEM_URL_RE, + REQUIRED_HEADERS, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) +from .models import ( + ExistingNote, + GitHubDataCache, + MarkdownEntry, + RowContext, + RowGenerationResult, + RowInput, +) +from .utils import ( + extract_issue_urls, + extract_pr_urls, + normalize_component, + normalize_raw_component, + normalized_release_component, + replace_author_markdown, + split_lines, + split_multi_value, + str_value, + unique_ordered, +) + + +GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") + + +def prepare_sheet_columns(sheet: Any) -> dict[str, int]: + header = get_header(sheet) + missing = sorted(REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError(f"Missing required Excel columns: {', '.join(missing)}") + get_component_col(header) + + ai_col = header.get("release_notes_written_by_ai") + formatted_col = header["formated_release_note"] + if not ai_col: + sheet.insert_cols(formatted_col + 1) + sheet.cell(row=1, column=formatted_col + 1, value="release_notes_written_by_ai") + header = get_header(sheet) + + if "published_release_notes" not in header: + last_col = sheet.max_column + sheet.cell(row=1, column=last_col + 1, value="published_release_notes") + header = get_header(sheet) + return header + + +def get_header(sheet: Any) -> dict[str, int]: + header: dict[str, int] = {} + for index, cell in enumerate(sheet[1], start=1): + if cell.value: + header[str(cell.value).strip()] = index + return header + + +def clear_output_columns(sheet: Any, header: dict[str, int], clear_ai: bool = True) -> None: + for row_number in range(2, sheet.max_row + 1): + if clear_ai: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + sheet.cell(row=row_number, column=header["published_release_notes"]).value = None + + +def sort_sheet_rows_by_component(sheet: Any) -> None: + header = get_header(sheet) + component_col = get_component_col(header) + if sheet.max_row <= 2: + return + + snapshots = [ + (row_number, component_sort_key(sheet.cell(row=row_number, column=component_col).value), snapshot_row(sheet, row_number)) + for row_number in range(2, sheet.max_row + 1) + ] + sorted_snapshots = sorted(snapshots, key=lambda item: item[1]) + if [row_number for row_number, _key, _snapshot in snapshots] == [ + row_number for row_number, _key, _snapshot in sorted_snapshots + ]: + return + + for target_row, (_source_row, _key, snapshot) in enumerate(sorted_snapshots, start=2): + restore_row(sheet, target_row, snapshot) + + print("Sorted worksheet rows by component before release-note generation", flush=True) + + +def component_sort_key(value: Any) -> tuple[int, str]: + component = normalize_raw_component(value) + if not component: + return (1, "") + return (0, component.casefold()) + + +def snapshot_row(sheet: Any, row_number: int) -> dict[str, Any]: + row_dimension = sheet.row_dimensions[row_number] + return { + "height": row_dimension.height, + "hidden": row_dimension.hidden, + "outline_level": row_dimension.outlineLevel, + "collapsed": row_dimension.collapsed, + "cells": [snapshot_cell(sheet.cell(row=row_number, column=column)) for column in range(1, sheet.max_column + 1)], + } + + +def snapshot_cell(cell: Any) -> dict[str, Any]: + return { + "value": cell.value, + "style": copy.copy(cell._style), + "number_format": cell.number_format, + "hyperlink": copy.copy(cell.hyperlink) if cell.hyperlink else None, + "comment": copy.copy(cell.comment) if cell.comment else None, + } + + +def restore_row(sheet: Any, row_number: int, snapshot: dict[str, Any]) -> None: + row_dimension = sheet.row_dimensions[row_number] + row_dimension.height = snapshot["height"] + row_dimension.hidden = snapshot["hidden"] + row_dimension.outlineLevel = snapshot["outline_level"] + row_dimension.collapsed = snapshot["collapsed"] + for column, cell_snapshot in enumerate(snapshot["cells"], start=1): + cell = sheet.cell(row=row_number, column=column) + cell.value = cell_snapshot["value"] + cell._style = copy.copy(cell_snapshot["style"]) + cell.number_format = cell_snapshot["number_format"] + cell._hyperlink = copy.copy(cell_snapshot["hyperlink"]) if cell_snapshot["hyperlink"] else None + cell.comment = copy.copy(cell_snapshot["comment"]) if cell_snapshot["comment"] else None + + +def get_component_col(header: dict[str, int]) -> int: + for name in COMPONENT_HEADERS: + if name in header: + return header[name] + raise ValueError("Missing required Excel column: component or components") + + +def issue_urls_for_row(sheet: Any, header: dict[str, int], row_number: int) -> list[str]: + candidates: list[str] = [] + if "issue_url" in header: + candidates.append(str_value(sheet.cell(row=row_number, column=header["issue_url"]).value)) + candidates.append(str_value(sheet.cell(row=row_number, column=header["formated_release_note"]).value)) + return unique_ordered(url for text in candidates for url in extract_issue_urls(text)) + + +def first_issue_url_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str | None: + issue_urls = issue_urls_for_row(sheet, header, row_number) + return issue_urls[0] if issue_urls else None + + +def store_existing_release_notes(releases_dir: Path, version: str) -> list[ExistingNote]: + existing_notes: list[ExistingNote] = [] + seen: set[tuple[str, tuple[str, ...]]] = set() + target_version = parse_semver_tuple(version) + + for file_path in sorted(releases_dir.rglob("*.md")): + if should_skip_release_file(file_path, target_version): + continue + level1 = level2 = level3 = "" + with file_path.open("r", encoding="utf-8") as file: + for raw_line in file: + line = raw_line.strip() + authors = AUTHOR_RE.findall(line) + item_url = GITHUB_ITEM_URL_RE.search(line) + if item_url: + key = (item_url.group(), tuple(authors)) + if key in seen: + continue + seen.add(key) + note_level = level1 + level2 + level3 + note_type, component = classify_note_level(note_level) + existing_notes.append( + ExistingNote( + url=item_url.group(), + line=line, + file_name=file_path.name, + note_level=note_level, + authors=authors, + note_type=note_type, + component=component, + ) + ) + continue + + heading = parse_release_note_heading(raw_line) + if not heading: + continue + heading_level, label = heading + if heading_level == 1: + level1 = "> " + label + level2 = level3 = "" + elif heading_level == 2: + level2 = "> " + label + level3 = "" + elif heading_level == 3: + level3 = "> " + label + return existing_notes + + +def should_skip_release_file(file_path: Path, target_version: tuple[int, int, int]) -> bool: + if "updated-by-ai" in file_path.stem: + return True + file_version = release_file_semver_tuple(file_path) + if not file_version: + return False + return file_version >= target_version + + +def parse_semver_tuple(version: str) -> tuple[int, int, int]: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def release_file_semver_tuple(file_path: Path) -> tuple[int, int, int] | None: + match = re.match( + r"^release-(?P\d+)\.(?P\d+)\.(?P\d+)", + file_path.stem, + ) + if not match: + return None + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def parse_release_note_heading(raw_line: str) -> tuple[int, str] | None: + line = raw_line.rstrip() + section = re.match(r"^##\s+(.+?)\s*$", line) + if section: + return 1, section.group(1).strip() + + top_component = re.match(r"^[+-]\s+(.+?)\s*$", line) + if top_component: + label = top_component.group(1).strip() + if label.lower() == "tools" or normalized_release_component(label): + return 2, label + + tool_component = re.match(r"^ {4}[+-]\s+(.+?)\s*$", line) + if tool_component: + label = tool_component.group(1).strip() + if normalized_release_component(label): + return 3, label + return None + + +def update_pr_authors_and_dup_notes( + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + github: Any, + author_workers: int = 1, +) -> None: + apply_bot_author_replacements(sheet, header, github, author_workers) + existing_notes_by_url = index_existing_notes_by_url(existing_notes) + + for row_number in range(2, sheet.max_row + 1): + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + current_author = str_value(author_cell.value) + + issue_url = first_issue_url_for_row(sheet, header, row_number) + if not issue_url: + continue + + current_authors = split_multi_value(current_author) + dup_notes = [] + for existing in existing_notes_by_url.get(issue_url, []): + if existing.authors and not set(current_authors).intersection(existing.authors): + continue + dup_notes.append(existing.dup_text) + + if dup_notes: + dup_col = header["published_release_notes"] + sheet.cell(row=row_number, column=dup_col, value="\n".join(unique_ordered(dup_notes))) + fill_row(sheet, row_number) + print(f"Row {row_number}: found duplicated release note for {issue_url}", flush=True) + + +def apply_bot_author_replacements( + sheet: Any, + header: dict[str, int], + github: Any, + author_workers: int, +) -> None: + requests = bot_author_requests(sheet, header) + if not requests: + return + print( + f"Resolving {len(requests)} bot-authored PR row(s) with {author_workers} worker(s)", + flush=True, + ) + + replacements = resolve_bot_author_replacements(requests, github, author_workers) + for row_number in sorted(replacements): + current_author, actual_author = replacements[row_number] + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + formatted_cell = sheet.cell(row=row_number, column=header["formated_release_note"]) + formatted_note = str_value(formatted_cell.value) + print( + f"Replacing bot author in row {row_number}: {current_author} -> {actual_author}", + flush=True, + ) + author_cell.value = actual_author + formatted_cell.value = replace_author_markdown( + formatted_note, current_author, actual_author + ) + + +def bot_author_requests(sheet: Any, header: dict[str, int]) -> list[tuple[int, str, str, str]]: + requests = [] + for row_number in range(2, sheet.max_row + 1): + current_author = str_value(sheet.cell(row=row_number, column=header["pr_author"]).value) + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + if current_author not in BOT_AUTHORS or not pr_link: + continue + pr_title = str_value(sheet.cell(row=row_number, column=header["pr_title"]).value) + requests.append((row_number, pr_link, pr_title, current_author)) + return requests + + +def resolve_bot_author_replacements( + requests: list[tuple[int, str, str, str]], + github: Any, + author_workers: int, +) -> dict[int, tuple[str, str]]: + replacements: dict[int, tuple[str, str]] = {} + total = len(requests) + if author_workers == 1: + for completed, request in enumerate(requests, start=1): + row_number, pr_link, pr_title, current_author = request + actual_author = resolve_bot_author(github, request) + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + with ThreadPoolExecutor(max_workers=author_workers) as executor: + futures = { + executor.submit(resolve_bot_author, github, request): request + for request in requests + } + for completed, future in enumerate(as_completed(futures), start=1): + row_number, _pr_link, _pr_title, current_author = futures[future] + actual_author = future.result() + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + +def print_bot_author_progress( + completed: int, + total: int, + row_number: int, + current_author: str, + actual_author: str, +) -> None: + status = "unchanged" if actual_author == current_author else f"{current_author} -> {actual_author}" + print( + f"Resolved bot author {completed}/{total}: row {row_number} ({status})", + flush=True, + ) + + +def resolve_bot_author(github: Any, request: tuple[int, str, str, str]) -> str: + row_number, pr_link, pr_title, current_author = request + try: + return github.get_original_author_for_cherry_pick( + row_number, + pr_link, + pr_title, + current_author, + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to resolve bot author for {pr_link}: {exc}", + file=sys.stderr, + flush=True, + ) + return current_author + + +def index_existing_notes_by_url(existing_notes: list[ExistingNote]) -> dict[str, list[ExistingNote]]: + indexed: dict[str, list[ExistingNote]] = {} + for existing in existing_notes: + indexed.setdefault(existing.url, []).append(existing) + return indexed + + +def merge_rows_by_issue_and_component(sheet: Any, header: dict[str, int]) -> None: + groups: OrderedDict[tuple[str, str], list[int]] = OrderedDict() + component_col = get_component_col(header) + for row_number in range(2, sheet.max_row + 1): + issue_url = first_issue_url_for_row(sheet, header, row_number) + if not issue_url: + continue + component = normalize_raw_component(sheet.cell(row=row_number, column=component_col).value) + if not component: + continue + groups.setdefault((issue_url, component), []).append(row_number) + + rows_to_delete: list[int] = [] + for (_issue_url, _component), rows in groups.items(): + if len(rows) <= 1: + continue + keep_row = rows[0] + merge_pr_links(sheet, header, keep_row, rows) + merge_authors(sheet, header, keep_row, rows) + merge_dup_notes(sheet, header, keep_row, rows) + fill_first_empty_values(sheet, header, keep_row, rows) + if str_value(sheet.cell(row=keep_row, column=header["published_release_notes"]).value): + fill_row(sheet, keep_row) + rows_to_delete.extend(rows[1:]) + + for row_number in sorted(rows_to_delete, reverse=True): + sheet.delete_rows(row_number, 1) + + +def merge_pr_links(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + links: list[str] = [] + for row in rows: + links.extend(split_multi_value(sheet.cell(row=row, column=header["pr_link"]).value)) + sheet.cell(row=keep_row, column=header["pr_link"], value=", ".join(unique_ordered(links))) + + +def merge_authors(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + authors: list[str] = [] + for row in rows: + authors.extend(split_multi_value(sheet.cell(row=row, column=header["pr_author"]).value)) + sheet.cell(row=keep_row, column=header["pr_author"], value=", ".join(unique_ordered(authors))) + + +def merge_dup_notes(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + notes: list[str] = [] + for row in rows: + notes.extend(split_lines(sheet.cell(row=row, column=header["published_release_notes"]).value)) + if notes: + sheet.cell(row=keep_row, column=header["published_release_notes"], value="\n".join(unique_ordered(notes))) + + +def fill_first_empty_values(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + columns_to_skip = { + header["pr_link"], + header["pr_author"], + header["published_release_notes"], + header["release_notes_written_by_ai"], + } + for col in range(1, sheet.max_column + 1): + if col in columns_to_skip: + continue + keep_cell = sheet.cell(row=keep_row, column=col) + if str_value(keep_cell.value): + continue + for row in rows[1:]: + value = sheet.cell(row=row, column=col).value + if str_value(value): + keep_cell.value = value + break + + +def generate_notes_for_sheet( + sheet: Any, + header: dict[str, int], + github: Any, + ai: Any, + ai_workers: int = 1, + github_workers: int = 1, + checkpoint_callback: Callable[[int, int], None] | None = None, +) -> list[MarkdownEntry]: + entries_by_row: dict[int, list[MarkdownEntry]] = {} + row_inputs = [ + build_row_input(sheet, header, row_number) + for row_number in range(2, sheet.max_row + 1) + ] + rows_to_generate: list[RowInput] = [] + + for row_input in row_inputs: + row_number = row_input.row_number + component = row_input.component + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + entries_by_row[row_number] = dup_entries_for_row(row_input, dup_text) + continue + + ai_cell = sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]) + expected_links = row_input.issue_urls or row_input.pr_urls + if not expected_links: + ai_cell.value = "AI_GENERATION_FAILED: missing issue URL and PR URL" + continue + + existing_note = str_value(ai_cell.value) + if is_reusable_ai_note(existing_note): + note_type = classify_note_type_from_text(existing_note, row_input.issue_type) + entries_by_row[row_number] = [ + MarkdownEntry( + note_type or "improvement", + component, + existing_note, + row_input.raw_component, + ) + ] + print(f"Row {row_number}: skipped existing AI release note", flush=True) + continue + + rows_to_generate.append(row_input) + + github_cache = prefetch_github_data(rows_to_generate, github, github_workers) + total_to_generate = len(rows_to_generate) + if total_to_generate: + print( + f"Generating AI release notes for {total_to_generate} row(s) " + f"with {ai_workers} worker(s)", + flush=True, + ) + + completed = 0 + with ThreadPoolExecutor(max_workers=ai_workers) as executor: + futures = [ + executor.submit(generate_note_for_row, row_input, github_cache, ai) + for row_input in rows_to_generate + ] + for future in as_completed(futures): + result = future.result() + apply_generation_result(sheet, header, result, entries_by_row) + completed += 1 + if checkpoint_callback: + checkpoint_callback(completed, total_to_generate) + + entries: list[MarkdownEntry] = [] + for row_input in row_inputs: + entries.extend(entries_by_row.get(row_input.row_number, [])) + return entries + + +def generate_notes_without_ai(sheet: Any, header: dict[str, int]) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + for row_number in range(2, sheet.max_row + 1): + row_input = build_row_input(sheet, header, row_number) + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + entries.extend(dup_entries_for_row(row_input, dup_text)) + continue + + formatted_notes = split_lines(row_input.formatted_release_note) + if not formatted_notes: + print( + f"Row {row_number}: skipped non-dup row because formated_release_note is empty", + file=sys.stderr, + flush=True, + ) + continue + note_type = classify_note_type_from_text( + row_input.formatted_release_note, + row_input.issue_type, + ) + for note in formatted_notes: + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + note, + row_input.raw_component, + ) + ) + + print( + f"AI generation is OFF; generated Markdown from formated_release_note for {len(entries)} note(s)", + flush=True, + ) + return entries + + +def dup_entries_for_row(row_input: RowInput, dup_text: str) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + for dup_note in split_lines(dup_text): + note_type = classify_note_type_from_text( + dup_note, + row_input.issue_type, + ) + dup_component = parse_component_from_dup(dup_note) or row_input.component + if note_type in {"improvement", "bug_fix"}: + entries.append( + MarkdownEntry( + note_type, + normalize_component(dup_component), + dup_note, + row_input.raw_component, + ) + ) + return entries + + +def build_row_input(sheet: Any, header: dict[str, int], row_number: int) -> RowInput: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + return RowInput( + row_number=row_number, + component=release_component_for_row(sheet, header, row_number), + raw_component=raw_component, + issue_type=str_value(sheet.cell(row=row_number, column=header["issue_type"]).value), + pr_title=str_value(sheet.cell(row=row_number, column=header["pr_title"]).value), + pr_authors=split_multi_value(sheet.cell(row=row_number, column=header["pr_author"]).value), + pr_urls=extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value)), + issue_urls=issue_urls_for_row(sheet, header, row_number), + formatted_release_note=str_value( + sheet.cell(row=row_number, column=header["formated_release_note"]).value + ), + ) + + +def is_reusable_ai_note(note: str) -> bool: + return bool(note) and not note.startswith("AI_GENERATION_FAILED:") + + +def prefetch_github_data(row_inputs: list[RowInput], github: Any, github_workers: int) -> GitHubDataCache: + issue_urls = unique_ordered(url for row_input in row_inputs for url in row_input.issue_urls) + pr_urls = unique_ordered(url for row_input in row_inputs for url in row_input.pr_urls) + issues = {} + pulls = {} + + if not issue_urls and not pr_urls: + return GitHubDataCache(issues=issues, pulls=pulls) + + print( + f"Prefetching GitHub data: {len(issue_urls)} issue(s), {len(pr_urls)} PR(s) " + f"with {github_workers} worker(s)", + flush=True, + ) + + with ThreadPoolExecutor(max_workers=github_workers) as executor: + futures = { + executor.submit(github.get_issue, issue_url): ("issue", issue_url) + for issue_url in issue_urls + } + futures.update( + { + executor.submit(github.get_pull, pr_url): ("pull", pr_url) + for pr_url in pr_urls + } + ) + for future in as_completed(futures): + item_type, url = futures[future] + try: + data = future.result() + except Exception as exc: # noqa: BLE001 + print(f"Failed to prefetch GitHub {item_type} {url}: {exc}", file=sys.stderr, flush=True) + continue + if item_type == "issue": + issues[url] = data + else: + pulls[url] = data + return GitHubDataCache(issues=issues, pulls=pulls) + + +def generate_note_for_row( + row_input: RowInput, + github_cache: GitHubDataCache, + ai: Any, +) -> RowGenerationResult: + expected_links = row_input.issue_urls or row_input.pr_urls + row_context = build_row_context_from_cache(row_input, github_cache) + contributors = unique_ordered( + [author for author in row_context.pr_authors if author not in BOT_AUTHORS] + ) + try: + prompt = build_generation_prompt(row_context, expected_links, contributors) + generated = ai.generate(prompt, expected_links, contributors) + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=generated.note_type, + note=generated.release_note, + error=None, + needs_review=generated.needs_review, + reason=generated.reason, + ) + except Exception as exc: # noqa: BLE001 + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=None, + note=None, + error=str(exc), + ) + + +def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCache) -> RowContext: + pr_authors = list(row_input.pr_authors) + issues = [ + github_cache.issues[issue_url] + for issue_url in row_input.issue_urls + if issue_url in github_cache.issues + ] + pulls = [] + for pr_url in row_input.pr_urls: + pull = github_cache.pulls.get(pr_url) + if not pull: + continue + pulls.append(pull) + if pull.author: + pr_authors.append(pull.author) + return RowContext( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + issue_type=row_input.issue_type, + pr_title=row_input.pr_title, + pr_authors=unique_ordered(pr_authors), + pr_urls=row_input.pr_urls, + issue_urls=row_input.issue_urls, + formatted_release_note=row_input.formatted_release_note, + issues=issues, + pulls=pulls, + ) + + +def apply_generation_result( + sheet: Any, + header: dict[str, int], + result: RowGenerationResult, + entries_by_row: dict[int, list[MarkdownEntry]], +) -> None: + ai_cell = sheet.cell(row=result.row_number, column=header["release_notes_written_by_ai"]) + if result.error: + ai_cell.value = f"AI_GENERATION_FAILED: {result.error}" + print( + f"Row {result.row_number}: AI generation failed: {result.error}", + file=sys.stderr, + flush=True, + ) + return + if not result.note or not result.note_type: + ai_cell.value = "AI_GENERATION_FAILED: empty AI generation result" + print( + f"Row {result.row_number}: AI generation failed: empty AI generation result", + file=sys.stderr, + flush=True, + ) + return + + ai_cell.value = result.note + entries_by_row[result.row_number] = [ + MarkdownEntry(result.note_type, result.component, result.note, result.raw_component) + ] + review_marker = " (needs review)" if result.needs_review else "" + print( + f"Row {result.row_number}: generated {result.note_type}{review_marker}: {result.reason}", + flush=True, + ) + + +def release_component_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + raw_lower = raw_component.lower() + raw_release_component = release_component_from_raw(raw_component) + if raw_release_component: + return raw_release_component + + urls = issue_urls_for_row(sheet, header, row_number) + urls.extend(extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value))) + repos = {match.group("repo").lower() for url in urls for match in [GITHUB_ITEM_URL_RE.search(url)] if match} + + if "pd" in repos: + return "PD" + if "tikv" in repos: + return "TiKV" + if "tiflash" in repos: + return "TiFlash" + if "ng-monitoring" in repos: + return "TiDB" + if "tiup" in repos: + return "TiUP" + if repos.intersection({"tiflow", "ticdc"}): + if "dm" in raw_lower and "cdc" not in raw_lower: + return "TiDB Data Migration (DM)" + return "TiCDC" + if "tidb" in repos: + if "br" in raw_lower: + return "Backup & Restore (BR)" + if "lightning" in raw_lower: + return "TiDB Lightning" + if "dumpling" in raw_lower: + return "Dumpling" + return "TiDB" + if "tidb-dashboard" in repos: + return "TiDB" + return normalize_component(raw_component) + + +def release_component_from_raw(raw_component: str) -> str: + normalized_raw = normalize_component(raw_component) + if normalized_raw in TOP_LEVEL_COMPONENTS or normalized_raw in TOOL_COMPONENTS: + return normalized_raw + + token_components = [ + normalize_component(token) + for token in split_multi_value(raw_component) + ] + if not token_components: + return "" + + for component in [ + "Backup & Restore (BR)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "sync-diff-inspector", + ]: + if component in token_components: + return component + + for component in TOP_LEVEL_COMPONENTS: + if component in token_components: + return component + + if "TiDB Data Migration (DM)" in token_components: + return "TiDB Data Migration (DM)" + if "TiCDC" in token_components: + return "TiCDC" + + return "" + + +def classify_note_level(note_level: str) -> tuple[str | None, str | None]: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note_level)] + if not labels: + return None, None + section = labels[0].lower() + note_type = None + if "bug fixes" in section or "error fixes" in section: + note_type = "bug_fix" + elif "improvements" in section: + note_type = "improvement" + + component_labels = labels[1:] + if component_labels and component_labels[0].lower() == "tools": + component_labels = component_labels[1:] + for label in reversed(component_labels): + component = normalized_release_component(label) + if component: + return note_type, component + return note_type, None + + +def classify_note_type_from_text(note: str, issue_type: str) -> str | None: + note_lower = note.lower() + issue_type_lower = issue_type.lower() + if "> bug fixes" in note_lower or "> 错误修复" in note_lower: + return "bug_fix" + if "> improvements" in note_lower or "> 改进提升" in note_lower: + return "improvement" + if "bug" in issue_type_lower or "fix" in issue_type_lower: + return "bug_fix" + if "improvement" in issue_type_lower or "enhancement" in issue_type_lower: + return "improvement" + if note.strip().startswith("- Fix "): + return "bug_fix" + return "improvement" + + +def parse_component_from_dup(note: str) -> str | None: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note)] + cleaned: list[str] = [] + for label in labels: + if " - " in label: + label = label.split(" - ", 1)[0] + cleaned.append(label.strip()) + if len(cleaned) < 2: + return None + return normalized_release_component(cleaned[-1]) + + +def fill_row(sheet: Any, row_number: int) -> None: + for column in range(1, sheet.max_column + 1): + sheet.cell(row=row_number, column=column).fill = copy.copy(GRAY_FILL) diff --git a/scripts/release_notes_ai/github_client.py b/scripts/release_notes_ai/github_client.py new file mode 100644 index 0000000000000..f0f4d1b5e2ff2 --- /dev/null +++ b/scripts/release_notes_ai/github_client.py @@ -0,0 +1,321 @@ +from __future__ import annotations + +import re +import sys +import threading +import time +from typing import Any + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +from .constants import GITHUB_ITEM_URL_RE +from .models import IssueInfo, PullInfo +from .utils import parse_github_url + + +def create_retry_policy() -> Retry: + return Retry( + total=3, + connect=3, + read=3, + status=3, + backoff_factor=1, + status_forcelist=(500, 502, 503, 504), + allowed_methods=frozenset(["GET"]), + respect_retry_after_header=True, + raise_on_status=False, + ) + + +class GitHubClient: + def __init__( + self, + token: str | None, + max_rate_limit_retries: int = 3, + max_rate_limit_sleep: int = 600, + ): + self.max_rate_limit_retries = max_rate_limit_retries + self.max_rate_limit_sleep = max_rate_limit_sleep + self.headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + if token: + self.headers["Authorization"] = f"Bearer {token}" + self._thread_local = threading.local() + + def get_session(self) -> requests.Session: + session = getattr(self._thread_local, "session", None) + if session is None: + session = requests.Session() + session.headers.update(self.headers) + adapter = HTTPAdapter(max_retries=create_retry_policy()) + session.mount("https://", adapter) + self._thread_local.session = session + return session + + def get_json(self, api_path: str) -> dict[str, Any]: + data = self.get_api_json(api_path) + if not isinstance(data, dict): + raise ValueError(f"Expected object response from {api_path}") + return data + + def get_api_json(self, api_path: str, params: dict[str, Any] | None = None) -> Any: + return self.get_url_json(f"https://api.github.com{api_path}", params=params) + + def get_url_json(self, url: str, params: dict[str, Any] | None = None) -> Any: + last_response: requests.Response | None = None + for attempt in range(self.max_rate_limit_retries + 1): + response = self.get_session().get(url, params=params, timeout=30) + last_response = response + if self.is_rate_limited(response) and attempt < self.max_rate_limit_retries: + sleep_seconds = self.rate_limit_sleep_seconds(response, attempt) + print( + "GitHub API rate limit reached; retrying in " + f"{sleep_seconds} seconds: {url}", + file=sys.stderr, + flush=True, + ) + time.sleep(sleep_seconds) + continue + response.raise_for_status() + return response.json() + if last_response is not None: + last_response.raise_for_status() + raise RuntimeError(f"GitHub API request failed: {url}") + + def is_rate_limited(self, response: requests.Response) -> bool: + if response.status_code == 429: + return True + if response.status_code != 403: + return False + if response.headers.get("x-ratelimit-remaining") == "0": + return True + message = response.text.lower() + return "rate limit" in message or "abuse detection" in message + + def rate_limit_sleep_seconds(self, response: requests.Response, attempt: int) -> int: + retry_after = response.headers.get("retry-after") + if retry_after and retry_after.isdigit(): + return min(max(int(retry_after), 1), self.max_rate_limit_sleep) + reset = response.headers.get("x-ratelimit-reset") + if reset and reset.isdigit(): + wait_seconds = int(reset) - int(time.time()) + 5 + return min(max(wait_seconds, 1), self.max_rate_limit_sleep) + return min(2 ** attempt, self.max_rate_limit_sleep) + + def get_pull(self, pr_url: str) -> PullInfo: + owner, repo, number = parse_github_url(pr_url, "pull") + pull = self.get_json(f"/repos/{owner}/{repo}/pulls/{number}") + files_summary = self.get_pull_files_summary(owner, repo, number) + return PullInfo( + url=pr_url, + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary=files_summary, + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + + def get_issue(self, issue_url: str) -> IssueInfo: + owner, repo, number = parse_github_url(issue_url, "issues") + issue = self.get_json(f"/repos/{owner}/{repo}/issues/{number}") + labels = [ + str(label.get("name")) + for label in issue.get("labels", []) + if isinstance(label, dict) and label.get("name") + ] + return IssueInfo( + url=issue_url, + title=str(issue.get("title") or ""), + body=str(issue.get("body") or ""), + labels=labels, + ) + + def get_pull_files_summary( + self, + owner: str, + repo: str, + number: str, + max_files: int = 80, + max_patch_chars: int = 1200, + max_total_chars: int = 60000, + ) -> str: + lines: list[str] = [] + page = 1 + total_chars = 0 + while len(lines) < max_files: + files = self.get_api_json( + f"/repos/{owner}/{repo}/pulls/{number}/files", + params={"per_page": 100, "page": page}, + ) + if not isinstance(files, list) or not files: + break + for item in files: + if len(lines) >= max_files or total_chars >= max_total_chars: + break + if not isinstance(item, dict): + continue + patch = str(item.get("patch") or "") + if len(patch) > max_patch_chars: + patch = patch[:max_patch_chars] + "\n...[patch truncated]" + block = "\n".join( + [ + f"file: {item.get('filename', '')}", + f"status: {item.get('status', '')}", + f"additions: {item.get('additions', 0)}", + f"deletions: {item.get('deletions', 0)}", + "patch:", + patch, + ] + ) + lines.append(block) + total_chars += len(block) + page += 1 + if not lines: + return "No changed-file information is available." + if len(lines) >= max_files: + lines.append("...[file list truncated]") + return "\n\n".join(lines) + + def list_pulls_for_base( + self, + owner: str, + repo: str, + base: str, + state: str = "closed", + max_pages: int = 10, + ) -> list[PullInfo]: + pulls: list[PullInfo] = [] + for page in range(1, max_pages + 1): + data = self.get_api_json( + f"/repos/{owner}/{repo}/pulls", + params={ + "state": state, + "base": base, + "sort": "created", + "direction": "asc", + "per_page": 100, + "page": page, + }, + ) + if not isinstance(data, list) or not data: + break + for pull in data: + if not isinstance(pull, dict): + continue + pulls.append( + PullInfo( + url=str(pull.get("html_url") or ""), + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary="", + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + ) + if len(data) < 100: + break + return pulls + + def get_original_author_for_cherry_pick( + self, row_number: int, cp_pr_link: str, cp_pr_title: str, current_author: str + ) -> str: + default_owner, default_repo, _cp_number = parse_github_url(cp_pr_link, "pull") + target_ref = find_original_pr_reference(cp_pr_title, default_owner, default_repo) + if not target_ref: + try: + cp_info = self.get_pull(cp_pr_link) + target_ref = ( + find_original_pr_reference(cp_info.head_ref, default_owner, default_repo) + or find_original_pr_reference(cp_info.title, default_owner, default_repo) + or find_original_pr_reference(cp_info.body, default_owner, default_repo) + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to inspect cherry-pick PR " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + if not target_ref: + print( + f"Row {row_number}: failed to find the original PR for " + f"{cp_pr_link} created by {current_author}.", + file=sys.stderr, + ) + return current_author + + target_owner, target_repo, target_number = target_ref + target_pr_link = f"https://github.com/{target_owner}/{target_repo}/pull/{target_number}" + try: + return self.get_pull(target_pr_link).author or current_author + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to find the non-bot author for " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + +def find_original_pr_reference( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + text = text or "" + marker_lines = [ + line + for line in text.splitlines() + if re.search(r"\b(backport|cherry[- ]?pick|original|source|from)\b", line, re.I) + ] + for line in marker_lines: + reference = find_pr_reference_in_text(line, default_owner, default_repo) + if reference: + return reference + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text) + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + branch = re.search(r"(?:^|[/_-])cherry-pick-(?P\d+)(?:\D|$)", text) + if branch: + return default_owner, default_repo, branch.group("number") + + if "\n" not in text and len(text) <= 300: + return find_pr_reference_in_text(text, default_owner, default_repo) + + return None + + +def find_pr_reference_in_text( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + for full_url in GITHUB_ITEM_URL_RE.finditer(text or ""): + if full_url.group("kind") == "pull": + return full_url.group("owner"), full_url.group("repo"), full_url.group("number") + + cross_repo = re.search( + r"(?[\w.-]+)/(?P[\w.-]+)#(?P\d+)\b", + text or "", + ) + if cross_repo: + return cross_repo.group("owner"), cross_repo.group("repo"), cross_repo.group("number") + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text or "") + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + return None diff --git a/scripts/release_notes_ai/markdown_writer.py b/scripts/release_notes_ai/markdown_writer.py new file mode 100644 index 0000000000000..38d02cdf51950 --- /dev/null +++ b/scripts/release_notes_ai/markdown_writer.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path + +from .constants import TOOL_COMPONENTS, TOP_LEVEL_COMPONENTS +from .models import MarkdownEntry +from .utils import normalize_component, str_value + + +def write_release_file( + output_file: Path, + version: str, + release_date: str, + entries: list[MarkdownEntry], +) -> None: + major_minor = ".".join(version.split(".")[:2]) + grouped = group_markdown_entries(entries) + content: list[str] = [ + "---", + f"title: TiDB {version} Release Notes", + f"summary: Learn about the improvements and bug fixes in TiDB {version}.", + "---", + "", + f"# TiDB {version} Release Notes", + "", + f"Release date: {release_date}", + "", + f"TiDB version: {version}", + "", + "Quick access: " + f"[Quick start](https://docs.pingcap.com/tidb/v{major_minor}/quick-start-with-tidb) | " + f"[Production deployment](https://docs.pingcap.com/tidb/v{major_minor}/production-deployment-using-tiup)", + "", + ] + + content.extend(render_section("## Improvements", grouped["improvement"])) + content.append("") + content.extend(render_section("## Bug fixes", grouped["bug_fix"])) + content.append("") + while content and content[-1] == "": + content.pop() + + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text("\n".join(content) + "\n", encoding="utf-8") + + +def group_markdown_entries(entries: list[MarkdownEntry]) -> dict[str, dict[str, list[MarkdownEntry]]]: + grouped: dict[str, dict[str, list[MarkdownEntry]]] = { + "improvement": defaultdict(list), + "bug_fix": defaultdict(list), + } + for entry in entries: + if entry.note_type not in grouped: + continue + component = normalize_component(entry.component) or "Other" + grouped[entry.note_type][component].append(entry) + return grouped + + +def render_section(title: str, entries_by_component: dict[str, list[MarkdownEntry]]) -> list[str]: + lines = [title, ""] + top_components = [ + component + for component in TOP_LEVEL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + unknown_top_components = sorted( + component + for component in entries_by_component + if component not in TOP_LEVEL_COMPONENTS + and component not in TOOL_COMPONENTS + and entries_by_component[component] + ) + tool_components = [ + component + for component in TOOL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + + for component in top_components + unknown_top_components: + lines.append(f"+ {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + if tool_components: + lines.append("+ Tools") + lines.append("") + for component in tool_components: + lines.append(f" + {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + while lines and lines[-1] == "": + lines.pop() + return lines + + +def note_with_component_marker(entry: MarkdownEntry) -> str: + note = ensure_release_note_bullet(entry.note) + raw_component = sanitize_component_marker(entry.raw_component) + if not raw_component or "" + + +def ensure_release_note_bullet(note: str) -> str: + note = str_value(note) + if note.startswith("- "): + return note + if note.startswith(("+ ", "* ")): + return "- " + note[2:].lstrip() + return f"- {note}" + + +def sanitize_component_marker(component: str) -> str: + return " ".join(str_value(component).replace("--", "- -").split()) diff --git a/scripts/release_notes_ai/models.py b/scripts/release_notes_ai/models.py new file mode 100644 index 0000000000000..7e89853cb3202 --- /dev/null +++ b/scripts/release_notes_ai/models.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import dataclasses + + +@dataclasses.dataclass +class ExistingNote: + url: str + line: str + file_name: str + note_level: str + authors: list[str] + note_type: str | None + component: str | None + + @property + def dup_text(self) -> str: + return f"- (dup): {self.file_name} {self.note_level} {self.line}" + + +@dataclasses.dataclass +class PullInfo: + url: str + title: str + body: str + author: str + head_ref: str + base_ref: str + files_summary: str + merged_at: str = "" + created_at: str = "" + + +@dataclasses.dataclass +class IssueInfo: + url: str + title: str + body: str + labels: list[str] + + +@dataclasses.dataclass +class GeneratedNote: + note_type: str + release_note: str + needs_review: bool + reason: str + + +@dataclasses.dataclass +class RowContext: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + issues: list[IssueInfo] + pulls: list[PullInfo] + + +@dataclasses.dataclass +class RowInput: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + + +@dataclasses.dataclass +class GitHubDataCache: + issues: dict[str, IssueInfo] + pulls: dict[str, PullInfo] + + +@dataclasses.dataclass +class MarkdownEntry: + note_type: str + component: str + note: str + raw_component: str = "" + + +@dataclasses.dataclass +class RowGenerationResult: + row_number: int + component: str + raw_component: str + note_type: str | None + note: str | None + error: str | None + needs_review: bool = False + reason: str = "" diff --git a/scripts/release_notes_ai/requirements.txt b/scripts/release_notes_ai/requirements.txt new file mode 100644 index 0000000000000..89cfc13a2a578 --- /dev/null +++ b/scripts/release_notes_ai/requirements.txt @@ -0,0 +1,3 @@ +openpyxl>=3.1 +requests>=2.31 +urllib3>=1.26 diff --git a/scripts/release_notes_ai/scope_filter.py b/scripts/release_notes_ai/scope_filter.py new file mode 100644 index 0000000000000..019824068d6e1 --- /dev/null +++ b/scripts/release_notes_ai/scope_filter.py @@ -0,0 +1,366 @@ +from __future__ import annotations + +import copy +import re +from dataclasses import dataclass +from datetime import date, datetime +from pathlib import Path +from typing import Any + +from .excel_workbook import get_header +from .models import PullInfo +from .utils import parse_github_url, str_value + + +OUT_OF_SCOPE_SHEET = "PRs_not_in_scope" +REASON_HEADER = "Reason" +SCOPE_REQUIRED_HEADERS = {"pr_status", "pr_merge_time", "pr_link"} + + +@dataclass(frozen=True) +class Version: + major: int + minor: int + patch: int + + @property + def release_branch(self) -> str: + return f"release-{self.major}.{self.minor}" + + @property + def text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch}" + + @property + def previous_patch_text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch - 1}" + + +@dataclass(frozen=True) +class TimelineRelease: + version: Version + display_version: str + release_date: date + + +@dataclass +class ScopeContext: + version: Version + releases_dir: Path + github: Any + base_branch_start_date: date | None = None + timeline: list[TimelineRelease] | None = None + release_branch_pulls: dict[str, list[PullInfo]] | None = None + + def __post_init__(self) -> None: + if self.timeline is None: + self.timeline = parse_release_timeline(self.releases_dir / "release-timeline.md") + if self.release_branch_pulls is None: + self.release_branch_pulls = {} + + +def move_prs_not_in_scope( + workbook: Any, + sheet: Any, + version: str, + releases_dir: Path, + github: Any, + base_branch_start_date: date | None = None, + target_sheet_name: str = OUT_OF_SCOPE_SHEET, +) -> int: + header = get_header(sheet) + missing = sorted(SCOPE_REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError( + "Missing required Excel columns for scope preprocessing: " + + ", ".join(missing) + ) + + context = ScopeContext( + version=parse_version(version), + releases_dir=releases_dir, + github=github, + base_branch_start_date=base_branch_start_date, + ) + target = ensure_out_of_scope_sheet(workbook, sheet, target_sheet_name) + + rows_to_move: list[tuple[int, str]] = [] + for row_number in range(2, sheet.max_row + 1): + reason = out_of_scope_reason(sheet, header, row_number, context) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} before release-note generation", + flush=True, + ) + return len(rows_to_move) + + +def ensure_out_of_scope_sheet(workbook: Any, source_sheet: Any, target_sheet_name: str) -> Any: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if target.max_row == 0 or not target.cell(row=1, column=1).value: + copy_header(source_sheet, target) + else: + ensure_reason_header(source_sheet, target) + return target + + target = workbook.create_sheet(target_sheet_name) + copy_header(source_sheet, target) + return target + + +def copy_header(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + copy_cell(source_sheet.cell(row=1, column=column), target_sheet.cell(row=1, column=column)) + ensure_reason_header(source_sheet, target_sheet) + + +def ensure_reason_header(source_sheet: Any, target_sheet: Any) -> None: + target_sheet.cell(row=1, column=source_sheet.max_column + 1, value=REASON_HEADER) + + +def append_row_with_reason(source_sheet: Any, target_sheet: Any, row_number: int, reason: str) -> None: + target_row = target_sheet.max_row + 1 + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=source_sheet.max_column + 1, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) + + +def out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + context: ScopeContext, +) -> str | None: + status = str_value(sheet.cell(row=row_number, column=header["pr_status"]).value).lower() + if status != "merged": + return f"PR status is {status or 'empty'}, not merged" + + merge_date = parse_date_value(sheet.cell(row=row_number, column=header["pr_merge_time"]).value) + if not merge_date: + return None + + if context.version.patch >= 1: + previous_date = release_date_for_version(context.timeline or [], context.version.previous_patch_text) + if not previous_date: + raise ValueError( + f"Cannot find release date for previous version {context.version.previous_patch_text} " + "in releases/release-timeline.md" + ) + if merge_date < previous_date: + return ( + f"PR merged on {merge_date.isoformat()}, before previous release " + f"{context.version.previous_patch_text} date {previous_date.isoformat()}" + ) + return None + + return major_release_out_of_scope_reason(sheet, header, row_number, merge_date, context) + + +def major_release_out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + merge_date: date, + context: ScopeContext, +) -> str | None: + latest_zero = latest_released_zero_patch(context.timeline or [], context.version.text) + if not latest_zero: + raise ValueError("Cannot find a previously released x.y.0 version in releases/release-timeline.md") + + if merge_date >= latest_zero.release_date: + return None + + branch_start = context.base_branch_start_date or estimated_release_branch_start_date(context, latest_zero) + if not branch_start: + return None + if merge_date < branch_start: + return ( + f"PR merged on {merge_date.isoformat()}, before estimated {latest_zero.version.release_branch} " + f"branch start date {branch_start.isoformat()}" + ) + + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + cherry_pick = find_release_branch_cherry_pick(context, latest_zero, pr_link) + if not cherry_pick: + return None + cherry_pick_date = parse_date_value(cherry_pick.merged_at) + if cherry_pick_date and cherry_pick_date < latest_zero.release_date: + return ( + f"Cherry-pick PR {cherry_pick.url} merged on {cherry_pick_date.isoformat()} " + f"before {latest_zero.display_version} release date {latest_zero.release_date.isoformat()}" + ) + return None + + +def estimated_release_branch_start_date( + context: ScopeContext, + latest_zero: TimelineRelease, +) -> date | None: + branch_pulls = release_branch_pulls(context, latest_zero.version.release_branch) + created_dates = [parse_date_value(pull.created_at) for pull in branch_pulls] + created_dates = [value for value in created_dates if value] + return min(created_dates) if created_dates else None + + +def find_release_branch_cherry_pick( + context: ScopeContext, + latest_zero: TimelineRelease, + pr_link: str, +) -> PullInfo | None: + try: + owner, repo, number = parse_github_url(pr_link, "pull") + except ValueError: + return None + if (owner, repo) != ("pingcap", "tidb"): + return None + + candidates = [] + for pull in release_branch_pulls(context, latest_zero.version.release_branch): + haystack = "\n".join([pull.title, pull.body, pull.head_ref, pull.url]) + if references_original_pr(haystack, owner, repo, number, pr_link): + candidates.append(pull) + + merged_candidates = [ + pull for pull in candidates if parse_date_value(pull.merged_at) + ] + if not merged_candidates: + return None + return min( + merged_candidates, + key=lambda pull: parse_date_value(pull.merged_at) or date.max, + ) + + +def references_original_pr( + text: str, + owner: str, + repo: str, + number: str, + pr_link: str, +) -> bool: + text = text or "" + patterns = [ + re.escape(pr_link), + rf"(? list[PullInfo]: + assert context.release_branch_pulls is not None + if branch not in context.release_branch_pulls: + context.release_branch_pulls[branch] = context.github.list_pulls_for_base( + "pingcap", + "tidb", + branch, + state="closed", + ) + return context.release_branch_pulls[branch] + + +def parse_release_timeline(path: Path) -> list[TimelineRelease]: + releases: list[TimelineRelease] = [] + if not path.exists(): + raise FileNotFoundError(f"Cannot find release timeline: {path}") + pattern = re.compile( + r"\|\s*\[(?P[^\]]+)\]\([^)]+\)\s*\|\s*(?P\d{4}-\d{2}-\d{2})\s*\|" + ) + for line in path.read_text(encoding="utf-8").splitlines(): + match = pattern.search(line) + if not match: + continue + try: + version = parse_version(match.group("version")) + except ValueError: + continue + release_date = date.fromisoformat(match.group("date")) + releases.append(TimelineRelease(version, match.group("version"), release_date)) + return releases + + +def release_date_for_version(timeline: list[TimelineRelease], version_text: str) -> date | None: + for release in timeline: + if release.version.text == version_text: + return release.release_date + return None + + +def latest_released_zero_patch( + timeline: list[TimelineRelease], + target_version_text: str, +) -> TimelineRelease | None: + zero_patch_releases = [ + release + for release in timeline + if release.version.patch == 0 and release.version.text != target_version_text + ] + if not zero_patch_releases: + return None + return max(zero_patch_releases, key=lambda release: release.release_date) + + +def parse_version(version: str) -> Version: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return Version( + major=int(match.group("major")), + minor=int(match.group("minor")), + patch=int(match.group("patch")), + ) + + +def parse_date_value(value: Any) -> date | None: + if value is None: + return None + if isinstance(value, datetime): + return value.date() + if isinstance(value, date): + return value + text = str_value(value) + if not text: + return None + text = text.replace("Z", "+00:00") + try: + return datetime.fromisoformat(text).date() + except ValueError: + pass + match = re.search(r"\d{4}-\d{2}-\d{2}", text) + if match: + return date.fromisoformat(match.group()) + return None diff --git a/scripts/release_notes_ai/utils.py b/scripts/release_notes_ai/utils.py new file mode 100644 index 0000000000000..1c0641787019c --- /dev/null +++ b/scripts/release_notes_ai/utils.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from typing import Any, Iterable + +from .constants import ( + COMPONENT_ALIASES, + GITHUB_ITEM_URL_RE, + ISSUE_URL_RE, + PR_URL_RE, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) + + +def parse_github_url(url: str, expected_kind: str) -> tuple[str, str, str]: + match = GITHUB_ITEM_URL_RE.search(url) + if not match: + raise ValueError(f"Invalid GitHub URL: {url}") + if match.group("kind") != expected_kind: + raise ValueError(f"Expected a GitHub {expected_kind} URL, got: {url}") + return match.group("owner"), match.group("repo"), match.group("number") + + +def extract_issue_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in ISSUE_URL_RE.finditer(text or "")) + + +def extract_pr_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in PR_URL_RE.finditer(text or "")) + + +def replace_author_markdown(text: str, old_author: str, new_author: str) -> str: + text = text or "" + return text.replace( + f"[{old_author}](https://github.com/{old_author}", + f"[{new_author}](https://github.com/{new_author}", + ) + + +def normalize_component(component: str) -> str: + cleaned = " ".join(str_value(component).split()) + if not cleaned: + return "" + return COMPONENT_ALIASES.get(cleaned.lower(), cleaned) + + +def normalize_raw_component(component: Any) -> str: + return " ".join(str_value(component).split()) + + +def normalized_release_component(component: str) -> str | None: + normalized = normalize_component(component) + if normalized in TOP_LEVEL_COMPONENTS or normalized in TOOL_COMPONENTS: + return normalized + return None + + +def split_multi_value(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [item.strip() for item in text.replace("\n", ",").split(",") if item.strip()] + + +def split_lines(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [line.strip() for line in text.splitlines() if line.strip()] + + +def unique_ordered(values: Iterable[str]) -> list[str]: + result: list[str] = [] + seen: set[str] = set() + for value in values: + cleaned = str_value(value) + if not cleaned or cleaned in seen: + continue + seen.add(cleaned) + result.append(cleaned) + return result + + +def str_value(value: Any) -> str: + if value is None: + return "" + return str(value).strip() diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py new file mode 100644 index 0000000000000..5d1e701f56cec --- /dev/null +++ b/scripts/release_notes_generate_ai.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""CLI entry point for generating English TiDB release notes with AI.""" + +from release_notes_ai.cli import main + + +if __name__ == "__main__": + raise SystemExit(main()) From 4583453ca945f965df345d18bbc7941e4ae045fb Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 29 Apr 2026 17:55:48 +0800 Subject: [PATCH 02/22] improve the filter logic and move the prompt to an independent file --- scripts/release_notes_ai/ai_client.py | 83 +++--- scripts/release_notes_ai/cli.py | 8 + scripts/release_notes_ai/constants.py | 3 + scripts/release_notes_ai/excel_workbook.py | 240 ++++++++++++++++-- .../release_notes_ai/prompts/generation.md | 40 +++ scripts/release_notes_generate_ai.py | 37 ++- 6 files changed, 345 insertions(+), 66 deletions(-) create mode 100644 scripts/release_notes_ai/prompts/generation.md diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release_notes_ai/ai_client.py index 503e28b63023b..d2770e3fbc56c 100644 --- a/scripts/release_notes_ai/ai_client.py +++ b/scripts/release_notes_ai/ai_client.py @@ -12,7 +12,11 @@ from pathlib import Path from typing import Any -from .constants import BUG_FIXES_REFERENCE, IMPROVEMENTS_REFERENCE +from .constants import ( + BUG_FIXES_REFERENCE, + GENERATION_PROMPT_TEMPLATE, + IMPROVEMENTS_REFERENCE, +) from .models import GeneratedNote, RowContext @@ -134,6 +138,7 @@ def build_generation_prompt( expected_links: list[str], contributors: list[str], ) -> str: + prompt_template = load_prompt_template(GENERATION_PROMPT_TEMPLATE) improvements_reference = load_reference_file(IMPROVEMENTS_REFERENCE) bug_fixes_reference = load_reference_file(BUG_FIXES_REFERENCE) context = { @@ -148,46 +153,16 @@ def build_generation_prompt( "issues": [dataclasses.asdict(issue) for issue in row_context.issues], "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], } - return textwrap.dedent( - f""" - You write exactly one English TiDB release note entry. - - Return only a JSON object with exactly these keys: - - type: "improvement" or "bug_fix" - - release_note: one Markdown bullet that starts with "- " - - needs_review: true or false - - reason: a short reason for the type and wording - - Rules: - - Write from the user's perspective. - - Use the Excel issue_type as a strong signal, but decide the final type from the issue, - PR description, and code changes. - - For improvements, follow the Improvements reference below. - - For bug fixes, follow the Bug fixes reference below. - - Do not end the release note with a period. - - Include every expected link in Markdown release-note style. - - Include every contributor as @[user](https://github.com/user). - - If there is no issue URL, use the PR link as the suffix link. - - Do not expose internal function names unless they are the user-visible behavior. - - If the available context is insufficient, still draft the best note and set needs_review - to true. - - Expected links: - {json.dumps(expected_links, ensure_ascii=False, indent=2)} - - Contributors: - {json.dumps(contributors, ensure_ascii=False, indent=2)} - - Row context: - {json.dumps(context, ensure_ascii=False, indent=2)} - - Improvements reference: - {improvements_reference} - - Bug fixes reference: - {bug_fixes_reference} - """ - ).strip() + return render_prompt_template( + prompt_template, + { + "EXPECTED_LINKS": json.dumps(expected_links, ensure_ascii=False, indent=2), + "CONTRIBUTORS": json.dumps(contributors, ensure_ascii=False, indent=2), + "ROW_CONTEXT": json.dumps(context, ensure_ascii=False, indent=2), + "IMPROVEMENTS_REFERENCE": improvements_reference, + "BUG_FIXES_REFERENCE": bug_fixes_reference, + }, + ) def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: @@ -206,6 +181,32 @@ def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: ).strip() +def render_prompt_template(template: str, values: dict[str, str]) -> str: + for key, value in values.items(): + template = template.replace(f"{{{{{key}}}}}", value) + return template.strip() + + +@lru_cache(maxsize=None) +def load_prompt_template(path: Path) -> str: + try: + return strip_prompt_template_heading(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Cannot find release-note prompt template: {path}. " + "Make sure scripts/release_notes_ai/prompts/generation.md exists." + ) from exc + + +def strip_prompt_template_heading(template: str) -> str: + lines = template.splitlines() + if lines and lines[0].startswith("# "): + lines = lines[1:] + if lines and not lines[0].strip(): + lines = lines[1:] + return "\n".join(lines) + + @lru_cache(maxsize=None) def load_reference_file(path: Path) -> str: try: diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index ee1d79a074c4a..fdeaccfda3efb 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -13,6 +13,7 @@ generate_notes_without_ai, generate_notes_for_sheet, merge_rows_by_issue_and_component, + move_rows_with_issues_already_in_same_series, prepare_sheet_columns, sort_sheet_rows_by_component, store_existing_release_notes, @@ -164,6 +165,13 @@ def main() -> int: clear_output_columns(sheet, header, clear_ai=args.force_regenerate) existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + move_rows_with_issues_already_in_same_series( + workbook, + sheet, + header, + existing_notes, + args.version, + ) update_pr_authors_and_dup_notes( sheet, header, diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py index c3e947167a23b..89cca90e52d2b 100644 --- a/scripts/release_notes_ai/constants.py +++ b/scripts/release_notes_ai/constants.py @@ -21,6 +21,9 @@ / "references" / "bug-fixes.md" ) +GENERATION_PROMPT_TEMPLATE = ( + REPO_ROOT / "scripts" / "release_notes_ai" / "prompts" / "generation.md" +) BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} # Keep the misspelled source column name because tirelease exports it this way. diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release_notes_ai/excel_workbook.py index 260b4b807d04e..177f28fd64c9f 100644 --- a/scripts/release_notes_ai/excel_workbook.py +++ b/scripts/release_notes_ai/excel_workbook.py @@ -43,6 +43,7 @@ GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") +SAME_SERIES_REASON_HEADER = "reason" def prepare_sheet_columns(sheet: Any) -> dict[str, int]: @@ -168,7 +169,7 @@ def first_issue_url_for_row(sheet: Any, header: dict[str, int], row_number: int) def store_existing_release_notes(releases_dir: Path, version: str) -> list[ExistingNote]: existing_notes: list[ExistingNote] = [] - seen: set[tuple[str, tuple[str, ...]]] = set() + seen: set[tuple[str, tuple[str, ...], str]] = set() target_version = parse_semver_tuple(version) for file_path in sorted(releases_dir.rglob("*.md")): @@ -179,25 +180,26 @@ def store_existing_release_notes(releases_dir: Path, version: str) -> list[Exist for raw_line in file: line = raw_line.strip() authors = AUTHOR_RE.findall(line) - item_url = GITHUB_ITEM_URL_RE.search(line) - if item_url: - key = (item_url.group(), tuple(authors)) - if key in seen: - continue - seen.add(key) + item_urls = [match.group() for match in GITHUB_ITEM_URL_RE.finditer(line)] + if item_urls: note_level = level1 + level2 + level3 note_type, component = classify_note_level(note_level) - existing_notes.append( - ExistingNote( - url=item_url.group(), - line=line, - file_name=file_path.name, - note_level=note_level, - authors=authors, - note_type=note_type, - component=component, + for item_url in item_urls: + key = (item_url, tuple(authors), file_path.name) + if key in seen: + continue + seen.add(key) + existing_notes.append( + ExistingNote( + url=item_url, + line=line, + file_name=file_path.name, + note_level=note_level, + authors=authors, + note_type=note_type, + component=component, + ) ) - ) continue heading = parse_release_note_heading(raw_line) @@ -283,22 +285,207 @@ def update_pr_authors_and_dup_notes( author_cell = sheet.cell(row=row_number, column=header["pr_author"]) current_author = str_value(author_cell.value) - issue_url = first_issue_url_for_row(sheet, header, row_number) - if not issue_url: + issue_urls = issue_urls_for_row(sheet, header, row_number) + if not issue_urls: continue current_authors = split_multi_value(current_author) dup_notes = [] - for existing in existing_notes_by_url.get(issue_url, []): - if existing.authors and not set(current_authors).intersection(existing.authors): - continue - dup_notes.append(existing.dup_text) + for issue_url in issue_urls: + for existing in existing_notes_by_url.get(issue_url, []): + if existing.authors and not set(current_authors).intersection(existing.authors): + continue + dup_notes.append(existing.dup_text) if dup_notes: dup_col = header["published_release_notes"] sheet.cell(row=row_number, column=dup_col, value="\n".join(unique_ordered(dup_notes))) fill_row(sheet, row_number) - print(f"Row {row_number}: found duplicated release note for {issue_url}", flush=True) + print( + f"Row {row_number}: found duplicated release note for {', '.join(issue_urls)}", + flush=True, + ) + + +def move_rows_with_issues_already_in_same_series( + workbook: Any, + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + version: str, +) -> int: + files_by_issue_url = same_series_release_files_by_issue_url(existing_notes, version) + if not files_by_issue_url: + return 0 + + target_sheet_name = same_series_issues_sheet_name(version) + target, reason_col = ensure_sheet_with_reason(workbook, sheet, target_sheet_name) + rows_to_move: list[tuple[int, str]] = [] + + for row_number in range(2, sheet.max_row + 1): + issue_urls = issue_urls_for_row(sheet, header, row_number) + reason = same_series_issue_reason(issue_urls, files_by_issue_url) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason, reason_col) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} because their issues " + "already appear in earlier release notes from the same major.minor series", + flush=True, + ) + return len(rows_to_move) + + +def same_series_release_files_by_issue_url( + existing_notes: list[ExistingNote], + version: str, +) -> dict[str, list[str]]: + target_version = parse_semver_tuple(version) + files_by_issue_url: dict[str, list[str]] = {} + + for existing in existing_notes: + match = GITHUB_ITEM_URL_RE.search(existing.url) + if not match or match.group("kind") != "issues": + continue + + file_version = release_file_semver_tuple(Path(existing.file_name)) + if not file_version: + continue + if file_version[:2] != target_version[:2] or file_version >= target_version: + continue + + files = files_by_issue_url.setdefault(existing.url, []) + if existing.file_name not in files: + files.append(existing.file_name) + + for issue_url, files in list(files_by_issue_url.items()): + files_by_issue_url[issue_url] = sorted(files, key=release_file_name_sort_key) + return files_by_issue_url + + +def same_series_issues_sheet_name(version: str) -> str: + major, minor, _patch = parse_semver_tuple(version) + return f"issues_already_in_earlier_v{major}.{minor}_notes" + + +def same_series_issue_reason( + issue_urls: list[str], + files_by_issue_url: dict[str, list[str]], +) -> str | None: + reasons = [] + for issue_url in issue_urls: + files = files_by_issue_url.get(issue_url) + if files: + reasons.append(f"{issue_url} appears in {', '.join(files)}") + return "; ".join(reasons) if reasons else None + + +def release_file_name_sort_key(file_name: str) -> tuple[int, int, int, str]: + version = release_file_semver_tuple(Path(file_name)) + if not version: + return (sys.maxsize, sys.maxsize, sys.maxsize, file_name) + return (*version, file_name) + + +def ensure_sheet_with_reason( + workbook: Any, + source_sheet: Any, + target_sheet_name: str, +) -> tuple[Any, int]: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if not str_value(target.cell(row=1, column=1).value): + reason_col = copy_header_with_reason(source_sheet, target) + else: + reason_col = ensure_same_series_reason_header(source_sheet, target) + return target, reason_col + + target = workbook.create_sheet(target_sheet_name) + reason_col = copy_header_with_reason(source_sheet, target) + return target, reason_col + + +def copy_header_with_reason(source_sheet: Any, target_sheet: Any) -> int: + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + return ensure_same_series_reason_header(source_sheet, target_sheet) + + +def ensure_same_series_reason_header(source_sheet: Any, target_sheet: Any) -> int: + reason_col = find_header_column(target_sheet, SAME_SERIES_REASON_HEADER) + if not reason_col: + reason_col = max(source_sheet.max_column, target_sheet.max_column) + 1 + copy_missing_header_cells(source_sheet, target_sheet) + target_sheet.cell(row=1, column=reason_col, value=SAME_SERIES_REASON_HEADER) + return reason_col + + while reason_col <= source_sheet.max_column: + target_sheet.insert_cols(reason_col) + reason_col += 1 + + copy_missing_header_cells(source_sheet, target_sheet) + return reason_col + + +def copy_missing_header_cells(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + if not str_value(target_sheet.cell(row=1, column=column).value): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + + +def find_header_column(sheet: Any, header_name: str) -> int | None: + for column in range(1, sheet.max_column + 1): + if str_value(sheet.cell(row=1, column=column).value) == header_name: + return column + return None + + +def append_row_with_reason( + source_sheet: Any, + target_sheet: Any, + row_number: int, + reason: str, + reason_col: int, +) -> None: + target_row = target_sheet.max_row + 1 + source_dimension = source_sheet.row_dimensions[row_number] + target_dimension = target_sheet.row_dimensions[target_row] + target_dimension.height = source_dimension.height + target_dimension.hidden = source_dimension.hidden + target_dimension.outlineLevel = source_dimension.outlineLevel + target_dimension.collapsed = source_dimension.collapsed + + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=reason_col, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) def apply_bot_author_replacements( @@ -407,7 +594,12 @@ def resolve_bot_author(github: Any, request: tuple[int, str, str, str]) -> str: def index_existing_notes_by_url(existing_notes: list[ExistingNote]) -> dict[str, list[ExistingNote]]: indexed: dict[str, list[ExistingNote]] = {} + seen: set[tuple[str, tuple[str, ...]]] = set() for existing in existing_notes: + key = (existing.url, tuple(existing.authors)) + if key in seen: + continue + seen.add(key) indexed.setdefault(existing.url, []).append(existing) return indexed diff --git a/scripts/release_notes_ai/prompts/generation.md b/scripts/release_notes_ai/prompts/generation.md new file mode 100644 index 0000000000000..8eb5b1e993381 --- /dev/null +++ b/scripts/release_notes_ai/prompts/generation.md @@ -0,0 +1,40 @@ +# Generation Prompt + +You are a senior technical writer who has profound knowledge of TiDB. + +Your task is to write exactly one English release note entry for a TiDB issue or PR. + +Return only a JSON object with exactly these keys: + +- type: "improvement" or "bug_fix" +- release_note: one Markdown bullet that starts with "- " +- needs_review: true or false +- reason: a short reason for the type and wording + +Rules: + +- Write from the user's perspective. +- Use the Excel issue_type as a strong signal, but decide the final type from the issue, PR description, and code changes. +- For improvements, follow the Improvements reference below. +- For bug fixes, follow the Bug fixes reference below. +- Do not end the release note with a period. +- Include every expected link in Markdown release-note style. +- Include every contributor as @[user](https://github.com/user). +- If there is no issue URL, use the PR link as the suffix link. +- Do not expose internal function names unless they are the user-visible behavior. +- If the available context is insufficient, still draft the best note and set needs_review to true. + +Expected links: +{{EXPECTED_LINKS}} + +Contributors: +{{CONTRIBUTORS}} + +Row context: +{{ROW_CONTEXT}} + +Improvements reference: +{{IMPROVEMENTS_REFERENCE}} + +Bug fixes reference: +{{BUG_FIXES_REFERENCE}} diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index 5d1e701f56cec..bdcb30ba8433b 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -1,7 +1,42 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -"""CLI entry point for generating English TiDB release notes with AI.""" +""" This script generates English TiDB release notes from a workbook with PR links and issue links of a specific release. + +What does this script do? + + - Filter out the PRs and issues that are not in the target release scope. For example, PRs that were merged before this previous path release. + - Move the issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. + - Mark the release notes that are already published in other series as ``(dup)`` and reuse the release notes for the same issue. + - Generate the English release note using AI according to the release note draft provided in the PR, the description and code changes of the PR, the descriptions of the issue + - Map components in the workbook to the corresponding release note components. + - Generate the release note file for the target release according to the release note template file. + +Typical usage: + + python3 scripts/release_notes_generate_ai.py \ + --version 8.5.7 \ + --excel /path/to/tirelease.xlsx \ + --releases-dir releases \ + --github-token-file /path/to/github-token.txt + +Useful options: + + --involve-ai-generation OFF + Skip AI generation and use the source ``formated_release_note`` values + for non-duplicate rows. + + --force-regenerate + Clear existing AI-generated notes in the processed workbook and generate + them again. + + --output-release-file /path/to/release-8.5.7.md + Write the generated Markdown to a custom path. By default, the output is + ``release--updated-by-ai.md`` under ``--releases-dir``. + +Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option +list. +""" from release_notes_ai.cli import main From 195da0b95b759c8511ec1130213d4ee50ec68224 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 6 May 2026 14:59:38 +0800 Subject: [PATCH 03/22] update the naming rule of the release note file --- scripts/release_notes_ai/cli.py | 14 ++++++++++++-- scripts/release_notes_generate_ai.py | 6 ++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index fdeaccfda3efb..7aea9b9ee43db 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -58,7 +58,10 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--output-release-file", - help="Output Markdown file. Defaults to release-{version}-updated-by-ai.md.", + help=( + "Output Markdown file. Defaults to release-{version}-updated-by-ai.md " + "if release-{version}.md already exists, otherwise release-{version}.md." + ), ) parser.add_argument( "--ai-timeout", @@ -142,7 +145,7 @@ def main() -> int: output_file = ( Path(args.output_release_file) if args.output_release_file - else Path(args.releases_dir) / f"release-{args.version}-updated-by-ai.md" + else default_output_release_file(Path(args.releases_dir), args.version) ) excel_path = Path(args.excel) @@ -219,6 +222,13 @@ def parse_on_off(value: str) -> str: return normalized +def default_output_release_file(releases_dir: Path, version: str) -> Path: + release_file = releases_dir / f"release-{version}.md" + if release_file.is_file(): + return releases_dir / f"release-{version}-updated-by-ai.md" + return release_file + + def default_processed_excel_path(excel_path: Path) -> Path: return excel_path.with_name(f"{excel_path.stem}_processed{excel_path.suffix}") diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index bdcb30ba8433b..03d1ec7f8a59b 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -31,8 +31,10 @@ them again. --output-release-file /path/to/release-8.5.7.md - Write the generated Markdown to a custom path. By default, the output is - ``release--updated-by-ai.md`` under ``--releases-dir``. + Write the generated Markdown to a custom path. By default, the output + under ``--releases-dir`` is ``release--updated-by-ai.md`` if + ``release-.md`` already exists, otherwise + ``release-.md``. Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option list. From 259870d8186173c3a079e96f33c9e28a169d75e1 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 6 May 2026 16:24:10 +0800 Subject: [PATCH 04/22] add the usage descriptions for the scripts --- scripts/release-notes-generator-readme.md | 79 +++++++++++++++++++++++ scripts/release_notes_ai/__init__.py | 1 - scripts/release_notes_ai/cli.py | 34 +++------- scripts/release_notes_ai/constants.py | 2 +- scripts/release_notes_generate_ai.py | 35 ++-------- 5 files changed, 94 insertions(+), 57 deletions(-) create mode 100644 scripts/release-notes-generator-readme.md delete mode 100644 scripts/release_notes_ai/__init__.py diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-generator-readme.md new file mode 100644 index 0000000000000..d966eaf3b83c4 --- /dev/null +++ b/scripts/release-notes-generator-readme.md @@ -0,0 +1,79 @@ +# Release notes generator + +`scripts/release_notes_generate_ai.py` generates English TiDB release notes according to PRs and issues in a specified excel file. + +## What it does + +**Scope filtering:** + +- Filters out PRs and issues that are not in the target release scope. For example, it filters out PRs that were merged before the previous patch release. +- Moves issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. + +**Duplicate handling:** + +- Marks release notes that are already published in other series as `(dup)` and reuses the release notes for the same issue. + +**Release note generation:** + +- Generates English release notes using AI according to the release note draft provided in the PR, the PR description and code changes, and the issue description. +- Maps components in the workbook to the corresponding release note components. + +**File output in Markdown:** + +- Generates the release note file for the target release according to the release note template file. +- Add the improvements and bug fixes of each component to the corresponding sections of the release note file. + +## Prerequisites + +- Install Python dependencies: + + ```bash + python3 -m pip install -r scripts/release_notes_ai/requirements.txt + ``` + +- Prepare a GitHub token with access to the public repositories and set the GitHub token in the `GITHUB_TOKEN` environment variable: + + ```bash + export GITHUB_TOKEN= + ``` + +- Install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. + +## Typical usage + +```bash +python3 scripts/release_notes_generate_ai.py \ + --version 8.5.7 \ + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases +``` + +## Option descriptions + +| Option | Required | Default value | Usage example | Description | +| --- | --- | --- | --- | --- | +| `--version ` | Yes | None | `--version 8.5.7` | Target TiDB version. This value is used for scope filtering, existing release-note lookup, generated Markdown front matter, and the default output file name. | +| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | +| `--releases-dir ` | Yes | None | `--releases-dir releases` | Path to the existing English release notes directory. The script scans this directory for historical release notes and writes the generated Markdown under this directory unless `--output-release-file` is specified. | +| `--sheet ` | No | `pr_for_release_note` | `--sheet pr_for_release_note` | Workbook sheet to process. | +| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator. The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | +| `--ai-model ` | No | `gpt-5.4` | `--ai-model gpt-5.4` | Model name passed to `codex exec` with `-m`. | +| `--involve-ai-generation ` | No | `ON` | `--involve-ai-generation OFF` | Whether to generate non-duplicate release notes with AI. Use `ON` to invoke AI, or `OFF` to use the source `formated_release_note` values. | +| `--output-release-file ` | No | Conditional | `--output-release-file /path/to/release-8.5.7.md` | Write the generated Markdown to a custom path. By default, the output under `--releases-dir` is `release--updated-by-ai.md` if `release-.md` already exists, otherwise `release-.md`. | +| `--ai-timeout ` | No | `600` | `--ai-timeout 600` | Timeout in seconds for each AI command invocation. | +| `--ai-workers ` | No | `3` | `--ai-workers 3` | Number of concurrent AI command invocations. | +| `--github-workers ` | No | `8` | `--github-workers 8` | Number of concurrent GitHub API prefetch workers. | +| `--author-workers ` | No | `3` | `--author-workers 3` | Number of concurrent workers used to resolve bot-authored cherry-pick PR authors. | +| `--checkpoint-interval ` | No | `1` | `--checkpoint-interval 1` | Save the processed workbook after every N completed AI rows. Use `0` to disable checkpoint saves. | +| `--force-regenerate` | No | Disabled | `--force-regenerate` | Clear existing AI-generated notes in the processed workbook and generate all non-duplicate rows again. | +| `--release-date ` | No | `TBD` | `--release-date "August 14, 2025"` | Release date text for the generated Markdown header. | +| `--skip-scope-preprocess` | No | Disabled | `--skip-scope-preprocess` | Skip moving not-in-scope PR rows to the `PRs_not_in_scope` sheet. | +| `--scope-base-branch-start-date ` | No | Estimated from release history | `--scope-base-branch-start-date 2025-01-01` | Override the estimated release-m.n branch start date for x.y.0 scope preprocessing. The value must use the `YYYY-MM-DD` format. | + +## Generated files + +- The source excel file passed to `--excel` is not overwritten. +- The processed excel file is written to `_processed.xlsx` next to the source workbook. +- The generated Markdown file is written to `--output-release-file` when that option is specified. +- If `--output-release-file` is omitted and `release-.md` already exists under `--releases-dir`, the generated Markdown file is written to `release--updated-by-ai.md`. +- If `--output-release-file` is omitted and `release-.md` does not exist under `--releases-dir`, the generated Markdown file is written to `release-.md`. diff --git a/scripts/release_notes_ai/__init__.py b/scripts/release_notes_ai/__init__.py deleted file mode 100644 index 65f7e128c779b..0000000000000 --- a/scripts/release_notes_ai/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Helpers for generating TiDB release notes with AI.""" diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index 7aea9b9ee43db..b1e913a7bca0d 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -26,7 +26,7 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Generate English release notes with AI from a tirelease workbook." + description="Generate English release notes with AI according to PRs and issues in a specified excel file." ) parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") @@ -36,7 +36,6 @@ def parse_args() -> argparse.Namespace: help="Path to the existing English release notes directory.", ) parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") - parser.add_argument("--github-token-file", help="Path to a GitHub token file.") parser.add_argument( "--ai-command", default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", @@ -137,7 +136,10 @@ def main() -> int: if not base_branch_start_date: raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") - token = load_github_token(args.github_token_file) + try: + token = load_github_token() + except ValueError as exc: + raise SystemExit(f"error: {exc}") from None github = GitHubClient(token) involve_ai_generation = args.involve_ai_generation == "ON" ai = AIClient(args.ai_command, args.ai_model, args.ai_timeout) if involve_ai_generation else None @@ -278,24 +280,8 @@ def save_workbook_safely(workbook: openpyxl.Workbook, excel_path: Path) -> None: raise RuntimeError(f"Failed to save workbook {excel_path}: {exc}") from exc -def load_github_token(token_file: str | None) -> str | None: - import shutil - import subprocess - - if token_file: - return Path(token_file).read_text(encoding="utf-8").strip() - if os.environ.get("GITHUB_TOKEN"): - return os.environ["GITHUB_TOKEN"].strip() - gh = shutil.which("gh") - if not gh: - return None - completed = subprocess.run( - [gh, "auth", "token"], - text=True, - capture_output=True, - timeout=10, - check=False, - ) - if completed.returncode == 0 and completed.stdout.strip(): - return completed.stdout.strip() - return None +def load_github_token() -> str: + token = os.environ.get("GITHUB_TOKEN", "").strip() + if not token: + raise ValueError("GITHUB_TOKEN environment variable is required") + return token diff --git a/scripts/release_notes_ai/constants.py b/scripts/release_notes_ai/constants.py index 89cca90e52d2b..12d0d9ca8dcee 100644 --- a/scripts/release_notes_ai/constants.py +++ b/scripts/release_notes_ai/constants.py @@ -26,7 +26,7 @@ ) BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} -# Keep the misspelled source column name because tirelease exports it this way. +# Keep the misspelled source column name because release note excel file exports it this way. REQUIRED_HEADERS = { "pr_author", "pr_link", diff --git a/scripts/release_notes_generate_ai.py b/scripts/release_notes_generate_ai.py index 03d1ec7f8a59b..0a6d3eb761268 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release_notes_generate_ai.py @@ -1,43 +1,16 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" This script generates English TiDB release notes from a workbook with PR links and issue links of a specific release. - -What does this script do? - - - Filter out the PRs and issues that are not in the target release scope. For example, PRs that were merged before this previous path release. - - Move the issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. - - Mark the release notes that are already published in other series as ``(dup)`` and reuse the release notes for the same issue. - - Generate the English release note using AI according to the release note draft provided in the PR, the description and code changes of the PR, the descriptions of the issue - - Map components in the workbook to the corresponding release note components. - - Generate the release note file for the target release according to the release note template file. +"""Generate TiDB improvements and bug fixes for release notes according to PRs and issues in a specified excel file. Typical usage: python3 scripts/release_notes_generate_ai.py \ --version 8.5.7 \ - --excel /path/to/tirelease.xlsx \ - --releases-dir releases \ - --github-token-file /path/to/github-token.txt - -Useful options: - - --involve-ai-generation OFF - Skip AI generation and use the source ``formated_release_note`` values - for non-duplicate rows. - - --force-regenerate - Clear existing AI-generated notes in the processed workbook and generate - them again. - - --output-release-file /path/to/release-8.5.7.md - Write the generated Markdown to a custom path. By default, the output - under ``--releases-dir`` is ``release--updated-by-ai.md`` if - ``release-.md`` already exists, otherwise - ``release-.md``. + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases -Run ``python3 scripts/release_notes_generate_ai.py --help`` for the full option -list. +For detailed usage and options, see scripts/release-notes-generator-readme.md. """ from release_notes_ai.cli import main From c47192b1936baf717e6ff1e6b04fef71b9bb1305 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 11 May 2026 16:55:36 +0800 Subject: [PATCH 05/22] Update release-notes-generator-readme.md --- scripts/release-notes-generator-readme.md | 327 +++++++++++++++++++++- 1 file changed, 312 insertions(+), 15 deletions(-) diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-generator-readme.md index d966eaf3b83c4..d377f4da3f689 100644 --- a/scripts/release-notes-generator-readme.md +++ b/scripts/release-notes-generator-readme.md @@ -1,27 +1,38 @@ # Release notes generator -`scripts/release_notes_generate_ai.py` generates English TiDB release notes according to PRs and issues in a specified excel file. +`scripts/release_notes_generate_ai.py` generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in a Excel workbook. + +The generator keeps the source workbook unchanged, writes all processing results to a processed workbook, and renders the generated entries to a Markdown release note file. ## What it does -**Scope filtering:** +**Scope filtering** + +- Filters out rows of PRs and issues that are not in the target release scope. +- Moves issues that already appeared in earlier release notes from the same major.minor series to a separate worksheet for review. + +**Author correction** + +- Resolves bot-authored cherry-pick rows to the original PR author when possible. + +**Duplicate handling** + +- Reuses already-published release note entries as `(dup)` entries when appropriate. -- Filters out PRs and issues that are not in the target release scope. For example, it filters out PRs that were merged before the previous patch release. -- Moves issues that already appeared in earlier notes from the same major.minor series to a separate worksheet. +**Release note generation** -**Duplicate handling:** +- Generates English release notes with AI from workbook data, GitHub PR and issue context, changed-file summaries, and repo-local release note writing references. -- Marks release notes that are already published in other series as `(dup)` and reuses the release notes for the same issue. +**Component mapping** -**Release note generation:** +- Maps workbook components to the corresponding release note Markdown components. -- Generates English release notes using AI according to the release note draft provided in the PR, the PR description and code changes, and the issue description. -- Maps components in the workbook to the corresponding release note components. +**Markdown generation** -**File output in Markdown:** +- Writes `Improvements` and `Bug fixes` entries to a Markdown release note draft. + +The generator does not create a complete formal release note. It does not generate sections such as compatibility changes, known issues, deprecations, or upgrade notes. -- Generates the release note file for the target release according to the release note template file. -- Add the improvements and bug fixes of each component to the corresponding sections of the release note file. ## Prerequisites @@ -53,7 +64,7 @@ python3 scripts/release_notes_generate_ai.py \ | Option | Required | Default value | Usage example | Description | | --- | --- | --- | --- | --- | | `--version ` | Yes | None | `--version 8.5.7` | Target TiDB version. This value is used for scope filtering, existing release-note lookup, generated Markdown front matter, and the default output file name. | -| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | +| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note Excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | | `--releases-dir ` | Yes | None | `--releases-dir releases` | Path to the existing English release notes directory. The script scans this directory for historical release notes and writes the generated Markdown under this directory unless `--output-release-file` is specified. | | `--sheet ` | No | `pr_for_release_note` | `--sheet pr_for_release_note` | Workbook sheet to process. | | `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator. The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | @@ -72,8 +83,294 @@ python3 scripts/release_notes_generate_ai.py \ ## Generated files -- The source excel file passed to `--excel` is not overwritten. -- The processed excel file is written to `_processed.xlsx` next to the source workbook. +- The source Excel file passed to `--excel` is not overwritten. +- The processed Excel file is written to `_processed.xlsx` next to the source workbook. - The generated Markdown file is written to `--output-release-file` when that option is specified. - If `--output-release-file` is omitted and `release-.md` already exists under `--releases-dir`, the generated Markdown file is written to `release--updated-by-ai.md`. - If `--output-release-file` is omitted and `release-.md` does not exist under `--releases-dir`, the generated Markdown file is written to `release-.md`. + +## Reference: processing rules + +The following sections describe the main processing logic and rules used by the generator. + +### Processing pipeline + +| Stage | What happens | Review value | +| --- | --- | --- | +| Scope filtering | Out-of-scope rows are moved to `PRs_not_in_scope` with a reason. | Reviewers can see why a row was excluded. | +| Workbook setup | Rows are sorted by component, and output columns are added or reset. | Related rows are easier to inspect, and generated data stays separate from source data. | +| Historical scan | Existing release notes are indexed by GitHub URL, contributor, section, and component. | The generator can reuse published wording instead of drafting duplicate text. | +| Same-series quarantine | Issues already published in the same major.minor series are moved to a separate sheet. | Repeated issues in the same series are visible for manual review. | +| Duplicate marking | Reusable historical entries are written to `published_release_notes` and rendered as `(dup)` entries. | The output keeps the reviewed published note and its source location. | +| Author replacement | Bot-authored cherry-pick rows are resolved to the original PR author when possible. | Contributor suffixes and duplicate matching use the real author. | +| Row merging | Rows with the same first issue URL and raw Excel component are merged. | Multiple PRs for one issue produce one release note entry. | +| Entry generation | Non-duplicate rows are generated by AI or copied from `formated_release_note` in non-AI mode. | The same preprocessing works for both drafting and dry-run workflows. | +| Markdown rendering | Entries are grouped by type and Markdown component. | The draft follows the expected release note structure. | + +### Scope filtering + +Scope filtering removes rows that should not appear in the target release note. Removed rows are copied to `PRs_not_in_scope`, receive a `Reason` value, and are deleted from the main sheet in the processed workbook. + +General rules: + +| Condition | Result | Why | +| --- | --- | --- | +| `pr_status` is not `merged` | Move the row to `PRs_not_in_scope`. | Unmerged changes should not be documented as released. | +| `pr_merge_time` is empty or cannot be parsed | Keep the row. | The generator cannot prove that the row is out of scope. | + +Patch-release rules: + +For a patch release such as `8.5.7`, the generator finds the previous patch release date in `releases/release-timeline.md`. When parsing `release-timeline.md`, the generator skips non-semver entries such as `Pre-GA`. + +| Condition | Result | Why | +| --- | --- | --- | +| The PR was merged before the previous patch release date. | Move the row to `PRs_not_in_scope`. | The PR should already have been considered for the previous patch release. | +| The PR was merged on or after the previous patch release date. | Keep the row. | The PR falls into the target patch-release window. | + +`x.y.0` release rules: + +For an `x.y.0` release, the generator uses `releases/release-timeline.md` and release-branch PR data to avoid including changes that were already shipped in the latest previous major.minor release. + +| Condition | Result | Why | +| --- | --- | --- | +| The PR was merged on or after the latest previously released `x.y.0` date. | Keep the row. | The PR is newer than that previous release boundary. | +| The PR was merged before the estimated start date of the previous release branch. | Move the row to `PRs_not_in_scope`. | The PR is older than the branch window for the previous major.minor release. | +| The PR was merged during the previous release-branch window, and a cherry-pick PR for the previous release branch was merged before that previous release date. | Move the row to `PRs_not_in_scope`. | The change was already included through that cherry-pick. | +| No earlier-release evidence is found. | Keep the row. | The generator keeps the row when it cannot prove that the change is out of scope. | + +The estimated release-branch start date comes from the earliest closed PR that targets the previous release branch. You can override it with `--scope-base-branch-start-date`. + +When matching a cherry-pick PR to the original PR, the generator recognizes: + +- The full original PR URL. +- A cross-repository reference such as `pingcap/tidb#12345`. +- A same-repository suffix such as `(#12345)`. +- A branch or text pattern such as `cherry-pick-12345`. +- A line that contains `backport`, `cherry-pick`, `original`, `source`, or `from` together with `#12345`. + +### Historical release note index + +The generator scans existing Markdown files under `--releases-dir` before it decides whether a workbook row is a duplicate. + +The scanner: + +- Ignores generated drafts whose file name contains `updated-by-ai`. +- Ignores release-note files whose version is greater than or equal to the target version. +- Tracks the current release-note section and component from headings and component bullets. +- Extracts every GitHub issue or PR URL from a release note line. +- Extracts contributors from `@[user](https://github.com/user)` suffixes. +- Classifies each historical line as `improvement` or `bug_fix` from its surrounding section. +- Records the surrounding Markdown component when possible. + +Each historical entry can later be reused in this format: + +```markdown +- (dup): +``` + +This preserves the published wording and shows the source file and component path. + +### Repeated issues and duplicates + +The generator handles repeated issues in two different ways: + +- Same-series repeats are moved to a separate worksheet for review. +- Reusable duplicates from other series are rendered as `(dup)` entries. + +This separation is intentional. If the same issue appears again in the same major.minor series, it is often a sign that the row needs human judgment. If the issue has already been documented elsewhere and the author check passes, reusing the published note is usually safer than drafting a new sentence. + +For target version `8.5.7`, the same-series quarantine sheet is named: + +```text +issues_already_in_earlier_v8.5_notes +``` + +A row is moved to this sheet when all of the following are true: + +- The row has an issue URL in `issue_url` or `formated_release_note`. +- The same issue URL appears in an existing release-note file. +- The existing release-note file is from the same major.minor series. +- The existing release-note file version is earlier than the target version. + +Rows in this sheet are not rendered to Markdown. + +After same-series rows are moved out, the generator marks remaining rows as duplicates when their issue URLs match reusable historical entries. + +| Rule | Behavior | +| --- | --- | +| Issue URL source | The generator reads issue URLs from `issue_url`, if present, and from `formated_release_note`. | +| PR URL source | PR URLs are not used for duplicate matching. They are used for AI context and component inference. | +| Author check | If a historical note has contributors, at least one current row author must match a historical contributor. If the historical note has no contributors, the URL match is enough. | +| Workbook output | Matching historical notes are written to `published_release_notes`, and the row is filled in gray. | +| Markdown output | Duplicate rows are rendered from `published_release_notes`; they do not go through AI generation. | +| Type selection | The generator uses the historical section when possible. Otherwise, it falls back to the current row `issue_type`. | +| Component selection | The generator uses the historical component path when possible. Otherwise, it falls back to the current row component. | + +### Author and row normalization + +Cherry-pick PRs are often authored by `ti-chi-bot` or `ti-srebot`. For rows with those authors, the generator tries to find the original PR from the cherry-pick PR title, branch name, or body. + +When the original PR is found, the generator: + +- Replaces `pr_author` with the original PR author. +- Updates author Markdown in `formated_release_note` from the bot account to the original author. + +If the original PR cannot be found, the row keeps the bot author. This avoids blocking the whole run because of one incomplete cherry-pick reference. + +Rows are then merged when they have the same first issue URL and the same raw Excel component. For each merged group, the first row is kept. The kept row receives: + +- The union of `pr_link` values. +- The union of `pr_author` values. +- The union of duplicate notes from `published_release_notes`. +- The first available non-empty value for other empty cells. + +Rows are grouped by the raw Excel component, not the normalized Markdown component. This keeps workbook distinctions intact until the final component mapping stage. + +### Entry generation + +With `--involve-ai-generation ON`, the generator calls the configured AI command for non-duplicate rows that do not already have reusable text in `release_notes_written_by_ai`. + +The prompt includes: + +- The raw Excel component and normalized Markdown component. +- Workbook fields such as `issue_type`, `pr_title`, `formated_release_note`, expected links, and contributors. +- GitHub issue titles, bodies, and labels. +- GitHub PR titles, bodies, authors, branches, merge times, and changed-file summaries. +- The repository-local writing references for improvements and bug fixes. +- The prompt template in `scripts/release_notes_ai/prompts/generation.md`. + +The AI command must return a JSON object with these fields: + +| Field | Rule | +| --- | --- | +| `type` | Must be `improvement` or `bug_fix`. | +| `release_note` | Must be one Markdown bullet that starts with a hyphen followed by a space. | +| `needs_review` | Must be a boolean. | +| `reason` | Must explain the type and wording choice. | + +The generator validates that the release note: + +- Starts with a hyphen followed by a space. +- Does not end with a period. +- Includes every expected issue or PR link. +- Includes every non-bot contributor as `@[user](https://github.com/user)`. + +If validation fails, the generator sends one repair prompt. If the repaired output still fails, the row is marked as: + +```text +AI_GENERATION_FAILED: +``` + +Failed rows are not rendered to Markdown. + +If `release_notes_written_by_ai` already contains a value and does not start with `AI_GENERATION_FAILED:`, the generator reuses it instead of calling AI again. Use `--force-regenerate` to clear existing AI output and regenerate all non-duplicate rows. + +With `--involve-ai-generation OFF`, the generator does not call the AI command. For non-duplicate rows, it splits `formated_release_note` into non-empty lines and renders those lines as Markdown entries. The preprocessing pipeline still runs in non-AI mode. + +### Component mapping + +The generator maps each workbook component to a Markdown release-note component before rendering. It also keeps the original workbook component in an HTML comment after each generated entry: + +```markdown +- Improve ... [#12345](https://github.com/pingcap/tidb/issues/12345) @[user](https://github.com/user) +``` + +This marker lets reviewers trace the generated component back to the workbook value without changing the visible release-note text. + +The generator resolves components in this order: + +1. If the raw workbook value is already a known release-note component or alias, use that value. +2. If the raw workbook value contains multiple comma-separated or newline-separated values, apply the multi-value priority rules. +3. If the workbook value still cannot be resolved, infer the component from the GitHub repositories in the issue and PR URLs. +4. If no rule matches, use the normalized raw workbook value. +5. If the final value is empty, render the entry under `Other`. + +Direct aliases: + +| Excel component value | Markdown component | +| --- | --- | +| `tidb` | `TiDB` | +| `tikv` | `TiKV` | +| `pd` | `PD` | +| `tiflash` | `TiFlash` | +| `tiproxy` | `TiProxy` | +| `br`, `backup & restore`, `backup & restore (br)` | `Backup & Restore (BR)` | +| `cdc`, `ticdc` | `TiCDC` | +| `dm`, `tidb data migration`, `tidb data migration (dm)` | `TiDB Data Migration (DM)` | +| `tidb lightning`, `lightning` | `TiDB Lightning` | +| `dumpling` | `Dumpling` | +| `tiup` | `TiUP` | +| `tidb binlog` | `TiDB Binlog` | +| `sync_diff`, `sync-diff-inspector`, `sync diff inspector` | `sync-diff-inspector` | + +TiDB subcomponent aliases: + +| Excel component value | Markdown component | +| --- | --- | +| `ng monitoring`, `ng-monitoring` | `TiDB` | +| `planner` | `TiDB` | +| `execution` | `TiDB` | +| `sql-infra` | `TiDB` | +| `transaction` | `TiDB` | +| `engine` | `TiDB` | +| `observability` | `TiDB` | +| `dxf` | `TiDB` | +| `storage` | `TiDB` | +| `tidb-dashboard`, `tidb dashboard` | `TiDB` | +| `ddl` | `TiDB` | +| `coprocessor` | `TiDB` | +| `compute` | `TiDB` | +| `scheduling` | `TiDB` | +| `spm` | `TiDB` | + +When a workbook cell contains multiple component values, the generator applies this priority: + +1. Tool components with stronger source meaning: `Backup & Restore (BR)`, `TiDB Lightning`, `Dumpling`, `TiUP`, and `sync-diff-inspector`. +2. Top-level components: `TiDB`, `TiKV`, `PD`, `TiFlash`, and `TiProxy`. +3. `TiDB Data Migration (DM)`. +4. `TiCDC`. + +Repository fallback rules: + +| Repository evidence | Markdown component | +| --- | --- | +| `pd` | `PD` | +| `tikv` | `TiKV` | +| `tiflash` | `TiFlash` | +| `ng-monitoring` | `TiDB` | +| `tiup` | `TiUP` | +| `tiflow` or `ticdc`, and the raw component contains `dm` but not `cdc` | `TiDB Data Migration (DM)` | +| `tiflow` or `ticdc`, otherwise | `TiCDC` | +| `tidb`, and the raw component contains `br` | `Backup & Restore (BR)` | +| `tidb`, and the raw component contains `lightning` | `TiDB Lightning` | +| `tidb`, and the raw component contains `dumpling` | `Dumpling` | +| `tidb`, otherwise | `TiDB` | +| `tidb-dashboard` | `TiDB` | + +### Markdown rendering and safe saving + +The generated file contains front matter, the `# TiDB Release Notes` heading, release metadata, quick access links, `## Improvements`, and `## Bug fixes`. + +Entries are grouped by type and component. Top-level components are rendered in this order: + +```text +TiDB, TiKV, PD, TiFlash, TiProxy +``` + +Tool components are rendered under `+ Tools` in this order: + +```text +Backup & Restore (BR), TiCDC, TiDB Data Migration (DM), TiDB Lightning, Dumpling, TiUP, TiDB Binlog, sync-diff-inspector +``` + +Known top-level components are rendered first. Unknown non-tool components are rendered next in alphabetical order. Tool components are rendered last under `Tools`. + +Before writing an entry, the renderer normalizes its bullet marker to a hyphen followed by a space. If the entry does not already contain a component marker, the renderer appends the raw workbook component as an HTML comment. + +The processed workbook is saved to `_processed.xlsx`. During AI generation, `--checkpoint-interval` controls how often the processed workbook is saved: + +- The default value `1` saves after every completed AI row. +- `0` disables checkpoint saves. + +Workbook saves are atomic. The generator first writes a temporary file in the target directory and then replaces the processed workbook. If replacement fails after a complete temporary workbook has been written, the error message includes the temporary file path. + From b4ea7ce5fb09eb59cacf0dbc47e2dcd31fedecb5 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 18 May 2026 14:38:26 +0800 Subject: [PATCH 06/22] explain `formatted_release_note_from_excel` --- scripts/release-notes-generator-readme.md | 2 +- scripts/release_notes_ai/excel_workbook.py | 2 +- scripts/release_notes_ai/prompts/generation.md | 6 ++++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-generator-readme.md index d377f4da3f689..008829c4e7fbd 100644 --- a/scripts/release-notes-generator-readme.md +++ b/scripts/release-notes-generator-readme.md @@ -182,7 +182,7 @@ This separation is intentional. If the same issue appears again in the same majo For target version `8.5.7`, the same-series quarantine sheet is named: ```text -issues_already_in_earlier_v8.5_notes +issue_already_in_earlier_v8.5.x ``` A row is moved to this sheet when all of the following are true: diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release_notes_ai/excel_workbook.py index 177f28fd64c9f..b3de7e92b2ad3 100644 --- a/scripts/release_notes_ai/excel_workbook.py +++ b/scripts/release_notes_ai/excel_workbook.py @@ -372,7 +372,7 @@ def same_series_release_files_by_issue_url( def same_series_issues_sheet_name(version: str) -> str: major, minor, _patch = parse_semver_tuple(version) - return f"issues_already_in_earlier_v{major}.{minor}_notes" + return f"issue_already_in_earlier_v{major}.{minor}" def same_series_issue_reason( diff --git a/scripts/release_notes_ai/prompts/generation.md b/scripts/release_notes_ai/prompts/generation.md index 8eb5b1e993381..7e57d73318959 100644 --- a/scripts/release_notes_ai/prompts/generation.md +++ b/scripts/release_notes_ai/prompts/generation.md @@ -33,6 +33,12 @@ Contributors: Row context: {{ROW_CONTEXT}} +About `formatted_release_note_from_excel`: + +- This field can be empty, `None`, or a generic placeholder such as `Please refer to [Release Notes Language Style Guide](https://pingcap.github.io/tidb-dev-guide/contribute-to-tidb/release-notes-style-guide.html) to write a quality release note.`. In these cases, treat it as no usable release-note draft. +- This field can also contain a draft release note written by the code PR author. In that case, use the draft as an important reference for the final release note, but verify and refine it against the PR code changes first and the issue description second. +- Do not copy the draft blindly. Preserve its useful user-facing intent, correct unclear or inaccurate wording, and still follow all release-note style rules above. + Improvements reference: {{IMPROVEMENTS_REFERENCE}} From d1db0b61c627852267c4abd82f85479c99d1cb8c Mon Sep 17 00:00:00 2001 From: qiancai Date: Tue, 16 Jun 2026 12:27:52 +0800 Subject: [PATCH 07/22] support using Azure OpenAI --- scripts/release-notes-generator-readme.md | 7 +-- scripts/release_notes_ai/ai_client.py | 55 +++++++++++++++++++++-- scripts/release_notes_ai/cli.py | 25 +++++++++-- scripts/release_notes_ai/requirements.txt | 1 + 4 files changed, 77 insertions(+), 11 deletions(-) diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-generator-readme.md index 008829c4e7fbd..f4cf162d57fdd 100644 --- a/scripts/release-notes-generator-readme.md +++ b/scripts/release-notes-generator-readme.md @@ -48,7 +48,7 @@ The generator does not create a complete formal release note. It does not genera export GITHUB_TOKEN= ``` -- Install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. +- Install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. If you use `--ai-provider azure` instead, Codex CLI is not required; set `AZURE_OPENAI_KEY` and `AZURE_OPENAI_BASE_URL` (or `OPENAI_BASE_URL`) environment variables. ## Typical usage @@ -67,8 +67,9 @@ python3 scripts/release_notes_generate_ai.py \ | `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note Excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | | `--releases-dir ` | Yes | None | `--releases-dir releases` | Path to the existing English release notes directory. The script scans this directory for historical release notes and writes the generated Markdown under this directory unless `--output-release-file` is specified. | | `--sheet ` | No | `pr_for_release_note` | `--sheet pr_for_release_note` | Workbook sheet to process. | -| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator. The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | -| `--ai-model ` | No | `gpt-5.4` | `--ai-model gpt-5.4` | Model name passed to `codex exec` with `-m`. | +| `--ai-provider ` | No | `codex` | `--ai-provider azure` | AI provider to use. `codex` runs the Codex CLI as a subprocess. `azure` calls Azure OpenAI via the OpenAI Python SDK (requires `AZURE_OPENAI_KEY` and `AZURE_OPENAI_BASE_URL` or `OPENAI_BASE_URL` environment variables). | +| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator (only used with `--ai-provider codex`). The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | +| `--ai-model ` | No | `gpt-5.4` | `--ai-model gpt-5.4` | Model name. Passed to `codex exec` with `-m`, or used as the model parameter for Azure OpenAI. | | `--involve-ai-generation ` | No | `ON` | `--involve-ai-generation OFF` | Whether to generate non-duplicate release notes with AI. Use `ON` to invoke AI, or `OFF` to use the source `formated_release_note` values. | | `--output-release-file ` | No | Conditional | `--output-release-file /path/to/release-8.5.7.md` | Write the generated Markdown to a custom path. By default, the output under `--releases-dir` is `release--updated-by-ai.md` if `release-.md` already exists, otherwise `release-.md`. | | `--ai-timeout ` | No | `600` | `--ai-timeout 600` | Timeout in seconds for each AI command invocation. | diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release_notes_ai/ai_client.py index d2770e3fbc56c..aaf56ea013e03 100644 --- a/scripts/release_notes_ai/ai_client.py +++ b/scripts/release_notes_ai/ai_client.py @@ -21,10 +21,7 @@ class AIClient: - def __init__(self, command: str, model: str | None, timeout: int): - self.command = shlex.split(command) - self.model = model - self.timeout = timeout + """Base AI client with shared generation and validation logic.""" def generate(self, prompt: str, expected_links: list[str], contributors: list[str]) -> GeneratedNote: result, errors = self._run_and_validate(prompt, expected_links, contributors) @@ -47,6 +44,18 @@ def _run_and_validate( return None, [str(exc)] return validate_ai_response(data, expected_links, contributors) + def _run(self, prompt: str) -> str: + raise NotImplementedError("Subclasses must implement _run") + + +class CodexAIClient(AIClient): + """AI client that invokes the Codex CLI as a subprocess.""" + + def __init__(self, command: str, model: str | None, timeout: int): + self.command = shlex.split(command) + self.model = model + self.timeout = timeout + def _run(self, prompt: str) -> str: command = list(self.command) if not command: @@ -97,6 +106,44 @@ def _is_codex_exec(command: list[str]) -> bool: return executable == "codex" and "exec" in command[1:] +class AzureOpenAIClient(AIClient): + """AI client that calls Azure OpenAI via the OpenAI Python SDK.""" + + DEFAULT_MODEL = "gpt-5.4" + MAX_OUTPUT_TOKENS = 16384 + TEMPERATURE = 0.1 + + def __init__(self, model: str | None, timeout: int): + from openai import OpenAI + + key = os.environ.get("AZURE_OPENAI_KEY", "") + base_url = ( + os.environ.get("AZURE_OPENAI_BASE_URL") + or os.environ.get("OPENAI_BASE_URL", "") + ) + if not key: + raise ValueError( + "AZURE_OPENAI_KEY environment variable is required " + "when using --ai-provider azure" + ) + if not base_url: + raise ValueError( + "AZURE_OPENAI_BASE_URL or OPENAI_BASE_URL environment variable " + "is required when using --ai-provider azure" + ) + self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout) + self.model = model or self.DEFAULT_MODEL + + def _run(self, prompt: str) -> str: + response = self.client.responses.create( + model=self.model, + input=[{"role": "user", "content": prompt}], + temperature=self.TEMPERATURE, + max_output_tokens=self.MAX_OUTPUT_TOKENS, + ) + return response.output_text.strip() + + def is_executable_available(executable: str) -> bool: if os.sep in executable or (os.altsep and os.altsep in executable): return Path(executable).exists() diff --git a/scripts/release_notes_ai/cli.py b/scripts/release_notes_ai/cli.py index b1e913a7bca0d..861011e1fd6b7 100644 --- a/scripts/release_notes_ai/cli.py +++ b/scripts/release_notes_ai/cli.py @@ -7,7 +7,7 @@ import openpyxl -from .ai_client import AIClient +from .ai_client import AzureOpenAIClient, CodexAIClient from .excel_workbook import ( clear_output_columns, generate_notes_without_ai, @@ -36,15 +36,26 @@ def parse_args() -> argparse.Namespace: help="Path to the existing English release notes directory.", ) parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") + parser.add_argument( + "--ai-provider", + choices=["codex", "azure"], + default="codex", + help=( + "AI provider to use. 'codex' runs the Codex CLI as a subprocess " + "(requires codex to be installed). 'azure' calls Azure OpenAI via the " + "OpenAI Python SDK (requires AZURE_OPENAI_KEY and AZURE_OPENAI_BASE_URL " + "or OPENAI_BASE_URL environment variables). Default: codex." + ), + ) parser.add_argument( "--ai-command", default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", - help="Command-line AI command. The prompt is passed through stdin.", + help="Command-line AI command (only used with --ai-provider codex). The prompt is passed through stdin.", ) parser.add_argument( "--ai-model", default="gpt-5.4", - help="Model name passed to codex exec with -m.", + help="Model name. Passed to codex exec with -m, or used as the model parameter for Azure OpenAI.", ) parser.add_argument( "--involve-ai-generation", @@ -142,7 +153,13 @@ def main() -> int: raise SystemExit(f"error: {exc}") from None github = GitHubClient(token) involve_ai_generation = args.involve_ai_generation == "ON" - ai = AIClient(args.ai_command, args.ai_model, args.ai_timeout) if involve_ai_generation else None + if involve_ai_generation: + if args.ai_provider == "azure": + ai = AzureOpenAIClient(args.ai_model, args.ai_timeout) + else: + ai = CodexAIClient(args.ai_command, args.ai_model, args.ai_timeout) + else: + ai = None output_file = ( Path(args.output_release_file) diff --git a/scripts/release_notes_ai/requirements.txt b/scripts/release_notes_ai/requirements.txt index 89cfc13a2a578..1168030c0fdca 100644 --- a/scripts/release_notes_ai/requirements.txt +++ b/scripts/release_notes_ai/requirements.txt @@ -1,3 +1,4 @@ openpyxl>=3.1 +openai>=1.66 requests>=2.31 urllib3>=1.26 From 2750b3459483250da85a5dc8584fa3f98e6ae4cb Mon Sep 17 00:00:00 2001 From: qiancai Date: Tue, 16 Jun 2026 13:18:50 +0800 Subject: [PATCH 08/22] rename scripts --- .../__main__.py} | 13 ++++--- .../ai_client.py | 2 +- .../cli.py | 0 .../constants.py | 2 +- .../excel_workbook.py | 0 .../github_client.py | 0 .../markdown_writer.py | 0 .../models.py | 0 .../prompts/generation.md | 0 .../release-notes-generator-readme.md | 36 +++++++++++++++---- .../requirements.txt | 0 .../scope_filter.py | 0 .../utils.py | 0 13 files changed, 37 insertions(+), 16 deletions(-) rename scripts/{release_notes_generate_ai.py => release-notes-ai-generator/__main__.py} (52%) rename scripts/{release_notes_ai => release-notes-ai-generator}/ai_client.py (99%) rename scripts/{release_notes_ai => release-notes-ai-generator}/cli.py (100%) rename scripts/{release_notes_ai => release-notes-ai-generator}/constants.py (96%) rename scripts/{release_notes_ai => release-notes-ai-generator}/excel_workbook.py (100%) rename scripts/{release_notes_ai => release-notes-ai-generator}/github_client.py (100%) rename scripts/{release_notes_ai => release-notes-ai-generator}/markdown_writer.py (100%) rename scripts/{release_notes_ai => release-notes-ai-generator}/models.py (100%) rename scripts/{release_notes_ai => release-notes-ai-generator}/prompts/generation.md (100%) rename scripts/{ => release-notes-ai-generator}/release-notes-generator-readme.md (94%) rename scripts/{release_notes_ai => release-notes-ai-generator}/requirements.txt (100%) rename scripts/{release_notes_ai => release-notes-ai-generator}/scope_filter.py (100%) rename scripts/{release_notes_ai => release-notes-ai-generator}/utils.py (100%) diff --git a/scripts/release_notes_generate_ai.py b/scripts/release-notes-ai-generator/__main__.py similarity index 52% rename from scripts/release_notes_generate_ai.py rename to scripts/release-notes-ai-generator/__main__.py index 0a6d3eb761268..17ee380d0ea0c 100644 --- a/scripts/release_notes_generate_ai.py +++ b/scripts/release-notes-ai-generator/__main__.py @@ -3,18 +3,17 @@ """Generate TiDB improvements and bug fixes for release notes according to PRs and issues in a specified excel file. -Typical usage: +Typical usage (run from the scripts/ directory): - python3 scripts/release_notes_generate_ai.py \ + python3 -m release-notes-ai-generator \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ --releases-dir releases + --ai-provider azure -For detailed usage and options, see scripts/release-notes-generator-readme.md. +For detailed usage and options, see release-notes-generator-readme.md in this directory. """ -from release_notes_ai.cli import main +from .cli import main - -if __name__ == "__main__": - raise SystemExit(main()) +raise SystemExit(main()) diff --git a/scripts/release_notes_ai/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py similarity index 99% rename from scripts/release_notes_ai/ai_client.py rename to scripts/release-notes-ai-generator/ai_client.py index aaf56ea013e03..ca3a2d4c783ab 100644 --- a/scripts/release_notes_ai/ai_client.py +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -241,7 +241,7 @@ def load_prompt_template(path: Path) -> str: except FileNotFoundError as exc: raise FileNotFoundError( f"Cannot find release-note prompt template: {path}. " - "Make sure scripts/release_notes_ai/prompts/generation.md exists." + "Make sure scripts/release-notes-ai-generator/prompts/generation.md exists." ) from exc diff --git a/scripts/release_notes_ai/cli.py b/scripts/release-notes-ai-generator/cli.py similarity index 100% rename from scripts/release_notes_ai/cli.py rename to scripts/release-notes-ai-generator/cli.py diff --git a/scripts/release_notes_ai/constants.py b/scripts/release-notes-ai-generator/constants.py similarity index 96% rename from scripts/release_notes_ai/constants.py rename to scripts/release-notes-ai-generator/constants.py index 12d0d9ca8dcee..3220aa9fcfc2f 100644 --- a/scripts/release_notes_ai/constants.py +++ b/scripts/release-notes-ai-generator/constants.py @@ -22,7 +22,7 @@ / "bug-fixes.md" ) GENERATION_PROMPT_TEMPLATE = ( - REPO_ROOT / "scripts" / "release_notes_ai" / "prompts" / "generation.md" + REPO_ROOT / "scripts" / "release-notes-ai-generator" / "prompts" / "generation.md" ) BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} diff --git a/scripts/release_notes_ai/excel_workbook.py b/scripts/release-notes-ai-generator/excel_workbook.py similarity index 100% rename from scripts/release_notes_ai/excel_workbook.py rename to scripts/release-notes-ai-generator/excel_workbook.py diff --git a/scripts/release_notes_ai/github_client.py b/scripts/release-notes-ai-generator/github_client.py similarity index 100% rename from scripts/release_notes_ai/github_client.py rename to scripts/release-notes-ai-generator/github_client.py diff --git a/scripts/release_notes_ai/markdown_writer.py b/scripts/release-notes-ai-generator/markdown_writer.py similarity index 100% rename from scripts/release_notes_ai/markdown_writer.py rename to scripts/release-notes-ai-generator/markdown_writer.py diff --git a/scripts/release_notes_ai/models.py b/scripts/release-notes-ai-generator/models.py similarity index 100% rename from scripts/release_notes_ai/models.py rename to scripts/release-notes-ai-generator/models.py diff --git a/scripts/release_notes_ai/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md similarity index 100% rename from scripts/release_notes_ai/prompts/generation.md rename to scripts/release-notes-ai-generator/prompts/generation.md diff --git a/scripts/release-notes-generator-readme.md b/scripts/release-notes-ai-generator/release-notes-generator-readme.md similarity index 94% rename from scripts/release-notes-generator-readme.md rename to scripts/release-notes-ai-generator/release-notes-generator-readme.md index f4cf162d57fdd..27f7897c8c83a 100644 --- a/scripts/release-notes-generator-readme.md +++ b/scripts/release-notes-ai-generator/release-notes-generator-readme.md @@ -1,6 +1,6 @@ # Release notes generator -`scripts/release_notes_generate_ai.py` generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in a Excel workbook. +`python3 -m release-notes-ai-generator` (run from the `scripts/` directory) generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in a Excel workbook. The generator keeps the source workbook unchanged, writes all processing results to a processed workbook, and renders the generated entries to a Markdown release note file. @@ -33,13 +33,12 @@ The generator keeps the source workbook unchanged, writes all processing results The generator does not create a complete formal release note. It does not generate sections such as compatibility changes, known issues, deprecations, or upgrade notes. - ## Prerequisites - Install Python dependencies: ```bash - python3 -m pip install -r scripts/release_notes_ai/requirements.txt + python3 -m pip install -r scripts/release-notes-ai-generator/requirements.txt ``` - Prepare a GitHub token with access to the public repositories and set the GitHub token in the `GITHUB_TOKEN` environment variable: @@ -48,17 +47,40 @@ The generator does not create a complete formal release note. It does not genera export GITHUB_TOKEN= ``` -- Install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. If you use `--ai-provider azure` instead, Codex CLI is not required; set `AZURE_OPENAI_KEY` and `AZURE_OPENAI_BASE_URL` (or `OPENAI_BASE_URL`) environment variables. +- Prepare the AI settings in your environment. + + - If you use `--ai-provider azure` instead, set the following environment variables: + + ```bash + export AZURE_OPENAI_KEY= + export AZURE_OPENAI_BASE_URL= + ``` + + - If you use Codex CLI, install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. + +## Typical usage examples -## Typical usage +Use Codex to generate release notes: ```bash -python3 scripts/release_notes_generate_ai.py \ +cd scripts +python3 -m release-notes-ai-generator \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ --releases-dir releases ``` +Use Azure OpenAI to generate release notes: + +```bash +cd scripts +python3 -m release-notes-ai-generator \ + --version 8.5.7 \ + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases \ + --ai-provider azure +``` + ## Option descriptions | Option | Required | Default value | Usage example | Description | @@ -238,7 +260,7 @@ The prompt includes: - GitHub issue titles, bodies, and labels. - GitHub PR titles, bodies, authors, branches, merge times, and changed-file summaries. - The repository-local writing references for improvements and bug fixes. -- The prompt template in `scripts/release_notes_ai/prompts/generation.md`. +- The prompt template in `scripts/release-notes-ai-generator/prompts/generation.md`. The AI command must return a JSON object with these fields: diff --git a/scripts/release_notes_ai/requirements.txt b/scripts/release-notes-ai-generator/requirements.txt similarity index 100% rename from scripts/release_notes_ai/requirements.txt rename to scripts/release-notes-ai-generator/requirements.txt diff --git a/scripts/release_notes_ai/scope_filter.py b/scripts/release-notes-ai-generator/scope_filter.py similarity index 100% rename from scripts/release_notes_ai/scope_filter.py rename to scripts/release-notes-ai-generator/scope_filter.py diff --git a/scripts/release_notes_ai/utils.py b/scripts/release-notes-ai-generator/utils.py similarity index 100% rename from scripts/release_notes_ai/utils.py rename to scripts/release-notes-ai-generator/utils.py From 9bc33af60a1c05ebd55dfc4db38a0eae627b0993 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 15:21:24 +0800 Subject: [PATCH 09/22] add the logic to determine whether release note is needed for a PR/issue --- .../release-notes-ai-generator/ai_client.py | 37 ++++++---- .../excel_workbook.py | 69 +++++++++++++++++++ .../prompts/generation.md | 43 ++++++++++-- 3 files changed, 130 insertions(+), 19 deletions(-) diff --git a/scripts/release-notes-ai-generator/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py index ca3a2d4c783ab..0bd7b2d3f050c 100644 --- a/scripts/release-notes-ai-generator/ai_client.py +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -156,7 +156,7 @@ def ai_output_schema() -> dict[str, Any]: "additionalProperties": False, "required": ["type", "release_note", "needs_review", "reason"], "properties": { - "type": {"type": "string", "enum": ["improvement", "bug_fix"]}, + "type": {"type": "string", "enum": ["improvement", "bug_fix", "not_needed"]}, "release_note": {"type": "string"}, "needs_review": {"type": "boolean"}, "reason": {"type": "string"}, @@ -311,25 +311,32 @@ def validate_ai_response( needs_review = data.get("needs_review") reason = data.get("reason") - if note_type not in {"improvement", "bug_fix"}: - errors.append('type must be "improvement" or "bug_fix"') - if not isinstance(release_note, str) or not release_note.startswith("- "): - errors.append('release_note must be a string that starts with "- "') - if isinstance(release_note, str) and release_note.rstrip().endswith("."): - errors.append("release_note must not end with a period") + if note_type not in {"improvement", "bug_fix", "not_needed"}: + errors.append('type must be "improvement", "bug_fix", or "not_needed"') if not isinstance(needs_review, bool): errors.append("needs_review must be a boolean") if not isinstance(reason, str): errors.append("reason must be a string") - if isinstance(release_note, str): - for link in expected_links: - if link and link not in release_note: - errors.append(f"release_note is missing expected link: {link}") - for contributor in contributors: - expected = f"@[{contributor}](https://github.com/{contributor})" - if contributor and expected not in release_note: - errors.append(f"release_note is missing contributor: {contributor}") + if note_type == "not_needed": + if not isinstance(release_note, str) or not release_note.startswith("Release note is not needed:"): + errors.append( + 'when type is "not_needed", release_note must start with ' + '"Release note is not needed:"' + ) + else: + if not isinstance(release_note, str) or not release_note.startswith("- "): + errors.append('release_note must be a string that starts with "- "') + if isinstance(release_note, str) and release_note.rstrip().endswith("."): + errors.append("release_note must not end with a period") + if isinstance(release_note, str): + for link in expected_links: + if link and link not in release_note: + errors.append(f"release_note is missing expected link: {link}") + for contributor in contributors: + expected = f"@[{contributor}](https://github.com/{contributor})" + if contributor and expected not in release_note: + errors.append(f"release_note is missing contributor: {contributor}") if errors: return None, errors diff --git a/scripts/release-notes-ai-generator/excel_workbook.py b/scripts/release-notes-ai-generator/excel_workbook.py index b3de7e92b2ad3..d0149d4ba6885 100644 --- a/scripts/release-notes-ai-generator/excel_workbook.py +++ b/scripts/release-notes-ai-generator/excel_workbook.py @@ -43,6 +43,7 @@ GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") +NOT_NEEDED_PREFIX = "Release note is not needed:" SAME_SERIES_REASON_HEADER = "reason" @@ -343,6 +344,59 @@ def move_rows_with_issues_already_in_same_series( return len(rows_to_move) +def move_not_needed_rows_to_sheet( + workbook: Any, + sheet: Any, + header: dict[str, int], +) -> int: + """Move rows where AI determined no release note is needed to a separate sheet.""" + ai_col = header["release_notes_written_by_ai"] + target_sheet_name = "release_note_not_needed" + + rows_to_move: list[int] = [] + for row_number in range(2, sheet.max_row + 1): + ai_value = str_value(sheet.cell(row=row_number, column=ai_col).value) + if ai_value.startswith(NOT_NEEDED_PREFIX): + rows_to_move.append(row_number) + + if not rows_to_move: + return 0 + + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if not str_value(target.cell(row=1, column=1).value): + copy_header_row(sheet, target) + else: + target = workbook.create_sheet(target_sheet_name) + copy_header_row(sheet, target) + + for row_number in rows_to_move: + target_row = target.max_row + 1 + for column in range(1, sheet.max_column + 1): + copy_cell( + sheet.cell(row=row_number, column=column), + target.cell(row=target_row, column=column), + ) + + for row_number in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + print( + f"Moved {len(rows_to_move)} row(s) to sheet '{target_sheet_name}' " + "(release note not needed)", + flush=True, + ) + return len(rows_to_move) + + +def copy_header_row(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + + def same_series_release_files_by_issue_url( existing_notes: list[ExistingNote], version: str, @@ -708,6 +762,9 @@ def generate_notes_for_sheet( existing_note = str_value(ai_cell.value) if is_reusable_ai_note(existing_note): + if is_not_needed_note(existing_note): + print(f"Row {row_number}: skipped existing not-needed verdict", flush=True) + continue note_type = classify_note_type_from_text(existing_note, row_input.issue_type) entries_by_row[row_number] = [ MarkdownEntry( @@ -831,6 +888,10 @@ def is_reusable_ai_note(note: str) -> bool: return bool(note) and not note.startswith("AI_GENERATION_FAILED:") +def is_not_needed_note(note: str) -> bool: + return note.startswith(NOT_NEEDED_PREFIX) + + def prefetch_github_data(row_inputs: list[RowInput], github: Any, github_workers: int) -> GitHubDataCache: issue_urls = unique_ordered(url for row_input in row_inputs for url in row_input.issue_urls) pr_urls = unique_ordered(url for row_input in row_inputs for url in row_input.pr_urls) @@ -959,6 +1020,14 @@ def apply_generation_result( ) return + if result.note_type == "not_needed": + ai_cell.value = result.note + print( + f"Row {result.row_number}: {result.note}", + flush=True, + ) + return + ai_cell.value = result.note entries_by_row[result.row_number] = [ MarkdownEntry(result.note_type, result.component, result.note, result.raw_component) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index 7e57d73318959..66b3526acdbc0 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -2,16 +2,51 @@ You are a senior technical writer who has profound knowledge of TiDB. -Your task is to write exactly one English release note entry for a TiDB issue or PR. +Your task is to evaluate whether a TiDB issue or PR needs a release note. + +- If yes, write exactly one English release note entry for it. +- If not, return a "Release note is not needed" verdict and a short reason. + +## Step 1: Determine whether a release note is needed + +Not every PR or change warrants a release note. Before writing, determine whether the change is visible to TiDB users or operators. + +### User-visible changes (write a release note) + +- Bug fixes that change query results, upgrade behavior, privilege checks, error messages, or compatibility +- New features, new SQL syntax or function support, or new configuration options +- Meaningful performance improvements observable in common operations +- Behavior changes that affect upgrade paths, tooling integration, or operational workflows +- Default value changes for system variables or configuration parameters + +### Internal-only changes (no release note needed) + +- Test-only changes: new test cases, flaky test fixes, test infrastructure updates +- Pure refactors or internal data-structure changes with no user-observable effect +- Added or improved debug/internal logs that do not surface in user-facing interfaces +- Internal CI/CD pipeline changes or developer workflow changes +- Code comments or source-code-only documentation changes (not user-facing docs) + +### Borderline cases + +If a PR is mostly internal but the outcome is user-visible, write a release note that describes the outcome and omit the implementation details. If the only user-facing effect is indirect or speculative, lean toward returning a "not_needed" verdict. + +## Step 2: Return your result Return only a JSON object with exactly these keys: -- type: "improvement" or "bug_fix" -- release_note: one Markdown bullet that starts with "- " +- type: "improvement", "bug_fix", or "not_needed" +- release_note: one Markdown bullet that starts with "- " (when type is "improvement" or "bug_fix"), or "Release note is not needed: " (when type is "not_needed") - needs_review: true or false - reason: a short reason for the type and wording -Rules: +When type is "not_needed", use a short reason in the release_note field. Examples: +- "Release note is not needed: test-only change" +- "Release note is not needed: internal refactor, no user-visible effect" +- "Release note is not needed: flaky test fix" +- "Release note is not needed: added internal debug logging" + +## Rules (apply only when writing a release note) - Write from the user's perspective. - Use the Excel issue_type as a strong signal, but decide the final type from the issue, PR description, and code changes. From b298a2f77835d7d7aff11078d6697246c3c06416 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 15:52:13 +0800 Subject: [PATCH 10/22] use a two-phase workflow --- .../release-notes-ai-generator/__main__.py | 14 +- scripts/release-notes-ai-generator/cli.py | 261 ++++++++++++++---- .../excel_workbook.py | 96 ++++++- .../release-notes-generator-readme.md | 133 +++++++-- 4 files changed, 413 insertions(+), 91 deletions(-) diff --git a/scripts/release-notes-ai-generator/__main__.py b/scripts/release-notes-ai-generator/__main__.py index 17ee380d0ea0c..114f4057f1c34 100644 --- a/scripts/release-notes-ai-generator/__main__.py +++ b/scripts/release-notes-ai-generator/__main__.py @@ -3,14 +3,22 @@ """Generate TiDB improvements and bug fixes for release notes according to PRs and issues in a specified excel file. -Typical usage (run from the scripts/ directory): +Two-phase workflow (run from the scripts/ directory): - python3 -m release-notes-ai-generator \ + # Phase 1: Process Excel, call AI, write results to Excel + python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ - --releases-dir releases + --releases-dir releases \ --ai-provider azure + # Phase 2: Export Markdown from the processed Excel + python3 -m release-notes-ai-generator export-markdown \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ + --releases-dir releases \ + --release-date "August 14, 2025" + For detailed usage and options, see release-notes-generator-readme.md in this directory. """ diff --git a/scripts/release-notes-ai-generator/cli.py b/scripts/release-notes-ai-generator/cli.py index 861011e1fd6b7..f2d68655e070e 100644 --- a/scripts/release-notes-ai-generator/cli.py +++ b/scripts/release-notes-ai-generator/cli.py @@ -10,9 +10,12 @@ from .ai_client import AzureOpenAIClient, CodexAIClient from .excel_workbook import ( clear_output_columns, + collect_markdown_entries_from_sheet, generate_notes_without_ai, generate_notes_for_sheet, + get_header, merge_rows_by_issue_and_component, + move_not_needed_rows_to_sheet, move_rows_with_issues_already_in_same_series, prepare_sheet_columns, sort_sheet_rows_by_component, @@ -26,8 +29,43 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Generate English release notes with AI according to PRs and issues in a specified excel file." + description=( + "Generate English release notes with AI according to PRs and issues " + "in a specified Excel file. Use subcommands 'generate' and 'export-markdown' " + "to run the two phases independently." + ), + ) + subparsers = parser.add_subparsers(dest="command") + + # --- Phase 1: generate --- + gen_parser = subparsers.add_parser( + "generate", + help=( + "Phase 1: Process the Excel workbook — run preprocessing, call AI to " + "generate release notes, and write results back to Excel. " + "Does NOT produce a Markdown file." + ), + ) + add_generate_args(gen_parser) + + # --- Phase 2: export-markdown --- + export_parser = subparsers.add_parser( + "export-markdown", + help=( + "Phase 2: Read a processed Excel workbook and export a Markdown " + "release-note file. Does NOT call AI or modify the Excel." + ), ) + add_export_markdown_args(export_parser) + + args = parser.parse_args() + if not args.command: + parser.print_help() + raise SystemExit(1) + return args + + +def add_generate_args(parser: argparse.ArgumentParser) -> None: parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") parser.add_argument( @@ -63,14 +101,7 @@ def parse_args() -> argparse.Namespace: default="ON", help=( "Whether to use AI for non-dup release notes. Use ON to generate with AI, " - "or OFF to output the original formated_release_note values. Default: ON." - ), - ) - parser.add_argument( - "--output-release-file", - help=( - "Output Markdown file. Defaults to release-{version}-updated-by-ai.md " - "if release-{version}.md already exists, otherwise release-{version}.md." + "or OFF to skip AI generation and only run preprocessing. Default: ON." ), ) parser.add_argument( @@ -114,11 +145,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Clear existing AI release notes and regenerate all non-dup rows.", ) - parser.add_argument( - "--release-date", - default="TBD", - help='Release date text for the Markdown header, for example "August 14, 2025".', - ) parser.add_argument( "--skip-scope-preprocess", action="store_true", @@ -131,11 +157,74 @@ def parse_args() -> argparse.Namespace: "preprocessing, in YYYY-MM-DD format." ), ) - return parser.parse_args() + parser.add_argument( + "--start-row", + type=int, + default=None, + help=( + "Excel row number to start processing from (1-indexed, row 1 is the header). " + "Use this to resume from a previous interruption. When specified, " + "preprocessing steps (sort, merge, scope filter, same-series move) are " + "skipped because they should have been completed in the first run. " + "Default: process all data rows." + ), + ) + parser.add_argument( + "--end-row", + type=int, + default=None, + help=( + "Excel row number to stop processing at (inclusive, 1-indexed). " + "Default: last row in the sheet." + ), + ) + parser.add_argument( + "--output-excel", + default=None, + help=( + "Path for the processed Excel output. " + "Default: _processed.xlsx in the same directory." + ), + ) + + +def add_export_markdown_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") + parser.add_argument( + "--excel", + required=True, + help="Path to the processed Excel workbook (output of the 'generate' phase).", + ) + parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") + parser.add_argument( + "--releases-dir", + required=True, + help="Path to the existing English release notes directory (used for default output path).", + ) + parser.add_argument( + "--output-release-file", + help=( + "Output Markdown file. Defaults to release-{version}-updated-by-ai.md " + "if release-{version}.md already exists, otherwise release-{version}.md." + ), + ) + parser.add_argument( + "--release-date", + default="TBD", + help='Release date text for the Markdown header, for example "August 14, 2025".', + ) def main() -> int: args = parse_args() + if args.command == "generate": + return run_generate(args) + if args.command == "export-markdown": + return run_export_markdown(args) + return 1 + + +def run_generate(args: argparse.Namespace) -> int: validate_positive_int("--ai-workers", args.ai_workers) validate_positive_int("--github-workers", args.github_workers) validate_positive_int("--author-workers", args.author_workers) @@ -147,6 +236,16 @@ def main() -> int: if not base_branch_start_date: raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") + row_range_specified = args.start_row is not None or args.end_row is not None + start_row = args.start_row + end_row = args.end_row + if start_row is not None and start_row < 2: + raise ValueError("--start-row must be >= 2 (row 1 is the header)") + if end_row is not None and end_row < 2: + raise ValueError("--end-row must be >= 2 (row 1 is the header)") + if start_row is not None and end_row is not None and start_row > end_row: + raise ValueError("--start-row must be <= --end-row") + try: token = load_github_token() except ValueError as exc: @@ -161,47 +260,59 @@ def main() -> int: else: ai = None - output_file = ( - Path(args.output_release_file) - if args.output_release_file - else default_output_release_file(Path(args.releases_dir), args.version) - ) - excel_path = Path(args.excel) - processed_excel_path = default_processed_excel_path(excel_path) + processed_excel_path = ( + Path(args.output_excel) if args.output_excel + else default_processed_excel_path(excel_path) + ) workbook = openpyxl.load_workbook(excel_path) if args.sheet not in workbook.sheetnames: raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") sheet = workbook[args.sheet] - if not args.skip_scope_preprocess: - move_prs_not_in_scope( + + if row_range_specified: + print( + f"Row range specified: processing rows " + f"{start_row or 2} to {end_row or sheet.max_row} " + f"(skipping preprocessing steps)", + flush=True, + ) + header = prepare_sheet_columns(sheet) + if args.force_regenerate: + clear_output_columns( + sheet, header, clear_ai=True, + start_row=start_row, end_row=end_row, + ) + else: + if not args.skip_scope_preprocess: + move_prs_not_in_scope( + workbook, + sheet, + args.version, + Path(args.releases_dir), + github, + base_branch_start_date=base_branch_start_date, + ) + sort_sheet_rows_by_component(sheet) + header = prepare_sheet_columns(sheet) + clear_output_columns(sheet, header, clear_ai=args.force_regenerate) + + existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + move_rows_with_issues_already_in_same_series( workbook, sheet, + header, + existing_notes, args.version, - Path(args.releases_dir), + ) + update_pr_authors_and_dup_notes( + sheet, + header, + existing_notes, github, - base_branch_start_date=base_branch_start_date, + author_workers=args.author_workers, ) - sort_sheet_rows_by_component(sheet) - header = prepare_sheet_columns(sheet) - clear_output_columns(sheet, header, clear_ai=args.force_regenerate) - - existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) - move_rows_with_issues_already_in_same_series( - workbook, - sheet, - header, - existing_notes, - args.version, - ) - update_pr_authors_and_dup_notes( - sheet, - header, - existing_notes, - github, - author_workers=args.author_workers, - ) - merge_rows_by_issue_and_component(sheet, header) + merge_rows_by_issue_and_component(sheet, header) if involve_ai_generation: checkpoint_callback = build_checkpoint_callback( @@ -209,7 +320,7 @@ def main() -> int: processed_excel_path, args.checkpoint_interval, ) - markdown_entries = generate_notes_for_sheet( + generate_notes_for_sheet( sheet, header, github, @@ -217,15 +328,58 @@ def main() -> int: ai_workers=args.ai_workers, github_workers=args.github_workers, checkpoint_callback=checkpoint_callback, + start_row=start_row, + end_row=end_row, ) else: - markdown_entries = generate_notes_without_ai(sheet, header) + generate_notes_without_ai( + sheet, header, start_row=start_row, end_row=end_row, + ) + + move_not_needed_rows_to_sheet( + workbook, sheet, header, start_row=start_row, end_row=end_row, + ) save_workbook_safely(workbook, processed_excel_path) + + print(f"Phase 1 (generate) completed.", flush=True) + print(f" Input Excel: {excel_path}", flush=True) + print(f" Processed Excel: {processed_excel_path}", flush=True) + print( + f" Next step: run 'export-markdown' with --excel {processed_excel_path} " + f"to generate the Markdown file.", + flush=True, + ) + return 0 + + +def run_export_markdown(args: argparse.Namespace) -> int: + excel_path = Path(args.excel) + output_file = ( + Path(args.output_release_file) + if args.output_release_file + else default_output_release_file(Path(args.releases_dir), args.version) + ) + + workbook = openpyxl.load_workbook(excel_path, data_only=True) + if args.sheet not in workbook.sheetnames: + raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") + sheet = workbook[args.sheet] + header = get_header(sheet) + + if "release_notes_written_by_ai" not in header: + raise ValueError( + f"Sheet {args.sheet!r} does not have a 'release_notes_written_by_ai' column. " + "Make sure you are pointing to the processed Excel from the 'generate' phase." + ) + + markdown_entries = collect_markdown_entries_from_sheet(sheet, header) + workbook.close() + write_release_file(output_file, args.version, args.release_date, markdown_entries) - print(f"Original Excel workbook unchanged: {excel_path}", flush=True) - print(f"Processed Excel workbook: {processed_excel_path}", flush=True) - print(f"Generated release note file: {output_file}", flush=True) + print(f"Phase 2 (export-markdown) completed.", flush=True) + print(f" Input Excel: {excel_path}", flush=True) + print(f" Generated release note file: {output_file}", flush=True) return 0 @@ -249,7 +403,10 @@ def default_output_release_file(releases_dir: Path, version: str) -> Path: def default_processed_excel_path(excel_path: Path) -> Path: - return excel_path.with_name(f"{excel_path.stem}_processed{excel_path.suffix}") + stem = excel_path.stem + if stem.endswith("_processed"): + return excel_path + return excel_path.with_name(f"{stem}_processed{excel_path.suffix}") def build_checkpoint_callback( diff --git a/scripts/release-notes-ai-generator/excel_workbook.py b/scripts/release-notes-ai-generator/excel_workbook.py index d0149d4ba6885..d2ac849a6c3ab 100644 --- a/scripts/release-notes-ai-generator/excel_workbook.py +++ b/scripts/release-notes-ai-generator/excel_workbook.py @@ -76,8 +76,16 @@ def get_header(sheet: Any) -> dict[str, int]: return header -def clear_output_columns(sheet: Any, header: dict[str, int], clear_ai: bool = True) -> None: - for row_number in range(2, sheet.max_row + 1): +def clear_output_columns( + sheet: Any, + header: dict[str, int], + clear_ai: bool = True, + start_row: int | None = None, + end_row: int | None = None, +) -> None: + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row + for row_number in range(effective_start, effective_end + 1): if clear_ai: sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None sheet.cell(row=row_number, column=header["published_release_notes"]).value = None @@ -348,13 +356,17 @@ def move_not_needed_rows_to_sheet( workbook: Any, sheet: Any, header: dict[str, int], + start_row: int | None = None, + end_row: int | None = None, ) -> int: """Move rows where AI determined no release note is needed to a separate sheet.""" ai_col = header["release_notes_written_by_ai"] target_sheet_name = "release_note_not_needed" + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row rows_to_move: list[int] = [] - for row_number in range(2, sheet.max_row + 1): + for row_number in range(effective_start, effective_end + 1): ai_value = str_value(sheet.cell(row=row_number, column=ai_col).value) if ai_value.startswith(NOT_NEEDED_PREFIX): rows_to_move.append(row_number) @@ -737,11 +749,15 @@ def generate_notes_for_sheet( ai_workers: int = 1, github_workers: int = 1, checkpoint_callback: Callable[[int, int], None] | None = None, + start_row: int | None = None, + end_row: int | None = None, ) -> list[MarkdownEntry]: entries_by_row: dict[int, list[MarkdownEntry]] = {} + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row row_inputs = [ build_row_input(sheet, header, row_number) - for row_number in range(2, sheet.max_row + 1) + for row_number in range(effective_start, effective_end + 1) ] rows_to_generate: list[RowInput] = [] @@ -807,9 +823,16 @@ def generate_notes_for_sheet( return entries -def generate_notes_without_ai(sheet: Any, header: dict[str, int]) -> list[MarkdownEntry]: +def generate_notes_without_ai( + sheet: Any, + header: dict[str, int], + start_row: int | None = None, + end_row: int | None = None, +) -> list[MarkdownEntry]: entries: list[MarkdownEntry] = [] - for row_number in range(2, sheet.max_row + 1): + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row + for row_number in range(effective_start, effective_end + 1): row_input = build_row_input(sheet, header, row_number) dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) if dup_text: @@ -845,6 +868,67 @@ def generate_notes_without_ai(sheet: Any, header: dict[str, int]) -> list[Markdo return entries +def collect_markdown_entries_from_sheet( + sheet: Any, + header: dict[str, int], +) -> list[MarkdownEntry]: + """Collect MarkdownEntry items from a processed Excel sheet (Phase 2). + + Reads published_release_notes (dup) and release_notes_written_by_ai columns + to build the full entry list without calling AI. Falls back to + formated_release_note when the AI column is empty (e.g. when AI generation + was OFF in Phase 1). + """ + entries: list[MarkdownEntry] = [] + has_published_col = "published_release_notes" in header + for row_number in range(2, sheet.max_row + 1): + row_input = build_row_input(sheet, header, row_number) + + if has_published_col: + dup_text = str_value( + sheet.cell(row=row_number, column=header["published_release_notes"]).value + ) + if dup_text: + entries.extend(dup_entries_for_row(row_input, dup_text)) + continue + + ai_note = str_value( + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value + ) + if ai_note and not ai_note.startswith("AI_GENERATION_FAILED:"): + if is_not_needed_note(ai_note): + continue + note_type = classify_note_type_from_text(ai_note, row_input.issue_type) + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + ai_note, + row_input.raw_component, + ) + ) + continue + + formatted_notes = split_lines(row_input.formatted_release_note) + if not formatted_notes: + continue + note_type = classify_note_type_from_text( + row_input.formatted_release_note, row_input.issue_type, + ) + for note in formatted_notes: + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + note, + row_input.raw_component, + ) + ) + + print(f"Collected {len(entries)} Markdown entry/entries from processed Excel", flush=True) + return entries + + def dup_entries_for_row(row_input: RowInput, dup_text: str) -> list[MarkdownEntry]: entries: list[MarkdownEntry] = [] for dup_note in split_lines(dup_text): diff --git a/scripts/release-notes-ai-generator/release-notes-generator-readme.md b/scripts/release-notes-ai-generator/release-notes-generator-readme.md index 27f7897c8c83a..eca06d286d516 100644 --- a/scripts/release-notes-ai-generator/release-notes-generator-readme.md +++ b/scripts/release-notes-ai-generator/release-notes-generator-readme.md @@ -1,8 +1,13 @@ # Release notes generator -`python3 -m release-notes-ai-generator` (run from the `scripts/` directory) generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in a Excel workbook. +`python3 -m release-notes-ai-generator` (run from the `scripts/` directory) generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in an Excel workbook. -The generator keeps the source workbook unchanged, writes all processing results to a processed workbook, and renders the generated entries to a Markdown release note file. +The generator uses a two-phase workflow: + +1. **`generate`** (Phase 1): Processes the source Excel workbook — runs preprocessing, calls AI to generate release notes, and writes results back to Excel. Supports row-range arguments (`--start-row` / `--end-row`) for resuming after interruptions. +2. **`export-markdown`** (Phase 2): Reads the processed Excel and exports a Markdown release note file. Does not call AI or modify the Excel. + +The source workbook is never overwritten. All processing results are written to a processed workbook (`_processed.xlsx`). ## What it does @@ -60,57 +65,125 @@ The generator does not create a complete formal release note. It does not genera ## Typical usage examples -Use Codex to generate release notes: +The generator uses two subcommands that run independently: + +- `generate` (Phase 1): processes the Excel workbook, calls AI, writes results back to Excel. +- `export-markdown` (Phase 2): reads the processed Excel and outputs a Markdown file. + +### Phase 1: Generate release notes into Excel + +Use Azure OpenAI: ```bash cd scripts -python3 -m release-notes-ai-generator \ +python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ - --releases-dir releases + --releases-dir releases \ + --ai-provider azure ``` -Use Azure OpenAI to generate release notes: +Use Codex CLI: ```bash cd scripts -python3 -m release-notes-ai-generator \ +python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases +``` + +### Phase 1: Resume from interruption + +If the first run is interrupted (e.g. API quota exhausted), resume from where it left off using `--start-row`: + +```bash +cd scripts +python3 -m release-notes-ai-generator generate \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ --releases-dir releases \ - --ai-provider azure + --ai-provider azure \ + --start-row 51 +``` + +You can also limit to a specific range with `--end-row`: + +```bash +python3 -m release-notes-ai-generator generate \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ + --releases-dir releases \ + --ai-provider azure \ + --start-row 51 --end-row 100 +``` + +When `--start-row` or `--end-row` is specified, preprocessing steps (sort, merge, scope filter, same-series move) are skipped because they were completed in the first run. + +### Phase 2: Export Markdown from processed Excel + +After Phase 1 is fully complete, export the Markdown: + +```bash +cd scripts +python3 -m release-notes-ai-generator export-markdown \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ + --releases-dir releases \ + --release-date "August 14, 2025" ``` ## Option descriptions -| Option | Required | Default value | Usage example | Description | -| --- | --- | --- | --- | --- | -| `--version ` | Yes | None | `--version 8.5.7` | Target TiDB version. This value is used for scope filtering, existing release-note lookup, generated Markdown front matter, and the default output file name. | -| `--excel ` | Yes | None | `--excel /path/to/release-note-excel.xlsx` | Path to the source release note Excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx`. | -| `--releases-dir ` | Yes | None | `--releases-dir releases` | Path to the existing English release notes directory. The script scans this directory for historical release notes and writes the generated Markdown under this directory unless `--output-release-file` is specified. | -| `--sheet ` | No | `pr_for_release_note` | `--sheet pr_for_release_note` | Workbook sheet to process. | -| `--ai-provider ` | No | `codex` | `--ai-provider azure` | AI provider to use. `codex` runs the Codex CLI as a subprocess. `azure` calls Azure OpenAI via the OpenAI Python SDK (requires `AZURE_OPENAI_KEY` and `AZURE_OPENAI_BASE_URL` or `OPENAI_BASE_URL` environment variables). | -| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | `--ai-command "codex --ask-for-approval never exec --sandbox read-only --ephemeral"` | Command used to invoke the AI generator (only used with `--ai-provider codex`). The prompt is passed through standard input. When the command is `codex exec`, the script also passes `--output-schema` and `--output-last-message`. | -| `--ai-model ` | No | `gpt-5.4` | `--ai-model gpt-5.4` | Model name. Passed to `codex exec` with `-m`, or used as the model parameter for Azure OpenAI. | -| `--involve-ai-generation ` | No | `ON` | `--involve-ai-generation OFF` | Whether to generate non-duplicate release notes with AI. Use `ON` to invoke AI, or `OFF` to use the source `formated_release_note` values. | -| `--output-release-file ` | No | Conditional | `--output-release-file /path/to/release-8.5.7.md` | Write the generated Markdown to a custom path. By default, the output under `--releases-dir` is `release--updated-by-ai.md` if `release-.md` already exists, otherwise `release-.md`. | -| `--ai-timeout ` | No | `600` | `--ai-timeout 600` | Timeout in seconds for each AI command invocation. | -| `--ai-workers ` | No | `3` | `--ai-workers 3` | Number of concurrent AI command invocations. | -| `--github-workers ` | No | `8` | `--github-workers 8` | Number of concurrent GitHub API prefetch workers. | -| `--author-workers ` | No | `3` | `--author-workers 3` | Number of concurrent workers used to resolve bot-authored cherry-pick PR authors. | -| `--checkpoint-interval ` | No | `1` | `--checkpoint-interval 1` | Save the processed workbook after every N completed AI rows. Use `0` to disable checkpoint saves. | -| `--force-regenerate` | No | Disabled | `--force-regenerate` | Clear existing AI-generated notes in the processed workbook and generate all non-duplicate rows again. | -| `--release-date ` | No | `TBD` | `--release-date "August 14, 2025"` | Release date text for the generated Markdown header. | -| `--skip-scope-preprocess` | No | Disabled | `--skip-scope-preprocess` | Skip moving not-in-scope PR rows to the `PRs_not_in_scope` sheet. | -| `--scope-base-branch-start-date ` | No | Estimated from release history | `--scope-base-branch-start-date 2025-01-01` | Override the estimated release-m.n branch start date for x.y.0 scope preprocessing. The value must use the `YYYY-MM-DD` format. | +### `generate` subcommand options + +| Option | Required | Default value | Description | +| --- | --- | --- | --- | +| `--version ` | Yes | None | Target TiDB version. Used for scope filtering, existing release-note lookup, and the default output file name. | +| `--excel ` | Yes | None | Path to the source release note Excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx` (or the path specified by `--output-excel`). | +| `--releases-dir ` | Yes | None | Path to the existing English release notes directory. Used for historical release note scanning and scope filtering. | +| `--sheet ` | No | `pr_for_release_note` | Workbook sheet to process. | +| `--ai-provider ` | No | `codex` | AI provider to use. `codex` runs the Codex CLI as a subprocess. `azure` calls Azure OpenAI via the OpenAI Python SDK. | +| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | Command used to invoke the AI generator (only used with `--ai-provider codex`). | +| `--ai-model ` | No | `gpt-5.4` | Model name. Passed to `codex exec` with `-m`, or used as the model parameter for Azure OpenAI. | +| `--involve-ai-generation ` | No | `ON` | Whether to generate non-duplicate release notes with AI. Use `OFF` to skip AI generation and only run preprocessing. | +| `--ai-timeout ` | No | `600` | Timeout in seconds for each AI command invocation. | +| `--ai-workers ` | No | `3` | Number of concurrent AI command invocations. | +| `--github-workers ` | No | `8` | Number of concurrent GitHub API prefetch workers. | +| `--author-workers ` | No | `3` | Number of concurrent workers used to resolve bot-authored cherry-pick PR authors. | +| `--checkpoint-interval ` | No | `1` | Save the processed workbook after every N completed AI rows. Use `0` to disable. | +| `--force-regenerate` | No | Disabled | Clear existing AI-generated notes and regenerate all non-duplicate rows. | +| `--skip-scope-preprocess` | No | Disabled | Skip moving not-in-scope PR rows to the `PRs_not_in_scope` sheet. | +| `--scope-base-branch-start-date ` | No | Estimated from release history | Override the estimated release-m.n branch start date for x.y.0 scope preprocessing. | +| `--start-row ` | No | First data row | Excel row number to start processing from (1-indexed, row 1 is the header). When specified, preprocessing steps are skipped. Use this to resume after an interruption. | +| `--end-row ` | No | Last row | Excel row number to stop processing at (inclusive, 1-indexed). | +| `--output-excel ` | No | `_processed.xlsx` | Path for the processed Excel output. | + +### `export-markdown` subcommand options + +| Option | Required | Default value | Description | +| --- | --- | --- | --- | +| `--version ` | Yes | None | Target TiDB version. Used for the Markdown front matter and default output file name. | +| `--excel ` | Yes | None | Path to the processed Excel workbook (output of the `generate` phase). | +| `--sheet ` | No | `pr_for_release_note` | Workbook sheet to read entries from. | +| `--releases-dir ` | Yes | None | Path to the existing English release notes directory (used to determine the default output path). | +| `--output-release-file ` | No | Conditional | Output Markdown file. Defaults to `release--updated-by-ai.md` if `release-.md` already exists, otherwise `release-.md`. | +| `--release-date ` | No | `TBD` | Release date text for the generated Markdown header. | ## Generated files -- The source Excel file passed to `--excel` is not overwritten. -- The processed Excel file is written to `_processed.xlsx` next to the source workbook. +**Phase 1 (`generate`):** + +- The source Excel file passed to `--excel` is not overwritten (unless `--output-excel` points to the same file, which is useful for resume scenarios). +- The processed Excel file is written to `_processed.xlsx` next to the source workbook, or to the path specified by `--output-excel`. +- Rows where AI determines no release note is needed are moved to a separate `release_note_not_needed` sheet in the processed workbook. + +**Phase 2 (`export-markdown`):** + - The generated Markdown file is written to `--output-release-file` when that option is specified. - If `--output-release-file` is omitted and `release-.md` already exists under `--releases-dir`, the generated Markdown file is written to `release--updated-by-ai.md`. - If `--output-release-file` is omitted and `release-.md` does not exist under `--releases-dir`, the generated Markdown file is written to `release-.md`. +- The Excel workbook is not modified during this phase. ## Reference: processing rules From e01d6f4b1e7a06896bac12a2d2ba7cc3f8897bf5 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 16:11:17 +0800 Subject: [PATCH 11/22] fix issues --- .../release-notes-ai-generator/ai_client.py | 21 ++++++---- scripts/release-notes-ai-generator/cli.py | 6 +-- .../excel_workbook.py | 38 +++++++++++++++++-- .../github_client.py | 2 +- 4 files changed, 52 insertions(+), 15 deletions(-) diff --git a/scripts/release-notes-ai-generator/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py index 0bd7b2d3f050c..6f1a58d62771b 100644 --- a/scripts/release-notes-ai-generator/ai_client.py +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -112,6 +112,7 @@ class AzureOpenAIClient(AIClient): DEFAULT_MODEL = "gpt-5.4" MAX_OUTPUT_TOKENS = 16384 TEMPERATURE = 0.1 + REASONING_MODEL_PREFIXES = ("o1", "o3", "o4", "gpt-5") def __init__(self, model: str | None, timeout: int): from openai import OpenAI @@ -134,13 +135,19 @@ def __init__(self, model: str | None, timeout: int): self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout) self.model = model or self.DEFAULT_MODEL + def _is_reasoning_model(self) -> bool: + model_lower = self.model.lower() + return any(model_lower.startswith(p) for p in self.REASONING_MODEL_PREFIXES) + def _run(self, prompt: str) -> str: - response = self.client.responses.create( - model=self.model, - input=[{"role": "user", "content": prompt}], - temperature=self.TEMPERATURE, - max_output_tokens=self.MAX_OUTPUT_TOKENS, - ) + kwargs: dict[str, Any] = { + "model": self.model, + "input": [{"role": "user", "content": prompt}], + "max_output_tokens": self.MAX_OUTPUT_TOKENS, + } + if not self._is_reasoning_model(): + kwargs["temperature"] = self.TEMPERATURE + response = self.client.responses.create(**kwargs) return response.output_text.strip() @@ -261,7 +268,7 @@ def load_reference_file(path: Path) -> str: except FileNotFoundError as exc: raise FileNotFoundError( f"Cannot find release-note reference file: {path}. " - "Make sure the repo-local write-review-translate-release-notes skill is present." + "Make sure .ai/skills/write-review-translate-release-notes/references/ exists." ) from exc diff --git a/scripts/release-notes-ai-generator/cli.py b/scripts/release-notes-ai-generator/cli.py index f2d68655e070e..f5e82ebf48f69 100644 --- a/scripts/release-notes-ai-generator/cli.py +++ b/scripts/release-notes-ai-generator/cli.py @@ -280,7 +280,7 @@ def run_generate(args: argparse.Namespace) -> int: header = prepare_sheet_columns(sheet) if args.force_regenerate: clear_output_columns( - sheet, header, clear_ai=True, + sheet, header, clear_ai=True, clear_published=False, start_row=start_row, end_row=end_row, ) else: @@ -341,7 +341,7 @@ def run_generate(args: argparse.Namespace) -> int: ) save_workbook_safely(workbook, processed_excel_path) - print(f"Phase 1 (generate) completed.", flush=True) + print("Phase 1 (generate) completed.", flush=True) print(f" Input Excel: {excel_path}", flush=True) print(f" Processed Excel: {processed_excel_path}", flush=True) print( @@ -377,7 +377,7 @@ def run_export_markdown(args: argparse.Namespace) -> int: write_release_file(output_file, args.version, args.release_date, markdown_entries) - print(f"Phase 2 (export-markdown) completed.", flush=True) + print("Phase 2 (export-markdown) completed.", flush=True) print(f" Input Excel: {excel_path}", flush=True) print(f" Generated release note file: {output_file}", flush=True) return 0 diff --git a/scripts/release-notes-ai-generator/excel_workbook.py b/scripts/release-notes-ai-generator/excel_workbook.py index d2ac849a6c3ab..16d8a1cf25674 100644 --- a/scripts/release-notes-ai-generator/excel_workbook.py +++ b/scripts/release-notes-ai-generator/excel_workbook.py @@ -61,6 +61,12 @@ def prepare_sheet_columns(sheet: Any) -> dict[str, int]: sheet.cell(row=1, column=formatted_col + 1, value="release_notes_written_by_ai") header = get_header(sheet) + if "ai_note_type" not in header: + ai_col_index = header["release_notes_written_by_ai"] + sheet.insert_cols(ai_col_index + 1) + sheet.cell(row=1, column=ai_col_index + 1, value="ai_note_type") + header = get_header(sheet) + if "published_release_notes" not in header: last_col = sheet.max_column sheet.cell(row=1, column=last_col + 1, value="published_release_notes") @@ -80,6 +86,7 @@ def clear_output_columns( sheet: Any, header: dict[str, int], clear_ai: bool = True, + clear_published: bool = True, start_row: int | None = None, end_row: int | None = None, ) -> None: @@ -88,7 +95,10 @@ def clear_output_columns( for row_number in range(effective_start, effective_end + 1): if clear_ai: sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None - sheet.cell(row=row_number, column=header["published_release_notes"]).value = None + if "ai_note_type" in header: + sheet.cell(row=row_number, column=header["ai_note_type"]).value = None + if clear_published: + sheet.cell(row=row_number, column=header["published_release_notes"]).value = None def sort_sheet_rows_by_component(sheet: Any) -> None: @@ -231,7 +241,7 @@ def should_skip_release_file(file_path: Path, target_version: tuple[int, int, in return True file_version = release_file_semver_tuple(file_path) if not file_version: - return False + return True return file_version >= target_version @@ -781,7 +791,14 @@ def generate_notes_for_sheet( if is_not_needed_note(existing_note): print(f"Row {row_number}: skipped existing not-needed verdict", flush=True) continue - note_type = classify_note_type_from_text(existing_note, row_input.issue_type) + persisted_type = str_value( + sheet.cell(row=row_number, column=header["ai_note_type"]).value + ) if "ai_note_type" in header else "" + note_type = ( + persisted_type + if persisted_type in {"improvement", "bug_fix"} + else classify_note_type_from_text(existing_note, row_input.issue_type) + ) entries_by_row[row_number] = [ MarkdownEntry( note_type or "improvement", @@ -898,7 +915,16 @@ def collect_markdown_entries_from_sheet( if ai_note and not ai_note.startswith("AI_GENERATION_FAILED:"): if is_not_needed_note(ai_note): continue - note_type = classify_note_type_from_text(ai_note, row_input.issue_type) + persisted_type = "" + if "ai_note_type" in header: + persisted_type = str_value( + sheet.cell(row=row_number, column=header["ai_note_type"]).value + ) + note_type = ( + persisted_type + if persisted_type in {"improvement", "bug_fix"} + else classify_note_type_from_text(ai_note, row_input.issue_type) + ) entries.append( MarkdownEntry( note_type or "improvement", @@ -1104,8 +1130,11 @@ def apply_generation_result( ) return + type_cell = sheet.cell(row=result.row_number, column=header["ai_note_type"]) + if result.note_type == "not_needed": ai_cell.value = result.note + type_cell.value = "not_needed" print( f"Row {result.row_number}: {result.note}", flush=True, @@ -1113,6 +1142,7 @@ def apply_generation_result( return ai_cell.value = result.note + type_cell.value = result.note_type entries_by_row[result.row_number] = [ MarkdownEntry(result.note_type, result.component, result.note, result.raw_component) ] diff --git a/scripts/release-notes-ai-generator/github_client.py b/scripts/release-notes-ai-generator/github_client.py index f0f4d1b5e2ff2..7eb824adf7c09 100644 --- a/scripts/release-notes-ai-generator/github_client.py +++ b/scripts/release-notes-ai-generator/github_client.py @@ -149,7 +149,7 @@ def get_pull_files_summary( lines: list[str] = [] page = 1 total_chars = 0 - while len(lines) < max_files: + while len(lines) < max_files and total_chars < max_total_chars: files = self.get_api_json( f"/repos/{owner}/{repo}/pulls/{number}/files", params={"per_page": 100, "page": page}, From d69077007d2e430258c0358cf13f521424eb4eab Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 16:33:35 +0800 Subject: [PATCH 12/22] limit the input size --- .../release-notes-ai-generator/ai_client.py | 53 ++++++++++++++++++- .../excel_workbook.py | 32 +++++++++++ .../github_client.py | 4 +- 3 files changed, 85 insertions(+), 4 deletions(-) diff --git a/scripts/release-notes-ai-generator/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py index 6f1a58d62771b..7139bcbd50858 100644 --- a/scripts/release-notes-ai-generator/ai_client.py +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -4,6 +4,7 @@ from functools import lru_cache import json import os +import re import shlex import shutil import subprocess @@ -193,8 +194,8 @@ def build_generation_prompt( contributors: list[str], ) -> str: prompt_template = load_prompt_template(GENERATION_PROMPT_TEMPLATE) - improvements_reference = load_reference_file(IMPROVEMENTS_REFERENCE) - bug_fixes_reference = load_reference_file(BUG_FIXES_REFERENCE) + improvements_reference = load_english_reference_file(IMPROVEMENTS_REFERENCE) + bug_fixes_reference = load_english_reference_file(BUG_FIXES_REFERENCE) context = { "row_number": row_context.row_number, "component": row_context.component, @@ -272,6 +273,54 @@ def load_reference_file(path: Path) -> str: ) from exc +@lru_cache(maxsize=None) +def load_english_reference_file(path: Path) -> str: + """Load a shared reference file and drop its Chinese-only sections. + + The reference files are shared with the write-review-translate-release-notes + skill and document both English and Chinese styles. This script only generates + English release notes, so the Chinese sections are stripped before they are + injected into the AI prompt to reduce input size. The source files are left + unchanged. + """ + return strip_non_english_sections(load_reference_file(path)) + + +HEADING_RE = re.compile(r"^(#{1,6})\s+(.*\S)\s*$") +CHINESE_LIST_ITEM_RE = re.compile(r"^\s*[-*+]\s+Chinese\b", re.IGNORECASE) + + +def strip_non_english_sections(markdown: str) -> str: + """Remove any heading whose title mentions "Chinese" and its nested content. + + A section is dropped from its heading line up to (but excluding) the next + heading of the same or higher level, which also removes nested subsections + such as the Chinese opening-verbs table. Table-of-contents list items that + start with "Chinese" are also dropped so the contents no longer point to + removed sections. + """ + result: list[str] = [] + skip_level: int | None = None + for line in markdown.splitlines(): + match = HEADING_RE.match(line) + if match: + level = len(match.group(1)) + if skip_level is not None and level > skip_level: + continue # still inside the skipped section + skip_level = None # a sibling or ancestor heading ends the skip + if "chinese" in match.group(2).lower(): + skip_level = level + continue + result.append(line) + continue + if skip_level is not None: + continue + if CHINESE_LIST_ITEM_RE.match(line): + continue + result.append(line) + return re.sub(r"\n{3,}", "\n\n", "\n".join(result)).strip() + "\n" + + def extract_json_object(output: str) -> dict[str, Any]: output = output.strip() if not output: diff --git a/scripts/release-notes-ai-generator/excel_workbook.py b/scripts/release-notes-ai-generator/excel_workbook.py index 16d8a1cf25674..6b3f52b2be94b 100644 --- a/scripts/release-notes-ai-generator/excel_workbook.py +++ b/scripts/release-notes-ai-generator/excel_workbook.py @@ -1,6 +1,7 @@ from __future__ import annotations import copy +import dataclasses import re import sys from concurrent.futures import ThreadPoolExecutor, as_completed @@ -45,6 +46,9 @@ GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") NOT_NEEDED_PREFIX = "Release note is not needed:" SAME_SERIES_REASON_HEADER = "reason" +# Global cap on the combined changed-file diff (files_summary) across all PRs of a +# single row, to bound the AI input size when a row references multiple PRs. +MAX_ROW_FILES_SUMMARY_CHARS = 40000 def prepare_sheet_columns(sheet: Any) -> dict[str, int]: @@ -1091,6 +1095,7 @@ def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCa pulls.append(pull) if pull.author: pr_authors.append(pull.author) + pulls = cap_pull_file_summaries(pulls, MAX_ROW_FILES_SUMMARY_CHARS) return RowContext( row_number=row_input.row_number, component=row_input.component, @@ -1106,6 +1111,33 @@ def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCa ) +def cap_pull_file_summaries(pulls: list[Any], budget: int) -> list[Any]: + """Truncate the combined files_summary across a row's PRs to ``budget`` chars. + + PullInfo objects come from the shared GitHub cache and may be referenced by + multiple rows, so truncation returns copies (via dataclasses.replace) instead + of mutating the cached objects in place. + """ + capped: list[Any] = [] + remaining = budget + for pull in pulls: + summary = pull.files_summary or "" + if len(summary) <= remaining: + capped.append(pull) + remaining -= len(summary) + continue + if remaining <= 0: + truncated = "...[changed-file information omitted to limit input size]" + else: + truncated = ( + summary[:remaining] + + "\n...[changed-file information truncated to limit input size]" + ) + capped.append(dataclasses.replace(pull, files_summary=truncated)) + remaining = 0 + return capped + + def apply_generation_result( sheet: Any, header: dict[str, int], diff --git a/scripts/release-notes-ai-generator/github_client.py b/scripts/release-notes-ai-generator/github_client.py index 7eb824adf7c09..c96ac7af86fc9 100644 --- a/scripts/release-notes-ai-generator/github_client.py +++ b/scripts/release-notes-ai-generator/github_client.py @@ -142,9 +142,9 @@ def get_pull_files_summary( owner: str, repo: str, number: str, - max_files: int = 80, + max_files: int = 40, max_patch_chars: int = 1200, - max_total_chars: int = 60000, + max_total_chars: int = 20000, ) -> str: lines: list[str] = [] page = 1 From 48dc7f75c0dad22b512295bf79c9c38b5ec40bb5 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 17:01:50 +0800 Subject: [PATCH 13/22] fix issues --- .../release-notes-ai-generator/ai_client.py | 1 + scripts/release-notes-ai-generator/cli.py | 39 ++++++++++++++----- .../excel_workbook.py | 17 +++++++- scripts/release-notes-ai-generator/models.py | 4 ++ .../prompts/generation.md | 5 +++ .../release-notes-generator-readme.md | 19 +++++---- 6 files changed, 65 insertions(+), 20 deletions(-) diff --git a/scripts/release-notes-ai-generator/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py index 7139bcbd50858..7cf029e1ea348 100644 --- a/scripts/release-notes-ai-generator/ai_client.py +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -207,6 +207,7 @@ def build_generation_prompt( "contributors": contributors, "issues": [dataclasses.asdict(issue) for issue in row_context.issues], "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], + "fetch_failed_urls": row_context.fetch_failed_urls, } return render_prompt_template( prompt_template, diff --git a/scripts/release-notes-ai-generator/cli.py b/scripts/release-notes-ai-generator/cli.py index f5e82ebf48f69..751408ba949a4 100644 --- a/scripts/release-notes-ai-generator/cli.py +++ b/scripts/release-notes-ai-generator/cli.py @@ -204,8 +204,10 @@ def add_export_markdown_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--output-release-file", help=( - "Output Markdown file. Defaults to release-{version}-updated-by-ai.md " - "if release-{version}.md already exists, otherwise release-{version}.md." + "Output Markdown file. Defaults to release-{version}-updated-by-ai.md. " + "The default never writes the canonical release-{version}.md, because " + "the generator only produces Improvements and Bug fixes, not a complete " + "release note." ), ) parser.add_argument( @@ -270,6 +272,14 @@ def run_generate(args: argparse.Namespace) -> int: raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") sheet = workbook[args.sheet] + if end_row is not None and end_row > sheet.max_row: + print( + f"--end-row {end_row} exceeds the last row ({sheet.max_row}); " + f"clamping to {sheet.max_row} to avoid materializing blank rows", + flush=True, + ) + end_row = sheet.max_row + if row_range_specified: print( f"Row range specified: processing rows " @@ -336,9 +346,18 @@ def run_generate(args: argparse.Namespace) -> int: sheet, header, start_row=start_row, end_row=end_row, ) - move_not_needed_rows_to_sheet( - workbook, sheet, header, start_row=start_row, end_row=end_row, - ) + if row_range_specified: + # Moving (deleting) not-needed rows would shift the row numbers of later + # rows, breaking the stable-row-number contract that segmented resume + # relies on. Leave them in place; they are still excluded from Markdown + # because their AI note starts with the not-needed prefix. + print( + "Row range specified: keeping not-needed rows in place to preserve " + "row numbers for resume (they are still excluded from Markdown)", + flush=True, + ) + else: + move_not_needed_rows_to_sheet(workbook, sheet, header) save_workbook_safely(workbook, processed_excel_path) print("Phase 1 (generate) completed.", flush=True) @@ -396,10 +415,12 @@ def parse_on_off(value: str) -> str: def default_output_release_file(releases_dir: Path, version: str) -> Path: - release_file = releases_dir / f"release-{version}.md" - if release_file.is_file(): - return releases_dir / f"release-{version}-updated-by-ai.md" - return release_file + # Always write to the "-updated-by-ai" name, never the canonical + # release-.md. The generator only produces Improvements and Bug + # fixes, not a complete formal release note, so the default output must not + # be mistaken for the official file. This name is also skipped by the + # historical-note scanner, so a re-run never treats the draft as published. + return releases_dir / f"release-{version}-updated-by-ai.md" def default_processed_excel_path(excel_path: Path) -> Path: diff --git a/scripts/release-notes-ai-generator/excel_workbook.py b/scripts/release-notes-ai-generator/excel_workbook.py index 6b3f52b2be94b..406c1c5038ea5 100644 --- a/scripts/release-notes-ai-generator/excel_workbook.py +++ b/scripts/release-notes-ai-generator/excel_workbook.py @@ -939,6 +939,13 @@ def collect_markdown_entries_from_sheet( ) continue + # A failed AI row must not fall back to formated_release_note: that text + # is an unvalidated draft or placeholder. Skip it so failed rows are not + # rendered to Markdown. The fallback below is only for rows with no AI + # note at all (e.g. --involve-ai-generation OFF). + if ai_note.startswith("AI_GENERATION_FAILED:"): + continue + formatted_notes = split_lines(row_input.formatted_release_note) if not formatted_notes: continue @@ -1011,6 +1018,7 @@ def prefetch_github_data(row_inputs: list[RowInput], github: Any, github_workers pr_urls = unique_ordered(url for row_input in row_inputs for url in row_input.pr_urls) issues = {} pulls = {} + failed_urls: set[str] = set() if not issue_urls and not pr_urls: return GitHubDataCache(issues=issues, pulls=pulls) @@ -1038,12 +1046,13 @@ def prefetch_github_data(row_inputs: list[RowInput], github: Any, github_workers data = future.result() except Exception as exc: # noqa: BLE001 print(f"Failed to prefetch GitHub {item_type} {url}: {exc}", file=sys.stderr, flush=True) + failed_urls.add(url) continue if item_type == "issue": issues[url] = data else: pulls[url] = data - return GitHubDataCache(issues=issues, pulls=pulls) + return GitHubDataCache(issues=issues, pulls=pulls, failed_urls=failed_urls) def generate_note_for_row( @@ -1096,6 +1105,11 @@ def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCa if pull.author: pr_authors.append(pull.author) pulls = cap_pull_file_summaries(pulls, MAX_ROW_FILES_SUMMARY_CHARS) + fetch_failed_urls = [ + url + for url in (*row_input.issue_urls, *row_input.pr_urls) + if url in github_cache.failed_urls + ] return RowContext( row_number=row_input.row_number, component=row_input.component, @@ -1108,6 +1122,7 @@ def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCa formatted_release_note=row_input.formatted_release_note, issues=issues, pulls=pulls, + fetch_failed_urls=fetch_failed_urls, ) diff --git a/scripts/release-notes-ai-generator/models.py b/scripts/release-notes-ai-generator/models.py index 7e89853cb3202..d22963460d097 100644 --- a/scripts/release-notes-ai-generator/models.py +++ b/scripts/release-notes-ai-generator/models.py @@ -60,6 +60,7 @@ class RowContext: formatted_release_note: str issues: list[IssueInfo] pulls: list[PullInfo] + fetch_failed_urls: list[str] = dataclasses.field(default_factory=list) @dataclasses.dataclass @@ -79,6 +80,9 @@ class RowInput: class GitHubDataCache: issues: dict[str, IssueInfo] pulls: dict[str, PullInfo] + # URLs whose GitHub data could not be fetched during prefetch, so the AI + # generates without their issue/PR body and diff. + failed_urls: set[str] = dataclasses.field(default_factory=set) @dataclasses.dataclass diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index 66b3526acdbc0..0dfa06070554d 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -74,6 +74,11 @@ About `formatted_release_note_from_excel`: - This field can also contain a draft release note written by the code PR author. In that case, use the draft as an important reference for the final release note, but verify and refine it against the PR code changes first and the issue description second. - Do not copy the draft blindly. Preserve its useful user-facing intent, correct unclear or inaccurate wording, and still follow all release-note style rules above. +About `fetch_failed_urls`: + +- This field lists issue or PR links whose GitHub data (title, body, labels, and changed files) could not be fetched, so the context for those links is missing. +- When it is non-empty, rely on the Excel fields (`pr_title_from_excel`, `formatted_release_note_from_excel`, `issue_type_from_excel`) to draft the note, and set `needs_review` to true. + Improvements reference: {{IMPROVEMENTS_REFERENCE}} diff --git a/scripts/release-notes-ai-generator/release-notes-generator-readme.md b/scripts/release-notes-ai-generator/release-notes-generator-readme.md index eca06d286d516..9b341457113db 100644 --- a/scripts/release-notes-ai-generator/release-notes-generator-readme.md +++ b/scripts/release-notes-ai-generator/release-notes-generator-readme.md @@ -79,7 +79,7 @@ cd scripts python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ - --releases-dir releases \ + --releases-dir ../releases \ --ai-provider azure ``` @@ -90,7 +90,7 @@ cd scripts python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ - --releases-dir releases + --releases-dir ../releases ``` ### Phase 1: Resume from interruption @@ -102,7 +102,7 @@ cd scripts python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel_processed.xlsx \ - --releases-dir releases \ + --releases-dir ../releases \ --ai-provider azure \ --start-row 51 ``` @@ -113,7 +113,7 @@ You can also limit to a specific range with `--end-row`: python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel_processed.xlsx \ - --releases-dir releases \ + --releases-dir ../releases \ --ai-provider azure \ --start-row 51 --end-row 100 ``` @@ -129,7 +129,7 @@ cd scripts python3 -m release-notes-ai-generator export-markdown \ --version 8.5.7 \ --excel /path/to/release-note-excel_processed.xlsx \ - --releases-dir releases \ + --releases-dir ../releases \ --release-date "August 14, 2025" ``` @@ -167,7 +167,7 @@ python3 -m release-notes-ai-generator export-markdown \ | `--excel ` | Yes | None | Path to the processed Excel workbook (output of the `generate` phase). | | `--sheet ` | No | `pr_for_release_note` | Workbook sheet to read entries from. | | `--releases-dir ` | Yes | None | Path to the existing English release notes directory (used to determine the default output path). | -| `--output-release-file ` | No | Conditional | Output Markdown file. Defaults to `release--updated-by-ai.md` if `release-.md` already exists, otherwise `release-.md`. | +| `--output-release-file ` | No | `release--updated-by-ai.md` | Output Markdown file. The default never writes the canonical `release-.md`, because the generator produces only `Improvements` and `Bug fixes`, not a complete release note. | | `--release-date ` | No | `TBD` | Release date text for the generated Markdown header. | ## Generated files @@ -176,13 +176,12 @@ python3 -m release-notes-ai-generator export-markdown \ - The source Excel file passed to `--excel` is not overwritten (unless `--output-excel` points to the same file, which is useful for resume scenarios). - The processed Excel file is written to `_processed.xlsx` next to the source workbook, or to the path specified by `--output-excel`. -- Rows where AI determines no release note is needed are moved to a separate `release_note_not_needed` sheet in the processed workbook. +- Rows where AI determines no release note is needed are moved to a separate `release_note_not_needed` sheet in the processed workbook. This move is skipped when `--start-row` or `--end-row` is used, so that deleting rows does not shift the row numbers a later segment relies on; such rows stay in the main sheet but are still excluded from Markdown. **Phase 2 (`export-markdown`):** - The generated Markdown file is written to `--output-release-file` when that option is specified. -- If `--output-release-file` is omitted and `release-.md` already exists under `--releases-dir`, the generated Markdown file is written to `release--updated-by-ai.md`. -- If `--output-release-file` is omitted and `release-.md` does not exist under `--releases-dir`, the generated Markdown file is written to `release-.md`. +- If `--output-release-file` is omitted, the generated Markdown file is written to `release--updated-by-ai.md` under `--releases-dir`. The default never overwrites the canonical `release-.md`, because the generated file is an incomplete draft (only `Improvements` and `Bug fixes`). - The Excel workbook is not modified during this phase. ## Reference: processing rules @@ -278,7 +277,7 @@ This separation is intentional. If the same issue appears again in the same majo For target version `8.5.7`, the same-series quarantine sheet is named: ```text -issue_already_in_earlier_v8.5.x +issue_already_in_earlier_v8.5 ``` A row is moved to this sheet when all of the following are true: From 0843e06a5261ce783458cbec607e60edf6296ee5 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 19:21:36 +0800 Subject: [PATCH 14/22] refine generation.md --- .../release-notes-ai-generator/ai_client.py | 71 +------- .../release-notes-ai-generator/constants.py | 16 -- .../prompts/generation.md | 166 ++++++++++++++---- 3 files changed, 133 insertions(+), 120 deletions(-) diff --git a/scripts/release-notes-ai-generator/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py index 7cf029e1ea348..badccc8ac238e 100644 --- a/scripts/release-notes-ai-generator/ai_client.py +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -4,7 +4,6 @@ from functools import lru_cache import json import os -import re import shlex import shutil import subprocess @@ -13,11 +12,7 @@ from pathlib import Path from typing import Any -from .constants import ( - BUG_FIXES_REFERENCE, - GENERATION_PROMPT_TEMPLATE, - IMPROVEMENTS_REFERENCE, -) +from .constants import GENERATION_PROMPT_TEMPLATE from .models import GeneratedNote, RowContext @@ -146,6 +141,7 @@ def _run(self, prompt: str) -> str: "input": [{"role": "user", "content": prompt}], "max_output_tokens": self.MAX_OUTPUT_TOKENS, } + print(prompt) if not self._is_reasoning_model(): kwargs["temperature"] = self.TEMPERATURE response = self.client.responses.create(**kwargs) @@ -194,8 +190,6 @@ def build_generation_prompt( contributors: list[str], ) -> str: prompt_template = load_prompt_template(GENERATION_PROMPT_TEMPLATE) - improvements_reference = load_english_reference_file(IMPROVEMENTS_REFERENCE) - bug_fixes_reference = load_english_reference_file(BUG_FIXES_REFERENCE) context = { "row_number": row_context.row_number, "component": row_context.component, @@ -215,8 +209,6 @@ def build_generation_prompt( "EXPECTED_LINKS": json.dumps(expected_links, ensure_ascii=False, indent=2), "CONTRIBUTORS": json.dumps(contributors, ensure_ascii=False, indent=2), "ROW_CONTEXT": json.dumps(context, ensure_ascii=False, indent=2), - "IMPROVEMENTS_REFERENCE": improvements_reference, - "BUG_FIXES_REFERENCE": bug_fixes_reference, }, ) @@ -263,65 +255,6 @@ def strip_prompt_template_heading(template: str) -> str: return "\n".join(lines) -@lru_cache(maxsize=None) -def load_reference_file(path: Path) -> str: - try: - return path.read_text(encoding="utf-8") - except FileNotFoundError as exc: - raise FileNotFoundError( - f"Cannot find release-note reference file: {path}. " - "Make sure .ai/skills/write-review-translate-release-notes/references/ exists." - ) from exc - - -@lru_cache(maxsize=None) -def load_english_reference_file(path: Path) -> str: - """Load a shared reference file and drop its Chinese-only sections. - - The reference files are shared with the write-review-translate-release-notes - skill and document both English and Chinese styles. This script only generates - English release notes, so the Chinese sections are stripped before they are - injected into the AI prompt to reduce input size. The source files are left - unchanged. - """ - return strip_non_english_sections(load_reference_file(path)) - - -HEADING_RE = re.compile(r"^(#{1,6})\s+(.*\S)\s*$") -CHINESE_LIST_ITEM_RE = re.compile(r"^\s*[-*+]\s+Chinese\b", re.IGNORECASE) - - -def strip_non_english_sections(markdown: str) -> str: - """Remove any heading whose title mentions "Chinese" and its nested content. - - A section is dropped from its heading line up to (but excluding) the next - heading of the same or higher level, which also removes nested subsections - such as the Chinese opening-verbs table. Table-of-contents list items that - start with "Chinese" are also dropped so the contents no longer point to - removed sections. - """ - result: list[str] = [] - skip_level: int | None = None - for line in markdown.splitlines(): - match = HEADING_RE.match(line) - if match: - level = len(match.group(1)) - if skip_level is not None and level > skip_level: - continue # still inside the skipped section - skip_level = None # a sibling or ancestor heading ends the skip - if "chinese" in match.group(2).lower(): - skip_level = level - continue - result.append(line) - continue - if skip_level is not None: - continue - if CHINESE_LIST_ITEM_RE.match(line): - continue - result.append(line) - return re.sub(r"\n{3,}", "\n\n", "\n".join(result)).strip() + "\n" - - def extract_json_object(output: str) -> dict[str, Any]: output = output.strip() if not output: diff --git a/scripts/release-notes-ai-generator/constants.py b/scripts/release-notes-ai-generator/constants.py index 3220aa9fcfc2f..160f214b4f521 100644 --- a/scripts/release-notes-ai-generator/constants.py +++ b/scripts/release-notes-ai-generator/constants.py @@ -5,22 +5,6 @@ REPO_ROOT = Path(__file__).resolve().parents[2] -IMPROVEMENTS_REFERENCE = ( - REPO_ROOT - / ".ai" - / "skills" - / "write-review-translate-release-notes" - / "references" - / "improvements.md" -) -BUG_FIXES_REFERENCE = ( - REPO_ROOT - / ".ai" - / "skills" - / "write-review-translate-release-notes" - / "references" - / "bug-fixes.md" -) GENERATION_PROMPT_TEMPLATE = ( REPO_ROOT / "scripts" / "release-notes-ai-generator" / "prompts" / "generation.md" ) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index 0dfa06070554d..1b480324ea57c 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -4,7 +4,7 @@ You are a senior technical writer who has profound knowledge of TiDB. Your task is to evaluate whether a TiDB issue or PR needs a release note. -- If yes, write exactly one English release note entry for it. +- If yes, write exactly **one** English release note entry for it. - If not, return a "Release note is not needed" verdict and a short reason. ## Step 1: Determine whether a release note is needed @@ -31,35 +31,87 @@ Not every PR or change warrants a release note. Before writing, determine whethe If a PR is mostly internal but the outcome is user-visible, write a release note that describes the outcome and omit the implementation details. If the only user-facing effect is indirect or speculative, lean toward returning a "not_needed" verdict. -## Step 2: Return your result +## Writing style guide -Return only a JSON object with exactly these keys: +The rules below define the wording, opening verbs, and single-entry style. Use the Improvements style when the type is `improvement`, and the Bug fixes style when the type is `bug_fix`. You output exactly one entry — never section headers, component groups, or more than one bullet. -- type: "improvement", "bug_fix", or "not_needed" -- release_note: one Markdown bullet that starts with "- " (when type is "improvement" or "bug_fix"), or "Release note is not needed: " (when type is "not_needed") -- needs_review: true or false -- reason: a short reason for the type and wording +### Improvements style -When type is "not_needed", use a short reason in the release_note field. Examples: -- "Release note is not needed: test-only change" -- "Release note is not needed: internal refactor, no user-visible effect" -- "Release note is not needed: flaky test fix" -- "Release note is not needed: added internal debug logging" +Lead with an action verb. Do not start with "This improves..." or "The X now supports...". -## Rules (apply only when writing a release note) +State the user benefit explicitly. Explain why the change matters in terms of performance, stability, or capability. For example, instead of "Not use the stale read request's `start_ts` to update `max_ts`," write "Avoid excessive commit request retrying by not using the Stale Read request's `start_ts` to update `max_ts`." -- Write from the user's perspective. -- Use the Excel issue_type as a strong signal, but decide the final type from the issue, PR description, and code changes. -- For improvements, follow the Improvements reference below. -- For bug fixes, follow the Bug fixes reference below. -- Do not end the release note with a period. -- Include every expected link in Markdown release-note style. -- Include every contributor as @[user](https://github.com/user). -- If there is no issue URL, use the PR link as the suffix link. -- Do not expose internal function names unless they are the user-visible behavior. -- If the available context is insufficient, still draft the best note and set needs_review to true. +Metric claims are encouraged when sourced, such as "up to 10 times performance improvement" or "improves performance by up to 62.5%". + +Opening verbs: + +| Verb | When to use | +|------|-------------| +| `Support` | New capability: ```Support casting the `STRING` type to the `DOUBLE` type``` | +| `Add` | New element or mechanism: `Add a timeout mechanism for LDAP authentication` | +| `Optimize` | Algorithmic improvement: `Optimize the non-joined data in right outer join using multiple threads` | +| `Improve` | General improvement: `Improve the MySQL compatibility of ...` | +| `Avoid` | Eliminate a problem: `Avoid excessive commit request retrying by ...` | +| `Enhance` | Capability expansion | +| `Mitigate` | Risk or stability improvement | +| `Accelerate` | Speed improvement | +| `Remove` | Cleanup or deprecation | +| `Increase` | Raise a limit or capacity | + +Examples: + +``` +- Improve the MySQL compatibility of expression default values displayed in the output of `SHOW CREATE TABLE` [#52939](https://github.com/pingcap/tidb/issues/52939) @[CbcWestwolf](https://github.com/CbcWestwolf) +- Support adding multiple indexes concurrently in the ingest mode [#52596](https://github.com/pingcap/tidb/issues/52596) @[lance6716](https://github.com/lance6716) +- Add a timeout mechanism for LDAP authentication to avoid the issue of resource lock (RLock) not being released in time [#51883](https://github.com/pingcap/tidb/issues/51883) @[YangKeao](https://github.com/YangKeao) +- Avoid performing IO operations on snapshot files in Raftstore threads to improve TiKV stability [#16564](https://github.com/tikv/tikv/issues/16564) @[Connor1996](https://github.com/Connor1996) +- Improve the performance of adding indexes with `tidb_ddl_enable_fast_reorg` enabled. In internal tests, v7.5.0 improves the performance by up to 62.5% compared with v6.5.0 [#47757](https://github.com/pingcap/tidb/issues/47757) @[tangenta](https://github.com/tangenta) +``` + +Put SQL functions in backtick ALL CAPS with parentheses (`` `DATE()` ``, not `date()`) and SQL keywords in backtick ALL CAPS (`` `HAVING` ``, not `having`). + +### Bug fixes style + +Lead with a fix verb phrase. Use the following accepted patterns, listed roughly by frequency in published v6.1+ notes: + +- `Fix the issue that [subject] [verb phrase]` (dominant modern pattern) +- `Fix the issue of [noun phrase] that occurs when/during [condition]` (result-first phrasing) +- `Fix the issue of [noun phrase]` (noun-centric, no trigger clause) +- `Fix the [incorrect/inaccurate] [noun]` (standalone, for example, `Fix the incorrect error message ...`) +- `Fix a [rare/potential] issue that [description]` (rare or non-deterministic bugs) +- `Fix the potential/occasional [panic/crash] that occurs when [condition]` (specific crash scenarios) +- `Fix the panic issue caused by [X]` (panic identified by cause) + +A complete entry should include the trigger condition (when it happens) and the observed impact (what the user sees), and optionally a workaround. Wrap exact error messages in backticks. + +For non-deterministic failures, both `might` and `potential` are acceptable: use `might` as an inline modal verb (`Fix the issue that TiDB might crash when ...`) and `potential` as an adjective before a noun (`Fix the potential panic that occurs when ...`). Do not use `may` or `could`. + +Examples: + +``` +- Fix the issue that executing SQL statements containing tables with multi-valued indexes might return the `Can't find a proper physical plan for this query` error [#49438](https://github.com/pingcap/tidb/issues/49438) @[qw4990](https://github.com/qw4990) +- Fix the issue that automatic statistics collection gets stuck after an OOM error occurs [#51993](https://github.com/pingcap/tidb/issues/51993) @[hi-rustin](https://github.com/hi-rustin) +- Fix the issue that TiDB might crash when `tidb_mem_quota_analyze` is enabled and the memory used by updating statistics exceeds the limit [#52601](https://github.com/pingcap/tidb/issues/52601) @[hawkingrei](https://github.com/hawkingrei) +- Fix the incorrect error message displayed when an invalid default value is specified for a column [#51592](https://github.com/pingcap/tidb/issues/51592) @[danqixu](https://github.com/danqixu) +- Fix a rare issue that special event timing might cause the data loss in log backup [#16739](https://github.com/tikv/tikv/issues/16739) @[YuJuncen](https://github.com/YuJuncen) +- Fix the panic issue caused by `GetAdditionalInfo` [#8079](https://github.com/tikv/pd/issues/8079) @[HuSharp](https://github.com/HuSharp) +``` + +Anti-patterns to avoid: + +| Incorrect | Correct | +|-----------|---------| +| `Fixed the issue that ...` (past tense) | `Fix the issue that ...` (imperative) | +| `Fixes an issue where ...` | `Fix the issue that ...` | +| `Fix the issue where ...` | `Fix the issue that ...` (use `that`, not `where`) | +| `Fix the issue that ... may ...` | Use `might` or `potential` | +| `The issue of X causing Y is fixed` | `Fix the issue that X causes Y` | + +## Input data -Expected links: +The following is the data for the single row you must process. + +Links to include in the release note (the entry MUST end with exactly these, no more and no fewer): {{EXPECTED_LINKS}} Contributors: @@ -68,19 +120,63 @@ Contributors: Row context: {{ROW_CONTEXT}} -About `formatted_release_note_from_excel`: +- About `formatted_release_note_from_excel`: + + - This field can be empty, `None`, or a generic placeholder such as `Please refer to [Release Notes Language Style Guide](https://pingcap.github.io/tidb-dev-guide/contribute-to-tidb/release-notes-style-guide.html) to write a quality release note.`. In these cases, treat it as no usable release-note draft. + - This field can also contain a draft release note written by the code PR author. In that case, use the draft as an important reference for the final release note, but verify and refine it against the PR code changes first and the issue description second. + - Do not copy the draft blindly. Preserve its useful user-facing intent, correct unclear or inaccurate wording, and still follow all release-note style rules below. + +- About `fetch_failed_urls`: + + - This field lists issue or PR links whose GitHub data (title, body, labels, and changed files) could not be fetched, so the context for those links is missing. + - When it is non-empty, rely on the Excel fields (`pr_title_from_excel`, `formatted_release_note_from_excel`, `issue_type_from_excel`) to draft the note, and set `needs_review` to true. + +- The `files_summary` field may end with `...[patch truncated]`. That truncation is expected; judge from the visible portion and do not treat it as missing data. + +## Rules (apply only when writing a release note) + +- Write from the user's perspective and in English. +- Use the Excel `issue_type` as a strong signal, but decide the final type from the issue, PR description, and code changes. +- Do not end the release note with a period. +- Do not expose internal function names unless they are the user-visible behavior. Rewrite them into observable behavior (for example, `Fix nil pointer panic in getRegionFromTS` → `Fix the potential panic that occurs when fetching region information during a Stale Read`). +- Append every contributor listed above, in order, as `@[user](https://github.com/user)`. +- End the entry with exactly the expected links listed above. Render each as `[#]()`, where `` is the issue or PR number taken from the URL. Do not invent, drop, or reorder links. +- If the available context is insufficient, still draft the best note and set `needs_review` to true. + +## Step 2: Return your result + +Return **only a raw JSON object**. Do not wrap it in Markdown code fences, and do not add any text before or after it. Use exactly these keys: + +- `type`: `"improvement"`, `"bug_fix"`, or `"not_needed"` +- `release_note`: see the format below +- `needs_review`: `true` or `false` +- `reason`: a short English reason for the type and wording + +### `release_note` format + +When `type` is `"improvement"` or `"bug_fix"`, `release_note` is one Markdown bullet assembled in this order: + +``` +- +``` + +Example: + +- Improvement example: -- This field can be empty, `None`, or a generic placeholder such as `Please refer to [Release Notes Language Style Guide](https://pingcap.github.io/tidb-dev-guide/contribute-to-tidb/release-notes-style-guide.html) to write a quality release note.`. In these cases, treat it as no usable release-note draft. -- This field can also contain a draft release note written by the code PR author. In that case, use the draft as an important reference for the final release note, but verify and refine it against the PR code changes first and the issue description second. -- Do not copy the draft blindly. Preserve its useful user-facing intent, correct unclear or inaccurate wording, and still follow all release-note style rules above. +``` +- Improve the MySQL compatibility of expression default values displayed in the output of `SHOW CREATE TABLE` [#52939](https://github.com/pingcap/tidb/issues/52939) @[CbcWestwolf](https://github.com/CbcWestwolf) +``` -About `fetch_failed_urls`: +- Bug fix example: -- This field lists issue or PR links whose GitHub data (title, body, labels, and changed files) could not be fetched, so the context for those links is missing. -- When it is non-empty, rely on the Excel fields (`pr_title_from_excel`, `formatted_release_note_from_excel`, `issue_type_from_excel`) to draft the note, and set `needs_review` to true. +``` +- Fix the issue that TiCDC might panic when the initialization of the Pulsar producer fails [#4937](https://github.com/pingcap/ticdc/issues/4937) @[wk989898](https://github.com/wk989898) +``` -Improvements reference: -{{IMPROVEMENTS_REFERENCE}} +When `type` is `"not_needed"`, set `release_note` to `"Release note is not needed: "` (no leading `- `, no links). Examples: -Bug fixes reference: -{{BUG_FIXES_REFERENCE}} +- `"Release note is not needed: test-only change"` +- `"Release note is not needed: internal refactor, no user-visible effect"` +- `"Release note is not needed: flaky test fix"` +- `"Release note is not needed: added internal debug logging"` From 86fb1d055299bf7a73ac5c2f6e78144370b40f96 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 19:51:30 +0800 Subject: [PATCH 15/22] Update generation.md --- .../prompts/generation.md | 137 +++++++----------- 1 file changed, 50 insertions(+), 87 deletions(-) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index 1b480324ea57c..b2bfdba5ab284 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -1,50 +1,69 @@ # Generation Prompt -You are a senior technical writer who has profound knowledge of TiDB. +You are a senior technical writer who has profound knowledge of TiDB. Your task is to decide whether a change needs a release note, and if so, write exactly one English release note entry. -Your task is to evaluate whether a TiDB issue or PR needs a release note. +## Input data about the change -- If yes, write exactly **one** English release note entry for it. -- If not, return a "Release note is not needed" verdict and a short reason. +{{ROW_CONTEXT}} + +Expected links to include in the release note (the entry MUST contain exactly these — no more, no fewer): +{{EXPECTED_LINKS}} + +{{CONTRIBUTORS}} -## Step 1: Determine whether a release note is needed +### How to read the input fields -Not every PR or change warrants a release note. Before writing, determine whether the change is visible to TiDB users or operators. +- `formatted_release_note_from_excel`: might be empty, `None`, or a generic placeholder (treat as no usable draft). When it contains a real draft written by the PR author, use it as an important reference — preserve its user-facing intent, but verify it against the PR code changes and issue description, correct inaccurate wording, and apply all style rules below. +- `fetch_failed_urls`: Lists links whose GitHub data could not be fetched. When non-empty, rely on Excel fields (`pr_title_from_excel`, `formatted_release_note_from_excel`, `issue_type_from_excel`) and set `needs_review` to true. +- `files_summary` might end with `...[patch truncated]` — that is expected; judge from the visible portion. -### User-visible changes (write a release note) +## Classification: does this change need a release note? + +Not every PR or change warrants a release note. Before writing, determine whether the change is visible to TiDB users or operators according to the issue description, PR description, and code changes. + +### Write a release note when the change is user-visible - Bug fixes that change query results, upgrade behavior, privilege checks, error messages, or compatibility -- New features, new SQL syntax or function support, or new configuration options +- New features, new SQL syntax/function support, or new configuration options - Meaningful performance improvements observable in common operations - Behavior changes that affect upgrade paths, tooling integration, or operational workflows - Default value changes for system variables or configuration parameters -### Internal-only changes (no release note needed) +### No release note needed when the change is internal-only - Test-only changes: new test cases, flaky test fixes, test infrastructure updates - Pure refactors or internal data-structure changes with no user-observable effect -- Added or improved debug/internal logs that do not surface in user-facing interfaces -- Internal CI/CD pipeline changes or developer workflow changes +- CI/CD pipeline or developer workflow changes - Code comments or source-code-only documentation changes (not user-facing docs) ### Borderline cases If a PR is mostly internal but the outcome is user-visible, write a release note that describes the outcome and omit the implementation details. If the only user-facing effect is indirect or speculative, lean toward returning a "not_needed" verdict. -## Writing style guide +### Whether improvement or bug fix -The rules below define the wording, opening verbs, and single-entry style. Use the Improvements style when the type is `improvement`, and the Bug fixes style when the type is `bug_fix`. You output exactly one entry — never section headers, component groups, or more than one bullet. +Use the Excel `issue_type` from the input data as a strong signal, but also decide the final type from the issue, PR description, and code changes. -### Improvements style +## Writing style (applies only when writing a release note) + +The rules below define the wording, opening verbs, and single-entry style. -Lead with an action verb. Do not start with "This improves..." or "The X now supports...". +### General rules -State the user benefit explicitly. Explain why the change matters in terms of performance, stability, or capability. For example, instead of "Not use the stale read request's `start_ts` to update `max_ts`," write "Avoid excessive commit request retrying by not using the Stale Read request's `start_ts` to update `max_ts`." +- Write from the user's perspective, in English. +- Do not end the entry with a period. +- Do not expose internal function names unless they are user-visible behavior. Rewrite into observable behavior (e.g. `Fix nil pointer panic in getRegionFromTS` → `Fix the potential panic that occurs when fetching region information during a Stale Read`). +- SQL functions: backtick ALL CAPS with parentheses (`` `DATE()` ``). SQL keywords: backtick ALL CAPS (`` `HAVING` ``). +- Use the Improvements style when the type is `improvement`, and the Bug fixes style when the type is `bug_fix`. +- Output exactly one entry — never section headers, component groups, or more than one bullet. +- If available context is insufficient, still draft the best note and set `needs_review` to true. -Metric claims are encouraged when sourced, such as "up to 10 times performance improvement" or "improves performance by up to 62.5%". +### Improvements style -Opening verbs: +Lead with an action verb. State the user benefit explicitly. Explain why the change matters in terms of performance, stability, or capability. For example, instead of "Not use the stale read request's `start_ts` to update `max_ts`," write "Avoid excessive commit request retrying by not using the Stale Read request's `start_ts` to update `max_ts`." +| Verb | When to use | +|------|-------------| | Verb | When to use | |------|-------------| | `Support` | New capability: ```Support casting the `STRING` type to the `DOUBLE` type``` | @@ -64,15 +83,11 @@ Examples: - Improve the MySQL compatibility of expression default values displayed in the output of `SHOW CREATE TABLE` [#52939](https://github.com/pingcap/tidb/issues/52939) @[CbcWestwolf](https://github.com/CbcWestwolf) - Support adding multiple indexes concurrently in the ingest mode [#52596](https://github.com/pingcap/tidb/issues/52596) @[lance6716](https://github.com/lance6716) - Add a timeout mechanism for LDAP authentication to avoid the issue of resource lock (RLock) not being released in time [#51883](https://github.com/pingcap/tidb/issues/51883) @[YangKeao](https://github.com/YangKeao) -- Avoid performing IO operations on snapshot files in Raftstore threads to improve TiKV stability [#16564](https://github.com/tikv/tikv/issues/16564) @[Connor1996](https://github.com/Connor1996) -- Improve the performance of adding indexes with `tidb_ddl_enable_fast_reorg` enabled. In internal tests, v7.5.0 improves the performance by up to 62.5% compared with v6.5.0 [#47757](https://github.com/pingcap/tidb/issues/47757) @[tangenta](https://github.com/tangenta) ``` -Put SQL functions in backtick ALL CAPS with parentheses (`` `DATE()` ``, not `date()`) and SQL keywords in backtick ALL CAPS (`` `HAVING` ``, not `having`). - ### Bug fixes style -Lead with a fix verb phrase. Use the following accepted patterns, listed roughly by frequency in published v6.1+ notes: +Lead with a fix verb phrase. Accepted patterns: - `Fix the issue that [subject] [verb phrase]` (dominant modern pattern) - `Fix the issue of [noun phrase] that occurs when/during [condition]` (result-first phrasing) @@ -90,93 +105,41 @@ Examples: ``` - Fix the issue that executing SQL statements containing tables with multi-valued indexes might return the `Can't find a proper physical plan for this query` error [#49438](https://github.com/pingcap/tidb/issues/49438) @[qw4990](https://github.com/qw4990) -- Fix the issue that automatic statistics collection gets stuck after an OOM error occurs [#51993](https://github.com/pingcap/tidb/issues/51993) @[hi-rustin](https://github.com/hi-rustin) - Fix the issue that TiDB might crash when `tidb_mem_quota_analyze` is enabled and the memory used by updating statistics exceeds the limit [#52601](https://github.com/pingcap/tidb/issues/52601) @[hawkingrei](https://github.com/hawkingrei) -- Fix the incorrect error message displayed when an invalid default value is specified for a column [#51592](https://github.com/pingcap/tidb/issues/51592) @[danqixu](https://github.com/danqixu) -- Fix a rare issue that special event timing might cause the data loss in log backup [#16739](https://github.com/tikv/tikv/issues/16739) @[YuJuncen](https://github.com/YuJuncen) - Fix the panic issue caused by `GetAdditionalInfo` [#8079](https://github.com/tikv/pd/issues/8079) @[HuSharp](https://github.com/HuSharp) ``` -Anti-patterns to avoid: +Anti-patterns: -| Incorrect | Correct | -|-----------|---------| +| Wrong | Right | +|-------|-------| | `Fixed the issue that ...` (past tense) | `Fix the issue that ...` (imperative) | | `Fixes an issue where ...` | `Fix the issue that ...` | | `Fix the issue where ...` | `Fix the issue that ...` (use `that`, not `where`) | | `Fix the issue that ... may ...` | Use `might` or `potential` | | `The issue of X causing Y is fixed` | `Fix the issue that X causes Y` | -## Input data +## Output format -The following is the data for the single row you must process. - -Links to include in the release note (the entry MUST end with exactly these, no more and no fewer): -{{EXPECTED_LINKS}} - -Contributors: -{{CONTRIBUTORS}} - -Row context: -{{ROW_CONTEXT}} - -- About `formatted_release_note_from_excel`: - - - This field can be empty, `None`, or a generic placeholder such as `Please refer to [Release Notes Language Style Guide](https://pingcap.github.io/tidb-dev-guide/contribute-to-tidb/release-notes-style-guide.html) to write a quality release note.`. In these cases, treat it as no usable release-note draft. - - This field can also contain a draft release note written by the code PR author. In that case, use the draft as an important reference for the final release note, but verify and refine it against the PR code changes first and the issue description second. - - Do not copy the draft blindly. Preserve its useful user-facing intent, correct unclear or inaccurate wording, and still follow all release-note style rules below. - -- About `fetch_failed_urls`: - - - This field lists issue or PR links whose GitHub data (title, body, labels, and changed files) could not be fetched, so the context for those links is missing. - - When it is non-empty, rely on the Excel fields (`pr_title_from_excel`, `formatted_release_note_from_excel`, `issue_type_from_excel`) to draft the note, and set `needs_review` to true. - -- The `files_summary` field may end with `...[patch truncated]`. That truncation is expected; judge from the visible portion and do not treat it as missing data. - -## Rules (apply only when writing a release note) - -- Write from the user's perspective and in English. -- Use the Excel `issue_type` as a strong signal, but decide the final type from the issue, PR description, and code changes. -- Do not end the release note with a period. -- Do not expose internal function names unless they are the user-visible behavior. Rewrite them into observable behavior (for example, `Fix nil pointer panic in getRegionFromTS` → `Fix the potential panic that occurs when fetching region information during a Stale Read`). -- Append every contributor listed above, in order, as `@[user](https://github.com/user)`. -- End the entry with exactly the expected links listed above. Render each as `[#]()`, where `` is the issue or PR number taken from the URL. Do not invent, drop, or reorder links. -- If the available context is insufficient, still draft the best note and set `needs_review` to true. - -## Step 2: Return your result - -Return **only a raw JSON object**. Do not wrap it in Markdown code fences, and do not add any text before or after it. Use exactly these keys: +Return **only a raw JSON object** — no Markdown fences, no extra text. Keys: - `type`: `"improvement"`, `"bug_fix"`, or `"not_needed"` -- `release_note`: see the format below +- `release_note`: the formatted entry (see below for the value format) - `needs_review`: `true` or `false` -- `reason`: a short English reason for the type and wording +- `reason`: short English reason for the type and wording -### `release_note` format +### `release_note` value When `type` is `"improvement"` or `"bug_fix"`, `release_note` is one Markdown bullet assembled in this order: ``` -- -``` - -Example: - -- Improvement example: - -``` -- Improve the MySQL compatibility of expression default values displayed in the output of `SHOW CREATE TABLE` [#52939](https://github.com/pingcap/tidb/issues/52939) @[CbcWestwolf](https://github.com/CbcWestwolf) +- ``` -- Bug fix example: +When `type` is `"not_needed"`, set `release_note` to the following format: ``` -- Fix the issue that TiCDC might panic when the initialization of the Pulsar producer fails [#4937](https://github.com/pingcap/ticdc/issues/4937) @[wk989898](https://github.com/wk989898) +Release note is not needed: ``` -When `type` is `"not_needed"`, set `release_note` to `"Release note is not needed: "` (no leading `- `, no links). Examples: - -- `"Release note is not needed: test-only change"` -- `"Release note is not needed: internal refactor, no user-visible effect"` -- `"Release note is not needed: flaky test fix"` -- `"Release note is not needed: added internal debug logging"` +Examples of `"not_needed"` reasons: `test-only change`, `internal refactor, no user-visible effect`, `flaky test fix`, `added internal debug logging`. From b0fa3245acd03241bc749efc56f36700f9f6266f Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 19:56:53 +0800 Subject: [PATCH 16/22] Update generation.md --- scripts/release-notes-ai-generator/prompts/generation.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index b2bfdba5ab284..c431cef87c257 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -9,6 +9,7 @@ You are a senior technical writer who has profound knowledge of TiDB. Your task Expected links to include in the release note (the entry MUST contain exactly these — no more, no fewer): {{EXPECTED_LINKS}} +Contributors (append each in order as `@[user](https://github.com/user)`): {{CONTRIBUTORS}} ### How to read the input fields @@ -33,6 +34,7 @@ Not every PR or change warrants a release note. Before writing, determine whethe - Test-only changes: new test cases, flaky test fixes, test infrastructure updates - Pure refactors or internal data-structure changes with no user-observable effect +- Internal debug/log changes that do not surface in user-facing interfaces - CI/CD pipeline or developer workflow changes - Code comments or source-code-only documentation changes (not user-facing docs) From e642d0525fe0faccdb615238665c20d5c484dce0 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 20:15:35 +0800 Subject: [PATCH 17/22] Update generation.md --- .../prompts/generation.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index c431cef87c257..2574cbd5927b7 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -56,6 +56,9 @@ The rules below define the wording, opening verbs, and single-entry style. - Do not end the entry with a period. - Do not expose internal function names unless they are user-visible behavior. Rewrite into observable behavior (e.g. `Fix nil pointer panic in getRegionFromTS` → `Fix the potential panic that occurs when fetching region information during a Stale Read`). - SQL functions: backtick ALL CAPS with parentheses (`` `DATE()` ``). SQL keywords: backtick ALL CAPS (`` `HAVING` ``). +- Normalize product names to their official capitalization: TiDB, TiKV, TiCDC, TiFlash, PD, BR, DM, TiDB Lightning, Dumpling, TiUP. Never use lowercase variants like `ticdc` or `tikv` in the release note text except they are part of variable/parameter names or code comments. +- Use ONLY the Contributors list provided above for `@[user](url)` attribution. Ignore `author` fields inside `pull_requests[]` — they may be bot accounts (e.g. `ti-chi-bot`) from cherry-pick workflows. +- End the entry with exactly the links from the Expected links list. Render each as `[#]()` where `` is the issue or PR number extracted from the URL path. Do not invent, drop, or reorder links. - Use the Improvements style when the type is `improvement`, and the Bug fixes style when the type is `bug_fix`. - Output exactly one entry — never section headers, component groups, or more than one bullet. - If available context is insufficient, still draft the best note and set `needs_review` to true. @@ -64,8 +67,6 @@ The rules below define the wording, opening verbs, and single-entry style. Lead with an action verb. State the user benefit explicitly. Explain why the change matters in terms of performance, stability, or capability. For example, instead of "Not use the stale read request's `start_ts` to update `max_ts`," write "Avoid excessive commit request retrying by not using the Stale Read request's `start_ts` to update `max_ts`." -| Verb | When to use | -|------|-------------| | Verb | When to use | |------|-------------| | `Support` | New capability: ```Support casting the `STRING` type to the `DOUBLE` type``` | @@ -138,6 +139,18 @@ When `type` is `"improvement"` or `"bug_fix"`, `release_note` is one Markdown bu - ``` +Improvement example: + +``` +- Support adding multiple indexes concurrently in the ingest mode [#52596](https://github.com/pingcap/tidb/issues/52596) @[lance6716](https://github.com/lance6716) +``` + +Bug fix example: + +``` +- Fix the issue that TiCDC might panic when the initialization of the Pulsar producer fails [#4937](https://github.com/pingcap/ticdc/issues/4937) @[wk989898](https://github.com/wk989898) +``` + When `type` is `"not_needed"`, set `release_note` to the following format: ``` From f8e82be767ac4faecb3e61c95414c561bdad08d2 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 20:23:09 +0800 Subject: [PATCH 18/22] Update generation.md --- scripts/release-notes-ai-generator/prompts/generation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index 2574cbd5927b7..b4c10c8d30eda 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -154,7 +154,7 @@ Bug fix example: When `type` is `"not_needed"`, set `release_note` to the following format: ``` -Release note is not needed: +Release note is not needed for this change. Reason: ``` Examples of `"not_needed"` reasons: `test-only change`, `internal refactor, no user-visible effect`, `flaky test fix`, `added internal debug logging`. From 94cc1ba3c247a21361dc99b616a208cff3afe6c5 Mon Sep 17 00:00:00 2001 From: qiancai Date: Wed, 17 Jun 2026 20:24:15 +0800 Subject: [PATCH 19/22] Update ai_client.py --- scripts/release-notes-ai-generator/ai_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/release-notes-ai-generator/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py index badccc8ac238e..d65272b395c06 100644 --- a/scripts/release-notes-ai-generator/ai_client.py +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -141,7 +141,7 @@ def _run(self, prompt: str) -> str: "input": [{"role": "user", "content": prompt}], "max_output_tokens": self.MAX_OUTPUT_TOKENS, } - print(prompt) + #print(prompt) if not self._is_reasoning_model(): kwargs["temperature"] = self.TEMPERATURE response = self.client.responses.create(**kwargs) From f7205de63a63dede0a6ac84a5f21ccc616381301 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 22 Jun 2026 10:12:37 +0800 Subject: [PATCH 20/22] Update generation.md --- scripts/release-notes-ai-generator/prompts/generation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index b4c10c8d30eda..328b2da428cb3 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -52,7 +52,7 @@ The rules below define the wording, opening verbs, and single-entry style. ### General rules -- Write from the user's perspective, in English. +- Write from the user’s perspective, clearly and concisely, in English. - Do not end the entry with a period. - Do not expose internal function names unless they are user-visible behavior. Rewrite into observable behavior (e.g. `Fix nil pointer panic in getRegionFromTS` → `Fix the potential panic that occurs when fetching region information during a Stale Read`). - SQL functions: backtick ALL CAPS with parentheses (`` `DATE()` ``). SQL keywords: backtick ALL CAPS (`` `HAVING` ``). From f1b2cbaf43172de630a0e0e777d42d13f5fd39ec Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 22 Jun 2026 10:40:50 +0800 Subject: [PATCH 21/22] Merge branch 'master' into release-notes-ai-generator From d9067db1b869d689d4d6161f2781add56a7adab0 Mon Sep 17 00:00:00 2001 From: qiancai Date: Mon, 22 Jun 2026 12:01:38 +0800 Subject: [PATCH 22/22] updated examples --- .../prompts/generation.md | 6 +++--- .../release-notes-generator-readme.md | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md index 328b2da428cb3..4da6909156904 100644 --- a/scripts/release-notes-ai-generator/prompts/generation.md +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -40,7 +40,7 @@ Not every PR or change warrants a release note. Before writing, determine whethe ### Borderline cases -If a PR is mostly internal but the outcome is user-visible, write a release note that describes the outcome and omit the implementation details. If the only user-facing effect is indirect or speculative, lean toward returning a "not_needed" verdict. +If a PR is mostly internal but the outcome is user-visible, write a release note that describes the outcome and omit the implementation details. If the only user-facing effect is indirect or speculative, lean toward returning a verdict that starts with "Release note is not needed:". ### Whether improvement or bug fix @@ -151,10 +151,10 @@ Bug fix example: - Fix the issue that TiCDC might panic when the initialization of the Pulsar producer fails [#4937](https://github.com/pingcap/ticdc/issues/4937) @[wk989898](https://github.com/wk989898) ``` -When `type` is `"not_needed"`, set `release_note` to the following format: +When `type` is `"not_needed"`, `release_note` must start with "Release note is not needed:": ``` Release note is not needed for this change. Reason: ``` -Examples of `"not_needed"` reasons: `test-only change`, `internal refactor, no user-visible effect`, `flaky test fix`, `added internal debug logging`. +Example types of `"not_needed"` reasons: `test-only change`, `internal refactor, no user-visible effect`, `flaky test fix`, `added internal debug logging`. diff --git a/scripts/release-notes-ai-generator/release-notes-generator-readme.md b/scripts/release-notes-ai-generator/release-notes-generator-readme.md index 9b341457113db..754cef07ab220 100644 --- a/scripts/release-notes-ai-generator/release-notes-generator-readme.md +++ b/scripts/release-notes-ai-generator/release-notes-generator-readme.md @@ -1,4 +1,4 @@ -# Release notes generator +# Readme: Release Notes AI Generator `python3 -m release-notes-ai-generator` (run from the `scripts/` directory) generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in an Excel workbook. @@ -78,8 +78,8 @@ Use Azure OpenAI: cd scripts python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ - --excel /path/to/release-note-excel.xlsx \ - --releases-dir ../releases \ + --excel \ + --releases-dir \ --ai-provider azure ``` @@ -90,7 +90,7 @@ cd scripts python3 -m release-notes-ai-generator generate \ --version 8.5.7 \ --excel /path/to/release-note-excel.xlsx \ - --releases-dir ../releases + --releases-dir ``` ### Phase 1: Resume from interruption @@ -127,10 +127,10 @@ After Phase 1 is fully complete, export the Markdown: ```bash cd scripts python3 -m release-notes-ai-generator export-markdown \ - --version 8.5.7 \ - --excel /path/to/release-note-excel_processed.xlsx \ - --releases-dir ../releases \ - --release-date "August 14, 2025" + --version \ + --excel \ + --releases-dir \ + --release-date "" ``` ## Option descriptions