diff --git a/scripts/release-notes-ai-generator/__main__.py b/scripts/release-notes-ai-generator/__main__.py new file mode 100644 index 0000000000000..114f4057f1c34 --- /dev/null +++ b/scripts/release-notes-ai-generator/__main__.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""Generate TiDB improvements and bug fixes for release notes according to PRs and issues in a specified excel file. + +Two-phase workflow (run from the scripts/ directory): + + # Phase 1: Process Excel, call AI, write results to Excel + python3 -m release-notes-ai-generator generate \ + --version 8.5.7 \ + --excel /path/to/release-note-excel.xlsx \ + --releases-dir releases \ + --ai-provider azure + + # Phase 2: Export Markdown from the processed Excel + python3 -m release-notes-ai-generator export-markdown \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ + --releases-dir releases \ + --release-date "August 14, 2025" + +For detailed usage and options, see release-notes-generator-readme.md in this directory. +""" + +from .cli import main + +raise SystemExit(main()) diff --git a/scripts/release-notes-ai-generator/ai_client.py b/scripts/release-notes-ai-generator/ai_client.py new file mode 100644 index 0000000000000..d65272b395c06 --- /dev/null +++ b/scripts/release-notes-ai-generator/ai_client.py @@ -0,0 +1,341 @@ +from __future__ import annotations + +import dataclasses +from functools import lru_cache +import json +import os +import shlex +import shutil +import subprocess +import tempfile +import textwrap +from pathlib import Path +from typing import Any + +from .constants import GENERATION_PROMPT_TEMPLATE +from .models import GeneratedNote, RowContext + + +class AIClient: + """Base AI client with shared generation and validation logic.""" + + def generate(self, prompt: str, expected_links: list[str], contributors: list[str]) -> GeneratedNote: + result, errors = self._run_and_validate(prompt, expected_links, contributors) + if result: + return result + + repair_prompt = build_repair_prompt(prompt, errors) + result, repair_errors = self._run_and_validate(repair_prompt, expected_links, contributors) + if result: + return result + raise ValueError("; ".join(repair_errors)) + + def _run_and_validate( + self, prompt: str, expected_links: list[str], contributors: list[str] + ) -> tuple[GeneratedNote | None, list[str]]: + output = self._run(prompt) + try: + data = extract_json_object(output) + except ValueError as exc: + return None, [str(exc)] + return validate_ai_response(data, expected_links, contributors) + + def _run(self, prompt: str) -> str: + raise NotImplementedError("Subclasses must implement _run") + + +class CodexAIClient(AIClient): + """AI client that invokes the Codex CLI as a subprocess.""" + + def __init__(self, command: str, model: str | None, timeout: int): + self.command = shlex.split(command) + self.model = model + self.timeout = timeout + + def _run(self, prompt: str) -> str: + command = list(self.command) + if not command: + raise ValueError("AI command is empty. Pass a command with --ai-command.") + if not is_executable_available(command[0]): + raise FileNotFoundError( + f"AI command executable not found: {command[0]!r}. " + "Install it or pass a custom command with --ai-command." + ) + + with tempfile.TemporaryDirectory() as temp_dir: + output_path: Path | None = None + if self._is_codex_exec(command): + if self.model: + command.extend(["-m", self.model]) + temp_path = Path(temp_dir) + schema_path = temp_path / "ai-output-schema.json" + output_path = temp_path / "ai-output.txt" + schema_path.write_text(json.dumps(ai_output_schema()), encoding="utf-8") + output_path.touch() + command.extend(["--output-schema", str(schema_path)]) + command.extend(["--output-last-message", str(output_path)]) + + completed = subprocess.run( + command, + input=prompt, + text=True, + capture_output=True, + timeout=self.timeout, + check=False, + ) + if completed.returncode != 0: + raise RuntimeError( + "AI command failed with exit code " + f"{completed.returncode}: {summarize_process_output(completed)}" + ) + if output_path and output_path.exists(): + last_message = output_path.read_text(encoding="utf-8").strip() + if last_message: + return last_message + return completed.stdout.strip() + + @staticmethod + def _is_codex_exec(command: list[str]) -> bool: + if not command: + return False + executable = Path(command[0]).name + return executable == "codex" and "exec" in command[1:] + + +class AzureOpenAIClient(AIClient): + """AI client that calls Azure OpenAI via the OpenAI Python SDK.""" + + DEFAULT_MODEL = "gpt-5.4" + MAX_OUTPUT_TOKENS = 16384 + TEMPERATURE = 0.1 + REASONING_MODEL_PREFIXES = ("o1", "o3", "o4", "gpt-5") + + def __init__(self, model: str | None, timeout: int): + from openai import OpenAI + + key = os.environ.get("AZURE_OPENAI_KEY", "") + base_url = ( + os.environ.get("AZURE_OPENAI_BASE_URL") + or os.environ.get("OPENAI_BASE_URL", "") + ) + if not key: + raise ValueError( + "AZURE_OPENAI_KEY environment variable is required " + "when using --ai-provider azure" + ) + if not base_url: + raise ValueError( + "AZURE_OPENAI_BASE_URL or OPENAI_BASE_URL environment variable " + "is required when using --ai-provider azure" + ) + self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout) + self.model = model or self.DEFAULT_MODEL + + def _is_reasoning_model(self) -> bool: + model_lower = self.model.lower() + return any(model_lower.startswith(p) for p in self.REASONING_MODEL_PREFIXES) + + def _run(self, prompt: str) -> str: + kwargs: dict[str, Any] = { + "model": self.model, + "input": [{"role": "user", "content": prompt}], + "max_output_tokens": self.MAX_OUTPUT_TOKENS, + } + #print(prompt) + if not self._is_reasoning_model(): + kwargs["temperature"] = self.TEMPERATURE + response = self.client.responses.create(**kwargs) + return response.output_text.strip() + + +def is_executable_available(executable: str) -> bool: + if os.sep in executable or (os.altsep and os.altsep in executable): + return Path(executable).exists() + return shutil.which(executable) is not None + + +def ai_output_schema() -> dict[str, Any]: + return { + "type": "object", + "additionalProperties": False, + "required": ["type", "release_note", "needs_review", "reason"], + "properties": { + "type": {"type": "string", "enum": ["improvement", "bug_fix", "not_needed"]}, + "release_note": {"type": "string"}, + "needs_review": {"type": "boolean"}, + "reason": {"type": "string"}, + }, + } + + +def summarize_process_output(completed: subprocess.CompletedProcess[str]) -> str: + parts = [] + if completed.stderr.strip(): + parts.append("stderr:\n" + tail_output(completed.stderr)) + if completed.stdout.strip(): + parts.append("stdout:\n" + tail_output(completed.stdout)) + return "\n\n".join(parts) or "no output" + + +def tail_output(text: str, max_lines: int = 40, max_chars: int = 4000) -> str: + tail = "\n".join(text.strip().splitlines()[-max_lines:]) + if len(tail) > max_chars: + tail = "...[truncated]\n" + tail[-max_chars:] + return tail + + +def build_generation_prompt( + row_context: RowContext, + expected_links: list[str], + contributors: list[str], +) -> str: + prompt_template = load_prompt_template(GENERATION_PROMPT_TEMPLATE) + context = { + "row_number": row_context.row_number, + "component": row_context.component, + "raw_component_from_excel": row_context.raw_component, + "issue_type_from_excel": row_context.issue_type, + "pr_title_from_excel": row_context.pr_title, + "formatted_release_note_from_excel": row_context.formatted_release_note, + "expected_links": expected_links, + "contributors": contributors, + "issues": [dataclasses.asdict(issue) for issue in row_context.issues], + "pull_requests": [dataclasses.asdict(pull) for pull in row_context.pulls], + "fetch_failed_urls": row_context.fetch_failed_urls, + } + return render_prompt_template( + prompt_template, + { + "EXPECTED_LINKS": json.dumps(expected_links, ensure_ascii=False, indent=2), + "CONTRIBUTORS": json.dumps(contributors, ensure_ascii=False, indent=2), + "ROW_CONTEXT": json.dumps(context, ensure_ascii=False, indent=2), + }, + ) + + +def build_repair_prompt(original_prompt: str, errors: list[str]) -> str: + return textwrap.dedent( + f""" + Your previous answer did not satisfy the required JSON schema or release-note rules. + + Validation errors: + {json.dumps(errors, ensure_ascii=False, indent=2)} + + Rewrite the answer. Return only the corrected JSON object. + + Original task: + {original_prompt} + """ + ).strip() + + +def render_prompt_template(template: str, values: dict[str, str]) -> str: + for key, value in values.items(): + template = template.replace(f"{{{{{key}}}}}", value) + return template.strip() + + +@lru_cache(maxsize=None) +def load_prompt_template(path: Path) -> str: + try: + return strip_prompt_template_heading(path.read_text(encoding="utf-8")) + except FileNotFoundError as exc: + raise FileNotFoundError( + f"Cannot find release-note prompt template: {path}. " + "Make sure scripts/release-notes-ai-generator/prompts/generation.md exists." + ) from exc + + +def strip_prompt_template_heading(template: str) -> str: + lines = template.splitlines() + if lines and lines[0].startswith("# "): + lines = lines[1:] + if lines and not lines[0].strip(): + lines = lines[1:] + return "\n".join(lines) + + +def extract_json_object(output: str) -> dict[str, Any]: + output = output.strip() + if not output: + raise ValueError("AI command returned no output") + try: + data = json.loads(output) + except json.JSONDecodeError: + candidates = extract_json_object_candidates(output) + if not candidates: + raise ValueError("AI output did not contain a JSON object") from None + required_keys = {"type", "release_note", "needs_review", "reason"} + data = next( + (candidate for candidate in candidates if required_keys <= candidate.keys()), + candidates[0], + ) + if not isinstance(data, dict): + raise ValueError("AI output JSON is not an object") + return data + + +def extract_json_object_candidates(output: str) -> list[dict[str, Any]]: + decoder = json.JSONDecoder() + candidates: list[dict[str, Any]] = [] + for index, char in enumerate(output): + if char != "{": + continue + try: + data, _end = decoder.raw_decode(output[index:]) + except json.JSONDecodeError: + continue + if isinstance(data, dict): + candidates.append(data) + return candidates + + +def validate_ai_response( + data: dict[str, Any], + expected_links: list[str], + contributors: list[str], +) -> tuple[GeneratedNote | None, list[str]]: + errors: list[str] = [] + note_type = data.get("type") + release_note = data.get("release_note") + needs_review = data.get("needs_review") + reason = data.get("reason") + + if note_type not in {"improvement", "bug_fix", "not_needed"}: + errors.append('type must be "improvement", "bug_fix", or "not_needed"') + if not isinstance(needs_review, bool): + errors.append("needs_review must be a boolean") + if not isinstance(reason, str): + errors.append("reason must be a string") + + if note_type == "not_needed": + if not isinstance(release_note, str) or not release_note.startswith("Release note is not needed:"): + errors.append( + 'when type is "not_needed", release_note must start with ' + '"Release note is not needed:"' + ) + else: + if not isinstance(release_note, str) or not release_note.startswith("- "): + errors.append('release_note must be a string that starts with "- "') + if isinstance(release_note, str) and release_note.rstrip().endswith("."): + errors.append("release_note must not end with a period") + if isinstance(release_note, str): + for link in expected_links: + if link and link not in release_note: + errors.append(f"release_note is missing expected link: {link}") + for contributor in contributors: + expected = f"@[{contributor}](https://github.com/{contributor})" + if contributor and expected not in release_note: + errors.append(f"release_note is missing contributor: {contributor}") + + if errors: + return None, errors + return ( + GeneratedNote( + note_type=str(note_type), + release_note=str(release_note).strip(), + needs_review=bool(needs_review), + reason=str(reason).strip(), + ), + [], + ) diff --git a/scripts/release-notes-ai-generator/cli.py b/scripts/release-notes-ai-generator/cli.py new file mode 100644 index 0000000000000..751408ba949a4 --- /dev/null +++ b/scripts/release-notes-ai-generator/cli.py @@ -0,0 +1,482 @@ +from __future__ import annotations + +import argparse +import os +import tempfile +from pathlib import Path + +import openpyxl + +from .ai_client import AzureOpenAIClient, CodexAIClient +from .excel_workbook import ( + clear_output_columns, + collect_markdown_entries_from_sheet, + generate_notes_without_ai, + generate_notes_for_sheet, + get_header, + merge_rows_by_issue_and_component, + move_not_needed_rows_to_sheet, + move_rows_with_issues_already_in_same_series, + prepare_sheet_columns, + sort_sheet_rows_by_component, + store_existing_release_notes, + update_pr_authors_and_dup_notes, +) +from .github_client import GitHubClient +from .markdown_writer import write_release_file +from .scope_filter import move_prs_not_in_scope, parse_date_value + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Generate English release notes with AI according to PRs and issues " + "in a specified Excel file. Use subcommands 'generate' and 'export-markdown' " + "to run the two phases independently." + ), + ) + subparsers = parser.add_subparsers(dest="command") + + # --- Phase 1: generate --- + gen_parser = subparsers.add_parser( + "generate", + help=( + "Phase 1: Process the Excel workbook — run preprocessing, call AI to " + "generate release notes, and write results back to Excel. " + "Does NOT produce a Markdown file." + ), + ) + add_generate_args(gen_parser) + + # --- Phase 2: export-markdown --- + export_parser = subparsers.add_parser( + "export-markdown", + help=( + "Phase 2: Read a processed Excel workbook and export a Markdown " + "release-note file. Does NOT call AI or modify the Excel." + ), + ) + add_export_markdown_args(export_parser) + + args = parser.parse_args() + if not args.command: + parser.print_help() + raise SystemExit(1) + return args + + +def add_generate_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") + parser.add_argument("--excel", required=True, help="Path to the release note Excel workbook.") + parser.add_argument( + "--releases-dir", + required=True, + help="Path to the existing English release notes directory.", + ) + parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") + parser.add_argument( + "--ai-provider", + choices=["codex", "azure"], + default="codex", + help=( + "AI provider to use. 'codex' runs the Codex CLI as a subprocess " + "(requires codex to be installed). 'azure' calls Azure OpenAI via the " + "OpenAI Python SDK (requires AZURE_OPENAI_KEY and AZURE_OPENAI_BASE_URL " + "or OPENAI_BASE_URL environment variables). Default: codex." + ), + ) + parser.add_argument( + "--ai-command", + default="codex --ask-for-approval never exec --sandbox read-only --ephemeral", + help="Command-line AI command (only used with --ai-provider codex). The prompt is passed through stdin.", + ) + parser.add_argument( + "--ai-model", + default="gpt-5.4", + help="Model name. Passed to codex exec with -m, or used as the model parameter for Azure OpenAI.", + ) + parser.add_argument( + "--involve-ai-generation", + type=parse_on_off, + default="ON", + help=( + "Whether to use AI for non-dup release notes. Use ON to generate with AI, " + "or OFF to skip AI generation and only run preprocessing. Default: ON." + ), + ) + parser.add_argument( + "--ai-timeout", + type=int, + default=600, + help="Timeout in seconds for each AI command invocation.", + ) + parser.add_argument( + "--ai-workers", + type=int, + default=3, + help=( + "Number of concurrent AI command invocations. The default is conservative " + "for codex exec subprocesses." + ), + ) + parser.add_argument( + "--github-workers", + type=int, + default=8, + help="Number of concurrent GitHub API prefetch workers.", + ) + parser.add_argument( + "--author-workers", + type=int, + default=3, + help="Number of concurrent workers used to resolve bot-authored cherry-pick PR authors.", + ) + parser.add_argument( + "--checkpoint-interval", + type=int, + default=1, + help=( + "Save the Excel workbook after every N completed AI rows. " + "Default: 1. Use 0 to disable." + ), + ) + parser.add_argument( + "--force-regenerate", + action="store_true", + help="Clear existing AI release notes and regenerate all non-dup rows.", + ) + parser.add_argument( + "--skip-scope-preprocess", + action="store_true", + help="Skip moving not-in-scope PR rows to the PRs_not_in_scope sheet.", + ) + parser.add_argument( + "--scope-base-branch-start-date", + help=( + "Override the estimated release-m.n branch start date for x.y.0 scope " + "preprocessing, in YYYY-MM-DD format." + ), + ) + parser.add_argument( + "--start-row", + type=int, + default=None, + help=( + "Excel row number to start processing from (1-indexed, row 1 is the header). " + "Use this to resume from a previous interruption. When specified, " + "preprocessing steps (sort, merge, scope filter, same-series move) are " + "skipped because they should have been completed in the first run. " + "Default: process all data rows." + ), + ) + parser.add_argument( + "--end-row", + type=int, + default=None, + help=( + "Excel row number to stop processing at (inclusive, 1-indexed). " + "Default: last row in the sheet." + ), + ) + parser.add_argument( + "--output-excel", + default=None, + help=( + "Path for the processed Excel output. " + "Default: _processed.xlsx in the same directory." + ), + ) + + +def add_export_markdown_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument("--version", required=True, help="Target TiDB version, for example 8.5.7.") + parser.add_argument( + "--excel", + required=True, + help="Path to the processed Excel workbook (output of the 'generate' phase).", + ) + parser.add_argument("--sheet", default="pr_for_release_note", help="Workbook sheet name.") + parser.add_argument( + "--releases-dir", + required=True, + help="Path to the existing English release notes directory (used for default output path).", + ) + parser.add_argument( + "--output-release-file", + help=( + "Output Markdown file. Defaults to release-{version}-updated-by-ai.md. " + "The default never writes the canonical release-{version}.md, because " + "the generator only produces Improvements and Bug fixes, not a complete " + "release note." + ), + ) + parser.add_argument( + "--release-date", + default="TBD", + help='Release date text for the Markdown header, for example "August 14, 2025".', + ) + + +def main() -> int: + args = parse_args() + if args.command == "generate": + return run_generate(args) + if args.command == "export-markdown": + return run_export_markdown(args) + return 1 + + +def run_generate(args: argparse.Namespace) -> int: + validate_positive_int("--ai-workers", args.ai_workers) + validate_positive_int("--github-workers", args.github_workers) + validate_positive_int("--author-workers", args.author_workers) + if args.checkpoint_interval < 0: + raise ValueError("--checkpoint-interval must be greater than or equal to 0") + base_branch_start_date = None + if args.scope_base_branch_start_date: + base_branch_start_date = parse_date_value(args.scope_base_branch_start_date) + if not base_branch_start_date: + raise ValueError("--scope-base-branch-start-date must use YYYY-MM-DD format") + + row_range_specified = args.start_row is not None or args.end_row is not None + start_row = args.start_row + end_row = args.end_row + if start_row is not None and start_row < 2: + raise ValueError("--start-row must be >= 2 (row 1 is the header)") + if end_row is not None and end_row < 2: + raise ValueError("--end-row must be >= 2 (row 1 is the header)") + if start_row is not None and end_row is not None and start_row > end_row: + raise ValueError("--start-row must be <= --end-row") + + try: + token = load_github_token() + except ValueError as exc: + raise SystemExit(f"error: {exc}") from None + github = GitHubClient(token) + involve_ai_generation = args.involve_ai_generation == "ON" + if involve_ai_generation: + if args.ai_provider == "azure": + ai = AzureOpenAIClient(args.ai_model, args.ai_timeout) + else: + ai = CodexAIClient(args.ai_command, args.ai_model, args.ai_timeout) + else: + ai = None + + excel_path = Path(args.excel) + processed_excel_path = ( + Path(args.output_excel) if args.output_excel + else default_processed_excel_path(excel_path) + ) + workbook = openpyxl.load_workbook(excel_path) + if args.sheet not in workbook.sheetnames: + raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") + sheet = workbook[args.sheet] + + if end_row is not None and end_row > sheet.max_row: + print( + f"--end-row {end_row} exceeds the last row ({sheet.max_row}); " + f"clamping to {sheet.max_row} to avoid materializing blank rows", + flush=True, + ) + end_row = sheet.max_row + + if row_range_specified: + print( + f"Row range specified: processing rows " + f"{start_row or 2} to {end_row or sheet.max_row} " + f"(skipping preprocessing steps)", + flush=True, + ) + header = prepare_sheet_columns(sheet) + if args.force_regenerate: + clear_output_columns( + sheet, header, clear_ai=True, clear_published=False, + start_row=start_row, end_row=end_row, + ) + else: + if not args.skip_scope_preprocess: + move_prs_not_in_scope( + workbook, + sheet, + args.version, + Path(args.releases_dir), + github, + base_branch_start_date=base_branch_start_date, + ) + sort_sheet_rows_by_component(sheet) + header = prepare_sheet_columns(sheet) + clear_output_columns(sheet, header, clear_ai=args.force_regenerate) + + existing_notes = store_existing_release_notes(Path(args.releases_dir), args.version) + move_rows_with_issues_already_in_same_series( + workbook, + sheet, + header, + existing_notes, + args.version, + ) + update_pr_authors_and_dup_notes( + sheet, + header, + existing_notes, + github, + author_workers=args.author_workers, + ) + merge_rows_by_issue_and_component(sheet, header) + + if involve_ai_generation: + checkpoint_callback = build_checkpoint_callback( + workbook, + processed_excel_path, + args.checkpoint_interval, + ) + generate_notes_for_sheet( + sheet, + header, + github, + ai, + ai_workers=args.ai_workers, + github_workers=args.github_workers, + checkpoint_callback=checkpoint_callback, + start_row=start_row, + end_row=end_row, + ) + else: + generate_notes_without_ai( + sheet, header, start_row=start_row, end_row=end_row, + ) + + if row_range_specified: + # Moving (deleting) not-needed rows would shift the row numbers of later + # rows, breaking the stable-row-number contract that segmented resume + # relies on. Leave them in place; they are still excluded from Markdown + # because their AI note starts with the not-needed prefix. + print( + "Row range specified: keeping not-needed rows in place to preserve " + "row numbers for resume (they are still excluded from Markdown)", + flush=True, + ) + else: + move_not_needed_rows_to_sheet(workbook, sheet, header) + save_workbook_safely(workbook, processed_excel_path) + + print("Phase 1 (generate) completed.", flush=True) + print(f" Input Excel: {excel_path}", flush=True) + print(f" Processed Excel: {processed_excel_path}", flush=True) + print( + f" Next step: run 'export-markdown' with --excel {processed_excel_path} " + f"to generate the Markdown file.", + flush=True, + ) + return 0 + + +def run_export_markdown(args: argparse.Namespace) -> int: + excel_path = Path(args.excel) + output_file = ( + Path(args.output_release_file) + if args.output_release_file + else default_output_release_file(Path(args.releases_dir), args.version) + ) + + workbook = openpyxl.load_workbook(excel_path, data_only=True) + if args.sheet not in workbook.sheetnames: + raise ValueError(f"Cannot find sheet {args.sheet!r} in {args.excel}") + sheet = workbook[args.sheet] + header = get_header(sheet) + + if "release_notes_written_by_ai" not in header: + raise ValueError( + f"Sheet {args.sheet!r} does not have a 'release_notes_written_by_ai' column. " + "Make sure you are pointing to the processed Excel from the 'generate' phase." + ) + + markdown_entries = collect_markdown_entries_from_sheet(sheet, header) + workbook.close() + + write_release_file(output_file, args.version, args.release_date, markdown_entries) + + print("Phase 2 (export-markdown) completed.", flush=True) + print(f" Input Excel: {excel_path}", flush=True) + print(f" Generated release note file: {output_file}", flush=True) + return 0 + + +def validate_positive_int(name: str, value: int) -> None: + if value < 1: + raise ValueError(f"{name} must be greater than or equal to 1") + + +def parse_on_off(value: str) -> str: + normalized = value.strip().upper() + if normalized not in {"ON", "OFF"}: + raise argparse.ArgumentTypeError("value must be ON or OFF") + return normalized + + +def default_output_release_file(releases_dir: Path, version: str) -> Path: + # Always write to the "-updated-by-ai" name, never the canonical + # release-.md. The generator only produces Improvements and Bug + # fixes, not a complete formal release note, so the default output must not + # be mistaken for the official file. This name is also skipped by the + # historical-note scanner, so a re-run never treats the draft as published. + return releases_dir / f"release-{version}-updated-by-ai.md" + + +def default_processed_excel_path(excel_path: Path) -> Path: + stem = excel_path.stem + if stem.endswith("_processed"): + return excel_path + return excel_path.with_name(f"{stem}_processed{excel_path.suffix}") + + +def build_checkpoint_callback( + workbook: openpyxl.Workbook, + excel_path: Path, + checkpoint_interval: int, +): + if checkpoint_interval <= 0: + return None + + def checkpoint(completed: int, total: int) -> None: + if completed % checkpoint_interval != 0 and completed != total: + return + save_workbook_safely(workbook, excel_path) + print( + f"Checkpoint saved after {completed}/{total} AI row(s): {excel_path}", + flush=True, + ) + + return checkpoint + + +def save_workbook_safely(workbook: openpyxl.Workbook, excel_path: Path) -> None: + excel_path = excel_path.resolve() + temp_file = tempfile.NamedTemporaryFile( + prefix=f".{excel_path.stem}.", + suffix=excel_path.suffix, + dir=excel_path.parent, + delete=False, + ) + temp_path = Path(temp_file.name) + temp_file.close() + saved_temp = False + try: + workbook.save(temp_path) + saved_temp = True + os.replace(temp_path, excel_path) + except Exception as exc: + if saved_temp and temp_path.exists(): + raise RuntimeError( + f"Failed to replace {excel_path}: {exc}. " + f"A complete temporary workbook remains at {temp_path}." + ) from exc + temp_path.unlink(missing_ok=True) + raise RuntimeError(f"Failed to save workbook {excel_path}: {exc}") from exc + + +def load_github_token() -> str: + token = os.environ.get("GITHUB_TOKEN", "").strip() + if not token: + raise ValueError("GITHUB_TOKEN environment variable is required") + return token diff --git a/scripts/release-notes-ai-generator/constants.py b/scripts/release-notes-ai-generator/constants.py new file mode 100644 index 0000000000000..160f214b4f521 --- /dev/null +++ b/scripts/release-notes-ai-generator/constants.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +import re +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] +GENERATION_PROMPT_TEMPLATE = ( + REPO_ROOT / "scripts" / "release-notes-ai-generator" / "prompts" / "generation.md" +) + +BOT_AUTHORS = {"ti-chi-bot", "ti-srebot"} +# Keep the misspelled source column name because release note excel file exports it this way. +REQUIRED_HEADERS = { + "pr_author", + "pr_link", + "pr_title", + "formated_release_note", + "issue_type", +} +COMPONENT_HEADERS = ("component", "components") + +GITHUB_ITEM_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/" + r"(?Pissues|pull)/(?P\d+)" +) +ISSUE_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/issues/(?P\d+)" +) +PR_URL_RE = re.compile( + r"https://github\.com/(?P[^/\s]+)/(?P[\w.-]+)/pull/(?P\d+)" +) +AUTHOR_RE = re.compile(r"@\[([^\]]+)\]") + +TOP_LEVEL_COMPONENTS = ["TiDB", "TiKV", "PD", "TiFlash", "TiProxy"] +TOOL_COMPONENTS = [ + "Backup & Restore (BR)", + "TiCDC", + "TiDB Data Migration (DM)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "TiDB Binlog", + "sync-diff-inspector", +] +COMPONENT_ALIASES = { + "tidb": "TiDB", + "tikv": "TiKV", + "pd": "PD", + "tiflash": "TiFlash", + "tiproxy": "TiProxy", + "br": "Backup & Restore (BR)", + "backup & restore": "Backup & Restore (BR)", + "backup & restore (br)": "Backup & Restore (BR)", + "cdc": "TiCDC", + "ticdc": "TiCDC", + "dm": "TiDB Data Migration (DM)", + "tidb data migration": "TiDB Data Migration (DM)", + "tidb data migration (dm)": "TiDB Data Migration (DM)", + "tidb lightning": "TiDB Lightning", + "lightning": "TiDB Lightning", + "dumpling": "Dumpling", + "tiup": "TiUP", + "tidb binlog": "TiDB Binlog", + "ng monitoring": "TiDB", + "sync_diff": "sync-diff-inspector", + "sync-diff-inspector": "sync-diff-inspector", + "sync diff inspector": "sync-diff-inspector", + "planner": "TiDB", + "execution": "TiDB", + "sql-infra": "TiDB", + "transaction": "TiDB", + "engine": "TiDB", + "observability": "TiDB", + "dxf": "TiDB", + "storage": "TiDB", + "tidb-dashboard": "TiDB", + "tidb dashboard": "TiDB", + "ddl": "TiDB", + "coprocessor": "TiDB", + "compute": "TiDB", + "scheduling": "TiDB", + "spm": "TiDB", + "ng-monitoring": "TiDB", +} diff --git a/scripts/release-notes-ai-generator/excel_workbook.py b/scripts/release-notes-ai-generator/excel_workbook.py new file mode 100644 index 0000000000000..406c1c5038ea5 --- /dev/null +++ b/scripts/release-notes-ai-generator/excel_workbook.py @@ -0,0 +1,1328 @@ +from __future__ import annotations + +import copy +import dataclasses +import re +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from collections import OrderedDict +from pathlib import Path +from typing import Any, Callable + +from openpyxl.styles import PatternFill + +from .ai_client import build_generation_prompt +from .constants import ( + AUTHOR_RE, + BOT_AUTHORS, + COMPONENT_HEADERS, + GITHUB_ITEM_URL_RE, + REQUIRED_HEADERS, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) +from .models import ( + ExistingNote, + GitHubDataCache, + MarkdownEntry, + RowContext, + RowGenerationResult, + RowInput, +) +from .utils import ( + extract_issue_urls, + extract_pr_urls, + normalize_component, + normalize_raw_component, + normalized_release_component, + replace_author_markdown, + split_lines, + split_multi_value, + str_value, + unique_ordered, +) + + +GRAY_FILL = PatternFill(start_color="D3D3D3", end_color="D3D3D3", fill_type="solid") +NOT_NEEDED_PREFIX = "Release note is not needed:" +SAME_SERIES_REASON_HEADER = "reason" +# Global cap on the combined changed-file diff (files_summary) across all PRs of a +# single row, to bound the AI input size when a row references multiple PRs. +MAX_ROW_FILES_SUMMARY_CHARS = 40000 + + +def prepare_sheet_columns(sheet: Any) -> dict[str, int]: + header = get_header(sheet) + missing = sorted(REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError(f"Missing required Excel columns: {', '.join(missing)}") + get_component_col(header) + + ai_col = header.get("release_notes_written_by_ai") + formatted_col = header["formated_release_note"] + if not ai_col: + sheet.insert_cols(formatted_col + 1) + sheet.cell(row=1, column=formatted_col + 1, value="release_notes_written_by_ai") + header = get_header(sheet) + + if "ai_note_type" not in header: + ai_col_index = header["release_notes_written_by_ai"] + sheet.insert_cols(ai_col_index + 1) + sheet.cell(row=1, column=ai_col_index + 1, value="ai_note_type") + header = get_header(sheet) + + if "published_release_notes" not in header: + last_col = sheet.max_column + sheet.cell(row=1, column=last_col + 1, value="published_release_notes") + header = get_header(sheet) + return header + + +def get_header(sheet: Any) -> dict[str, int]: + header: dict[str, int] = {} + for index, cell in enumerate(sheet[1], start=1): + if cell.value: + header[str(cell.value).strip()] = index + return header + + +def clear_output_columns( + sheet: Any, + header: dict[str, int], + clear_ai: bool = True, + clear_published: bool = True, + start_row: int | None = None, + end_row: int | None = None, +) -> None: + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row + for row_number in range(effective_start, effective_end + 1): + if clear_ai: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + if "ai_note_type" in header: + sheet.cell(row=row_number, column=header["ai_note_type"]).value = None + if clear_published: + sheet.cell(row=row_number, column=header["published_release_notes"]).value = None + + +def sort_sheet_rows_by_component(sheet: Any) -> None: + header = get_header(sheet) + component_col = get_component_col(header) + if sheet.max_row <= 2: + return + + snapshots = [ + (row_number, component_sort_key(sheet.cell(row=row_number, column=component_col).value), snapshot_row(sheet, row_number)) + for row_number in range(2, sheet.max_row + 1) + ] + sorted_snapshots = sorted(snapshots, key=lambda item: item[1]) + if [row_number for row_number, _key, _snapshot in snapshots] == [ + row_number for row_number, _key, _snapshot in sorted_snapshots + ]: + return + + for target_row, (_source_row, _key, snapshot) in enumerate(sorted_snapshots, start=2): + restore_row(sheet, target_row, snapshot) + + print("Sorted worksheet rows by component before release-note generation", flush=True) + + +def component_sort_key(value: Any) -> tuple[int, str]: + component = normalize_raw_component(value) + if not component: + return (1, "") + return (0, component.casefold()) + + +def snapshot_row(sheet: Any, row_number: int) -> dict[str, Any]: + row_dimension = sheet.row_dimensions[row_number] + return { + "height": row_dimension.height, + "hidden": row_dimension.hidden, + "outline_level": row_dimension.outlineLevel, + "collapsed": row_dimension.collapsed, + "cells": [snapshot_cell(sheet.cell(row=row_number, column=column)) for column in range(1, sheet.max_column + 1)], + } + + +def snapshot_cell(cell: Any) -> dict[str, Any]: + return { + "value": cell.value, + "style": copy.copy(cell._style), + "number_format": cell.number_format, + "hyperlink": copy.copy(cell.hyperlink) if cell.hyperlink else None, + "comment": copy.copy(cell.comment) if cell.comment else None, + } + + +def restore_row(sheet: Any, row_number: int, snapshot: dict[str, Any]) -> None: + row_dimension = sheet.row_dimensions[row_number] + row_dimension.height = snapshot["height"] + row_dimension.hidden = snapshot["hidden"] + row_dimension.outlineLevel = snapshot["outline_level"] + row_dimension.collapsed = snapshot["collapsed"] + for column, cell_snapshot in enumerate(snapshot["cells"], start=1): + cell = sheet.cell(row=row_number, column=column) + cell.value = cell_snapshot["value"] + cell._style = copy.copy(cell_snapshot["style"]) + cell.number_format = cell_snapshot["number_format"] + cell._hyperlink = copy.copy(cell_snapshot["hyperlink"]) if cell_snapshot["hyperlink"] else None + cell.comment = copy.copy(cell_snapshot["comment"]) if cell_snapshot["comment"] else None + + +def get_component_col(header: dict[str, int]) -> int: + for name in COMPONENT_HEADERS: + if name in header: + return header[name] + raise ValueError("Missing required Excel column: component or components") + + +def issue_urls_for_row(sheet: Any, header: dict[str, int], row_number: int) -> list[str]: + candidates: list[str] = [] + if "issue_url" in header: + candidates.append(str_value(sheet.cell(row=row_number, column=header["issue_url"]).value)) + candidates.append(str_value(sheet.cell(row=row_number, column=header["formated_release_note"]).value)) + return unique_ordered(url for text in candidates for url in extract_issue_urls(text)) + + +def first_issue_url_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str | None: + issue_urls = issue_urls_for_row(sheet, header, row_number) + return issue_urls[0] if issue_urls else None + + +def store_existing_release_notes(releases_dir: Path, version: str) -> list[ExistingNote]: + existing_notes: list[ExistingNote] = [] + seen: set[tuple[str, tuple[str, ...], str]] = set() + target_version = parse_semver_tuple(version) + + for file_path in sorted(releases_dir.rglob("*.md")): + if should_skip_release_file(file_path, target_version): + continue + level1 = level2 = level3 = "" + with file_path.open("r", encoding="utf-8") as file: + for raw_line in file: + line = raw_line.strip() + authors = AUTHOR_RE.findall(line) + item_urls = [match.group() for match in GITHUB_ITEM_URL_RE.finditer(line)] + if item_urls: + note_level = level1 + level2 + level3 + note_type, component = classify_note_level(note_level) + for item_url in item_urls: + key = (item_url, tuple(authors), file_path.name) + if key in seen: + continue + seen.add(key) + existing_notes.append( + ExistingNote( + url=item_url, + line=line, + file_name=file_path.name, + note_level=note_level, + authors=authors, + note_type=note_type, + component=component, + ) + ) + continue + + heading = parse_release_note_heading(raw_line) + if not heading: + continue + heading_level, label = heading + if heading_level == 1: + level1 = "> " + label + level2 = level3 = "" + elif heading_level == 2: + level2 = "> " + label + level3 = "" + elif heading_level == 3: + level3 = "> " + label + return existing_notes + + +def should_skip_release_file(file_path: Path, target_version: tuple[int, int, int]) -> bool: + if "updated-by-ai" in file_path.stem: + return True + file_version = release_file_semver_tuple(file_path) + if not file_version: + return True + return file_version >= target_version + + +def parse_semver_tuple(version: str) -> tuple[int, int, int]: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def release_file_semver_tuple(file_path: Path) -> tuple[int, int, int] | None: + match = re.match( + r"^release-(?P\d+)\.(?P\d+)\.(?P\d+)", + file_path.stem, + ) + if not match: + return None + return ( + int(match.group("major")), + int(match.group("minor")), + int(match.group("patch")), + ) + + +def parse_release_note_heading(raw_line: str) -> tuple[int, str] | None: + line = raw_line.rstrip() + section = re.match(r"^##\s+(.+?)\s*$", line) + if section: + return 1, section.group(1).strip() + + top_component = re.match(r"^[+-]\s+(.+?)\s*$", line) + if top_component: + label = top_component.group(1).strip() + if label.lower() == "tools" or normalized_release_component(label): + return 2, label + + tool_component = re.match(r"^ {4}[+-]\s+(.+?)\s*$", line) + if tool_component: + label = tool_component.group(1).strip() + if normalized_release_component(label): + return 3, label + return None + + +def update_pr_authors_and_dup_notes( + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + github: Any, + author_workers: int = 1, +) -> None: + apply_bot_author_replacements(sheet, header, github, author_workers) + existing_notes_by_url = index_existing_notes_by_url(existing_notes) + + for row_number in range(2, sheet.max_row + 1): + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + current_author = str_value(author_cell.value) + + issue_urls = issue_urls_for_row(sheet, header, row_number) + if not issue_urls: + continue + + current_authors = split_multi_value(current_author) + dup_notes = [] + for issue_url in issue_urls: + for existing in existing_notes_by_url.get(issue_url, []): + if existing.authors and not set(current_authors).intersection(existing.authors): + continue + dup_notes.append(existing.dup_text) + + if dup_notes: + dup_col = header["published_release_notes"] + sheet.cell(row=row_number, column=dup_col, value="\n".join(unique_ordered(dup_notes))) + fill_row(sheet, row_number) + print( + f"Row {row_number}: found duplicated release note for {', '.join(issue_urls)}", + flush=True, + ) + + +def move_rows_with_issues_already_in_same_series( + workbook: Any, + sheet: Any, + header: dict[str, int], + existing_notes: list[ExistingNote], + version: str, +) -> int: + files_by_issue_url = same_series_release_files_by_issue_url(existing_notes, version) + if not files_by_issue_url: + return 0 + + target_sheet_name = same_series_issues_sheet_name(version) + target, reason_col = ensure_sheet_with_reason(workbook, sheet, target_sheet_name) + rows_to_move: list[tuple[int, str]] = [] + + for row_number in range(2, sheet.max_row + 1): + issue_urls = issue_urls_for_row(sheet, header, row_number) + reason = same_series_issue_reason(issue_urls, files_by_issue_url) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason, reason_col) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} because their issues " + "already appear in earlier release notes from the same major.minor series", + flush=True, + ) + return len(rows_to_move) + + +def move_not_needed_rows_to_sheet( + workbook: Any, + sheet: Any, + header: dict[str, int], + start_row: int | None = None, + end_row: int | None = None, +) -> int: + """Move rows where AI determined no release note is needed to a separate sheet.""" + ai_col = header["release_notes_written_by_ai"] + target_sheet_name = "release_note_not_needed" + + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row + rows_to_move: list[int] = [] + for row_number in range(effective_start, effective_end + 1): + ai_value = str_value(sheet.cell(row=row_number, column=ai_col).value) + if ai_value.startswith(NOT_NEEDED_PREFIX): + rows_to_move.append(row_number) + + if not rows_to_move: + return 0 + + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if not str_value(target.cell(row=1, column=1).value): + copy_header_row(sheet, target) + else: + target = workbook.create_sheet(target_sheet_name) + copy_header_row(sheet, target) + + for row_number in rows_to_move: + target_row = target.max_row + 1 + for column in range(1, sheet.max_column + 1): + copy_cell( + sheet.cell(row=row_number, column=column), + target.cell(row=target_row, column=column), + ) + + for row_number in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + print( + f"Moved {len(rows_to_move)} row(s) to sheet '{target_sheet_name}' " + "(release note not needed)", + flush=True, + ) + return len(rows_to_move) + + +def copy_header_row(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + + +def same_series_release_files_by_issue_url( + existing_notes: list[ExistingNote], + version: str, +) -> dict[str, list[str]]: + target_version = parse_semver_tuple(version) + files_by_issue_url: dict[str, list[str]] = {} + + for existing in existing_notes: + match = GITHUB_ITEM_URL_RE.search(existing.url) + if not match or match.group("kind") != "issues": + continue + + file_version = release_file_semver_tuple(Path(existing.file_name)) + if not file_version: + continue + if file_version[:2] != target_version[:2] or file_version >= target_version: + continue + + files = files_by_issue_url.setdefault(existing.url, []) + if existing.file_name not in files: + files.append(existing.file_name) + + for issue_url, files in list(files_by_issue_url.items()): + files_by_issue_url[issue_url] = sorted(files, key=release_file_name_sort_key) + return files_by_issue_url + + +def same_series_issues_sheet_name(version: str) -> str: + major, minor, _patch = parse_semver_tuple(version) + return f"issue_already_in_earlier_v{major}.{minor}" + + +def same_series_issue_reason( + issue_urls: list[str], + files_by_issue_url: dict[str, list[str]], +) -> str | None: + reasons = [] + for issue_url in issue_urls: + files = files_by_issue_url.get(issue_url) + if files: + reasons.append(f"{issue_url} appears in {', '.join(files)}") + return "; ".join(reasons) if reasons else None + + +def release_file_name_sort_key(file_name: str) -> tuple[int, int, int, str]: + version = release_file_semver_tuple(Path(file_name)) + if not version: + return (sys.maxsize, sys.maxsize, sys.maxsize, file_name) + return (*version, file_name) + + +def ensure_sheet_with_reason( + workbook: Any, + source_sheet: Any, + target_sheet_name: str, +) -> tuple[Any, int]: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if not str_value(target.cell(row=1, column=1).value): + reason_col = copy_header_with_reason(source_sheet, target) + else: + reason_col = ensure_same_series_reason_header(source_sheet, target) + return target, reason_col + + target = workbook.create_sheet(target_sheet_name) + reason_col = copy_header_with_reason(source_sheet, target) + return target, reason_col + + +def copy_header_with_reason(source_sheet: Any, target_sheet: Any) -> int: + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + return ensure_same_series_reason_header(source_sheet, target_sheet) + + +def ensure_same_series_reason_header(source_sheet: Any, target_sheet: Any) -> int: + reason_col = find_header_column(target_sheet, SAME_SERIES_REASON_HEADER) + if not reason_col: + reason_col = max(source_sheet.max_column, target_sheet.max_column) + 1 + copy_missing_header_cells(source_sheet, target_sheet) + target_sheet.cell(row=1, column=reason_col, value=SAME_SERIES_REASON_HEADER) + return reason_col + + while reason_col <= source_sheet.max_column: + target_sheet.insert_cols(reason_col) + reason_col += 1 + + copy_missing_header_cells(source_sheet, target_sheet) + return reason_col + + +def copy_missing_header_cells(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + if not str_value(target_sheet.cell(row=1, column=column).value): + copy_cell( + source_sheet.cell(row=1, column=column), + target_sheet.cell(row=1, column=column), + ) + + +def find_header_column(sheet: Any, header_name: str) -> int | None: + for column in range(1, sheet.max_column + 1): + if str_value(sheet.cell(row=1, column=column).value) == header_name: + return column + return None + + +def append_row_with_reason( + source_sheet: Any, + target_sheet: Any, + row_number: int, + reason: str, + reason_col: int, +) -> None: + target_row = target_sheet.max_row + 1 + source_dimension = source_sheet.row_dimensions[row_number] + target_dimension = target_sheet.row_dimensions[target_row] + target_dimension.height = source_dimension.height + target_dimension.hidden = source_dimension.hidden + target_dimension.outlineLevel = source_dimension.outlineLevel + target_dimension.collapsed = source_dimension.collapsed + + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=reason_col, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) + + +def apply_bot_author_replacements( + sheet: Any, + header: dict[str, int], + github: Any, + author_workers: int, +) -> None: + requests = bot_author_requests(sheet, header) + if not requests: + return + print( + f"Resolving {len(requests)} bot-authored PR row(s) with {author_workers} worker(s)", + flush=True, + ) + + replacements = resolve_bot_author_replacements(requests, github, author_workers) + for row_number in sorted(replacements): + current_author, actual_author = replacements[row_number] + author_cell = sheet.cell(row=row_number, column=header["pr_author"]) + formatted_cell = sheet.cell(row=row_number, column=header["formated_release_note"]) + formatted_note = str_value(formatted_cell.value) + print( + f"Replacing bot author in row {row_number}: {current_author} -> {actual_author}", + flush=True, + ) + author_cell.value = actual_author + formatted_cell.value = replace_author_markdown( + formatted_note, current_author, actual_author + ) + + +def bot_author_requests(sheet: Any, header: dict[str, int]) -> list[tuple[int, str, str, str]]: + requests = [] + for row_number in range(2, sheet.max_row + 1): + current_author = str_value(sheet.cell(row=row_number, column=header["pr_author"]).value) + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + if current_author not in BOT_AUTHORS or not pr_link: + continue + pr_title = str_value(sheet.cell(row=row_number, column=header["pr_title"]).value) + requests.append((row_number, pr_link, pr_title, current_author)) + return requests + + +def resolve_bot_author_replacements( + requests: list[tuple[int, str, str, str]], + github: Any, + author_workers: int, +) -> dict[int, tuple[str, str]]: + replacements: dict[int, tuple[str, str]] = {} + total = len(requests) + if author_workers == 1: + for completed, request in enumerate(requests, start=1): + row_number, pr_link, pr_title, current_author = request + actual_author = resolve_bot_author(github, request) + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + with ThreadPoolExecutor(max_workers=author_workers) as executor: + futures = { + executor.submit(resolve_bot_author, github, request): request + for request in requests + } + for completed, future in enumerate(as_completed(futures), start=1): + row_number, _pr_link, _pr_title, current_author = futures[future] + actual_author = future.result() + print_bot_author_progress(completed, total, row_number, current_author, actual_author) + if actual_author != current_author: + replacements[row_number] = (current_author, actual_author) + return replacements + + +def print_bot_author_progress( + completed: int, + total: int, + row_number: int, + current_author: str, + actual_author: str, +) -> None: + status = "unchanged" if actual_author == current_author else f"{current_author} -> {actual_author}" + print( + f"Resolved bot author {completed}/{total}: row {row_number} ({status})", + flush=True, + ) + + +def resolve_bot_author(github: Any, request: tuple[int, str, str, str]) -> str: + row_number, pr_link, pr_title, current_author = request + try: + return github.get_original_author_for_cherry_pick( + row_number, + pr_link, + pr_title, + current_author, + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to resolve bot author for {pr_link}: {exc}", + file=sys.stderr, + flush=True, + ) + return current_author + + +def index_existing_notes_by_url(existing_notes: list[ExistingNote]) -> dict[str, list[ExistingNote]]: + indexed: dict[str, list[ExistingNote]] = {} + seen: set[tuple[str, tuple[str, ...]]] = set() + for existing in existing_notes: + key = (existing.url, tuple(existing.authors)) + if key in seen: + continue + seen.add(key) + indexed.setdefault(existing.url, []).append(existing) + return indexed + + +def merge_rows_by_issue_and_component(sheet: Any, header: dict[str, int]) -> None: + groups: OrderedDict[tuple[str, str], list[int]] = OrderedDict() + component_col = get_component_col(header) + for row_number in range(2, sheet.max_row + 1): + issue_url = first_issue_url_for_row(sheet, header, row_number) + if not issue_url: + continue + component = normalize_raw_component(sheet.cell(row=row_number, column=component_col).value) + if not component: + continue + groups.setdefault((issue_url, component), []).append(row_number) + + rows_to_delete: list[int] = [] + for (_issue_url, _component), rows in groups.items(): + if len(rows) <= 1: + continue + keep_row = rows[0] + merge_pr_links(sheet, header, keep_row, rows) + merge_authors(sheet, header, keep_row, rows) + merge_dup_notes(sheet, header, keep_row, rows) + fill_first_empty_values(sheet, header, keep_row, rows) + if str_value(sheet.cell(row=keep_row, column=header["published_release_notes"]).value): + fill_row(sheet, keep_row) + rows_to_delete.extend(rows[1:]) + + for row_number in sorted(rows_to_delete, reverse=True): + sheet.delete_rows(row_number, 1) + + +def merge_pr_links(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + links: list[str] = [] + for row in rows: + links.extend(split_multi_value(sheet.cell(row=row, column=header["pr_link"]).value)) + sheet.cell(row=keep_row, column=header["pr_link"], value=", ".join(unique_ordered(links))) + + +def merge_authors(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + authors: list[str] = [] + for row in rows: + authors.extend(split_multi_value(sheet.cell(row=row, column=header["pr_author"]).value)) + sheet.cell(row=keep_row, column=header["pr_author"], value=", ".join(unique_ordered(authors))) + + +def merge_dup_notes(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + notes: list[str] = [] + for row in rows: + notes.extend(split_lines(sheet.cell(row=row, column=header["published_release_notes"]).value)) + if notes: + sheet.cell(row=keep_row, column=header["published_release_notes"], value="\n".join(unique_ordered(notes))) + + +def fill_first_empty_values(sheet: Any, header: dict[str, int], keep_row: int, rows: list[int]) -> None: + columns_to_skip = { + header["pr_link"], + header["pr_author"], + header["published_release_notes"], + header["release_notes_written_by_ai"], + } + for col in range(1, sheet.max_column + 1): + if col in columns_to_skip: + continue + keep_cell = sheet.cell(row=keep_row, column=col) + if str_value(keep_cell.value): + continue + for row in rows[1:]: + value = sheet.cell(row=row, column=col).value + if str_value(value): + keep_cell.value = value + break + + +def generate_notes_for_sheet( + sheet: Any, + header: dict[str, int], + github: Any, + ai: Any, + ai_workers: int = 1, + github_workers: int = 1, + checkpoint_callback: Callable[[int, int], None] | None = None, + start_row: int | None = None, + end_row: int | None = None, +) -> list[MarkdownEntry]: + entries_by_row: dict[int, list[MarkdownEntry]] = {} + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row + row_inputs = [ + build_row_input(sheet, header, row_number) + for row_number in range(effective_start, effective_end + 1) + ] + rows_to_generate: list[RowInput] = [] + + for row_input in row_inputs: + row_number = row_input.row_number + component = row_input.component + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value = None + entries_by_row[row_number] = dup_entries_for_row(row_input, dup_text) + continue + + ai_cell = sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]) + expected_links = row_input.issue_urls or row_input.pr_urls + if not expected_links: + ai_cell.value = "AI_GENERATION_FAILED: missing issue URL and PR URL" + continue + + existing_note = str_value(ai_cell.value) + if is_reusable_ai_note(existing_note): + if is_not_needed_note(existing_note): + print(f"Row {row_number}: skipped existing not-needed verdict", flush=True) + continue + persisted_type = str_value( + sheet.cell(row=row_number, column=header["ai_note_type"]).value + ) if "ai_note_type" in header else "" + note_type = ( + persisted_type + if persisted_type in {"improvement", "bug_fix"} + else classify_note_type_from_text(existing_note, row_input.issue_type) + ) + entries_by_row[row_number] = [ + MarkdownEntry( + note_type or "improvement", + component, + existing_note, + row_input.raw_component, + ) + ] + print(f"Row {row_number}: skipped existing AI release note", flush=True) + continue + + rows_to_generate.append(row_input) + + github_cache = prefetch_github_data(rows_to_generate, github, github_workers) + total_to_generate = len(rows_to_generate) + if total_to_generate: + print( + f"Generating AI release notes for {total_to_generate} row(s) " + f"with {ai_workers} worker(s)", + flush=True, + ) + + completed = 0 + with ThreadPoolExecutor(max_workers=ai_workers) as executor: + futures = [ + executor.submit(generate_note_for_row, row_input, github_cache, ai) + for row_input in rows_to_generate + ] + for future in as_completed(futures): + result = future.result() + apply_generation_result(sheet, header, result, entries_by_row) + completed += 1 + if checkpoint_callback: + checkpoint_callback(completed, total_to_generate) + + entries: list[MarkdownEntry] = [] + for row_input in row_inputs: + entries.extend(entries_by_row.get(row_input.row_number, [])) + return entries + + +def generate_notes_without_ai( + sheet: Any, + header: dict[str, int], + start_row: int | None = None, + end_row: int | None = None, +) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + effective_start = start_row if start_row is not None else 2 + effective_end = end_row if end_row is not None else sheet.max_row + for row_number in range(effective_start, effective_end + 1): + row_input = build_row_input(sheet, header, row_number) + dup_text = str_value(sheet.cell(row=row_number, column=header["published_release_notes"]).value) + if dup_text: + entries.extend(dup_entries_for_row(row_input, dup_text)) + continue + + formatted_notes = split_lines(row_input.formatted_release_note) + if not formatted_notes: + print( + f"Row {row_number}: skipped non-dup row because formated_release_note is empty", + file=sys.stderr, + flush=True, + ) + continue + note_type = classify_note_type_from_text( + row_input.formatted_release_note, + row_input.issue_type, + ) + for note in formatted_notes: + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + note, + row_input.raw_component, + ) + ) + + print( + f"AI generation is OFF; generated Markdown from formated_release_note for {len(entries)} note(s)", + flush=True, + ) + return entries + + +def collect_markdown_entries_from_sheet( + sheet: Any, + header: dict[str, int], +) -> list[MarkdownEntry]: + """Collect MarkdownEntry items from a processed Excel sheet (Phase 2). + + Reads published_release_notes (dup) and release_notes_written_by_ai columns + to build the full entry list without calling AI. Falls back to + formated_release_note when the AI column is empty (e.g. when AI generation + was OFF in Phase 1). + """ + entries: list[MarkdownEntry] = [] + has_published_col = "published_release_notes" in header + for row_number in range(2, sheet.max_row + 1): + row_input = build_row_input(sheet, header, row_number) + + if has_published_col: + dup_text = str_value( + sheet.cell(row=row_number, column=header["published_release_notes"]).value + ) + if dup_text: + entries.extend(dup_entries_for_row(row_input, dup_text)) + continue + + ai_note = str_value( + sheet.cell(row=row_number, column=header["release_notes_written_by_ai"]).value + ) + if ai_note and not ai_note.startswith("AI_GENERATION_FAILED:"): + if is_not_needed_note(ai_note): + continue + persisted_type = "" + if "ai_note_type" in header: + persisted_type = str_value( + sheet.cell(row=row_number, column=header["ai_note_type"]).value + ) + note_type = ( + persisted_type + if persisted_type in {"improvement", "bug_fix"} + else classify_note_type_from_text(ai_note, row_input.issue_type) + ) + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + ai_note, + row_input.raw_component, + ) + ) + continue + + # A failed AI row must not fall back to formated_release_note: that text + # is an unvalidated draft or placeholder. Skip it so failed rows are not + # rendered to Markdown. The fallback below is only for rows with no AI + # note at all (e.g. --involve-ai-generation OFF). + if ai_note.startswith("AI_GENERATION_FAILED:"): + continue + + formatted_notes = split_lines(row_input.formatted_release_note) + if not formatted_notes: + continue + note_type = classify_note_type_from_text( + row_input.formatted_release_note, row_input.issue_type, + ) + for note in formatted_notes: + entries.append( + MarkdownEntry( + note_type or "improvement", + row_input.component, + note, + row_input.raw_component, + ) + ) + + print(f"Collected {len(entries)} Markdown entry/entries from processed Excel", flush=True) + return entries + + +def dup_entries_for_row(row_input: RowInput, dup_text: str) -> list[MarkdownEntry]: + entries: list[MarkdownEntry] = [] + for dup_note in split_lines(dup_text): + note_type = classify_note_type_from_text( + dup_note, + row_input.issue_type, + ) + dup_component = parse_component_from_dup(dup_note) or row_input.component + if note_type in {"improvement", "bug_fix"}: + entries.append( + MarkdownEntry( + note_type, + normalize_component(dup_component), + dup_note, + row_input.raw_component, + ) + ) + return entries + + +def build_row_input(sheet: Any, header: dict[str, int], row_number: int) -> RowInput: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + return RowInput( + row_number=row_number, + component=release_component_for_row(sheet, header, row_number), + raw_component=raw_component, + issue_type=str_value(sheet.cell(row=row_number, column=header["issue_type"]).value), + pr_title=str_value(sheet.cell(row=row_number, column=header["pr_title"]).value), + pr_authors=split_multi_value(sheet.cell(row=row_number, column=header["pr_author"]).value), + pr_urls=extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value)), + issue_urls=issue_urls_for_row(sheet, header, row_number), + formatted_release_note=str_value( + sheet.cell(row=row_number, column=header["formated_release_note"]).value + ), + ) + + +def is_reusable_ai_note(note: str) -> bool: + return bool(note) and not note.startswith("AI_GENERATION_FAILED:") + + +def is_not_needed_note(note: str) -> bool: + return note.startswith(NOT_NEEDED_PREFIX) + + +def prefetch_github_data(row_inputs: list[RowInput], github: Any, github_workers: int) -> GitHubDataCache: + issue_urls = unique_ordered(url for row_input in row_inputs for url in row_input.issue_urls) + pr_urls = unique_ordered(url for row_input in row_inputs for url in row_input.pr_urls) + issues = {} + pulls = {} + failed_urls: set[str] = set() + + if not issue_urls and not pr_urls: + return GitHubDataCache(issues=issues, pulls=pulls) + + print( + f"Prefetching GitHub data: {len(issue_urls)} issue(s), {len(pr_urls)} PR(s) " + f"with {github_workers} worker(s)", + flush=True, + ) + + with ThreadPoolExecutor(max_workers=github_workers) as executor: + futures = { + executor.submit(github.get_issue, issue_url): ("issue", issue_url) + for issue_url in issue_urls + } + futures.update( + { + executor.submit(github.get_pull, pr_url): ("pull", pr_url) + for pr_url in pr_urls + } + ) + for future in as_completed(futures): + item_type, url = futures[future] + try: + data = future.result() + except Exception as exc: # noqa: BLE001 + print(f"Failed to prefetch GitHub {item_type} {url}: {exc}", file=sys.stderr, flush=True) + failed_urls.add(url) + continue + if item_type == "issue": + issues[url] = data + else: + pulls[url] = data + return GitHubDataCache(issues=issues, pulls=pulls, failed_urls=failed_urls) + + +def generate_note_for_row( + row_input: RowInput, + github_cache: GitHubDataCache, + ai: Any, +) -> RowGenerationResult: + expected_links = row_input.issue_urls or row_input.pr_urls + row_context = build_row_context_from_cache(row_input, github_cache) + contributors = unique_ordered( + [author for author in row_context.pr_authors if author not in BOT_AUTHORS] + ) + try: + prompt = build_generation_prompt(row_context, expected_links, contributors) + generated = ai.generate(prompt, expected_links, contributors) + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=generated.note_type, + note=generated.release_note, + error=None, + needs_review=generated.needs_review, + reason=generated.reason, + ) + except Exception as exc: # noqa: BLE001 + return RowGenerationResult( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + note_type=None, + note=None, + error=str(exc), + ) + + +def build_row_context_from_cache(row_input: RowInput, github_cache: GitHubDataCache) -> RowContext: + pr_authors = list(row_input.pr_authors) + issues = [ + github_cache.issues[issue_url] + for issue_url in row_input.issue_urls + if issue_url in github_cache.issues + ] + pulls = [] + for pr_url in row_input.pr_urls: + pull = github_cache.pulls.get(pr_url) + if not pull: + continue + pulls.append(pull) + if pull.author: + pr_authors.append(pull.author) + pulls = cap_pull_file_summaries(pulls, MAX_ROW_FILES_SUMMARY_CHARS) + fetch_failed_urls = [ + url + for url in (*row_input.issue_urls, *row_input.pr_urls) + if url in github_cache.failed_urls + ] + return RowContext( + row_number=row_input.row_number, + component=row_input.component, + raw_component=row_input.raw_component, + issue_type=row_input.issue_type, + pr_title=row_input.pr_title, + pr_authors=unique_ordered(pr_authors), + pr_urls=row_input.pr_urls, + issue_urls=row_input.issue_urls, + formatted_release_note=row_input.formatted_release_note, + issues=issues, + pulls=pulls, + fetch_failed_urls=fetch_failed_urls, + ) + + +def cap_pull_file_summaries(pulls: list[Any], budget: int) -> list[Any]: + """Truncate the combined files_summary across a row's PRs to ``budget`` chars. + + PullInfo objects come from the shared GitHub cache and may be referenced by + multiple rows, so truncation returns copies (via dataclasses.replace) instead + of mutating the cached objects in place. + """ + capped: list[Any] = [] + remaining = budget + for pull in pulls: + summary = pull.files_summary or "" + if len(summary) <= remaining: + capped.append(pull) + remaining -= len(summary) + continue + if remaining <= 0: + truncated = "...[changed-file information omitted to limit input size]" + else: + truncated = ( + summary[:remaining] + + "\n...[changed-file information truncated to limit input size]" + ) + capped.append(dataclasses.replace(pull, files_summary=truncated)) + remaining = 0 + return capped + + +def apply_generation_result( + sheet: Any, + header: dict[str, int], + result: RowGenerationResult, + entries_by_row: dict[int, list[MarkdownEntry]], +) -> None: + ai_cell = sheet.cell(row=result.row_number, column=header["release_notes_written_by_ai"]) + if result.error: + ai_cell.value = f"AI_GENERATION_FAILED: {result.error}" + print( + f"Row {result.row_number}: AI generation failed: {result.error}", + file=sys.stderr, + flush=True, + ) + return + if not result.note or not result.note_type: + ai_cell.value = "AI_GENERATION_FAILED: empty AI generation result" + print( + f"Row {result.row_number}: AI generation failed: empty AI generation result", + file=sys.stderr, + flush=True, + ) + return + + type_cell = sheet.cell(row=result.row_number, column=header["ai_note_type"]) + + if result.note_type == "not_needed": + ai_cell.value = result.note + type_cell.value = "not_needed" + print( + f"Row {result.row_number}: {result.note}", + flush=True, + ) + return + + ai_cell.value = result.note + type_cell.value = result.note_type + entries_by_row[result.row_number] = [ + MarkdownEntry(result.note_type, result.component, result.note, result.raw_component) + ] + review_marker = " (needs review)" if result.needs_review else "" + print( + f"Row {result.row_number}: generated {result.note_type}{review_marker}: {result.reason}", + flush=True, + ) + + +def release_component_for_row(sheet: Any, header: dict[str, int], row_number: int) -> str: + raw_component = normalize_raw_component( + sheet.cell(row=row_number, column=get_component_col(header)).value + ) + raw_lower = raw_component.lower() + raw_release_component = release_component_from_raw(raw_component) + if raw_release_component: + return raw_release_component + + urls = issue_urls_for_row(sheet, header, row_number) + urls.extend(extract_pr_urls(str_value(sheet.cell(row=row_number, column=header["pr_link"]).value))) + repos = {match.group("repo").lower() for url in urls for match in [GITHUB_ITEM_URL_RE.search(url)] if match} + + if "pd" in repos: + return "PD" + if "tikv" in repos: + return "TiKV" + if "tiflash" in repos: + return "TiFlash" + if "ng-monitoring" in repos: + return "TiDB" + if "tiup" in repos: + return "TiUP" + if repos.intersection({"tiflow", "ticdc"}): + if "dm" in raw_lower and "cdc" not in raw_lower: + return "TiDB Data Migration (DM)" + return "TiCDC" + if "tidb" in repos: + if "br" in raw_lower: + return "Backup & Restore (BR)" + if "lightning" in raw_lower: + return "TiDB Lightning" + if "dumpling" in raw_lower: + return "Dumpling" + return "TiDB" + if "tidb-dashboard" in repos: + return "TiDB" + return normalize_component(raw_component) + + +def release_component_from_raw(raw_component: str) -> str: + normalized_raw = normalize_component(raw_component) + if normalized_raw in TOP_LEVEL_COMPONENTS or normalized_raw in TOOL_COMPONENTS: + return normalized_raw + + token_components = [ + normalize_component(token) + for token in split_multi_value(raw_component) + ] + if not token_components: + return "" + + for component in [ + "Backup & Restore (BR)", + "TiDB Lightning", + "Dumpling", + "TiUP", + "sync-diff-inspector", + ]: + if component in token_components: + return component + + for component in TOP_LEVEL_COMPONENTS: + if component in token_components: + return component + + if "TiDB Data Migration (DM)" in token_components: + return "TiDB Data Migration (DM)" + if "TiCDC" in token_components: + return "TiCDC" + + return "" + + +def classify_note_level(note_level: str) -> tuple[str | None, str | None]: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note_level)] + if not labels: + return None, None + section = labels[0].lower() + note_type = None + if "bug fixes" in section or "error fixes" in section: + note_type = "bug_fix" + elif "improvements" in section: + note_type = "improvement" + + component_labels = labels[1:] + if component_labels and component_labels[0].lower() == "tools": + component_labels = component_labels[1:] + for label in reversed(component_labels): + component = normalized_release_component(label) + if component: + return note_type, component + return note_type, None + + +def classify_note_type_from_text(note: str, issue_type: str) -> str | None: + note_lower = note.lower() + issue_type_lower = issue_type.lower() + if "> bug fixes" in note_lower or "> 错误修复" in note_lower: + return "bug_fix" + if "> improvements" in note_lower or "> 改进提升" in note_lower: + return "improvement" + if "bug" in issue_type_lower or "fix" in issue_type_lower: + return "bug_fix" + if "improvement" in issue_type_lower or "enhancement" in issue_type_lower: + return "improvement" + if note.strip().startswith("- Fix "): + return "bug_fix" + return "improvement" + + +def parse_component_from_dup(note: str) -> str | None: + labels = [label.strip() for label in re.findall(r">\s*([^>]+)", note)] + cleaned: list[str] = [] + for label in labels: + if " - " in label: + label = label.split(" - ", 1)[0] + cleaned.append(label.strip()) + if len(cleaned) < 2: + return None + return normalized_release_component(cleaned[-1]) + + +def fill_row(sheet: Any, row_number: int) -> None: + for column in range(1, sheet.max_column + 1): + sheet.cell(row=row_number, column=column).fill = copy.copy(GRAY_FILL) diff --git a/scripts/release-notes-ai-generator/github_client.py b/scripts/release-notes-ai-generator/github_client.py new file mode 100644 index 0000000000000..c96ac7af86fc9 --- /dev/null +++ b/scripts/release-notes-ai-generator/github_client.py @@ -0,0 +1,321 @@ +from __future__ import annotations + +import re +import sys +import threading +import time +from typing import Any + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +from .constants import GITHUB_ITEM_URL_RE +from .models import IssueInfo, PullInfo +from .utils import parse_github_url + + +def create_retry_policy() -> Retry: + return Retry( + total=3, + connect=3, + read=3, + status=3, + backoff_factor=1, + status_forcelist=(500, 502, 503, 504), + allowed_methods=frozenset(["GET"]), + respect_retry_after_header=True, + raise_on_status=False, + ) + + +class GitHubClient: + def __init__( + self, + token: str | None, + max_rate_limit_retries: int = 3, + max_rate_limit_sleep: int = 600, + ): + self.max_rate_limit_retries = max_rate_limit_retries + self.max_rate_limit_sleep = max_rate_limit_sleep + self.headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28", + } + if token: + self.headers["Authorization"] = f"Bearer {token}" + self._thread_local = threading.local() + + def get_session(self) -> requests.Session: + session = getattr(self._thread_local, "session", None) + if session is None: + session = requests.Session() + session.headers.update(self.headers) + adapter = HTTPAdapter(max_retries=create_retry_policy()) + session.mount("https://", adapter) + self._thread_local.session = session + return session + + def get_json(self, api_path: str) -> dict[str, Any]: + data = self.get_api_json(api_path) + if not isinstance(data, dict): + raise ValueError(f"Expected object response from {api_path}") + return data + + def get_api_json(self, api_path: str, params: dict[str, Any] | None = None) -> Any: + return self.get_url_json(f"https://api.github.com{api_path}", params=params) + + def get_url_json(self, url: str, params: dict[str, Any] | None = None) -> Any: + last_response: requests.Response | None = None + for attempt in range(self.max_rate_limit_retries + 1): + response = self.get_session().get(url, params=params, timeout=30) + last_response = response + if self.is_rate_limited(response) and attempt < self.max_rate_limit_retries: + sleep_seconds = self.rate_limit_sleep_seconds(response, attempt) + print( + "GitHub API rate limit reached; retrying in " + f"{sleep_seconds} seconds: {url}", + file=sys.stderr, + flush=True, + ) + time.sleep(sleep_seconds) + continue + response.raise_for_status() + return response.json() + if last_response is not None: + last_response.raise_for_status() + raise RuntimeError(f"GitHub API request failed: {url}") + + def is_rate_limited(self, response: requests.Response) -> bool: + if response.status_code == 429: + return True + if response.status_code != 403: + return False + if response.headers.get("x-ratelimit-remaining") == "0": + return True + message = response.text.lower() + return "rate limit" in message or "abuse detection" in message + + def rate_limit_sleep_seconds(self, response: requests.Response, attempt: int) -> int: + retry_after = response.headers.get("retry-after") + if retry_after and retry_after.isdigit(): + return min(max(int(retry_after), 1), self.max_rate_limit_sleep) + reset = response.headers.get("x-ratelimit-reset") + if reset and reset.isdigit(): + wait_seconds = int(reset) - int(time.time()) + 5 + return min(max(wait_seconds, 1), self.max_rate_limit_sleep) + return min(2 ** attempt, self.max_rate_limit_sleep) + + def get_pull(self, pr_url: str) -> PullInfo: + owner, repo, number = parse_github_url(pr_url, "pull") + pull = self.get_json(f"/repos/{owner}/{repo}/pulls/{number}") + files_summary = self.get_pull_files_summary(owner, repo, number) + return PullInfo( + url=pr_url, + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary=files_summary, + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + + def get_issue(self, issue_url: str) -> IssueInfo: + owner, repo, number = parse_github_url(issue_url, "issues") + issue = self.get_json(f"/repos/{owner}/{repo}/issues/{number}") + labels = [ + str(label.get("name")) + for label in issue.get("labels", []) + if isinstance(label, dict) and label.get("name") + ] + return IssueInfo( + url=issue_url, + title=str(issue.get("title") or ""), + body=str(issue.get("body") or ""), + labels=labels, + ) + + def get_pull_files_summary( + self, + owner: str, + repo: str, + number: str, + max_files: int = 40, + max_patch_chars: int = 1200, + max_total_chars: int = 20000, + ) -> str: + lines: list[str] = [] + page = 1 + total_chars = 0 + while len(lines) < max_files and total_chars < max_total_chars: + files = self.get_api_json( + f"/repos/{owner}/{repo}/pulls/{number}/files", + params={"per_page": 100, "page": page}, + ) + if not isinstance(files, list) or not files: + break + for item in files: + if len(lines) >= max_files or total_chars >= max_total_chars: + break + if not isinstance(item, dict): + continue + patch = str(item.get("patch") or "") + if len(patch) > max_patch_chars: + patch = patch[:max_patch_chars] + "\n...[patch truncated]" + block = "\n".join( + [ + f"file: {item.get('filename', '')}", + f"status: {item.get('status', '')}", + f"additions: {item.get('additions', 0)}", + f"deletions: {item.get('deletions', 0)}", + "patch:", + patch, + ] + ) + lines.append(block) + total_chars += len(block) + page += 1 + if not lines: + return "No changed-file information is available." + if len(lines) >= max_files: + lines.append("...[file list truncated]") + return "\n\n".join(lines) + + def list_pulls_for_base( + self, + owner: str, + repo: str, + base: str, + state: str = "closed", + max_pages: int = 10, + ) -> list[PullInfo]: + pulls: list[PullInfo] = [] + for page in range(1, max_pages + 1): + data = self.get_api_json( + f"/repos/{owner}/{repo}/pulls", + params={ + "state": state, + "base": base, + "sort": "created", + "direction": "asc", + "per_page": 100, + "page": page, + }, + ) + if not isinstance(data, list) or not data: + break + for pull in data: + if not isinstance(pull, dict): + continue + pulls.append( + PullInfo( + url=str(pull.get("html_url") or ""), + title=str(pull.get("title") or ""), + body=str(pull.get("body") or ""), + author=str((pull.get("user") or {}).get("login") or ""), + head_ref=str((pull.get("head") or {}).get("ref") or ""), + base_ref=str((pull.get("base") or {}).get("ref") or ""), + files_summary="", + merged_at=str(pull.get("merged_at") or ""), + created_at=str(pull.get("created_at") or ""), + ) + ) + if len(data) < 100: + break + return pulls + + def get_original_author_for_cherry_pick( + self, row_number: int, cp_pr_link: str, cp_pr_title: str, current_author: str + ) -> str: + default_owner, default_repo, _cp_number = parse_github_url(cp_pr_link, "pull") + target_ref = find_original_pr_reference(cp_pr_title, default_owner, default_repo) + if not target_ref: + try: + cp_info = self.get_pull(cp_pr_link) + target_ref = ( + find_original_pr_reference(cp_info.head_ref, default_owner, default_repo) + or find_original_pr_reference(cp_info.title, default_owner, default_repo) + or find_original_pr_reference(cp_info.body, default_owner, default_repo) + ) + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to inspect cherry-pick PR " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + if not target_ref: + print( + f"Row {row_number}: failed to find the original PR for " + f"{cp_pr_link} created by {current_author}.", + file=sys.stderr, + ) + return current_author + + target_owner, target_repo, target_number = target_ref + target_pr_link = f"https://github.com/{target_owner}/{target_repo}/pull/{target_number}" + try: + return self.get_pull(target_pr_link).author or current_author + except Exception as exc: # noqa: BLE001 + print( + f"Row {row_number}: failed to find the non-bot author for " + f"{cp_pr_link}: {exc}", + file=sys.stderr, + ) + return current_author + + +def find_original_pr_reference( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + text = text or "" + marker_lines = [ + line + for line in text.splitlines() + if re.search(r"\b(backport|cherry[- ]?pick|original|source|from)\b", line, re.I) + ] + for line in marker_lines: + reference = find_pr_reference_in_text(line, default_owner, default_repo) + if reference: + return reference + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text) + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + branch = re.search(r"(?:^|[/_-])cherry-pick-(?P\d+)(?:\D|$)", text) + if branch: + return default_owner, default_repo, branch.group("number") + + if "\n" not in text and len(text) <= 300: + return find_pr_reference_in_text(text, default_owner, default_repo) + + return None + + +def find_pr_reference_in_text( + text: str, + default_owner: str, + default_repo: str, +) -> tuple[str, str, str] | None: + for full_url in GITHUB_ITEM_URL_RE.finditer(text or ""): + if full_url.group("kind") == "pull": + return full_url.group("owner"), full_url.group("repo"), full_url.group("number") + + cross_repo = re.search( + r"(?[\w.-]+)/(?P[\w.-]+)#(?P\d+)\b", + text or "", + ) + if cross_repo: + return cross_repo.group("owner"), cross_repo.group("repo"), cross_repo.group("number") + + same_repo = re.search(r"\(#(?P\d+)\)\s*$", text or "") + if same_repo: + return default_owner, default_repo, same_repo.group("number") + + return None diff --git a/scripts/release-notes-ai-generator/markdown_writer.py b/scripts/release-notes-ai-generator/markdown_writer.py new file mode 100644 index 0000000000000..38d02cdf51950 --- /dev/null +++ b/scripts/release-notes-ai-generator/markdown_writer.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path + +from .constants import TOOL_COMPONENTS, TOP_LEVEL_COMPONENTS +from .models import MarkdownEntry +from .utils import normalize_component, str_value + + +def write_release_file( + output_file: Path, + version: str, + release_date: str, + entries: list[MarkdownEntry], +) -> None: + major_minor = ".".join(version.split(".")[:2]) + grouped = group_markdown_entries(entries) + content: list[str] = [ + "---", + f"title: TiDB {version} Release Notes", + f"summary: Learn about the improvements and bug fixes in TiDB {version}.", + "---", + "", + f"# TiDB {version} Release Notes", + "", + f"Release date: {release_date}", + "", + f"TiDB version: {version}", + "", + "Quick access: " + f"[Quick start](https://docs.pingcap.com/tidb/v{major_minor}/quick-start-with-tidb) | " + f"[Production deployment](https://docs.pingcap.com/tidb/v{major_minor}/production-deployment-using-tiup)", + "", + ] + + content.extend(render_section("## Improvements", grouped["improvement"])) + content.append("") + content.extend(render_section("## Bug fixes", grouped["bug_fix"])) + content.append("") + while content and content[-1] == "": + content.pop() + + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text("\n".join(content) + "\n", encoding="utf-8") + + +def group_markdown_entries(entries: list[MarkdownEntry]) -> dict[str, dict[str, list[MarkdownEntry]]]: + grouped: dict[str, dict[str, list[MarkdownEntry]]] = { + "improvement": defaultdict(list), + "bug_fix": defaultdict(list), + } + for entry in entries: + if entry.note_type not in grouped: + continue + component = normalize_component(entry.component) or "Other" + grouped[entry.note_type][component].append(entry) + return grouped + + +def render_section(title: str, entries_by_component: dict[str, list[MarkdownEntry]]) -> list[str]: + lines = [title, ""] + top_components = [ + component + for component in TOP_LEVEL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + unknown_top_components = sorted( + component + for component in entries_by_component + if component not in TOP_LEVEL_COMPONENTS + and component not in TOOL_COMPONENTS + and entries_by_component[component] + ) + tool_components = [ + component + for component in TOOL_COMPONENTS + if component in entries_by_component and entries_by_component[component] + ] + + for component in top_components + unknown_top_components: + lines.append(f"+ {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + if tool_components: + lines.append("+ Tools") + lines.append("") + for component in tool_components: + lines.append(f" + {component}") + lines.append("") + for entry in entries_by_component[component]: + lines.append(f" {note_with_component_marker(entry)}") + lines.append("") + + while lines and lines[-1] == "": + lines.pop() + return lines + + +def note_with_component_marker(entry: MarkdownEntry) -> str: + note = ensure_release_note_bullet(entry.note) + raw_component = sanitize_component_marker(entry.raw_component) + if not raw_component or "" + + +def ensure_release_note_bullet(note: str) -> str: + note = str_value(note) + if note.startswith("- "): + return note + if note.startswith(("+ ", "* ")): + return "- " + note[2:].lstrip() + return f"- {note}" + + +def sanitize_component_marker(component: str) -> str: + return " ".join(str_value(component).replace("--", "- -").split()) diff --git a/scripts/release-notes-ai-generator/models.py b/scripts/release-notes-ai-generator/models.py new file mode 100644 index 0000000000000..d22963460d097 --- /dev/null +++ b/scripts/release-notes-ai-generator/models.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import dataclasses + + +@dataclasses.dataclass +class ExistingNote: + url: str + line: str + file_name: str + note_level: str + authors: list[str] + note_type: str | None + component: str | None + + @property + def dup_text(self) -> str: + return f"- (dup): {self.file_name} {self.note_level} {self.line}" + + +@dataclasses.dataclass +class PullInfo: + url: str + title: str + body: str + author: str + head_ref: str + base_ref: str + files_summary: str + merged_at: str = "" + created_at: str = "" + + +@dataclasses.dataclass +class IssueInfo: + url: str + title: str + body: str + labels: list[str] + + +@dataclasses.dataclass +class GeneratedNote: + note_type: str + release_note: str + needs_review: bool + reason: str + + +@dataclasses.dataclass +class RowContext: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + issues: list[IssueInfo] + pulls: list[PullInfo] + fetch_failed_urls: list[str] = dataclasses.field(default_factory=list) + + +@dataclasses.dataclass +class RowInput: + row_number: int + component: str + raw_component: str + issue_type: str + pr_title: str + pr_authors: list[str] + pr_urls: list[str] + issue_urls: list[str] + formatted_release_note: str + + +@dataclasses.dataclass +class GitHubDataCache: + issues: dict[str, IssueInfo] + pulls: dict[str, PullInfo] + # URLs whose GitHub data could not be fetched during prefetch, so the AI + # generates without their issue/PR body and diff. + failed_urls: set[str] = dataclasses.field(default_factory=set) + + +@dataclasses.dataclass +class MarkdownEntry: + note_type: str + component: str + note: str + raw_component: str = "" + + +@dataclasses.dataclass +class RowGenerationResult: + row_number: int + component: str + raw_component: str + note_type: str | None + note: str | None + error: str | None + needs_review: bool = False + reason: str = "" diff --git a/scripts/release-notes-ai-generator/prompts/generation.md b/scripts/release-notes-ai-generator/prompts/generation.md new file mode 100644 index 0000000000000..b4c10c8d30eda --- /dev/null +++ b/scripts/release-notes-ai-generator/prompts/generation.md @@ -0,0 +1,160 @@ +# Generation Prompt + +You are a senior technical writer who has profound knowledge of TiDB. Your task is to decide whether a change needs a release note, and if so, write exactly one English release note entry. + +## Input data about the change + +{{ROW_CONTEXT}} + +Expected links to include in the release note (the entry MUST contain exactly these — no more, no fewer): +{{EXPECTED_LINKS}} + +Contributors (append each in order as `@[user](https://github.com/user)`): +{{CONTRIBUTORS}} + +### How to read the input fields + +- `formatted_release_note_from_excel`: might be empty, `None`, or a generic placeholder (treat as no usable draft). When it contains a real draft written by the PR author, use it as an important reference — preserve its user-facing intent, but verify it against the PR code changes and issue description, correct inaccurate wording, and apply all style rules below. +- `fetch_failed_urls`: Lists links whose GitHub data could not be fetched. When non-empty, rely on Excel fields (`pr_title_from_excel`, `formatted_release_note_from_excel`, `issue_type_from_excel`) and set `needs_review` to true. +- `files_summary` might end with `...[patch truncated]` — that is expected; judge from the visible portion. + +## Classification: does this change need a release note? + +Not every PR or change warrants a release note. Before writing, determine whether the change is visible to TiDB users or operators according to the issue description, PR description, and code changes. + +### Write a release note when the change is user-visible + +- Bug fixes that change query results, upgrade behavior, privilege checks, error messages, or compatibility +- New features, new SQL syntax/function support, or new configuration options +- Meaningful performance improvements observable in common operations +- Behavior changes that affect upgrade paths, tooling integration, or operational workflows +- Default value changes for system variables or configuration parameters + +### No release note needed when the change is internal-only + +- Test-only changes: new test cases, flaky test fixes, test infrastructure updates +- Pure refactors or internal data-structure changes with no user-observable effect +- Internal debug/log changes that do not surface in user-facing interfaces +- CI/CD pipeline or developer workflow changes +- Code comments or source-code-only documentation changes (not user-facing docs) + +### Borderline cases + +If a PR is mostly internal but the outcome is user-visible, write a release note that describes the outcome and omit the implementation details. If the only user-facing effect is indirect or speculative, lean toward returning a "not_needed" verdict. + +### Whether improvement or bug fix + +Use the Excel `issue_type` from the input data as a strong signal, but also decide the final type from the issue, PR description, and code changes. + +## Writing style (applies only when writing a release note) + +The rules below define the wording, opening verbs, and single-entry style. + +### General rules + +- Write from the user's perspective, in English. +- Do not end the entry with a period. +- Do not expose internal function names unless they are user-visible behavior. Rewrite into observable behavior (e.g. `Fix nil pointer panic in getRegionFromTS` → `Fix the potential panic that occurs when fetching region information during a Stale Read`). +- SQL functions: backtick ALL CAPS with parentheses (`` `DATE()` ``). SQL keywords: backtick ALL CAPS (`` `HAVING` ``). +- Normalize product names to their official capitalization: TiDB, TiKV, TiCDC, TiFlash, PD, BR, DM, TiDB Lightning, Dumpling, TiUP. Never use lowercase variants like `ticdc` or `tikv` in the release note text except they are part of variable/parameter names or code comments. +- Use ONLY the Contributors list provided above for `@[user](url)` attribution. Ignore `author` fields inside `pull_requests[]` — they may be bot accounts (e.g. `ti-chi-bot`) from cherry-pick workflows. +- End the entry with exactly the links from the Expected links list. Render each as `[#]()` where `` is the issue or PR number extracted from the URL path. Do not invent, drop, or reorder links. +- Use the Improvements style when the type is `improvement`, and the Bug fixes style when the type is `bug_fix`. +- Output exactly one entry — never section headers, component groups, or more than one bullet. +- If available context is insufficient, still draft the best note and set `needs_review` to true. + +### Improvements style + +Lead with an action verb. State the user benefit explicitly. Explain why the change matters in terms of performance, stability, or capability. For example, instead of "Not use the stale read request's `start_ts` to update `max_ts`," write "Avoid excessive commit request retrying by not using the Stale Read request's `start_ts` to update `max_ts`." + +| Verb | When to use | +|------|-------------| +| `Support` | New capability: ```Support casting the `STRING` type to the `DOUBLE` type``` | +| `Add` | New element or mechanism: `Add a timeout mechanism for LDAP authentication` | +| `Optimize` | Algorithmic improvement: `Optimize the non-joined data in right outer join using multiple threads` | +| `Improve` | General improvement: `Improve the MySQL compatibility of ...` | +| `Avoid` | Eliminate a problem: `Avoid excessive commit request retrying by ...` | +| `Enhance` | Capability expansion | +| `Mitigate` | Risk or stability improvement | +| `Accelerate` | Speed improvement | +| `Remove` | Cleanup or deprecation | +| `Increase` | Raise a limit or capacity | + +Examples: + +``` +- Improve the MySQL compatibility of expression default values displayed in the output of `SHOW CREATE TABLE` [#52939](https://github.com/pingcap/tidb/issues/52939) @[CbcWestwolf](https://github.com/CbcWestwolf) +- Support adding multiple indexes concurrently in the ingest mode [#52596](https://github.com/pingcap/tidb/issues/52596) @[lance6716](https://github.com/lance6716) +- Add a timeout mechanism for LDAP authentication to avoid the issue of resource lock (RLock) not being released in time [#51883](https://github.com/pingcap/tidb/issues/51883) @[YangKeao](https://github.com/YangKeao) +``` + +### Bug fixes style + +Lead with a fix verb phrase. Accepted patterns: + +- `Fix the issue that [subject] [verb phrase]` (dominant modern pattern) +- `Fix the issue of [noun phrase] that occurs when/during [condition]` (result-first phrasing) +- `Fix the issue of [noun phrase]` (noun-centric, no trigger clause) +- `Fix the [incorrect/inaccurate] [noun]` (standalone, for example, `Fix the incorrect error message ...`) +- `Fix a [rare/potential] issue that [description]` (rare or non-deterministic bugs) +- `Fix the potential/occasional [panic/crash] that occurs when [condition]` (specific crash scenarios) +- `Fix the panic issue caused by [X]` (panic identified by cause) + +A complete entry should include the trigger condition (when it happens) and the observed impact (what the user sees), and optionally a workaround. Wrap exact error messages in backticks. + +For non-deterministic failures, both `might` and `potential` are acceptable: use `might` as an inline modal verb (`Fix the issue that TiDB might crash when ...`) and `potential` as an adjective before a noun (`Fix the potential panic that occurs when ...`). Do not use `may` or `could`. + +Examples: + +``` +- Fix the issue that executing SQL statements containing tables with multi-valued indexes might return the `Can't find a proper physical plan for this query` error [#49438](https://github.com/pingcap/tidb/issues/49438) @[qw4990](https://github.com/qw4990) +- Fix the issue that TiDB might crash when `tidb_mem_quota_analyze` is enabled and the memory used by updating statistics exceeds the limit [#52601](https://github.com/pingcap/tidb/issues/52601) @[hawkingrei](https://github.com/hawkingrei) +- Fix the panic issue caused by `GetAdditionalInfo` [#8079](https://github.com/tikv/pd/issues/8079) @[HuSharp](https://github.com/HuSharp) +``` + +Anti-patterns: + +| Wrong | Right | +|-------|-------| +| `Fixed the issue that ...` (past tense) | `Fix the issue that ...` (imperative) | +| `Fixes an issue where ...` | `Fix the issue that ...` | +| `Fix the issue where ...` | `Fix the issue that ...` (use `that`, not `where`) | +| `Fix the issue that ... may ...` | Use `might` or `potential` | +| `The issue of X causing Y is fixed` | `Fix the issue that X causes Y` | + +## Output format + +Return **only a raw JSON object** — no Markdown fences, no extra text. Keys: + +- `type`: `"improvement"`, `"bug_fix"`, or `"not_needed"` +- `release_note`: the formatted entry (see below for the value format) +- `needs_review`: `true` or `false` +- `reason`: short English reason for the type and wording + +### `release_note` value + +When `type` is `"improvement"` or `"bug_fix"`, `release_note` is one Markdown bullet assembled in this order: + +``` +- +``` + +Improvement example: + +``` +- Support adding multiple indexes concurrently in the ingest mode [#52596](https://github.com/pingcap/tidb/issues/52596) @[lance6716](https://github.com/lance6716) +``` + +Bug fix example: + +``` +- Fix the issue that TiCDC might panic when the initialization of the Pulsar producer fails [#4937](https://github.com/pingcap/ticdc/issues/4937) @[wk989898](https://github.com/wk989898) +``` + +When `type` is `"not_needed"`, set `release_note` to the following format: + +``` +Release note is not needed for this change. Reason: +``` + +Examples of `"not_needed"` reasons: `test-only change`, `internal refactor, no user-visible effect`, `flaky test fix`, `added internal debug logging`. diff --git a/scripts/release-notes-ai-generator/release-notes-generator-readme.md b/scripts/release-notes-ai-generator/release-notes-generator-readme.md new file mode 100644 index 0000000000000..9b341457113db --- /dev/null +++ b/scripts/release-notes-ai-generator/release-notes-generator-readme.md @@ -0,0 +1,471 @@ +# Release notes generator + +`python3 -m release-notes-ai-generator` (run from the `scripts/` directory) generates English TiDB release notes for the `Improvements` and `Bug fixes` sections according to PRs and issues in an Excel workbook. + +The generator uses a two-phase workflow: + +1. **`generate`** (Phase 1): Processes the source Excel workbook — runs preprocessing, calls AI to generate release notes, and writes results back to Excel. Supports row-range arguments (`--start-row` / `--end-row`) for resuming after interruptions. +2. **`export-markdown`** (Phase 2): Reads the processed Excel and exports a Markdown release note file. Does not call AI or modify the Excel. + +The source workbook is never overwritten. All processing results are written to a processed workbook (`_processed.xlsx`). + +## What it does + +**Scope filtering** + +- Filters out rows of PRs and issues that are not in the target release scope. +- Moves issues that already appeared in earlier release notes from the same major.minor series to a separate worksheet for review. + +**Author correction** + +- Resolves bot-authored cherry-pick rows to the original PR author when possible. + +**Duplicate handling** + +- Reuses already-published release note entries as `(dup)` entries when appropriate. + +**Release note generation** + +- Generates English release notes with AI from workbook data, GitHub PR and issue context, changed-file summaries, and repo-local release note writing references. + +**Component mapping** + +- Maps workbook components to the corresponding release note Markdown components. + +**Markdown generation** + +- Writes `Improvements` and `Bug fixes` entries to a Markdown release note draft. + +The generator does not create a complete formal release note. It does not generate sections such as compatibility changes, known issues, deprecations, or upgrade notes. + +## Prerequisites + +- Install Python dependencies: + + ```bash + python3 -m pip install -r scripts/release-notes-ai-generator/requirements.txt + ``` + +- Prepare a GitHub token with access to the public repositories and set the GitHub token in the `GITHUB_TOKEN` environment variable: + + ```bash + export GITHUB_TOKEN= + ``` + +- Prepare the AI settings in your environment. + + - If you use `--ai-provider azure` instead, set the following environment variables: + + ```bash + export AZURE_OPENAI_KEY= + export AZURE_OPENAI_BASE_URL= + ``` + + - If you use Codex CLI, install and log in to Codex CLI. The default `--ai-command` uses `codex exec`, so the installed Codex CLI must support `exec`, `--sandbox read-only`, `--ephemeral`, `--output-schema`, `--output-last-message`, and `-m `. + +## Typical usage examples + +The generator uses two subcommands that run independently: + +- `generate` (Phase 1): processes the Excel workbook, calls AI, writes results back to Excel. +- `export-markdown` (Phase 2): reads the processed Excel and outputs a Markdown file. + +### Phase 1: Generate release notes into Excel + +Use Azure OpenAI: + +```bash +cd scripts +python3 -m release-notes-ai-generator generate \ + --version 8.5.7 \ + --excel /path/to/release-note-excel.xlsx \ + --releases-dir ../releases \ + --ai-provider azure +``` + +Use Codex CLI: + +```bash +cd scripts +python3 -m release-notes-ai-generator generate \ + --version 8.5.7 \ + --excel /path/to/release-note-excel.xlsx \ + --releases-dir ../releases +``` + +### Phase 1: Resume from interruption + +If the first run is interrupted (e.g. API quota exhausted), resume from where it left off using `--start-row`: + +```bash +cd scripts +python3 -m release-notes-ai-generator generate \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ + --releases-dir ../releases \ + --ai-provider azure \ + --start-row 51 +``` + +You can also limit to a specific range with `--end-row`: + +```bash +python3 -m release-notes-ai-generator generate \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ + --releases-dir ../releases \ + --ai-provider azure \ + --start-row 51 --end-row 100 +``` + +When `--start-row` or `--end-row` is specified, preprocessing steps (sort, merge, scope filter, same-series move) are skipped because they were completed in the first run. + +### Phase 2: Export Markdown from processed Excel + +After Phase 1 is fully complete, export the Markdown: + +```bash +cd scripts +python3 -m release-notes-ai-generator export-markdown \ + --version 8.5.7 \ + --excel /path/to/release-note-excel_processed.xlsx \ + --releases-dir ../releases \ + --release-date "August 14, 2025" +``` + +## Option descriptions + +### `generate` subcommand options + +| Option | Required | Default value | Description | +| --- | --- | --- | --- | +| `--version ` | Yes | None | Target TiDB version. Used for scope filtering, existing release-note lookup, and the default output file name. | +| `--excel ` | Yes | None | Path to the source release note Excel file. The source workbook is not overwritten. The processed workbook is written to `_processed.xlsx` (or the path specified by `--output-excel`). | +| `--releases-dir ` | Yes | None | Path to the existing English release notes directory. Used for historical release note scanning and scope filtering. | +| `--sheet ` | No | `pr_for_release_note` | Workbook sheet to process. | +| `--ai-provider ` | No | `codex` | AI provider to use. `codex` runs the Codex CLI as a subprocess. `azure` calls Azure OpenAI via the OpenAI Python SDK. | +| `--ai-command ` | No | `codex --ask-for-approval never exec --sandbox read-only --ephemeral` | Command used to invoke the AI generator (only used with `--ai-provider codex`). | +| `--ai-model ` | No | `gpt-5.4` | Model name. Passed to `codex exec` with `-m`, or used as the model parameter for Azure OpenAI. | +| `--involve-ai-generation ` | No | `ON` | Whether to generate non-duplicate release notes with AI. Use `OFF` to skip AI generation and only run preprocessing. | +| `--ai-timeout ` | No | `600` | Timeout in seconds for each AI command invocation. | +| `--ai-workers ` | No | `3` | Number of concurrent AI command invocations. | +| `--github-workers ` | No | `8` | Number of concurrent GitHub API prefetch workers. | +| `--author-workers ` | No | `3` | Number of concurrent workers used to resolve bot-authored cherry-pick PR authors. | +| `--checkpoint-interval ` | No | `1` | Save the processed workbook after every N completed AI rows. Use `0` to disable. | +| `--force-regenerate` | No | Disabled | Clear existing AI-generated notes and regenerate all non-duplicate rows. | +| `--skip-scope-preprocess` | No | Disabled | Skip moving not-in-scope PR rows to the `PRs_not_in_scope` sheet. | +| `--scope-base-branch-start-date ` | No | Estimated from release history | Override the estimated release-m.n branch start date for x.y.0 scope preprocessing. | +| `--start-row ` | No | First data row | Excel row number to start processing from (1-indexed, row 1 is the header). When specified, preprocessing steps are skipped. Use this to resume after an interruption. | +| `--end-row ` | No | Last row | Excel row number to stop processing at (inclusive, 1-indexed). | +| `--output-excel ` | No | `_processed.xlsx` | Path for the processed Excel output. | + +### `export-markdown` subcommand options + +| Option | Required | Default value | Description | +| --- | --- | --- | --- | +| `--version ` | Yes | None | Target TiDB version. Used for the Markdown front matter and default output file name. | +| `--excel ` | Yes | None | Path to the processed Excel workbook (output of the `generate` phase). | +| `--sheet ` | No | `pr_for_release_note` | Workbook sheet to read entries from. | +| `--releases-dir ` | Yes | None | Path to the existing English release notes directory (used to determine the default output path). | +| `--output-release-file ` | No | `release--updated-by-ai.md` | Output Markdown file. The default never writes the canonical `release-.md`, because the generator produces only `Improvements` and `Bug fixes`, not a complete release note. | +| `--release-date ` | No | `TBD` | Release date text for the generated Markdown header. | + +## Generated files + +**Phase 1 (`generate`):** + +- The source Excel file passed to `--excel` is not overwritten (unless `--output-excel` points to the same file, which is useful for resume scenarios). +- The processed Excel file is written to `_processed.xlsx` next to the source workbook, or to the path specified by `--output-excel`. +- Rows where AI determines no release note is needed are moved to a separate `release_note_not_needed` sheet in the processed workbook. This move is skipped when `--start-row` or `--end-row` is used, so that deleting rows does not shift the row numbers a later segment relies on; such rows stay in the main sheet but are still excluded from Markdown. + +**Phase 2 (`export-markdown`):** + +- The generated Markdown file is written to `--output-release-file` when that option is specified. +- If `--output-release-file` is omitted, the generated Markdown file is written to `release--updated-by-ai.md` under `--releases-dir`. The default never overwrites the canonical `release-.md`, because the generated file is an incomplete draft (only `Improvements` and `Bug fixes`). +- The Excel workbook is not modified during this phase. + +## Reference: processing rules + +The following sections describe the main processing logic and rules used by the generator. + +### Processing pipeline + +| Stage | What happens | Review value | +| --- | --- | --- | +| Scope filtering | Out-of-scope rows are moved to `PRs_not_in_scope` with a reason. | Reviewers can see why a row was excluded. | +| Workbook setup | Rows are sorted by component, and output columns are added or reset. | Related rows are easier to inspect, and generated data stays separate from source data. | +| Historical scan | Existing release notes are indexed by GitHub URL, contributor, section, and component. | The generator can reuse published wording instead of drafting duplicate text. | +| Same-series quarantine | Issues already published in the same major.minor series are moved to a separate sheet. | Repeated issues in the same series are visible for manual review. | +| Duplicate marking | Reusable historical entries are written to `published_release_notes` and rendered as `(dup)` entries. | The output keeps the reviewed published note and its source location. | +| Author replacement | Bot-authored cherry-pick rows are resolved to the original PR author when possible. | Contributor suffixes and duplicate matching use the real author. | +| Row merging | Rows with the same first issue URL and raw Excel component are merged. | Multiple PRs for one issue produce one release note entry. | +| Entry generation | Non-duplicate rows are generated by AI or copied from `formated_release_note` in non-AI mode. | The same preprocessing works for both drafting and dry-run workflows. | +| Markdown rendering | Entries are grouped by type and Markdown component. | The draft follows the expected release note structure. | + +### Scope filtering + +Scope filtering removes rows that should not appear in the target release note. Removed rows are copied to `PRs_not_in_scope`, receive a `Reason` value, and are deleted from the main sheet in the processed workbook. + +General rules: + +| Condition | Result | Why | +| --- | --- | --- | +| `pr_status` is not `merged` | Move the row to `PRs_not_in_scope`. | Unmerged changes should not be documented as released. | +| `pr_merge_time` is empty or cannot be parsed | Keep the row. | The generator cannot prove that the row is out of scope. | + +Patch-release rules: + +For a patch release such as `8.5.7`, the generator finds the previous patch release date in `releases/release-timeline.md`. When parsing `release-timeline.md`, the generator skips non-semver entries such as `Pre-GA`. + +| Condition | Result | Why | +| --- | --- | --- | +| The PR was merged before the previous patch release date. | Move the row to `PRs_not_in_scope`. | The PR should already have been considered for the previous patch release. | +| The PR was merged on or after the previous patch release date. | Keep the row. | The PR falls into the target patch-release window. | + +`x.y.0` release rules: + +For an `x.y.0` release, the generator uses `releases/release-timeline.md` and release-branch PR data to avoid including changes that were already shipped in the latest previous major.minor release. + +| Condition | Result | Why | +| --- | --- | --- | +| The PR was merged on or after the latest previously released `x.y.0` date. | Keep the row. | The PR is newer than that previous release boundary. | +| The PR was merged before the estimated start date of the previous release branch. | Move the row to `PRs_not_in_scope`. | The PR is older than the branch window for the previous major.minor release. | +| The PR was merged during the previous release-branch window, and a cherry-pick PR for the previous release branch was merged before that previous release date. | Move the row to `PRs_not_in_scope`. | The change was already included through that cherry-pick. | +| No earlier-release evidence is found. | Keep the row. | The generator keeps the row when it cannot prove that the change is out of scope. | + +The estimated release-branch start date comes from the earliest closed PR that targets the previous release branch. You can override it with `--scope-base-branch-start-date`. + +When matching a cherry-pick PR to the original PR, the generator recognizes: + +- The full original PR URL. +- A cross-repository reference such as `pingcap/tidb#12345`. +- A same-repository suffix such as `(#12345)`. +- A branch or text pattern such as `cherry-pick-12345`. +- A line that contains `backport`, `cherry-pick`, `original`, `source`, or `from` together with `#12345`. + +### Historical release note index + +The generator scans existing Markdown files under `--releases-dir` before it decides whether a workbook row is a duplicate. + +The scanner: + +- Ignores generated drafts whose file name contains `updated-by-ai`. +- Ignores release-note files whose version is greater than or equal to the target version. +- Tracks the current release-note section and component from headings and component bullets. +- Extracts every GitHub issue or PR URL from a release note line. +- Extracts contributors from `@[user](https://github.com/user)` suffixes. +- Classifies each historical line as `improvement` or `bug_fix` from its surrounding section. +- Records the surrounding Markdown component when possible. + +Each historical entry can later be reused in this format: + +```markdown +- (dup): +``` + +This preserves the published wording and shows the source file and component path. + +### Repeated issues and duplicates + +The generator handles repeated issues in two different ways: + +- Same-series repeats are moved to a separate worksheet for review. +- Reusable duplicates from other series are rendered as `(dup)` entries. + +This separation is intentional. If the same issue appears again in the same major.minor series, it is often a sign that the row needs human judgment. If the issue has already been documented elsewhere and the author check passes, reusing the published note is usually safer than drafting a new sentence. + +For target version `8.5.7`, the same-series quarantine sheet is named: + +```text +issue_already_in_earlier_v8.5 +``` + +A row is moved to this sheet when all of the following are true: + +- The row has an issue URL in `issue_url` or `formated_release_note`. +- The same issue URL appears in an existing release-note file. +- The existing release-note file is from the same major.minor series. +- The existing release-note file version is earlier than the target version. + +Rows in this sheet are not rendered to Markdown. + +After same-series rows are moved out, the generator marks remaining rows as duplicates when their issue URLs match reusable historical entries. + +| Rule | Behavior | +| --- | --- | +| Issue URL source | The generator reads issue URLs from `issue_url`, if present, and from `formated_release_note`. | +| PR URL source | PR URLs are not used for duplicate matching. They are used for AI context and component inference. | +| Author check | If a historical note has contributors, at least one current row author must match a historical contributor. If the historical note has no contributors, the URL match is enough. | +| Workbook output | Matching historical notes are written to `published_release_notes`, and the row is filled in gray. | +| Markdown output | Duplicate rows are rendered from `published_release_notes`; they do not go through AI generation. | +| Type selection | The generator uses the historical section when possible. Otherwise, it falls back to the current row `issue_type`. | +| Component selection | The generator uses the historical component path when possible. Otherwise, it falls back to the current row component. | + +### Author and row normalization + +Cherry-pick PRs are often authored by `ti-chi-bot` or `ti-srebot`. For rows with those authors, the generator tries to find the original PR from the cherry-pick PR title, branch name, or body. + +When the original PR is found, the generator: + +- Replaces `pr_author` with the original PR author. +- Updates author Markdown in `formated_release_note` from the bot account to the original author. + +If the original PR cannot be found, the row keeps the bot author. This avoids blocking the whole run because of one incomplete cherry-pick reference. + +Rows are then merged when they have the same first issue URL and the same raw Excel component. For each merged group, the first row is kept. The kept row receives: + +- The union of `pr_link` values. +- The union of `pr_author` values. +- The union of duplicate notes from `published_release_notes`. +- The first available non-empty value for other empty cells. + +Rows are grouped by the raw Excel component, not the normalized Markdown component. This keeps workbook distinctions intact until the final component mapping stage. + +### Entry generation + +With `--involve-ai-generation ON`, the generator calls the configured AI command for non-duplicate rows that do not already have reusable text in `release_notes_written_by_ai`. + +The prompt includes: + +- The raw Excel component and normalized Markdown component. +- Workbook fields such as `issue_type`, `pr_title`, `formated_release_note`, expected links, and contributors. +- GitHub issue titles, bodies, and labels. +- GitHub PR titles, bodies, authors, branches, merge times, and changed-file summaries. +- The repository-local writing references for improvements and bug fixes. +- The prompt template in `scripts/release-notes-ai-generator/prompts/generation.md`. + +The AI command must return a JSON object with these fields: + +| Field | Rule | +| --- | --- | +| `type` | Must be `improvement` or `bug_fix`. | +| `release_note` | Must be one Markdown bullet that starts with a hyphen followed by a space. | +| `needs_review` | Must be a boolean. | +| `reason` | Must explain the type and wording choice. | + +The generator validates that the release note: + +- Starts with a hyphen followed by a space. +- Does not end with a period. +- Includes every expected issue or PR link. +- Includes every non-bot contributor as `@[user](https://github.com/user)`. + +If validation fails, the generator sends one repair prompt. If the repaired output still fails, the row is marked as: + +```text +AI_GENERATION_FAILED: +``` + +Failed rows are not rendered to Markdown. + +If `release_notes_written_by_ai` already contains a value and does not start with `AI_GENERATION_FAILED:`, the generator reuses it instead of calling AI again. Use `--force-regenerate` to clear existing AI output and regenerate all non-duplicate rows. + +With `--involve-ai-generation OFF`, the generator does not call the AI command. For non-duplicate rows, it splits `formated_release_note` into non-empty lines and renders those lines as Markdown entries. The preprocessing pipeline still runs in non-AI mode. + +### Component mapping + +The generator maps each workbook component to a Markdown release-note component before rendering. It also keeps the original workbook component in an HTML comment after each generated entry: + +```markdown +- Improve ... [#12345](https://github.com/pingcap/tidb/issues/12345) @[user](https://github.com/user) +``` + +This marker lets reviewers trace the generated component back to the workbook value without changing the visible release-note text. + +The generator resolves components in this order: + +1. If the raw workbook value is already a known release-note component or alias, use that value. +2. If the raw workbook value contains multiple comma-separated or newline-separated values, apply the multi-value priority rules. +3. If the workbook value still cannot be resolved, infer the component from the GitHub repositories in the issue and PR URLs. +4. If no rule matches, use the normalized raw workbook value. +5. If the final value is empty, render the entry under `Other`. + +Direct aliases: + +| Excel component value | Markdown component | +| --- | --- | +| `tidb` | `TiDB` | +| `tikv` | `TiKV` | +| `pd` | `PD` | +| `tiflash` | `TiFlash` | +| `tiproxy` | `TiProxy` | +| `br`, `backup & restore`, `backup & restore (br)` | `Backup & Restore (BR)` | +| `cdc`, `ticdc` | `TiCDC` | +| `dm`, `tidb data migration`, `tidb data migration (dm)` | `TiDB Data Migration (DM)` | +| `tidb lightning`, `lightning` | `TiDB Lightning` | +| `dumpling` | `Dumpling` | +| `tiup` | `TiUP` | +| `tidb binlog` | `TiDB Binlog` | +| `sync_diff`, `sync-diff-inspector`, `sync diff inspector` | `sync-diff-inspector` | + +TiDB subcomponent aliases: + +| Excel component value | Markdown component | +| --- | --- | +| `ng monitoring`, `ng-monitoring` | `TiDB` | +| `planner` | `TiDB` | +| `execution` | `TiDB` | +| `sql-infra` | `TiDB` | +| `transaction` | `TiDB` | +| `engine` | `TiDB` | +| `observability` | `TiDB` | +| `dxf` | `TiDB` | +| `storage` | `TiDB` | +| `tidb-dashboard`, `tidb dashboard` | `TiDB` | +| `ddl` | `TiDB` | +| `coprocessor` | `TiDB` | +| `compute` | `TiDB` | +| `scheduling` | `TiDB` | +| `spm` | `TiDB` | + +When a workbook cell contains multiple component values, the generator applies this priority: + +1. Tool components with stronger source meaning: `Backup & Restore (BR)`, `TiDB Lightning`, `Dumpling`, `TiUP`, and `sync-diff-inspector`. +2. Top-level components: `TiDB`, `TiKV`, `PD`, `TiFlash`, and `TiProxy`. +3. `TiDB Data Migration (DM)`. +4. `TiCDC`. + +Repository fallback rules: + +| Repository evidence | Markdown component | +| --- | --- | +| `pd` | `PD` | +| `tikv` | `TiKV` | +| `tiflash` | `TiFlash` | +| `ng-monitoring` | `TiDB` | +| `tiup` | `TiUP` | +| `tiflow` or `ticdc`, and the raw component contains `dm` but not `cdc` | `TiDB Data Migration (DM)` | +| `tiflow` or `ticdc`, otherwise | `TiCDC` | +| `tidb`, and the raw component contains `br` | `Backup & Restore (BR)` | +| `tidb`, and the raw component contains `lightning` | `TiDB Lightning` | +| `tidb`, and the raw component contains `dumpling` | `Dumpling` | +| `tidb`, otherwise | `TiDB` | +| `tidb-dashboard` | `TiDB` | + +### Markdown rendering and safe saving + +The generated file contains front matter, the `# TiDB Release Notes` heading, release metadata, quick access links, `## Improvements`, and `## Bug fixes`. + +Entries are grouped by type and component. Top-level components are rendered in this order: + +```text +TiDB, TiKV, PD, TiFlash, TiProxy +``` + +Tool components are rendered under `+ Tools` in this order: + +```text +Backup & Restore (BR), TiCDC, TiDB Data Migration (DM), TiDB Lightning, Dumpling, TiUP, TiDB Binlog, sync-diff-inspector +``` + +Known top-level components are rendered first. Unknown non-tool components are rendered next in alphabetical order. Tool components are rendered last under `Tools`. + +Before writing an entry, the renderer normalizes its bullet marker to a hyphen followed by a space. If the entry does not already contain a component marker, the renderer appends the raw workbook component as an HTML comment. + +The processed workbook is saved to `_processed.xlsx`. During AI generation, `--checkpoint-interval` controls how often the processed workbook is saved: + +- The default value `1` saves after every completed AI row. +- `0` disables checkpoint saves. + +Workbook saves are atomic. The generator first writes a temporary file in the target directory and then replaces the processed workbook. If replacement fails after a complete temporary workbook has been written, the error message includes the temporary file path. + diff --git a/scripts/release-notes-ai-generator/requirements.txt b/scripts/release-notes-ai-generator/requirements.txt new file mode 100644 index 0000000000000..1168030c0fdca --- /dev/null +++ b/scripts/release-notes-ai-generator/requirements.txt @@ -0,0 +1,4 @@ +openpyxl>=3.1 +openai>=1.66 +requests>=2.31 +urllib3>=1.26 diff --git a/scripts/release-notes-ai-generator/scope_filter.py b/scripts/release-notes-ai-generator/scope_filter.py new file mode 100644 index 0000000000000..019824068d6e1 --- /dev/null +++ b/scripts/release-notes-ai-generator/scope_filter.py @@ -0,0 +1,366 @@ +from __future__ import annotations + +import copy +import re +from dataclasses import dataclass +from datetime import date, datetime +from pathlib import Path +from typing import Any + +from .excel_workbook import get_header +from .models import PullInfo +from .utils import parse_github_url, str_value + + +OUT_OF_SCOPE_SHEET = "PRs_not_in_scope" +REASON_HEADER = "Reason" +SCOPE_REQUIRED_HEADERS = {"pr_status", "pr_merge_time", "pr_link"} + + +@dataclass(frozen=True) +class Version: + major: int + minor: int + patch: int + + @property + def release_branch(self) -> str: + return f"release-{self.major}.{self.minor}" + + @property + def text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch}" + + @property + def previous_patch_text(self) -> str: + return f"{self.major}.{self.minor}.{self.patch - 1}" + + +@dataclass(frozen=True) +class TimelineRelease: + version: Version + display_version: str + release_date: date + + +@dataclass +class ScopeContext: + version: Version + releases_dir: Path + github: Any + base_branch_start_date: date | None = None + timeline: list[TimelineRelease] | None = None + release_branch_pulls: dict[str, list[PullInfo]] | None = None + + def __post_init__(self) -> None: + if self.timeline is None: + self.timeline = parse_release_timeline(self.releases_dir / "release-timeline.md") + if self.release_branch_pulls is None: + self.release_branch_pulls = {} + + +def move_prs_not_in_scope( + workbook: Any, + sheet: Any, + version: str, + releases_dir: Path, + github: Any, + base_branch_start_date: date | None = None, + target_sheet_name: str = OUT_OF_SCOPE_SHEET, +) -> int: + header = get_header(sheet) + missing = sorted(SCOPE_REQUIRED_HEADERS - set(header)) + if missing: + raise ValueError( + "Missing required Excel columns for scope preprocessing: " + + ", ".join(missing) + ) + + context = ScopeContext( + version=parse_version(version), + releases_dir=releases_dir, + github=github, + base_branch_start_date=base_branch_start_date, + ) + target = ensure_out_of_scope_sheet(workbook, sheet, target_sheet_name) + + rows_to_move: list[tuple[int, str]] = [] + for row_number in range(2, sheet.max_row + 1): + reason = out_of_scope_reason(sheet, header, row_number, context) + if reason: + rows_to_move.append((row_number, reason)) + + for row_number, reason in rows_to_move: + append_row_with_reason(sheet, target, row_number, reason) + + for row_number, _reason in reversed(rows_to_move): + sheet.delete_rows(row_number, 1) + + if rows_to_move: + print( + f"Moved {len(rows_to_move)} row(s) to {target_sheet_name} before release-note generation", + flush=True, + ) + return len(rows_to_move) + + +def ensure_out_of_scope_sheet(workbook: Any, source_sheet: Any, target_sheet_name: str) -> Any: + if target_sheet_name in workbook.sheetnames: + target = workbook[target_sheet_name] + if target.max_row == 0 or not target.cell(row=1, column=1).value: + copy_header(source_sheet, target) + else: + ensure_reason_header(source_sheet, target) + return target + + target = workbook.create_sheet(target_sheet_name) + copy_header(source_sheet, target) + return target + + +def copy_header(source_sheet: Any, target_sheet: Any) -> None: + for column in range(1, source_sheet.max_column + 1): + copy_cell(source_sheet.cell(row=1, column=column), target_sheet.cell(row=1, column=column)) + ensure_reason_header(source_sheet, target_sheet) + + +def ensure_reason_header(source_sheet: Any, target_sheet: Any) -> None: + target_sheet.cell(row=1, column=source_sheet.max_column + 1, value=REASON_HEADER) + + +def append_row_with_reason(source_sheet: Any, target_sheet: Any, row_number: int, reason: str) -> None: + target_row = target_sheet.max_row + 1 + for column in range(1, source_sheet.max_column + 1): + copy_cell( + source_sheet.cell(row=row_number, column=column), + target_sheet.cell(row=target_row, column=column), + ) + target_sheet.cell(row=target_row, column=source_sheet.max_column + 1, value=reason) + + +def copy_cell(source_cell: Any, target_cell: Any) -> None: + target_cell.value = source_cell.value + if source_cell.has_style: + target_cell._style = copy.copy(source_cell._style) + if source_cell.number_format: + target_cell.number_format = source_cell.number_format + if source_cell.hyperlink: + target_cell._hyperlink = copy.copy(source_cell.hyperlink) + if source_cell.comment: + target_cell.comment = copy.copy(source_cell.comment) + + +def out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + context: ScopeContext, +) -> str | None: + status = str_value(sheet.cell(row=row_number, column=header["pr_status"]).value).lower() + if status != "merged": + return f"PR status is {status or 'empty'}, not merged" + + merge_date = parse_date_value(sheet.cell(row=row_number, column=header["pr_merge_time"]).value) + if not merge_date: + return None + + if context.version.patch >= 1: + previous_date = release_date_for_version(context.timeline or [], context.version.previous_patch_text) + if not previous_date: + raise ValueError( + f"Cannot find release date for previous version {context.version.previous_patch_text} " + "in releases/release-timeline.md" + ) + if merge_date < previous_date: + return ( + f"PR merged on {merge_date.isoformat()}, before previous release " + f"{context.version.previous_patch_text} date {previous_date.isoformat()}" + ) + return None + + return major_release_out_of_scope_reason(sheet, header, row_number, merge_date, context) + + +def major_release_out_of_scope_reason( + sheet: Any, + header: dict[str, int], + row_number: int, + merge_date: date, + context: ScopeContext, +) -> str | None: + latest_zero = latest_released_zero_patch(context.timeline or [], context.version.text) + if not latest_zero: + raise ValueError("Cannot find a previously released x.y.0 version in releases/release-timeline.md") + + if merge_date >= latest_zero.release_date: + return None + + branch_start = context.base_branch_start_date or estimated_release_branch_start_date(context, latest_zero) + if not branch_start: + return None + if merge_date < branch_start: + return ( + f"PR merged on {merge_date.isoformat()}, before estimated {latest_zero.version.release_branch} " + f"branch start date {branch_start.isoformat()}" + ) + + pr_link = str_value(sheet.cell(row=row_number, column=header["pr_link"]).value) + cherry_pick = find_release_branch_cherry_pick(context, latest_zero, pr_link) + if not cherry_pick: + return None + cherry_pick_date = parse_date_value(cherry_pick.merged_at) + if cherry_pick_date and cherry_pick_date < latest_zero.release_date: + return ( + f"Cherry-pick PR {cherry_pick.url} merged on {cherry_pick_date.isoformat()} " + f"before {latest_zero.display_version} release date {latest_zero.release_date.isoformat()}" + ) + return None + + +def estimated_release_branch_start_date( + context: ScopeContext, + latest_zero: TimelineRelease, +) -> date | None: + branch_pulls = release_branch_pulls(context, latest_zero.version.release_branch) + created_dates = [parse_date_value(pull.created_at) for pull in branch_pulls] + created_dates = [value for value in created_dates if value] + return min(created_dates) if created_dates else None + + +def find_release_branch_cherry_pick( + context: ScopeContext, + latest_zero: TimelineRelease, + pr_link: str, +) -> PullInfo | None: + try: + owner, repo, number = parse_github_url(pr_link, "pull") + except ValueError: + return None + if (owner, repo) != ("pingcap", "tidb"): + return None + + candidates = [] + for pull in release_branch_pulls(context, latest_zero.version.release_branch): + haystack = "\n".join([pull.title, pull.body, pull.head_ref, pull.url]) + if references_original_pr(haystack, owner, repo, number, pr_link): + candidates.append(pull) + + merged_candidates = [ + pull for pull in candidates if parse_date_value(pull.merged_at) + ] + if not merged_candidates: + return None + return min( + merged_candidates, + key=lambda pull: parse_date_value(pull.merged_at) or date.max, + ) + + +def references_original_pr( + text: str, + owner: str, + repo: str, + number: str, + pr_link: str, +) -> bool: + text = text or "" + patterns = [ + re.escape(pr_link), + rf"(? list[PullInfo]: + assert context.release_branch_pulls is not None + if branch not in context.release_branch_pulls: + context.release_branch_pulls[branch] = context.github.list_pulls_for_base( + "pingcap", + "tidb", + branch, + state="closed", + ) + return context.release_branch_pulls[branch] + + +def parse_release_timeline(path: Path) -> list[TimelineRelease]: + releases: list[TimelineRelease] = [] + if not path.exists(): + raise FileNotFoundError(f"Cannot find release timeline: {path}") + pattern = re.compile( + r"\|\s*\[(?P[^\]]+)\]\([^)]+\)\s*\|\s*(?P\d{4}-\d{2}-\d{2})\s*\|" + ) + for line in path.read_text(encoding="utf-8").splitlines(): + match = pattern.search(line) + if not match: + continue + try: + version = parse_version(match.group("version")) + except ValueError: + continue + release_date = date.fromisoformat(match.group("date")) + releases.append(TimelineRelease(version, match.group("version"), release_date)) + return releases + + +def release_date_for_version(timeline: list[TimelineRelease], version_text: str) -> date | None: + for release in timeline: + if release.version.text == version_text: + return release.release_date + return None + + +def latest_released_zero_patch( + timeline: list[TimelineRelease], + target_version_text: str, +) -> TimelineRelease | None: + zero_patch_releases = [ + release + for release in timeline + if release.version.patch == 0 and release.version.text != target_version_text + ] + if not zero_patch_releases: + return None + return max(zero_patch_releases, key=lambda release: release.release_date) + + +def parse_version(version: str) -> Version: + match = re.match(r"^(?P\d+)\.(?P\d+)\.(?P\d+)", version) + if not match: + raise ValueError(f"Invalid TiDB version: {version}") + return Version( + major=int(match.group("major")), + minor=int(match.group("minor")), + patch=int(match.group("patch")), + ) + + +def parse_date_value(value: Any) -> date | None: + if value is None: + return None + if isinstance(value, datetime): + return value.date() + if isinstance(value, date): + return value + text = str_value(value) + if not text: + return None + text = text.replace("Z", "+00:00") + try: + return datetime.fromisoformat(text).date() + except ValueError: + pass + match = re.search(r"\d{4}-\d{2}-\d{2}", text) + if match: + return date.fromisoformat(match.group()) + return None diff --git a/scripts/release-notes-ai-generator/utils.py b/scripts/release-notes-ai-generator/utils.py new file mode 100644 index 0000000000000..1c0641787019c --- /dev/null +++ b/scripts/release-notes-ai-generator/utils.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from typing import Any, Iterable + +from .constants import ( + COMPONENT_ALIASES, + GITHUB_ITEM_URL_RE, + ISSUE_URL_RE, + PR_URL_RE, + TOOL_COMPONENTS, + TOP_LEVEL_COMPONENTS, +) + + +def parse_github_url(url: str, expected_kind: str) -> tuple[str, str, str]: + match = GITHUB_ITEM_URL_RE.search(url) + if not match: + raise ValueError(f"Invalid GitHub URL: {url}") + if match.group("kind") != expected_kind: + raise ValueError(f"Expected a GitHub {expected_kind} URL, got: {url}") + return match.group("owner"), match.group("repo"), match.group("number") + + +def extract_issue_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in ISSUE_URL_RE.finditer(text or "")) + + +def extract_pr_urls(text: str) -> list[str]: + return unique_ordered(match.group() for match in PR_URL_RE.finditer(text or "")) + + +def replace_author_markdown(text: str, old_author: str, new_author: str) -> str: + text = text or "" + return text.replace( + f"[{old_author}](https://github.com/{old_author}", + f"[{new_author}](https://github.com/{new_author}", + ) + + +def normalize_component(component: str) -> str: + cleaned = " ".join(str_value(component).split()) + if not cleaned: + return "" + return COMPONENT_ALIASES.get(cleaned.lower(), cleaned) + + +def normalize_raw_component(component: Any) -> str: + return " ".join(str_value(component).split()) + + +def normalized_release_component(component: str) -> str | None: + normalized = normalize_component(component) + if normalized in TOP_LEVEL_COMPONENTS or normalized in TOOL_COMPONENTS: + return normalized + return None + + +def split_multi_value(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [item.strip() for item in text.replace("\n", ",").split(",") if item.strip()] + + +def split_lines(value: Any) -> list[str]: + text = str_value(value) + if not text: + return [] + return [line.strip() for line in text.splitlines() if line.strip()] + + +def unique_ordered(values: Iterable[str]) -> list[str]: + result: list[str] = [] + seen: set[str] = set() + for value in values: + cleaned = str_value(value) + if not cleaned or cleaned in seen: + continue + seen.add(cleaned) + result.append(cleaned) + return result + + +def str_value(value: Any) -> str: + if value is None: + return "" + return str(value).strip()