#!/usr/bin/env python3 # dofs/tools/extract_api.py # Extract DOFS public API from headers and emit: # - Per-header Markdown files under docs/.md (mirrors src tree) # - One JSON index at docs/index.json from __future__ import annotations import argparse import json import re from dataclasses import dataclass, asdict from pathlib import Path from typing import List, Optional, Tuple, Dict # -------- Repo roots -------- def _detect_repo_root() -> Path: p = Path(__file__).resolve() for anc in [p.parent, *p.parents]: if (anc / "src").is_dir(): return anc return p.parent REPO_ROOT = _detect_repo_root() # .../dofs SRC_ROOT = REPO_ROOT / "src" OUT_DIR_DEFAULT = REPO_ROOT / "docs" # mirror into docs/ # -------- IO helpers -------- def read_text(p: Path) -> str: return p.read_text(encoding="utf-8", errors="ignore") def iter_headers(root: Path) -> List[Path]: return sorted(root.rglob("*.h")) def strip_comments_and_literals(code: str) -> str: string_re = r'("([^"\\]|\\.)*")|(\'([^\'\\]|\\.)*\')' slc_re = r'//[^\n]*' mlc_re = r'/\*.*?\*/' def _keep_nls(m): # keep line count stable return re.sub(r'[^\n]', ' ', m.group(0)) code = re.sub(mlc_re, _keep_nls, code, flags=re.S) code = re.sub(string_re, _keep_nls, code, flags=re.S) code = re.sub(slc_re, _keep_nls, code) return code # -------- Data model -------- @dataclass class Symbol: kind: str # "free_function" | "method" | "ctor" | "dtor" | "conversion" | "macro" qualified: str signature: str file: str # e.g., "src/core/simulator.h" line: int static: bool = False const: bool = False ref_qual: str = "" template_params: str = "" # -------- Parser (same core as before; trimmed comments) -------- class Parser: def __init__(self, text: str, relpath: str): self.text = text; self.relpath = relpath self.i = 0; self.n = len(text); self.line = 1 self.ns_stack: List[str] = [] self.class_stack: List[dict] = [] self.depth_brace = 0 self.pending_template: Optional[str] = None self.syms: List[Symbol] = [] # simple guard for bogus names when we fall into bodies self._kw_block = { "if", "for", "while", "switch", "return", "case", "default", "do", "else", "break", "continue", "goto", "try", "catch" } def peek(self, k=0): j=self.i+k; return self.text[j] if 0<=j=self.n: return ch=self.text[self.i]; self.i+=1 if ch=="\n": self.line+=1 def skip_ws(self): while self.i= self.n: break if self.text.startswith("namespace", self.i): self._parse_namespace(); continue if self.text.startswith("class ", self.i) or self.text.startswith("struct ", self.i): self._parse_record(); continue if self.text.startswith("template", self.i): self.pending_template = self._parse_template_intro(); continue if self.text.startswith("public:", self.i): self._set_access("public"); self.advance(len("public:")); continue if self.text.startswith("private:", self.i): self._set_access("private"); self.advance(len("private:")); continue if self.text.startswith("protected:", self.i): self._set_access("protected"); self.advance(len("protected:")); continue ch=self.peek() if ch=="{": self.depth_brace+=1; self.advance(1); continue if ch=="}": self.depth_brace-=1; self.advance(1); self._maybe_pop(); continue self._maybe_decl_or_def() return self.syms def _skip_balanced_block(self): """ Consume a balanced {...} block starting at the current position (which must be at '{'). This does NOT touch self.depth_brace / class_stack, so it won't confuse outer block tracking. """ if self.peek() != "{": return depth = 0 # consume the first '{' self.advance(1) depth += 1 while self.i < self.n and depth > 0: ch = self.peek() if ch == "{": depth += 1 elif ch == "}": depth -= 1 self.advance(1) # --- blocks --- def _parse_namespace(self): self.advance(len("namespace")); self.skip_ws() if self.text.startswith("inline", self.i): self.advance(len("inline")); self.skip_ws() m = re.match(r'([A-Za-z_]\w*(::[A-Za-z_]\w*)*)?', self.text[self.i:]) name = ""; if m: name = m.group(0) or ""; self.advance(len(name)) self.skip_ws() if self.peek() == "{": self.advance(1); self.depth_brace += 1 self.ns_stack.append(name if name else "") def _parse_record(self): kw = "class" if self.text.startswith("class ", self.i) else "struct" self.advance(len(kw)); self.skip_ws() name = self._read_word() if not name: return while self.i") else: self.advance(1) if self.peek()=="{": self.advance(1); self.depth_brace += 1 self.class_stack.append({"name": name, "access": "public" if kw=="struct" else "private", "brace_depth": self.depth_brace}) else: self.advance(1) # forward decl def _parse_template_intro(self) -> str: self.advance(len("template")); self.skip_ws() params = self._read_balanced("<", ">") if self.peek()=="<" else "" return f"template{params}" def _set_access(self, acc: str): if self.class_stack: self.class_stack[-1]["access"]=acc def _maybe_pop(self): if self.class_stack and self.class_stack[-1]["brace_depth"] == self.depth_brace + 1: self.class_stack.pop(); return if self.ns_stack: self.ns_stack.pop() # --- helpers --- def _read_word(self) -> str: self.skip_ws() m = re.match(r'[A-Za-z_]\w*', self.text[self.i:]) if not m: return "" w = m.group(0); self.advance(len(w)); return w def _read_balanced(self, o: str, c: str) -> str: depth=1; out=o; self.advance(1) while self.i0: ch=self.peek(); out+=ch; self.advance(1) if ch==o: depth+=1 elif ch==c: depth-=1 return out def _current_ns_is_dofs(self) -> bool: if not self.ns_stack: return False chain=[p for p in self.ns_stack if p] return bool(chain) and chain[0]=="dofs" def _read_one_head(self) -> Tuple[str, str]: par=ang=sq=0; start=self.i while self.i": ang=max(0,ang-1) elif ch=="[": sq+=1 elif ch=="]": sq=max(0,sq-1) elif ch==";" and par==0 and ang==0 and sq==0: end=self.i; self.advance(1) return self.text[start:end].strip(), ";" elif ch=="{" and par==0 and ang==0 and sq==0: end=self.i return self.text[start:end].strip(), "{" self.advance(1) return "", "" def _skip_brace_block(self): """Assumes current char is '{'; skis balanced block.""" if self.peek() != "{": return brace = 0 while self.i < self.n: ch = self.peek() self.advance(1) if ch == "{": brace += 1 elif ch == "}": brace -= 1 if brace == 0: break def _consume_until_sep(self): par=ang=sq=0 while self.i": ang=max(0,ang-1) elif ch=="[": sq+=1 elif ch=="]": sq=max(0,sq-1) elif ch==";" and par==0 and ang==0 and sq==0: return elif ch=="{" and par==0 and ang==0 and sq==0: brace=1 while self.i0: c2=self.peek(); self.advance(1) if c2=="{": brace+=1 elif c2=="}": brace-=1 return def _maybe_decl_or_def(self): start_line = self.line # skip obvious non-function starts for bs in ("using ", "typedef ", "enum ", "namespace ", "static_assert"): if self.text.startswith(bs, self.i): self._consume_until_sep(); return if self.text.startswith("template ", self.i): self.pending_template = self._parse_template_intro(); return decl, endch = self._read_one_head() if not decl.strip(): return tparams = self.pending_template or "" self.pending_template = None if "friend" in decl: return if "(" not in decl or ")" not in decl: return recorded = False # classify: method vs free fn (inside dofs) in_class = bool(self.class_stack) if in_class: if self.class_stack[-1]["access"] != "public": return self._record_method(decl, start_line, tparams) recorded = True else: if self._current_ns_is_dofs(): self._record_free_function(decl, start_line, tparams) recorded = True # If we just read a function head with a body, skip the body **after** recording if endch == "{": self._skip_brace_block() return # If it wasn't recorded (e.g., not in dofs namespace for free function), # just continue; declarations ending with ';' need no additional skipping. if recorded: return else: return # --- symbol building --- def _normalize(self, s: str) -> str: return re.sub(r'\s+', ' ', s).strip() def _name_from_decl(self, decl: str) -> str: """ Find the function/method name robustly: - choose the '(' that starts the *parameter list* (angle-depth == 0) - then take the identifier immediately to its left as the name Avoids mistaking template args like 'std::function' for a function. """ # Strip trailing qualifiers after param list for stability head = re.split(r'\b(noexcept|requires)\b', decl)[0] # Scan to find the '(' that begins the parameter list at angle-depth 0 ang = 0 par_open_idx = -1 for idx, ch in enumerate(head): if ch == '<': ang += 1 elif ch == '>': ang = max(0, ang - 1) elif ch == '(' and ang == 0: par_open_idx = idx break if par_open_idx == -1: return "" # Walk left from par_open_idx to find the start of the name token j = par_open_idx - 1 # Skip whitespace while j >= 0 and head[j].isspace(): j -= 1 # Collect identifier (and allow operator forms) # First, try operator names m_op = re.search(r'(operator\s*""\s*_[A-Za-z_]\w*|operator\s*[^\s(]+)\s*$', head[:par_open_idx]) if m_op: name = m_op.group(1) else: # Regular identifier (possibly destructor) m_id = re.search(r'(~?[A-Za-z_]\w*)\s*$', head[:par_open_idx]) name = m_id.group(1) if m_id else "" if not name or name in self._kw_block: return "" return name def _qualify(self, name: str) -> str: ns = [p for p in self.ns_stack if p] q = "::".join(ns) + "::" if ns else "" if self.class_stack: q += "::".join([c["name"] for c in self.class_stack]) + "::" return (q + name) if q else name def _kind_for_method(self, name: str, cls: str) -> str: if name == cls: return "ctor" if name == f"~{cls}": return "dtor" if name.startswith("operator"): if re.match(r'operator\s+[^(\s]+', name) and "<" not in name and name != "operator()": return "conversion" return "method" return "method" def _cvref_static(self, decl: str) -> Tuple[bool,bool,str]: is_static = bool(re.search(r'(^|\s)static\s', decl)) r = decl.rfind(")") tail = decl[r+1:] if r!=-1 else "" is_const = bool(re.search(r'\bconst\b', tail)) refq = "&&" if "&&" in tail else ("&" if re.search(r'(^|\s)&(\s|$)', tail) else "") return is_static, is_const, refq def _record_method(self, decl: str, start_line: int, tparams: str): cls = self.class_stack[-1]["name"] name = self._name_from_decl(decl) if not name: return qualified = self._qualify(name) is_static, is_const, refq = self._cvref_static(decl) kind = self._kind_for_method(name, cls) sig = self._normalize((tparams + " " + decl).strip() if tparams else decl) self.syms.append(Symbol(kind=kind, qualified=qualified, signature=sig, file=self.relpath, line=start_line, static=is_static, const=is_const, ref_qual=refq, template_params=tparams or "")) def _record_free_function(self, decl: str, start_line: int, tparams: str): name = self._name_from_decl(decl) if not name: return qualified = self._qualify(name) sig = self._normalize((tparams + " " + decl).strip() if tparams else decl) self.syms.append(Symbol(kind="free_function", qualified=qualified, signature=sig, file=self.relpath, line=start_line, template_params=tparams or "")) # -------- Rendering -------- def to_json(symbols: List[Symbol]) -> str: items = [asdict(s) for s in symbols] items.sort(key=lambda s: (s["file"], s["line"], s["qualified"], s["signature"])) return json.dumps({"version": 1, "symbols": items}, indent=2) def _markdown_for_file(rel_repo_file: str, symbols: List[Symbol]) -> str: """ Build per-header Markdown for exactly the symbols whose s.file == rel_repo_file. """ title = rel_repo_file.replace("src/", "", 1) lines = [f"# {title}\n"] file_syms = [s for s in symbols if s.file == rel_repo_file] if not file_syms: lines.append("_No public API symbols found in this header._") lines.append("") return "\n".join(l.rstrip() for l in lines) # Group macros last; keep deterministic order def _order(s: Symbol): k = {"macro": 2}.get(s.kind, 1) return (k, s.qualified, s.signature) for s in sorted(file_syms, key=_order): tprefix = (s.template_params + " ") if s.template_params else "" if s.kind == "macro": # H2 with macro name, then macro head; no line numbers, no bullets lines.append(f"## `{s.qualified}`") lines.append(f"`{s.signature}`\n") else: # H2 with fully qualified name (namespace::[class::]func) # Contract/signature on the next line fqname = s.qualified if tprefix: lines.append(f"## `{fqname}`") lines.append(f"`{tprefix.strip()} {s.signature}`\n".replace(" ", " ").strip()) else: lines.append(f"## `{fqname}`") lines.append(f"`{s.signature}`\n") return "\n".join(l.rstrip() for l in lines) # -------- Robust multi-line free-function extraction -------- # Matches things like: # inline void foo(A a, # B b = std::nullopt) noexcept; # std::mutex &error_mutex() noexcept; _FREE_FN_RE = re.compile(r""" (?P ^ | [;\}\n] ) # anchor (?P (?:\s*(?:inline|constexpr|consteval|constinit|static|extern)\s+)* # storage/attrs (?:[\w:\<\>\*\&\s]+\s+)? # return type (optional for constructors, but we only accept when present) (?P[A-Za-z_]\w*)\s* # function name \( (?P [^()]* (?:\([^()]*\)[^()]*)* # balanced parens inside params ) \) (?:\s*noexcept(?:\s*\([^)]*\))?)? # optional noexcept/noexcept(expr) (?:\s*->\s*[^;{\n]+)? # optional trailing return type ) \s* (?P [;{] ) # prototype or definition """, re.VERBOSE | re.DOTALL | re.MULTILINE) def _collapse_ws(s: str) -> str: # Collapse all whitespace runs to a single space for clean signatures return " ".join(s.split()) def extract_free_functions_multiline(clean_text: str, relpath: str) -> List[Symbol]: """ Walk the file tracking namespace blocks and pick out free-function heads that can span multiple lines. Avoid class/struct/enum bodies. """ syms: List[Symbol] = [] ns_stack: List[str] = [] class_depth = 0 # crude guard: skip when inside class/struct/enum body # Token-ish scan to maintain simple block context i = 0 n = len(clean_text) while i < n: # namespace enter if clean_text.startswith("namespace", i): j = i + len("namespace") while j < n and clean_text[j].isspace(): j += 1 # Parse namespace name (could be 'dofs' or anonymous) k = j while k < n and (clean_text[k].isalnum() or clean_text[k] in "_:"): k += 1 ns_name = clean_text[j:k].strip() # Find the next '{' m = clean_text.find("{", k) if m != -1: if ns_name: ns_stack.append(ns_name) else: ns_stack.append("") # anonymous i = m + 1 continue # class/struct/enum guard if clean_text.startswith("class ", i) or clean_text.startswith("struct ", i) or clean_text.startswith("enum ", i): # Enter body at next '{' m = clean_text.find("{", i) if m != -1: class_depth += 1 i = m + 1 continue if clean_text[i] == '}': if class_depth > 0: class_depth -= 1 elif ns_stack: ns_stack.pop() i += 1 continue # Try a function head only if not inside a class-like body if class_depth == 0: m = _FREE_FN_RE.match(clean_text, i) if m: name = m.group("name") head = m.group("head") # filter obvious false positives: require a return type before name # (very rough: there must be at least one space before name inside head) if re.search(r"\S\s+" + re.escape(name) + r"\s*\(", head): qualified = "::".join([ns for ns in ns_stack if ns]) # drop anonymous qualified = f"{qualified}::{name}" if qualified else name # Build a tidy signature ender = m.group("ender") signature = _collapse_ws(head) + ender line = clean_text.count("\n", 0, m.start("head")) + 1 syms.append(Symbol(kind="free_function", qualified=qualified, signature=signature, file=relpath, line=line)) i = m.end() continue i += 1 return syms # -------- Macro extraction (function-like only) -------- _MACRO_HEAD_RE = re.compile(r'^\s*#\s*define\s+([A-Za-z_]\w*)\s*\((.*)$') def extract_function_like_macros(text: str, relpath: str) -> List[Symbol]: """ Capture lines of the form: #define NAME(args) with multi-line bodies using backslash continuations. We record: kind="macro", qualified=NAME, signature="#define NAME(args)". """ syms: List[Symbol] = [] lines = text.splitlines() i = 0 while i < len(lines): line = lines[i] m = _MACRO_HEAD_RE.match(line) if not m: i += 1 continue name = m.group(1) args_part = m.group(2) # may or may not contain closing ')' start_line = i + 1 # Collect continuation lines while trailing backslash exists. body_lines = [line] i += 1 while i < len(lines) and body_lines[-1].rstrip().endswith("\\"): body_lines.append(lines[i]) i += 1 # Reconstruct just the macro head (name + (...) args text). head = "".join(body_lines) # Try to extract the argument list reliably (balanced parens from first '(') # without being confused by body parentheses. head_from_paren = head[head.find("("):] if "(" in head else "" # Minimal balanced scan to the first matching ')' par = 0 arg_end = -1 for idx, ch in enumerate(head_from_paren): if ch == "(": par += 1 elif ch == ")": par -= 1 if par == 0: arg_end = idx break if arg_end != -1: arg_text = head_from_paren[1:arg_end] # inside (...) else: # Fallback: whatever we saw on the first line arg_text = args_part.split(")")[0] signature = f"#define {name}({arg_text.strip()})" syms.append(Symbol(kind="macro", qualified=name, signature=signature, file=relpath, line=start_line)) return syms # -------- Driver -------- def main(): ap = argparse.ArgumentParser(description="Extract DOFS public API (per-header docs).") ap.add_argument("--src", default=str(SRC_ROOT), help="Source root (default: repo/src)") ap.add_argument("--out-dir", default=str(OUT_DIR_DEFAULT), help="Docs root to mirror into (default: docs)") ap.add_argument("--stdout", action="store_true", help="Print JSON to stdout instead of writing files") args = ap.parse_args() src_root = Path(args.src).resolve() out_root = Path(args.out_dir).resolve() all_symbols: List[Symbol] = [] header_paths = iter_headers(src_root) for hp in header_paths: rel_repo = hp.relative_to(REPO_ROOT).as_posix() # e.g., src/core/simulator.h raw = read_text(hp) clean = strip_comments_and_literals(raw) p = Parser(clean, rel_repo) # C++ functions/methods (public) inside namespace dofs parsed = p.run() all_symbols.extend(parsed) # Multi-line free functions (e.g., log_error in error.h) extra_fns = extract_free_functions_multiline(clean, rel_repo) # De-duplicate by (kind, qualified, signature, file, line) seen = { (s.kind, s.qualified, s.signature, s.file, s.line) for s in all_symbols } for s in extra_fns: key = (s.kind, s.qualified, s.signature, s.file, s.line) if key not in seen: all_symbols.append(s) seen.add(key) # Function-like macros (global, regardless of namespace) all_symbols.extend(extract_function_like_macros(raw, rel_repo)) if args.stdout: print(to_json(all_symbols)) return # Write index.json under docs/ out_root.mkdir(parents=True, exist_ok=True) (out_root / "index.json").write_text(to_json(all_symbols), encoding="utf-8") # Emit one markdown per header, mirroring src/ -> docs/ # src/.h => docs/.md for hp in header_paths: rel_from_repo = hp.relative_to(REPO_ROOT).as_posix() # src/... rel_from_src = hp.relative_to(src_root).with_suffix(".md") # core/simulator.md target_path = out_root / rel_from_src target_path.parent.mkdir(parents=True, exist_ok=True) md = _markdown_for_file(rel_from_repo, all_symbols) target_path.write_text(md, encoding="utf-8") print(f"[extract_api] Wrote JSON index: {out_root/'index.json'}") print(f"[extract_api] Wrote per-header Markdown under: {out_root}") if __name__ == "__main__": main()