Files
dofs/tools/extract_api.py

627 lines
24 KiB
Python
Executable File

#!/usr/bin/env python3
# dofs/tools/extract_api.py
# Extract DOFS public API from headers and emit:
# - Per-header Markdown files under docs/<same path>.md (mirrors src tree)
# - One JSON index at docs/index.json
from __future__ import annotations
import argparse
import json
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Optional, Tuple, Dict
# -------- Repo roots --------
def _detect_repo_root() -> Path:
p = Path(__file__).resolve()
for anc in [p.parent, *p.parents]:
if (anc / "src").is_dir():
return anc
return p.parent
REPO_ROOT = _detect_repo_root() # .../dofs
SRC_ROOT = REPO_ROOT / "src"
OUT_DIR_DEFAULT = REPO_ROOT / "docs" # mirror into docs/
# -------- IO helpers --------
def read_text(p: Path) -> str:
return p.read_text(encoding="utf-8", errors="ignore")
def iter_headers(root: Path) -> List[Path]:
return sorted(root.rglob("*.h"))
def strip_comments_and_literals(code: str) -> str:
string_re = r'("([^"\\]|\\.)*")|(\'([^\'\\]|\\.)*\')'
slc_re = r'//[^\n]*'
mlc_re = r'/\*.*?\*/'
def _keep_nls(m): # keep line count stable
return re.sub(r'[^\n]', ' ', m.group(0))
code = re.sub(mlc_re, _keep_nls, code, flags=re.S)
code = re.sub(string_re, _keep_nls, code, flags=re.S)
code = re.sub(slc_re, _keep_nls, code)
return code
# -------- Data model --------
@dataclass
class Symbol:
kind: str # "free_function" | "method" | "ctor" | "dtor" | "conversion" | "macro"
qualified: str
signature: str
file: str # e.g., "src/core/simulator.h"
line: int
static: bool = False
const: bool = False
ref_qual: str = ""
template_params: str = ""
# -------- Parser (same core as before; trimmed comments) --------
class Parser:
def __init__(self, text: str, relpath: str):
self.text = text; self.relpath = relpath
self.i = 0; self.n = len(text); self.line = 1
self.ns_stack: List[str] = []
self.class_stack: List[dict] = []
self.depth_brace = 0
self.pending_template: Optional[str] = None
self.syms: List[Symbol] = []
# simple guard for bogus names when we fall into bodies
self._kw_block = {
"if", "for", "while", "switch", "return", "case", "default",
"do", "else", "break", "continue", "goto", "try", "catch"
}
def peek(self, k=0): j=self.i+k; return self.text[j] if 0<=j<self.n else ""
def advance(self, k=1):
for _ in range(k):
if self.i>=self.n: return
ch=self.text[self.i]; self.i+=1
if ch=="\n": self.line+=1
def skip_ws(self):
while self.i<self.n and self.text[self.i].isspace(): self.advance(1)
def run(self):
while self.i < self.n:
self.skip_ws()
if self.i >= self.n: break
if self.text.startswith("namespace", self.i): self._parse_namespace(); continue
if self.text.startswith("class ", self.i) or self.text.startswith("struct ", self.i): self._parse_record(); continue
if self.text.startswith("template", self.i):
self.pending_template = self._parse_template_intro(); continue
if self.text.startswith("public:", self.i): self._set_access("public"); self.advance(len("public:")); continue
if self.text.startswith("private:", self.i): self._set_access("private"); self.advance(len("private:")); continue
if self.text.startswith("protected:", self.i): self._set_access("protected"); self.advance(len("protected:")); continue
ch=self.peek()
if ch=="{": self.depth_brace+=1; self.advance(1); continue
if ch=="}": self.depth_brace-=1; self.advance(1); self._maybe_pop(); continue
self._maybe_decl_or_def()
return self.syms
def _skip_balanced_block(self):
"""
Consume a balanced {...} block starting at the current position
(which must be at '{'). This does NOT touch self.depth_brace /
class_stack, so it won't confuse outer block tracking.
"""
if self.peek() != "{":
return
depth = 0
# consume the first '{'
self.advance(1)
depth += 1
while self.i < self.n and depth > 0:
ch = self.peek()
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
self.advance(1)
# --- blocks ---
def _parse_namespace(self):
self.advance(len("namespace")); self.skip_ws()
if self.text.startswith("inline", self.i):
self.advance(len("inline")); self.skip_ws()
m = re.match(r'([A-Za-z_]\w*(::[A-Za-z_]\w*)*)?', self.text[self.i:])
name = "";
if m: name = m.group(0) or ""; self.advance(len(name))
self.skip_ws()
if self.peek() == "{":
self.advance(1); self.depth_brace += 1
self.ns_stack.append(name if name else "")
def _parse_record(self):
kw = "class" if self.text.startswith("class ", self.i) else "struct"
self.advance(len(kw)); self.skip_ws()
name = self._read_word()
if not name: return
while self.i<self.n and self.peek() not in "{;":
if self.peek()=="<": self._read_balanced("<", ">")
else: self.advance(1)
if self.peek()=="{":
self.advance(1); self.depth_brace += 1
self.class_stack.append({"name": name, "access": "public" if kw=="struct" else "private", "brace_depth": self.depth_brace})
else:
self.advance(1) # forward decl
def _parse_template_intro(self) -> str:
self.advance(len("template")); self.skip_ws()
params = self._read_balanced("<", ">") if self.peek()=="<" else ""
return f"template{params}"
def _set_access(self, acc: str):
if self.class_stack: self.class_stack[-1]["access"]=acc
def _maybe_pop(self):
if self.class_stack and self.class_stack[-1]["brace_depth"] == self.depth_brace + 1:
self.class_stack.pop(); return
if self.ns_stack: self.ns_stack.pop()
# --- helpers ---
def _read_word(self) -> str:
self.skip_ws()
m = re.match(r'[A-Za-z_]\w*', self.text[self.i:])
if not m: return ""
w = m.group(0); self.advance(len(w)); return w
def _read_balanced(self, o: str, c: str) -> str:
depth=1; out=o; self.advance(1)
while self.i<self.n and depth>0:
ch=self.peek(); out+=ch; self.advance(1)
if ch==o: depth+=1
elif ch==c: depth-=1
return out
def _current_ns_is_dofs(self) -> bool:
if not self.ns_stack: return False
chain=[p for p in self.ns_stack if p]
return bool(chain) and chain[0]=="dofs"
def _read_one_head(self) -> Tuple[str, str]:
par=ang=sq=0; start=self.i
while self.i<self.n:
ch=self.peek()
if ch=="(": par+=1
elif ch==")": par=max(0,par-1)
elif ch=="<": ang+=1
elif ch==">": ang=max(0,ang-1)
elif ch=="[": sq+=1
elif ch=="]": sq=max(0,sq-1)
elif ch==";" and par==0 and ang==0 and sq==0:
end=self.i; self.advance(1)
return self.text[start:end].strip(), ";"
elif ch=="{" and par==0 and ang==0 and sq==0:
end=self.i
return self.text[start:end].strip(), "{"
self.advance(1)
return "", ""
def _skip_brace_block(self):
"""Assumes current char is '{'; skis balanced block."""
if self.peek() != "{":
return
brace = 0
while self.i < self.n:
ch = self.peek()
self.advance(1)
if ch == "{":
brace += 1
elif ch == "}":
brace -= 1
if brace == 0:
break
def _consume_until_sep(self):
par=ang=sq=0
while self.i<self.n:
ch=self.peek(); self.advance(1)
if ch=="(": par+=1
elif ch==")": par=max(0,par-1)
elif ch=="<": ang+=1
elif ch==">": ang=max(0,ang-1)
elif ch=="[": sq+=1
elif ch=="]": sq=max(0,sq-1)
elif ch==";" and par==0 and ang==0 and sq==0: return
elif ch=="{" and par==0 and ang==0 and sq==0:
brace=1
while self.i<self.n and brace>0:
c2=self.peek(); self.advance(1)
if c2=="{": brace+=1
elif c2=="}": brace-=1
return
def _maybe_decl_or_def(self):
start_line = self.line
# skip obvious non-function starts
for bs in ("using ", "typedef ", "enum ", "namespace ", "static_assert"):
if self.text.startswith(bs, self.i):
self._consume_until_sep(); return
if self.text.startswith("template ", self.i):
self.pending_template = self._parse_template_intro(); return
decl, endch = self._read_one_head()
if not decl.strip(): return
tparams = self.pending_template or ""
self.pending_template = None
if "friend" in decl: return
if "(" not in decl or ")" not in decl: return
recorded = False
# classify: method vs free fn (inside dofs)
in_class = bool(self.class_stack)
if in_class:
if self.class_stack[-1]["access"] != "public": return
self._record_method(decl, start_line, tparams)
recorded = True
else:
if self._current_ns_is_dofs():
self._record_free_function(decl, start_line, tparams)
recorded = True
# If we just read a function head with a body, skip the body **after** recording
if endch == "{":
self._skip_brace_block()
return
# If it wasn't recorded (e.g., not in dofs namespace for free function),
# just continue; declarations ending with ';' need no additional skipping.
if recorded:
return
else:
return
# --- symbol building ---
def _normalize(self, s: str) -> str:
return re.sub(r'\s+', ' ', s).strip()
def _name_from_decl(self, decl: str) -> str:
"""
Find the function/method name robustly:
- choose the '(' that starts the *parameter list* (angle-depth == 0)
- then take the identifier immediately to its left as the name
Avoids mistaking template args like 'std::function<void()>' for a function.
"""
# Strip trailing qualifiers after param list for stability
head = re.split(r'\b(noexcept|requires)\b', decl)[0]
# Scan to find the '(' that begins the parameter list at angle-depth 0
ang = 0
par_open_idx = -1
for idx, ch in enumerate(head):
if ch == '<':
ang += 1
elif ch == '>':
ang = max(0, ang - 1)
elif ch == '(' and ang == 0:
par_open_idx = idx
break
if par_open_idx == -1:
return ""
# Walk left from par_open_idx to find the start of the name token
j = par_open_idx - 1
# Skip whitespace
while j >= 0 and head[j].isspace():
j -= 1
# Collect identifier (and allow operator forms)
# First, try operator names
m_op = re.search(r'(operator\s*""\s*_[A-Za-z_]\w*|operator\s*[^\s(]+)\s*$', head[:par_open_idx])
if m_op:
name = m_op.group(1)
else:
# Regular identifier (possibly destructor)
m_id = re.search(r'(~?[A-Za-z_]\w*)\s*$', head[:par_open_idx])
name = m_id.group(1) if m_id else ""
if not name or name in self._kw_block:
return ""
return name
def _qualify(self, name: str) -> str:
ns = [p for p in self.ns_stack if p]
q = "::".join(ns) + "::" if ns else ""
if self.class_stack:
q += "::".join([c["name"] for c in self.class_stack]) + "::"
return (q + name) if q else name
def _kind_for_method(self, name: str, cls: str) -> str:
if name == cls: return "ctor"
if name == f"~{cls}": return "dtor"
if name.startswith("operator"):
if re.match(r'operator\s+[^(\s]+', name) and "<" not in name and name != "operator()":
return "conversion"
return "method"
return "method"
def _cvref_static(self, decl: str) -> Tuple[bool,bool,str]:
is_static = bool(re.search(r'(^|\s)static\s', decl))
r = decl.rfind(")")
tail = decl[r+1:] if r!=-1 else ""
is_const = bool(re.search(r'\bconst\b', tail))
refq = "&&" if "&&" in tail else ("&" if re.search(r'(^|\s)&(\s|$)', tail) else "")
return is_static, is_const, refq
def _record_method(self, decl: str, start_line: int, tparams: str):
cls = self.class_stack[-1]["name"]
name = self._name_from_decl(decl)
if not name: return
qualified = self._qualify(name)
is_static, is_const, refq = self._cvref_static(decl)
kind = self._kind_for_method(name, cls)
sig = self._normalize((tparams + " " + decl).strip() if tparams else decl)
self.syms.append(Symbol(kind=kind, qualified=qualified, signature=sig,
file=self.relpath, line=start_line,
static=is_static, const=is_const, ref_qual=refq,
template_params=tparams or ""))
def _record_free_function(self, decl: str, start_line: int, tparams: str):
name = self._name_from_decl(decl)
if not name: return
qualified = self._qualify(name)
sig = self._normalize((tparams + " " + decl).strip() if tparams else decl)
self.syms.append(Symbol(kind="free_function", qualified=qualified, signature=sig,
file=self.relpath, line=start_line,
template_params=tparams or ""))
# -------- Rendering --------
def to_json(symbols: List[Symbol]) -> str:
items = [asdict(s) for s in symbols]
items.sort(key=lambda s: (s["file"], s["line"], s["qualified"], s["signature"]))
return json.dumps({"version": 1, "symbols": items}, indent=2)
def _markdown_for_file(rel_repo_file: str, symbols: List[Symbol]) -> str:
"""
Build per-header Markdown for exactly the symbols whose s.file == rel_repo_file.
"""
title = rel_repo_file.replace("src/", "", 1)
lines = [f"# {title}\n"]
file_syms = [s for s in symbols if s.file == rel_repo_file]
if not file_syms:
lines.append("_No public API symbols found in this header._")
lines.append("")
return "\n".join(l.rstrip() for l in lines)
# Group macros last; keep deterministic order
def _order(s: Symbol):
k = {"macro": 2}.get(s.kind, 1)
return (k, s.qualified, s.signature)
for s in sorted(file_syms, key=_order):
tprefix = (s.template_params + " ") if s.template_params else ""
if s.kind == "macro":
# H2 with macro name, then macro head; no line numbers, no bullets
lines.append(f"## `{s.qualified}`")
lines.append(f"`{s.signature}`\n")
else:
# H2 with fully qualified name (namespace::[class::]func)
# Contract/signature on the next line
fqname = s.qualified
if tprefix:
lines.append(f"## `{fqname}`")
lines.append(f"`{tprefix.strip()} {s.signature}`\n".replace(" ", " ").strip())
else:
lines.append(f"## `{fqname}`")
lines.append(f"`{s.signature}`\n")
return "\n".join(l.rstrip() for l in lines)
# -------- Robust multi-line free-function extraction --------
# Matches things like:
# inline void foo(A a,
# B b = std::nullopt) noexcept;
# std::mutex &error_mutex() noexcept;
_FREE_FN_RE = re.compile(r"""
(?P<prefix> ^ | [;\}\n] ) # anchor
(?P<head>
(?:\s*(?:inline|constexpr|consteval|constinit|static|extern)\s+)* # storage/attrs
(?:[\w:\<\>\*\&\s]+\s+)? # return type (optional for constructors, but we only accept when present)
(?P<name>[A-Za-z_]\w*)\s* # function name
\(
(?P<params>
[^()]* (?:\([^()]*\)[^()]*)* # balanced parens inside params
)
\)
(?:\s*noexcept(?:\s*\([^)]*\))?)? # optional noexcept/noexcept(expr)
(?:\s*->\s*[^;{\n]+)? # optional trailing return type
)
\s*
(?P<ender> [;{] ) # prototype or definition
""", re.VERBOSE | re.DOTALL | re.MULTILINE)
def _collapse_ws(s: str) -> str:
# Collapse all whitespace runs to a single space for clean signatures
return " ".join(s.split())
def extract_free_functions_multiline(clean_text: str, relpath: str) -> List[Symbol]:
"""
Walk the file tracking namespace blocks and pick out free-function
heads that can span multiple lines. Avoid class/struct/enum bodies.
"""
syms: List[Symbol] = []
ns_stack: List[str] = []
class_depth = 0 # crude guard: skip when inside class/struct/enum body
# Token-ish scan to maintain simple block context
i = 0
n = len(clean_text)
while i < n:
# namespace enter
if clean_text.startswith("namespace", i):
j = i + len("namespace")
while j < n and clean_text[j].isspace():
j += 1
# Parse namespace name (could be 'dofs' or anonymous)
k = j
while k < n and (clean_text[k].isalnum() or clean_text[k] in "_:"):
k += 1
ns_name = clean_text[j:k].strip()
# Find the next '{'
m = clean_text.find("{", k)
if m != -1:
if ns_name:
ns_stack.append(ns_name)
else:
ns_stack.append("") # anonymous
i = m + 1
continue
# class/struct/enum guard
if clean_text.startswith("class ", i) or clean_text.startswith("struct ", i) or clean_text.startswith("enum ", i):
# Enter body at next '{'
m = clean_text.find("{", i)
if m != -1:
class_depth += 1
i = m + 1
continue
if clean_text[i] == '}':
if class_depth > 0:
class_depth -= 1
elif ns_stack:
ns_stack.pop()
i += 1
continue
# Try a function head only if not inside a class-like body
if class_depth == 0:
m = _FREE_FN_RE.match(clean_text, i)
if m:
name = m.group("name")
head = m.group("head")
# filter obvious false positives: require a return type before name
# (very rough: there must be at least one space before name inside head)
if re.search(r"\S\s+" + re.escape(name) + r"\s*\(", head):
qualified = "::".join([ns for ns in ns_stack if ns]) # drop anonymous
qualified = f"{qualified}::{name}" if qualified else name
# Build a tidy signature
ender = m.group("ender")
signature = _collapse_ws(head) + ender
line = clean_text.count("\n", 0, m.start("head")) + 1
syms.append(Symbol(kind="free_function",
qualified=qualified,
signature=signature,
file=relpath,
line=line))
i = m.end()
continue
i += 1
return syms
# -------- Macro extraction (function-like only) --------
_MACRO_HEAD_RE = re.compile(r'^\s*#\s*define\s+([A-Za-z_]\w*)\s*\((.*)$')
def extract_function_like_macros(text: str, relpath: str) -> List[Symbol]:
"""
Capture lines of the form:
#define NAME(args) <body...>
with multi-line bodies using backslash continuations.
We record: kind="macro", qualified=NAME, signature="#define NAME(args)".
"""
syms: List[Symbol] = []
lines = text.splitlines()
i = 0
while i < len(lines):
line = lines[i]
m = _MACRO_HEAD_RE.match(line)
if not m:
i += 1
continue
name = m.group(1)
args_part = m.group(2) # may or may not contain closing ')'
start_line = i + 1
# Collect continuation lines while trailing backslash exists.
body_lines = [line]
i += 1
while i < len(lines) and body_lines[-1].rstrip().endswith("\\"):
body_lines.append(lines[i])
i += 1
# Reconstruct just the macro head (name + (...) args text).
head = "".join(body_lines)
# Try to extract the argument list reliably (balanced parens from first '(')
# without being confused by body parentheses.
head_from_paren = head[head.find("("):] if "(" in head else ""
# Minimal balanced scan to the first matching ')'
par = 0
arg_end = -1
for idx, ch in enumerate(head_from_paren):
if ch == "(":
par += 1
elif ch == ")":
par -= 1
if par == 0:
arg_end = idx
break
if arg_end != -1:
arg_text = head_from_paren[1:arg_end] # inside (...)
else:
# Fallback: whatever we saw on the first line
arg_text = args_part.split(")")[0]
signature = f"#define {name}({arg_text.strip()})"
syms.append(Symbol(kind="macro",
qualified=name,
signature=signature,
file=relpath,
line=start_line))
return syms
# -------- Driver --------
def main():
ap = argparse.ArgumentParser(description="Extract DOFS public API (per-header docs).")
ap.add_argument("--src", default=str(SRC_ROOT), help="Source root (default: repo/src)")
ap.add_argument("--out-dir", default=str(OUT_DIR_DEFAULT), help="Docs root to mirror into (default: docs)")
ap.add_argument("--stdout", action="store_true", help="Print JSON to stdout instead of writing files")
args = ap.parse_args()
src_root = Path(args.src).resolve()
out_root = Path(args.out_dir).resolve()
all_symbols: List[Symbol] = []
header_paths = iter_headers(src_root)
for hp in header_paths:
rel_repo = hp.relative_to(REPO_ROOT).as_posix() # e.g., src/core/simulator.h
raw = read_text(hp)
clean = strip_comments_and_literals(raw)
p = Parser(clean, rel_repo)
# C++ functions/methods (public) inside namespace dofs
parsed = p.run()
all_symbols.extend(parsed)
# Multi-line free functions (e.g., log_error in error.h)
extra_fns = extract_free_functions_multiline(clean, rel_repo)
# De-duplicate by (kind, qualified, signature, file, line)
seen = { (s.kind, s.qualified, s.signature, s.file, s.line) for s in all_symbols }
for s in extra_fns:
key = (s.kind, s.qualified, s.signature, s.file, s.line)
if key not in seen:
all_symbols.append(s)
seen.add(key)
# Function-like macros (global, regardless of namespace)
all_symbols.extend(extract_function_like_macros(raw, rel_repo))
if args.stdout:
print(to_json(all_symbols))
return
# Write index.json under docs/
out_root.mkdir(parents=True, exist_ok=True)
(out_root / "index.json").write_text(to_json(all_symbols), encoding="utf-8")
# Emit one markdown per header, mirroring src/ -> docs/
# src/<subpath>.h => docs/<subpath>.md
for hp in header_paths:
rel_from_repo = hp.relative_to(REPO_ROOT).as_posix() # src/...
rel_from_src = hp.relative_to(src_root).with_suffix(".md") # core/simulator.md
target_path = out_root / rel_from_src
target_path.parent.mkdir(parents=True, exist_ok=True)
md = _markdown_for_file(rel_from_repo, all_symbols)
target_path.write_text(md, encoding="utf-8")
print(f"[extract_api] Wrote JSON index: {out_root/'index.json'}")
print(f"[extract_api] Wrote per-header Markdown under: {out_root}")
if __name__ == "__main__":
main()