dofs/tools/extract_api.py

#!/usr/bin/env python3
# dofs/tools/extract_api.py
# Extract DOFS public API from headers and emit:
# - Per-header Markdown files under docs/<same path>.md (mirrors src tree)
# - One JSON index at docs/index.json

from __future__ import annotations
import argparse
import json
import re
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Optional, Tuple, Dict

# -------- Repo roots --------
def _detect_repo_root() -> Path:
    p = Path(__file__).resolve()
    for anc in [p.parent, *p.parents]:
        if (anc / "src").is_dir():
            return anc
    return p.parent

REPO_ROOT = _detect_repo_root()     # .../dofs
SRC_ROOT  = REPO_ROOT / "src"
OUT_DIR_DEFAULT = REPO_ROOT / "docs"   # mirror into docs/

# -------- IO helpers --------
def read_text(p: Path) -> str:
    return p.read_text(encoding="utf-8", errors="ignore")

def iter_headers(root: Path) -> List[Path]:
    return sorted(root.rglob("*.h"))

def strip_comments_and_literals(code: str) -> str:
    string_re = r'("([^"\\]|\\.)*")|(\'([^\'\\]|\\.)*\')'
    slc_re = r'//[^\n]*'
    mlc_re = r'/\*.*?\*/'
    def _keep_nls(m):  # keep line count stable
        return re.sub(r'[^\n]', ' ', m.group(0))
    code = re.sub(mlc_re, _keep_nls, code, flags=re.S)
    code = re.sub(string_re, _keep_nls, code, flags=re.S)
    code = re.sub(slc_re, _keep_nls, code)
    return code

# -------- Data model --------
@dataclass
class Symbol:
    kind: str                 # "free_function" | "method" | "ctor" | "dtor" | "conversion" | "macro"
    qualified: str
    signature: str
    file: str                 # e.g., "src/core/simulator.h"
    line: int
    static: bool = False
    const: bool = False
    ref_qual: str = ""
    template_params: str = ""

# -------- Parser (same core as before; trimmed comments) --------
class Parser:
    def __init__(self, text: str, relpath: str):
        self.text = text; self.relpath = relpath
        self.i = 0; self.n = len(text); self.line = 1
        self.ns_stack: List[str] = []
        self.class_stack: List[dict] = []
        self.depth_brace = 0
        self.pending_template: Optional[str] = None
        self.syms: List[Symbol] = []

        # simple guard for bogus names when we fall into bodies
        self._kw_block = {
            "if", "for", "while", "switch", "return", "case", "default",
            "do", "else", "break", "continue", "goto", "try", "catch"
        }

    def peek(self, k=0): j=self.i+k; return self.text[j] if 0<=j<self.n else ""
    def advance(self, k=1):
        for _ in range(k):
            if self.i>=self.n: return
            ch=self.text[self.i]; self.i+=1
            if ch=="\n": self.line+=1
    def skip_ws(self):
        while self.i<self.n and self.text[self.i].isspace(): self.advance(1)

    def run(self):
        while self.i < self.n:
            self.skip_ws()
            if self.i >= self.n: break
            if self.text.startswith("namespace", self.i): self._parse_namespace(); continue
            if self.text.startswith("class ", self.i) or self.text.startswith("struct ", self.i): self._parse_record(); continue
            if self.text.startswith("template", self.i):
                self.pending_template = self._parse_template_intro(); continue
            if self.text.startswith("public:", self.i): self._set_access("public"); self.advance(len("public:")); continue
            if self.text.startswith("private:", self.i): self._set_access("private"); self.advance(len("private:")); continue
            if self.text.startswith("protected:", self.i): self._set_access("protected"); self.advance(len("protected:")); continue
            ch=self.peek()
            if ch=="{": self.depth_brace+=1; self.advance(1); continue
            if ch=="}": self.depth_brace-=1; self.advance(1); self._maybe_pop(); continue
            self._maybe_decl_or_def()
        return self.syms

    def _skip_balanced_block(self):
        """
        Consume a balanced {...} block starting at the current position
        (which must be at '{'). This does NOT touch self.depth_brace /
        class_stack, so it won't confuse outer block tracking.
        """
        if self.peek() != "{":
            return
        depth = 0
        # consume the first '{'
        self.advance(1)
        depth += 1
        while self.i < self.n and depth > 0:
            ch = self.peek()
            if ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
            self.advance(1)

    # --- blocks ---
    def _parse_namespace(self):
        self.advance(len("namespace")); self.skip_ws()
        if self.text.startswith("inline", self.i):
            self.advance(len("inline")); self.skip_ws()
        m = re.match(r'([A-Za-z_]\w*(::[A-Za-z_]\w*)*)?', self.text[self.i:])
        name = "";
        if m: name = m.group(0) or ""; self.advance(len(name))
        self.skip_ws()
        if self.peek() == "{":
            self.advance(1); self.depth_brace += 1
            self.ns_stack.append(name if name else "")
    def _parse_record(self):
        kw = "class" if self.text.startswith("class ", self.i) else "struct"
        self.advance(len(kw)); self.skip_ws()
        name = self._read_word()
        if not name: return
        while self.i<self.n and self.peek() not in "{;":
            if self.peek()=="<": self._read_balanced("<", ">")
            else: self.advance(1)
        if self.peek()=="{":
            self.advance(1); self.depth_brace += 1
            self.class_stack.append({"name": name, "access": "public" if kw=="struct" else "private", "brace_depth": self.depth_brace})
        else:
            self.advance(1)  # forward decl

    def _parse_template_intro(self) -> str:
        self.advance(len("template")); self.skip_ws()
        params = self._read_balanced("<", ">") if self.peek()=="<" else ""
        return f"template{params}"

    def _set_access(self, acc: str):
        if self.class_stack: self.class_stack[-1]["access"]=acc

    def _maybe_pop(self):
        if self.class_stack and self.class_stack[-1]["brace_depth"] == self.depth_brace + 1:
            self.class_stack.pop(); return
        if self.ns_stack: self.ns_stack.pop()

    # --- helpers ---
    def _read_word(self) -> str:
        self.skip_ws()
        m = re.match(r'[A-Za-z_]\w*', self.text[self.i:])
        if not m: return ""
        w = m.group(0); self.advance(len(w)); return w
    def _read_balanced(self, o: str, c: str) -> str:
        depth=1; out=o; self.advance(1)
        while self.i<self.n and depth>0:
            ch=self.peek(); out+=ch; self.advance(1)
            if ch==o: depth+=1
            elif ch==c: depth-=1
        return out

    def _current_ns_is_dofs(self) -> bool:
        if not self.ns_stack: return False
        chain=[p for p in self.ns_stack if p]
        return bool(chain) and chain[0]=="dofs"

    def _read_one_head(self) -> Tuple[str, str]:
        par=ang=sq=0; start=self.i
        while self.i<self.n:
            ch=self.peek()
            if ch=="(": par+=1
            elif ch==")": par=max(0,par-1)
            elif ch=="<": ang+=1
            elif ch==">": ang=max(0,ang-1)
            elif ch=="[": sq+=1
            elif ch=="]": sq=max(0,sq-1)
            elif ch==";" and par==0 and ang==0 and sq==0:
                end=self.i; self.advance(1)
                return self.text[start:end].strip(), ";"
            elif ch=="{" and par==0 and ang==0 and sq==0:
                end=self.i
                return self.text[start:end].strip(), "{"
            self.advance(1)
        return "", ""

    def _skip_brace_block(self):
        """Assumes current char is '{'; skis balanced block."""
        if self.peek() != "{":
            return
        brace = 0
        while self.i < self.n:
            ch = self.peek()
            self.advance(1)
            if ch == "{":
                brace += 1
            elif ch == "}":
                brace -= 1
                if brace == 0:
                    break


    def _consume_until_sep(self):
        par=ang=sq=0
        while self.i<self.n:
            ch=self.peek(); self.advance(1)
            if ch=="(": par+=1
            elif ch==")": par=max(0,par-1)
            elif ch=="<": ang+=1
            elif ch==">": ang=max(0,ang-1)
            elif ch=="[": sq+=1
            elif ch=="]": sq=max(0,sq-1)
            elif ch==";" and par==0 and ang==0 and sq==0: return
            elif ch=="{" and par==0 and ang==0 and sq==0:
                brace=1
                while self.i<self.n and brace>0:
                    c2=self.peek(); self.advance(1)
                    if c2=="{": brace+=1
                    elif c2=="}": brace-=1
                return

    def _maybe_decl_or_def(self):
        start_line = self.line
        # skip obvious non-function starts
        for bs in ("using ", "typedef ", "enum ", "namespace ", "static_assert"):
            if self.text.startswith(bs, self.i):
                self._consume_until_sep(); return
        if self.text.startswith("template ", self.i):
            self.pending_template = self._parse_template_intro(); return

        decl, endch = self._read_one_head()
        if not decl.strip(): return

        tparams = self.pending_template or ""
        self.pending_template = None

        if "friend" in decl: return
        if "(" not in decl or ")" not in decl: return

        recorded = False
        # classify: method vs free fn (inside dofs)
        in_class = bool(self.class_stack)
        if in_class:
            if self.class_stack[-1]["access"] != "public": return
            self._record_method(decl, start_line, tparams)
            recorded = True
        else:
            if self._current_ns_is_dofs():
                self._record_free_function(decl, start_line, tparams)
                recorded = True

        # If we just read a function head with a body, skip the body **after** recording
        if endch == "{":
            self._skip_brace_block()
            return

        # If it wasn't recorded (e.g., not in dofs namespace for free function),
        # just continue; declarations ending with ';' need no additional skipping.
        if recorded:
            return
        else:
            return

    # --- symbol building ---
    def _normalize(self, s: str) -> str:
        return re.sub(r'\s+', ' ', s).strip()

    def _name_from_decl(self, decl: str) -> str:
        """
        Find the function/method name robustly:
        - choose the '(' that starts the *parameter list* (angle-depth == 0)
        - then take the identifier immediately to its left as the name
        Avoids mistaking template args like 'std::function<void()>' for a function.
        """
        # Strip trailing qualifiers after param list for stability
        head = re.split(r'\b(noexcept|requires)\b', decl)[0]

        # Scan to find the '(' that begins the parameter list at angle-depth 0
        ang = 0
        par_open_idx = -1
        for idx, ch in enumerate(head):
            if ch == '<':
                ang += 1
            elif ch == '>':
                ang = max(0, ang - 1)
            elif ch == '(' and ang == 0:
                par_open_idx = idx
                break
        if par_open_idx == -1:
            return ""

        # Walk left from par_open_idx to find the start of the name token
        j = par_open_idx - 1
        # Skip whitespace
        while j >= 0 and head[j].isspace():
            j -= 1
        # Collect identifier (and allow operator forms)
        # First, try operator names
        m_op = re.search(r'(operator\s*""\s*_[A-Za-z_]\w*|operator\s*[^\s(]+)\s*$', head[:par_open_idx])
        if m_op:
            name = m_op.group(1)
        else:
            # Regular identifier (possibly destructor)
            m_id = re.search(r'(~?[A-Za-z_]\w*)\s*$', head[:par_open_idx])
            name = m_id.group(1) if m_id else ""

        if not name or name in self._kw_block:
            return ""
        return name

    def _qualify(self, name: str) -> str:
        ns = [p for p in self.ns_stack if p]
        q = "::".join(ns) + "::" if ns else ""
        if self.class_stack:
            q += "::".join([c["name"] for c in self.class_stack]) + "::"
        return (q + name) if q else name

    def _kind_for_method(self, name: str, cls: str) -> str:
        if name == cls: return "ctor"
        if name == f"~{cls}": return "dtor"
        if name.startswith("operator"):
            if re.match(r'operator\s+[^(\s]+', name) and "<" not in name and name != "operator()":
                return "conversion"
            return "method"
        return "method"

    def _cvref_static(self, decl: str) -> Tuple[bool,bool,str]:
        is_static = bool(re.search(r'(^|\s)static\s', decl))
        r = decl.rfind(")")
        tail = decl[r+1:] if r!=-1 else ""
        is_const = bool(re.search(r'\bconst\b', tail))
        refq = "&&" if "&&" in tail else ("&" if re.search(r'(^|\s)&(\s|$)', tail) else "")
        return is_static, is_const, refq

    def _record_method(self, decl: str, start_line: int, tparams: str):
        cls = self.class_stack[-1]["name"]
        name = self._name_from_decl(decl)
        if not name: return
        qualified = self._qualify(name)
        is_static, is_const, refq = self._cvref_static(decl)
        kind = self._kind_for_method(name, cls)
        sig = self._normalize((tparams + " " + decl).strip() if tparams else decl)
        self.syms.append(Symbol(kind=kind, qualified=qualified, signature=sig,
                                file=self.relpath, line=start_line,
                                static=is_static, const=is_const, ref_qual=refq,
                                template_params=tparams or ""))

    def _record_free_function(self, decl: str, start_line: int, tparams: str):
        name = self._name_from_decl(decl)
        if not name: return
        qualified = self._qualify(name)
        sig = self._normalize((tparams + " " + decl).strip() if tparams else decl)
        self.syms.append(Symbol(kind="free_function", qualified=qualified, signature=sig,
                                file=self.relpath, line=start_line,
                                template_params=tparams or ""))

# -------- Rendering --------
def to_json(symbols: List[Symbol]) -> str:
    items = [asdict(s) for s in symbols]
    items.sort(key=lambda s: (s["file"], s["line"], s["qualified"], s["signature"]))
    return json.dumps({"version": 1, "symbols": items}, indent=2)

def _markdown_for_file(rel_repo_file: str, symbols: List[Symbol]) -> str:
    """
    Build per-header Markdown for exactly the symbols whose s.file == rel_repo_file.
    """
    title = rel_repo_file.replace("src/", "", 1)
    lines = [f"# {title}\n"]
    file_syms = [s for s in symbols if s.file == rel_repo_file]
    if not file_syms:
        lines.append("_No public API symbols found in this header._")
        lines.append("")
        return "\n".join(l.rstrip() for l in lines)

    # Group macros last; keep deterministic order
    def _order(s: Symbol):
        k = {"macro": 2}.get(s.kind, 1)
        return (k, s.qualified, s.signature)

    for s in sorted(file_syms, key=_order):
        tprefix = (s.template_params + " ") if s.template_params else ""
        if s.kind == "macro":
            # H2 with macro name, then macro head; no line numbers, no bullets
            lines.append(f"## `{s.qualified}`")
            lines.append(f"`{s.signature}`\n")
        else:
            # H2 with fully qualified name (namespace::[class::]func)
            # Contract/signature on the next line
            fqname = s.qualified
            if tprefix:
                lines.append(f"## `{fqname}`")
                lines.append(f"`{tprefix.strip()} {s.signature}`\n".replace("  ", " ").strip())
            else:
                lines.append(f"## `{fqname}`")
                lines.append(f"`{s.signature}`\n")

    return "\n".join(l.rstrip() for l in lines)

# -------- Robust multi-line free-function extraction --------
# Matches things like:
#   inline void foo(A a,
#                   B b = std::nullopt) noexcept;
#   std::mutex &error_mutex() noexcept;
_FREE_FN_RE = re.compile(r"""
    (?P<prefix> ^ | [;\}\n] )                      # anchor
    (?P<head>
        (?:\s*(?:inline|constexpr|consteval|constinit|static|extern)\s+)*   # storage/attrs
        (?:[\w:\<\>\*\&\s]+\s+)?                  # return type (optional for constructors, but we only accept when present)
        (?P<name>[A-Za-z_]\w*)\s*                 # function name
        \(
            (?P<params>
                [^()]* (?:\([^()]*\)[^()]*)*     # balanced parens inside params
            )
        \)
        (?:\s*noexcept(?:\s*\([^)]*\))?)?         # optional noexcept/noexcept(expr)
        (?:\s*->\s*[^;{\n]+)?                     # optional trailing return type
    )
    \s*
    (?P<ender> [;{] )                              # prototype or definition
""", re.VERBOSE | re.DOTALL | re.MULTILINE)

def _collapse_ws(s: str) -> str:
    # Collapse all whitespace runs to a single space for clean signatures
    return " ".join(s.split())

def extract_free_functions_multiline(clean_text: str, relpath: str) -> List[Symbol]:
    """
    Walk the file tracking namespace blocks and pick out free-function
    heads that can span multiple lines. Avoid class/struct/enum bodies.
    """
    syms: List[Symbol] = []
    ns_stack: List[str] = []
    class_depth = 0  # crude guard: skip when inside class/struct/enum body

    # Token-ish scan to maintain simple block context
    i = 0
    n = len(clean_text)
    while i < n:
        # namespace enter
        if clean_text.startswith("namespace", i):
            j = i + len("namespace")
            while j < n and clean_text[j].isspace():
                j += 1
            # Parse namespace name (could be 'dofs' or anonymous)
            k = j
            while k < n and (clean_text[k].isalnum() or clean_text[k] in "_:"):
                k += 1
            ns_name = clean_text[j:k].strip()
            # Find the next '{'
            m = clean_text.find("{", k)
            if m != -1:
                if ns_name:
                    ns_stack.append(ns_name)
                else:
                    ns_stack.append("")  # anonymous
                i = m + 1
                continue

        # class/struct/enum guard
        if clean_text.startswith("class ", i) or clean_text.startswith("struct ", i) or clean_text.startswith("enum ", i):
            # Enter body at next '{'
            m = clean_text.find("{", i)
            if m != -1:
                class_depth += 1
                i = m + 1
                continue

        if clean_text[i] == '}':
            if class_depth > 0:
                class_depth -= 1
            elif ns_stack:
                ns_stack.pop()
            i += 1
            continue

        # Try a function head only if not inside a class-like body
        if class_depth == 0:
            m = _FREE_FN_RE.match(clean_text, i)
            if m:
                name = m.group("name")
                head = m.group("head")
                # filter obvious false positives: require a return type before name
                # (very rough: there must be at least one space before name inside head)
                if re.search(r"\S\s+" + re.escape(name) + r"\s*\(", head):
                    qualified = "::".join([ns for ns in ns_stack if ns])  # drop anonymous
                    qualified = f"{qualified}::{name}" if qualified else name
                    # Build a tidy signature
                    ender = m.group("ender")
                    signature = _collapse_ws(head) + ender
                    line = clean_text.count("\n", 0, m.start("head")) + 1
                    syms.append(Symbol(kind="free_function",
                                       qualified=qualified,
                                       signature=signature,
                                       file=relpath,
                                       line=line))
                i = m.end()
                continue

        i += 1
    return syms

# -------- Macro extraction (function-like only) --------
_MACRO_HEAD_RE = re.compile(r'^\s*#\s*define\s+([A-Za-z_]\w*)\s*\((.*)$')

def extract_function_like_macros(text: str, relpath: str) -> List[Symbol]:
    """
    Capture lines of the form:
        #define NAME(args) <body...>
    with multi-line bodies using backslash continuations.
    We record: kind="macro", qualified=NAME, signature="#define NAME(args)".
    """
    syms: List[Symbol] = []
    lines = text.splitlines()
    i = 0
    while i < len(lines):
        line = lines[i]
        m = _MACRO_HEAD_RE.match(line)
        if not m:
            i += 1
            continue
        name = m.group(1)
        args_part = m.group(2)  # may or may not contain closing ')'
        start_line = i + 1
        # Collect continuation lines while trailing backslash exists.
        body_lines = [line]
        i += 1
        while i < len(lines) and body_lines[-1].rstrip().endswith("\\"):
            body_lines.append(lines[i])
            i += 1
        # Reconstruct just the macro head (name + (...) args text).
        head = "".join(body_lines)
        # Try to extract the argument list reliably (balanced parens from first '(')
        # without being confused by body parentheses.
        head_from_paren = head[head.find("("):] if "(" in head else ""
        # Minimal balanced scan to the first matching ')'
        par = 0
        arg_end = -1
        for idx, ch in enumerate(head_from_paren):
            if ch == "(":
                par += 1
            elif ch == ")":
                par -= 1
                if par == 0:
                    arg_end = idx
                    break
        if arg_end != -1:
            arg_text = head_from_paren[1:arg_end]  # inside (...)
        else:
            # Fallback: whatever we saw on the first line
            arg_text = args_part.split(")")[0]
        signature = f"#define {name}({arg_text.strip()})"
        syms.append(Symbol(kind="macro",
                           qualified=name,
                           signature=signature,
                           file=relpath,
                           line=start_line))
    return syms

# -------- Driver --------
def main():
    ap = argparse.ArgumentParser(description="Extract DOFS public API (per-header docs).")
    ap.add_argument("--src", default=str(SRC_ROOT), help="Source root (default: repo/src)")
    ap.add_argument("--out-dir", default=str(OUT_DIR_DEFAULT), help="Docs root to mirror into (default: docs)")
    ap.add_argument("--stdout", action="store_true", help="Print JSON to stdout instead of writing files")
    args = ap.parse_args()

    src_root = Path(args.src).resolve()
    out_root = Path(args.out_dir).resolve()

    all_symbols: List[Symbol] = []
    header_paths = iter_headers(src_root)

    for hp in header_paths:
        rel_repo = hp.relative_to(REPO_ROOT).as_posix()     # e.g., src/core/simulator.h
        raw = read_text(hp)
        clean = strip_comments_and_literals(raw)
        p = Parser(clean, rel_repo)
        # C++ functions/methods (public) inside namespace dofs
        parsed = p.run()
        all_symbols.extend(parsed)
        # Multi-line free functions (e.g., log_error in error.h)
        extra_fns = extract_free_functions_multiline(clean, rel_repo)
        # De-duplicate by (kind, qualified, signature, file, line)
        seen = { (s.kind, s.qualified, s.signature, s.file, s.line) for s in all_symbols }
        for s in extra_fns:
            key = (s.kind, s.qualified, s.signature, s.file, s.line)
            if key not in seen:
                all_symbols.append(s)
                seen.add(key)
        # Function-like macros (global, regardless of namespace)
        all_symbols.extend(extract_function_like_macros(raw, rel_repo))

    if args.stdout:
        print(to_json(all_symbols))
        return

    # Write index.json under docs/
    out_root.mkdir(parents=True, exist_ok=True)
    (out_root / "index.json").write_text(to_json(all_symbols), encoding="utf-8")

    # Emit one markdown per header, mirroring src/ -> docs/
    # src/<subpath>.h  =>  docs/<subpath>.md
    for hp in header_paths:
        rel_from_repo = hp.relative_to(REPO_ROOT).as_posix()           # src/...
        rel_from_src  = hp.relative_to(src_root).with_suffix(".md")    # core/simulator.md
        target_path   = out_root / rel_from_src
        target_path.parent.mkdir(parents=True, exist_ok=True)
        md = _markdown_for_file(rel_from_repo, all_symbols)
        target_path.write_text(md, encoding="utf-8")

    print(f"[extract_api] Wrote JSON index: {out_root/'index.json'}")
    print(f"[extract_api] Wrote per-header Markdown under: {out_root}")

if __name__ == "__main__":
    main()