#!/usr/bin/env python3 """#17 design-soul flywheel — make the model NATIVELY elite (verify-everything applied to design). Mirrors the Lean flywheel (66/69): generate (CANON-steered) → audit_design gate → SELF-CORRECT from the real violations → keep ONLY elite → SFT corpus → heal. After the heal the design PRIOR is elite, so the raw model produces oklch/grid/scale designs with no prompt and no constraint = elite out of the box. python scripts/76_design_flywheel.py --selftest # CPU: audit gate + self-correct + corpus logic python scripts/76_design_flywheel.py --n 200 --correct 2 # GPU: full flywheel (after miniF2F frees the GPU) """ import argparse import json import os import re import sys import urllib.request sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) from design_canon import CANON, audit_design # noqa: E402 DESIGN_PROMPTS = [ "A SaaS pricing page with three tiers.", "A dashboard card showing a KPI with a sparkline.", "A login form with email and password.", "A hero section for a developer-tools landing page.", "A settings panel with labelled toggles in sections.", "A pricing comparison table with a highlighted plan.", "A notification toast with icon, message, and dismiss.", "A profile card with avatar, name, bio, and two actions.", ] def _post(base_url, model, msgs, max_tokens=1400): body = json.dumps({"model": model, "messages": msgs, "temperature": 0.6, "max_tokens": max_tokens, "chat_template_kwargs": {"enable_thinking": False}}).encode() import time as _t for a in range(4): # retry transient server hiccups (hardened, like 66_prove) try: req = urllib.request.Request(base_url + "/chat/completions", body, {"Content-Type": "application/json"}) m = json.loads(urllib.request.urlopen(req, timeout=300).read())["choices"][0]["message"] return m.get("content") or "" except Exception: # noqa: BLE001 if a == 3: raise _t.sleep(2 * (a + 1)) def extract_css(text): m = re.search(r"```(?:html|css)?\s*\n(.*?)```", text, re.S) return m.group(1).strip() if m else text.strip() def prove_elite(base_url, model, prompt, correct=2, _gen=None): """Generate → audit → self-correct from the real violations → return (elite_code|None, trajectory). `_gen(msgs)->text` is injectable so the CPU selftest can run the whole loop with no model/GPU.""" gen = _gen or (lambda msgs: _post(base_url, model, msgs)) msgs = [{"role": "system", "content": CANON}, {"role": "user", "content": prompt}] for _ in range(correct + 1): out = gen(msgs) code = extract_css(out) v = audit_design(code) if not v: # ELITE — keep it return code, msgs + [{"role": "assistant", "content": out}] msgs += [{"role": "assistant", "content": out}, # self-correct: feed the exact violations back {"role": "user", "content": "Not elite yet — fix these and output ONLY the corrected code:\n- " + "\n- ".join(v)}] return None, msgs def main(): ap = argparse.ArgumentParser() ap.add_argument("--selftest", action="store_true") ap.add_argument("--n", type=int, default=200) ap.add_argument("--correct", type=int, default=2) ap.add_argument("--base-url", default="http://127.0.0.1:8080/v1") ap.add_argument("--model", default="models/GLM-5.2-q3a4-v4") ap.add_argument("--out", default=os.path.join(os.path.dirname(__file__), "..", "heal", "design", "train.jsonl")) args = ap.parse_args() if args.selftest: return _selftest() os.makedirs(os.path.dirname(args.out), exist_ok=True) kept = tries = 0 with open(args.out, "w") as f: for i in range(args.n): prompt = DESIGN_PROMPTS[i % len(DESIGN_PROMPTS)] tries += 1 code, traj = prove_elite(args.base_url, args.model, prompt, args.correct) if code: # keep ONLY verified-elite → the heal corpus f.write(json.dumps({"messages": traj}) + "\n") kept += 1 print(f" [{i + 1}/{args.n}] {'✓ elite' if code else '✗ gave up'} ({kept}/{tries} kept)", flush=True) print(f"\n DESIGN-FLYWHEEL: {kept}/{tries} elite designs → {args.out} (SFT this → native-elite prior)") def _selftest(): elite = (":root{--brand:oklch(0.62 0.19 255)}\n.card{padding:16px;margin:24px 0;gap:8px;" "border:1px solid oklch(0.9 0.02 255);font-size:16px;color:oklch(0.2 0.02 255)}") bad = ".card{padding:13px;margin:5px;background:#fff;color:rgb(0,0,0);font-size:17px}" assert audit_design(elite) == [] and len(audit_design(bad)) == 3 # the self-correct loop: mock a model that emits BAD first, then ELITE after seeing violations seq = iter([f"```css\n{bad}\n```", f"```css\n{elite}\n```"]) code, traj = prove_elite("", "", DESIGN_PROMPTS[0], correct=2, _gen=lambda msgs: next(seq)) assert code is not None and audit_design(code) == [], code assert sum(1 for m in traj if m["role"] == "user") == 2, "should have fed violations back once" # corpus filter: a model that never gets it right → nothing kept code2, _ = prove_elite("", "", DESIGN_PROMPTS[0], correct=1, _gen=lambda msgs: f"```css\n{bad}\n```") assert code2 is None print(" design_flywheel selftest PASS — audit gate + self-correct-from-violations + keep-only-elite, GPU-free") print(" (generate-step needs the GPU; run after miniF2F → heal/design/train.jsonl → SFT = native-elite prior)") if __name__ == "__main__": main()