#!/usr/bin/env python3
"""#23 facet-inclusive REAP calibration mix — protect EVERY facet's experts in the next prune.

The v4 prune used a CODE-FIRST calibration → it cost design soul (the aesthetic experts weren't exercised, so
REAP saliency under-weighted them). The deepest fix isn't healing design back — it's calibrating the prune on
ALL facets so their experts survive in the first place. This assembles a balanced calibration covering the 7
facets (design/dataviz/code/security/math/prose/architecture) + core capabilities, anchored on the elite gold
seeds (the canonical signal that fires each facet's experts). Feeds scripts/23_stream_calibrate for the re-prune.

  python scripts/78_facet_calib.py
"""
import glob
import json
import os
import random
import sys
from collections import Counter

HERE = os.path.dirname(__file__)
ROOT = os.path.join(HERE, "..")
random.seed(0)


def _text(row):
    return "\n".join(m.get("content", "") for m in row.get("messages", []) if m.get("content"))


def _sample(rel, k):
    fp = os.path.join(ROOT, rel)
    if not os.path.exists(fp):
        return []
    rows = [json.loads(line) for line in open(fp) if line.strip()]
    random.shuffle(rows)
    out = [_text(r) for r in rows]
    return [t for t in out if t][:k]


def main():
    mix = []
    # 1) the elite facet seeds — the canonical exemplar that exercises each facet's experts (the key signal)
    for fp in sorted(glob.glob(os.path.join(ROOT, "heal/facets/seeds/*"))):
        if not os.path.isfile(fp) or fp.endswith(".pyc"):
            continue
        mix.append((os.path.basename(fp).split("_")[0], open(fp, encoding="utf-8", errors="ignore").read()))
    for t in _sample("heal/design/seeds.jsonl", 9):
        mix.append(("design", t))
    # the facet CANONS — dense facet vocabulary (OKLCH/Tufte/Saltzer/Erdős/…), a strong per-facet REAP activator
    # that exercises each facet's experts so a harder prune (14/7GB) keeps them. Full balance comes from the flywheel.
    sys.path.insert(0, os.path.join(ROOT, "src"))
    from soul import FACETS  # noqa: E402
    for name, facet in FACETS.items():
        mix.append((name, facet.canon))
    # 2) balanced samples per capability from the heal corpora
    for cap, rel, k in [("design", "heal/design/train.jsonl", 40),
                        ("math", "heal/lean/train.jsonl", 40),
                        ("retrieval", "heal/callsieve/train.jsonl", 40),
                        ("general", "heal/data-v4/train.jsonl", 60)]:
        for t in _sample(rel, k):
            mix.append((cap, t))
    random.shuffle(mix)

    out = os.path.join(ROOT, "calib", "facet_mix.jsonl")
    os.makedirs(os.path.dirname(out), exist_ok=True)
    with open(out, "w") as f:
        for cap, text in mix:
            f.write(json.dumps({"text": text[:4000], "facet": cap}) + "\n")

    counts = Counter(c for c, _ in mix)
    print(f"  facet-inclusive calibration: {len(mix)} samples → {out}")
    for cap, n in counts.most_common():
        print(f"    {cap:13s}: {n}")
    facets = {"design", "dataviz", "code", "security", "math", "prose", "architecture"}
    covered = facets & set(counts)
    print(f"  facet coverage: {len(covered)}/7 {sorted(covered)}")
    print("  → REAP saliency on THIS keeps every facet's experts; the next prune loses no soul (the deepest fix)")
    assert len(mix) > 100 and len(covered) >= 6, (len(mix), covered)


if __name__ == "__main__":
    main()