| { | |
| "eval_date": "2026-04-26", | |
| "model_500M": { | |
| "checkpoint": "stage2_final/model.pt", | |
| "tests": { | |
| "text_understanding": {"status": "PASS", "result": "Coherent next-token predictions"}, | |
| "image_generation_pipeline": {"status": "PASS/FAIL", "clip_score": 0.1118, "verdict": "Near-random. Needs 10-50x more data/steps."}, | |
| "vqa_chartqa": {"status": "FAIL (expected)", "accuracy": "0/10 (0.0%)", "reason": "No VQA instruction tuning performed."}, | |
| "moonshine_audio": {"status": "PASS", "wer": "~3.5%", "latency": "0.1s"}, | |
| "kv_cache": {"status": "PASS", "reduction": "40.6%"} | |
| } | |
| }, | |
| "conclusion": "Architecture validated. Quality not production-grade. Image gen needs $500-5000 more training. VQA needs $5-20. ASR use Moonshine." | |
| } |