evaluation_results.json · Tinman-Lab/Tinman-SmolOmni-MLA-Toolkit at main

Add evaluation results: CLIP score, VQA, architecture verification

6674fdc verified about 2 months ago

768 Bytes

	{
	"eval_date": "2026-04-26",
	"model_500M": {
	"checkpoint": "stage2_final/model.pt",
	"tests": {
	"text_understanding": {"status": "PASS", "result": "Coherent next-token predictions"},
	"image_generation_pipeline": {"status": "PASS/FAIL", "clip_score": 0.1118, "verdict": "Near-random. Needs 10-50x more data/steps."},
	"vqa_chartqa": {"status": "FAIL (expected)", "accuracy": "0/10 (0.0%)", "reason": "No VQA instruction tuning performed."},
	"moonshine_audio": {"status": "PASS", "wer": "~3.5%", "latency": "0.1s"},
	"kv_cache": {"status": "PASS", "reduction": "40.6%"}
	}
	},
	"conclusion": "Architecture validated. Quality not production-grade. Image gen needs $500-5000 more training. VQA needs $5-20. ASR use Moonshine."
	}