{ "model": "language_tail_kv_int4.onnx", "type": "merged KV-cache graph (inputs_embeds injected; input_ids for mask only)", "inputs": [ "input_ids", "inputs_embeds", "attention_mask", "position_ids", "past_key_{0..35}", "past_value_{0..35}" ], "outputs": [ "logits", "present_key_{0..35}", "present_value_{0..35}" ], "kv": { "n_layers": 36, "kv_heads": 2, "head_dim": 128 }, "int4_size_GB": 1.65, "validation": { "prefill_empty_past": "ok", "next_token": " (151672) == torch", "boxes": 6, "result": "OK" }, "speed_note": "CPU MatMulNBits is slow (~0.27 tok/s int4); WebGPU is the deployment target. KV cache makes decode ~13x faster than the cache-less graph." }