{
  "model": "language_tail_kv_int4.onnx",
  "type": "merged KV-cache graph (inputs_embeds injected; input_ids for mask only)",
  "inputs": [
    "input_ids",
    "inputs_embeds",
    "attention_mask",
    "position_ids",
    "past_key_{0..35}",
    "past_value_{0..35}"
  ],
  "outputs": [
    "logits",
    "present_key_{0..35}",
    "present_value_{0..35}"
  ],
  "kv": {
    "n_layers": 36,
    "kv_heads": 2,
    "head_dim": 128
  },
  "int4_size_GB": 1.65,
  "validation": {
    "prefill_empty_past": "ok",
    "next_token": "<ref> (151672) == torch",
    "boxes": 6,
    "result": "OK"
  },
  "speed_note": "CPU MatMulNBits is slow (~0.27 tok/s int4); WebGPU is the deployment target. KV cache makes decode ~13x faster than the cache-less graph."
}