Reza2kn
/

LocateAnything-3B-ONNX-WebGPU-INT4

+{
+  "model": "language_tail_kv_int4.onnx",
+  "type": "merged KV-cache graph (inputs_embeds injected; input_ids for mask only)",
+  "inputs": [
+    "input_ids",
+    "inputs_embeds",
+    "attention_mask",
+    "position_ids",
+    "past_key_{0..35}",
+    "past_value_{0..35}"
+  ],
+  "outputs": [
+    "logits",
+    "present_key_{0..35}",
+    "present_value_{0..35}"
+  ],
+  "kv": {
+    "n_layers": 36,
+    "kv_heads": 2,
+    "head_dim": 128
+  },
+  "int4_size_GB": 1.65,
+  "validation": {
+    "prefill_empty_past": "ok",
+    "next_token": "<ref> (151672) == torch",
+    "boxes": 6,
+    "result": "OK"
+  },
+  "speed_note": "CPU MatMulNBits is slow (~0.27 tok/s int4); WebGPU is the deployment target. KV cache makes decode ~13x faster than the cache-less graph."
+}