Reza2kn commited on
Commit
4afd974
·
verified ·
1 Parent(s): ed74582

Add KV-cache INT4 language graph for in-browser WebGPU

Browse files
Files changed (1) hide show
  1. kv_validation_report.json +30 -0
kv_validation_report.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "language_tail_kv_int4.onnx",
3
+ "type": "merged KV-cache graph (inputs_embeds injected; input_ids for mask only)",
4
+ "inputs": [
5
+ "input_ids",
6
+ "inputs_embeds",
7
+ "attention_mask",
8
+ "position_ids",
9
+ "past_key_{0..35}",
10
+ "past_value_{0..35}"
11
+ ],
12
+ "outputs": [
13
+ "logits",
14
+ "present_key_{0..35}",
15
+ "present_value_{0..35}"
16
+ ],
17
+ "kv": {
18
+ "n_layers": 36,
19
+ "kv_heads": 2,
20
+ "head_dim": 128
21
+ },
22
+ "int4_size_GB": 1.65,
23
+ "validation": {
24
+ "prefill_empty_past": "ok",
25
+ "next_token": "<ref> (151672) == torch",
26
+ "boxes": 6,
27
+ "result": "OK"
28
+ },
29
+ "speed_note": "CPU MatMulNBits is slow (~0.27 tok/s int4); WebGPU is the deployment target. KV cache makes decode ~13x faster than the cache-less graph."
30
+ }