rdtand commited on
Commit
09de726
·
verified ·
1 Parent(s): e7c8b12

v5: max-not-sum sibling aggregation, kernel shape mask, joint input_global — validator: ppl=4.16, mean_NLL=1.43, MTP P0=89.5%

Browse files
config.json CHANGED
@@ -165,60 +165,44 @@
165
  "zp_dtype": "torch.uint8"
166
  },
167
  "targets": [
168
- "re:^language_model[.]model[.]layers[.]11[.]self_attn[.]k_proj$",
169
- "re:^language_model[.]model[.]layers[.]11[.]self_attn[.]q_proj$",
170
- "re:^language_model[.]model[.]layers[.]11[.]self_attn[.]qkv_proj$",
171
- "re:^language_model[.]model[.]layers[.]11[.]self_attn[.]v_proj$",
172
- "re:^language_model[.]model[.]layers[.]15[.]self_attn[.]k_proj$",
173
- "re:^language_model[.]model[.]layers[.]15[.]self_attn[.]q_proj$",
174
- "re:^language_model[.]model[.]layers[.]15[.]self_attn[.]qkv_proj$",
175
- "re:^language_model[.]model[.]layers[.]15[.]self_attn[.]v_proj$",
176
- "re:^language_model[.]model[.]layers[.]27[.]self_attn[.]o_proj$",
177
- "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]in_proj_qkv$",
178
- "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]in_proj_qkvz$",
179
- "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]in_proj_z$",
180
  "re:^language_model[.]model[.]layers[.]29[.]linear_attn[.]in_proj_qkv$",
181
  "re:^language_model[.]model[.]layers[.]29[.]linear_attn[.]in_proj_qkvz$",
182
  "re:^language_model[.]model[.]layers[.]29[.]linear_attn[.]in_proj_z$",
183
- "re:^language_model[.]model[.]layers[.]32[.]mlp[.]gate_proj$",
184
- "re:^language_model[.]model[.]layers[.]32[.]mlp[.]gate_up_proj$",
185
- "re:^language_model[.]model[.]layers[.]32[.]mlp[.]up_proj$",
186
- "re:^language_model[.]model[.]layers[.]35[.]self_attn[.]o_proj$",
 
 
187
  "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_qkv$",
188
  "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_qkvz$",
189
  "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_z$",
190
- "re:^language_model[.]model[.]layers[.]45[.]mlp[.]gate_proj$",
191
- "re:^language_model[.]model[.]layers[.]45[.]mlp[.]gate_up_proj$",
192
- "re:^language_model[.]model[.]layers[.]45[.]mlp[.]up_proj$",
193
- "re:^language_model[.]model[.]layers[.]46[.]linear_attn[.]out_proj$",
194
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]gate_proj$",
195
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]gate_up_proj$",
196
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]up_proj$",
197
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]gate_proj$",
198
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]gate_up_proj$",
199
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]up_proj$",
200
- "re:^language_model[.]model[.]layers[.]47[.]self_attn[.]o_proj$",
 
 
 
201
  "re:^language_model[.]model[.]layers[.]56[.]linear_attn[.]in_proj_qkv$",
202
  "re:^language_model[.]model[.]layers[.]56[.]linear_attn[.]in_proj_qkvz$",
203
  "re:^language_model[.]model[.]layers[.]56[.]linear_attn[.]in_proj_z$",
204
- "re:^language_model[.]model[.]layers[.]59[.]self_attn[.]k_proj$",
205
- "re:^language_model[.]model[.]layers[.]59[.]self_attn[.]o_proj$",
206
- "re:^language_model[.]model[.]layers[.]59[.]self_attn[.]q_proj$",
207
- "re:^language_model[.]model[.]layers[.]59[.]self_attn[.]qkv_proj$",
208
- "re:^language_model[.]model[.]layers[.]59[.]self_attn[.]v_proj$",
209
- "re:^language_model[.]model[.]layers[.]63[.]mlp[.]down_proj$",
210
- "re:^language_model[.]model[.]layers[.]7[.]self_attn[.]k_proj$",
211
- "re:^language_model[.]model[.]layers[.]7[.]self_attn[.]q_proj$",
212
- "re:^language_model[.]model[.]layers[.]7[.]self_attn[.]qkv_proj$",
213
- "re:^language_model[.]model[.]layers[.]7[.]self_attn[.]v_proj$",
214
- "re:^visual[.]blocks[.]10[.]attn[.]qkv$",
215
- "re:^visual[.]blocks[.]11[.]attn[.]proj$",
216
- "re:^visual[.]blocks[.]11[.]attn[.]qkv$",
217
- "re:^visual[.]blocks[.]12[.]attn[.]qkv$",
218
- "re:^visual[.]blocks[.]13[.]attn[.]proj$",
219
- "re:^visual[.]blocks[.]14[.]attn[.]qkv$",
220
  "re:^visual[.]blocks[.]6[.]attn[.]qkv$",
221
- "re:^visual[.]blocks[.]7[.]attn[.]qkv$",
 
 
222
  "re:^visual[.]blocks[.]9[.]attn[.]qkv$"
223
  ]
224
  },
@@ -324,6 +308,9 @@
324
  "re:^language_model[.]model[.]layers[.]17[.]mlp[.]gate_proj$",
325
  "re:^language_model[.]model[.]layers[.]17[.]mlp[.]gate_up_proj$",
326
  "re:^language_model[.]model[.]layers[.]17[.]mlp[.]up_proj$",
 
 
 
327
  "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_qkv$",
328
  "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_qkvz$",
329
  "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_z$",
@@ -336,11 +323,7 @@
336
  "re:^language_model[.]model[.]layers[.]19[.]mlp[.]gate_proj$",
337
  "re:^language_model[.]model[.]layers[.]19[.]mlp[.]gate_up_proj$",
338
  "re:^language_model[.]model[.]layers[.]19[.]mlp[.]up_proj$",
339
- "re:^language_model[.]model[.]layers[.]19[.]self_attn[.]k_proj$",
340
  "re:^language_model[.]model[.]layers[.]19[.]self_attn[.]o_proj$",
341
- "re:^language_model[.]model[.]layers[.]19[.]self_attn[.]q_proj$",
342
- "re:^language_model[.]model[.]layers[.]19[.]self_attn[.]qkv_proj$",
343
- "re:^language_model[.]model[.]layers[.]19[.]self_attn[.]v_proj$",
344
  "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_qkv$",
345
  "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_qkvz$",
346
  "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_z$",
@@ -349,6 +332,9 @@
349
  "re:^language_model[.]model[.]layers[.]2[.]mlp[.]gate_proj$",
350
  "re:^language_model[.]model[.]layers[.]2[.]mlp[.]gate_up_proj$",
351
  "re:^language_model[.]model[.]layers[.]2[.]mlp[.]up_proj$",
 
 
 
352
  "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_qkv$",
353
  "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_qkvz$",
354
  "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_z$",
@@ -383,11 +369,7 @@
383
  "re:^language_model[.]model[.]layers[.]23[.]mlp[.]gate_proj$",
384
  "re:^language_model[.]model[.]layers[.]23[.]mlp[.]gate_up_proj$",
385
  "re:^language_model[.]model[.]layers[.]23[.]mlp[.]up_proj$",
386
- "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]k_proj$",
387
  "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]o_proj$",
388
- "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]q_proj$",
389
- "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]qkv_proj$",
390
- "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]v_proj$",
391
  "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_qkv$",
392
  "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_qkvz$",
393
  "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_z$",
@@ -416,6 +398,10 @@
416
  "re:^language_model[.]model[.]layers[.]27[.]mlp[.]gate_proj$",
417
  "re:^language_model[.]model[.]layers[.]27[.]mlp[.]gate_up_proj$",
418
  "re:^language_model[.]model[.]layers[.]27[.]mlp[.]up_proj$",
 
 
 
 
419
  "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]out_proj$",
420
  "re:^language_model[.]model[.]layers[.]28[.]mlp[.]down_proj$",
421
  "re:^language_model[.]model[.]layers[.]28[.]mlp[.]gate_proj$",
@@ -442,6 +428,9 @@
442
  "re:^language_model[.]model[.]layers[.]31[.]mlp[.]up_proj$",
443
  "re:^language_model[.]model[.]layers[.]32[.]linear_attn[.]out_proj$",
444
  "re:^language_model[.]model[.]layers[.]32[.]mlp[.]down_proj$",
 
 
 
445
  "re:^language_model[.]model[.]layers[.]33[.]linear_attn[.]out_proj$",
446
  "re:^language_model[.]model[.]layers[.]33[.]mlp[.]down_proj$",
447
  "re:^language_model[.]model[.]layers[.]33[.]mlp[.]gate_proj$",
@@ -456,6 +445,10 @@
456
  "re:^language_model[.]model[.]layers[.]35[.]mlp[.]gate_proj$",
457
  "re:^language_model[.]model[.]layers[.]35[.]mlp[.]gate_up_proj$",
458
  "re:^language_model[.]model[.]layers[.]35[.]mlp[.]up_proj$",
 
 
 
 
459
  "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_qkv$",
460
  "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_qkvz$",
461
  "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_z$",
@@ -487,11 +480,7 @@
487
  "re:^language_model[.]model[.]layers[.]39[.]mlp[.]gate_proj$",
488
  "re:^language_model[.]model[.]layers[.]39[.]mlp[.]gate_up_proj$",
489
  "re:^language_model[.]model[.]layers[.]39[.]mlp[.]up_proj$",
490
- "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]k_proj$",
491
  "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]o_proj$",
492
- "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]q_proj$",
493
- "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]qkv_proj$",
494
- "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]v_proj$",
495
  "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_a$",
496
  "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_b$",
497
  "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_ba$",
@@ -538,8 +527,13 @@
538
  "re:^language_model[.]model[.]layers[.]44[.]mlp[.]up_proj$",
539
  "re:^language_model[.]model[.]layers[.]45[.]linear_attn[.]out_proj$",
540
  "re:^language_model[.]model[.]layers[.]45[.]mlp[.]down_proj$",
 
 
 
 
541
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]down_proj$",
542
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]down_proj$",
 
543
  "re:^language_model[.]model[.]layers[.]48[.]linear_attn[.]out_proj$",
544
  "re:^language_model[.]model[.]layers[.]48[.]mlp[.]down_proj$",
545
  "re:^language_model[.]model[.]layers[.]48[.]mlp[.]gate_proj$",
@@ -570,11 +564,7 @@
570
  "re:^language_model[.]model[.]layers[.]51[.]mlp[.]gate_proj$",
571
  "re:^language_model[.]model[.]layers[.]51[.]mlp[.]gate_up_proj$",
572
  "re:^language_model[.]model[.]layers[.]51[.]mlp[.]up_proj$",
573
- "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]k_proj$",
574
  "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]o_proj$",
575
- "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]q_proj$",
576
- "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]qkv_proj$",
577
- "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]v_proj$",
578
  "re:^language_model[.]model[.]layers[.]52[.]linear_attn[.]in_proj_a$",
579
  "re:^language_model[.]model[.]layers[.]52[.]linear_attn[.]in_proj_b$",
580
  "re:^language_model[.]model[.]layers[.]52[.]linear_attn[.]in_proj_ba$",
@@ -642,6 +632,7 @@
642
  "re:^language_model[.]model[.]layers[.]59[.]mlp[.]gate_proj$",
643
  "re:^language_model[.]model[.]layers[.]59[.]mlp[.]gate_up_proj$",
644
  "re:^language_model[.]model[.]layers[.]59[.]mlp[.]up_proj$",
 
645
  "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_qkv$",
646
  "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_qkvz$",
647
  "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_z$",
@@ -680,6 +671,7 @@
680
  "re:^language_model[.]model[.]layers[.]62[.]mlp[.]gate_proj$",
681
  "re:^language_model[.]model[.]layers[.]62[.]mlp[.]gate_up_proj$",
682
  "re:^language_model[.]model[.]layers[.]62[.]mlp[.]up_proj$",
 
683
  "re:^language_model[.]model[.]layers[.]63[.]self_attn[.]o_proj$",
684
  "re:^language_model[.]model[.]layers[.]7[.]mlp[.]down_proj$",
685
  "re:^language_model[.]model[.]layers[.]7[.]mlp[.]gate_proj$",
@@ -717,16 +709,23 @@
717
  "re:^mtp[.]layers[.]0[.]self_attn[.]q_proj$",
718
  "re:^mtp[.]layers[.]0[.]self_attn[.]qkv_proj$",
719
  "re:^mtp[.]layers[.]0[.]self_attn[.]v_proj$",
 
 
720
  "re:^visual[.]blocks[.]10[.]mlp[.]linear_fc1$",
721
  "re:^visual[.]blocks[.]10[.]mlp[.]linear_fc2$",
 
 
722
  "re:^visual[.]blocks[.]11[.]mlp[.]linear_fc1$",
723
  "re:^visual[.]blocks[.]11[.]mlp[.]linear_fc2$",
724
  "re:^visual[.]blocks[.]12[.]attn[.]proj$",
 
725
  "re:^visual[.]blocks[.]12[.]mlp[.]linear_fc1$",
726
  "re:^visual[.]blocks[.]12[.]mlp[.]linear_fc2$",
 
727
  "re:^visual[.]blocks[.]13[.]mlp[.]linear_fc1$",
728
  "re:^visual[.]blocks[.]13[.]mlp[.]linear_fc2$",
729
  "re:^visual[.]blocks[.]14[.]attn[.]proj$",
 
730
  "re:^visual[.]blocks[.]14[.]mlp[.]linear_fc1$",
731
  "re:^visual[.]blocks[.]14[.]mlp[.]linear_fc2$",
732
  "re:^visual[.]blocks[.]15[.]attn[.]proj$",
@@ -766,15 +765,18 @@
766
  "re:^visual[.]blocks[.]24[.]attn[.]proj$",
767
  "re:^visual[.]blocks[.]24[.]attn[.]qkv$",
768
  "re:^visual[.]blocks[.]24[.]mlp[.]linear_fc1$",
 
769
  "re:^visual[.]blocks[.]25[.]attn[.]proj$",
770
  "re:^visual[.]blocks[.]25[.]attn[.]qkv$",
771
  "re:^visual[.]blocks[.]25[.]mlp[.]linear_fc1$",
772
  "re:^visual[.]blocks[.]26[.]attn[.]proj$",
773
  "re:^visual[.]blocks[.]26[.]attn[.]qkv$",
774
  "re:^visual[.]blocks[.]26[.]mlp[.]linear_fc1$",
 
775
  "re:^visual[.]blocks[.]5[.]mlp[.]linear_fc2$",
776
  "re:^visual[.]blocks[.]6[.]mlp[.]linear_fc1$",
777
  "re:^visual[.]blocks[.]6[.]mlp[.]linear_fc2$",
 
778
  "re:^visual[.]blocks[.]7[.]mlp[.]linear_fc1$",
779
  "re:^visual[.]blocks[.]7[.]mlp[.]linear_fc2$",
780
  "re:^visual[.]blocks[.]8[.]mlp[.]linear_fc1$",
@@ -796,6 +798,10 @@
796
  "language_model.model.layers.10.linear_attn.in_proj_a",
797
  "language_model.model.layers.10.linear_attn.in_proj_b",
798
  "language_model.model.layers.10.linear_attn.in_proj_ba",
 
 
 
 
799
  "language_model.model.layers.12.linear_attn.in_proj_a",
800
  "language_model.model.layers.12.linear_attn.in_proj_b",
801
  "language_model.model.layers.12.linear_attn.in_proj_ba",
@@ -805,21 +811,23 @@
805
  "language_model.model.layers.14.linear_attn.in_proj_a",
806
  "language_model.model.layers.14.linear_attn.in_proj_b",
807
  "language_model.model.layers.14.linear_attn.in_proj_ba",
 
 
 
 
808
  "language_model.model.layers.16.linear_attn.in_proj_a",
809
  "language_model.model.layers.16.linear_attn.in_proj_b",
810
  "language_model.model.layers.16.linear_attn.in_proj_ba",
811
  "language_model.model.layers.17.linear_attn.in_proj_a",
812
  "language_model.model.layers.17.linear_attn.in_proj_b",
813
  "language_model.model.layers.17.linear_attn.in_proj_ba",
814
- "language_model.model.layers.18.linear_attn.in_proj_a",
815
- "language_model.model.layers.18.linear_attn.in_proj_b",
816
- "language_model.model.layers.18.linear_attn.in_proj_ba",
 
817
  "language_model.model.layers.2.linear_attn.in_proj_a",
818
  "language_model.model.layers.2.linear_attn.in_proj_b",
819
  "language_model.model.layers.2.linear_attn.in_proj_ba",
820
- "language_model.model.layers.20.linear_attn.in_proj_a",
821
- "language_model.model.layers.20.linear_attn.in_proj_b",
822
- "language_model.model.layers.20.linear_attn.in_proj_ba",
823
  "language_model.model.layers.24.linear_attn.in_proj_a",
824
  "language_model.model.layers.24.linear_attn.in_proj_b",
825
  "language_model.model.layers.24.linear_attn.in_proj_ba",
@@ -850,7 +858,6 @@
850
  "language_model.model.layers.30.linear_attn.in_proj_qkvz",
851
  "language_model.model.layers.30.linear_attn.in_proj_z",
852
  "language_model.model.layers.31.self_attn.k_proj",
853
- "language_model.model.layers.31.self_attn.o_proj",
854
  "language_model.model.layers.31.self_attn.q_proj",
855
  "language_model.model.layers.31.self_attn.qkv_proj",
856
  "language_model.model.layers.31.self_attn.v_proj",
@@ -876,9 +883,6 @@
876
  "language_model.model.layers.35.self_attn.q_proj",
877
  "language_model.model.layers.35.self_attn.qkv_proj",
878
  "language_model.model.layers.35.self_attn.v_proj",
879
- "language_model.model.layers.36.linear_attn.in_proj_a",
880
- "language_model.model.layers.36.linear_attn.in_proj_b",
881
- "language_model.model.layers.36.linear_attn.in_proj_ba",
882
  "language_model.model.layers.38.linear_attn.in_proj_a",
883
  "language_model.model.layers.38.linear_attn.in_proj_b",
884
  "language_model.model.layers.38.linear_attn.in_proj_ba",
@@ -892,7 +896,6 @@
892
  "language_model.model.layers.42.linear_attn.in_proj_qkvz",
893
  "language_model.model.layers.42.linear_attn.in_proj_z",
894
  "language_model.model.layers.43.self_attn.k_proj",
895
- "language_model.model.layers.43.self_attn.o_proj",
896
  "language_model.model.layers.43.self_attn.q_proj",
897
  "language_model.model.layers.43.self_attn.qkv_proj",
898
  "language_model.model.layers.43.self_attn.v_proj",
@@ -942,6 +945,10 @@
942
  "language_model.model.layers.56.linear_attn.in_proj_a",
943
  "language_model.model.layers.56.linear_attn.in_proj_b",
944
  "language_model.model.layers.56.linear_attn.in_proj_ba",
 
 
 
 
945
  "language_model.model.layers.6.linear_attn.in_proj_a",
946
  "language_model.model.layers.6.linear_attn.in_proj_b",
947
  "language_model.model.layers.6.linear_attn.in_proj_ba",
@@ -951,10 +958,10 @@
951
  "language_model.model.layers.63.mlp.gate_proj",
952
  "language_model.model.layers.63.mlp.gate_up_proj",
953
  "language_model.model.layers.63.mlp.up_proj",
954
- "language_model.model.layers.63.self_attn.k_proj",
955
- "language_model.model.layers.63.self_attn.q_proj",
956
- "language_model.model.layers.63.self_attn.qkv_proj",
957
- "language_model.model.layers.63.self_attn.v_proj",
958
  "mtp.fc",
959
  "visual.blocks.0.attn.proj",
960
  "visual.blocks.0.attn.qkv",
@@ -964,15 +971,12 @@
964
  "visual.blocks.1.attn.qkv",
965
  "visual.blocks.1.mlp.linear_fc1",
966
  "visual.blocks.1.mlp.linear_fc2",
967
- "visual.blocks.10.attn.proj",
968
- "visual.blocks.13.attn.qkv",
969
  "visual.blocks.2.attn.proj",
970
  "visual.blocks.2.attn.qkv",
971
  "visual.blocks.2.mlp.linear_fc1",
972
  "visual.blocks.2.mlp.linear_fc2",
973
  "visual.blocks.21.mlp.linear_fc2",
974
  "visual.blocks.23.mlp.linear_fc2",
975
- "visual.blocks.24.mlp.linear_fc2",
976
  "visual.blocks.25.mlp.linear_fc2",
977
  "visual.blocks.26.mlp.linear_fc2",
978
  "visual.blocks.3.attn.proj",
@@ -985,12 +989,8 @@
985
  "visual.blocks.4.mlp.linear_fc2",
986
  "visual.blocks.5.attn.proj",
987
  "visual.blocks.5.attn.qkv",
988
- "visual.blocks.5.mlp.linear_fc1",
989
  "visual.blocks.6.attn.proj",
990
- "visual.blocks.7.attn.proj",
991
  "visual.blocks.8.attn.proj",
992
- "visual.blocks.8.attn.qkv",
993
- "visual.blocks.9.attn.proj",
994
  "visual.pos_embed"
995
  ],
996
  "quantization_status": "compressed"
 
165
  "zp_dtype": "torch.uint8"
166
  },
167
  "targets": [
168
+ "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]k_proj$",
169
+ "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]q_proj$",
170
+ "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]qkv_proj$",
171
+ "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]v_proj$",
 
 
 
 
 
 
 
 
172
  "re:^language_model[.]model[.]layers[.]29[.]linear_attn[.]in_proj_qkv$",
173
  "re:^language_model[.]model[.]layers[.]29[.]linear_attn[.]in_proj_qkvz$",
174
  "re:^language_model[.]model[.]layers[.]29[.]linear_attn[.]in_proj_z$",
175
+ "re:^language_model[.]model[.]layers[.]31[.]self_attn[.]o_proj$",
176
+ "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]k_proj$",
177
+ "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]q_proj$",
178
+ "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]qkv_proj$",
179
+ "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]v_proj$",
180
+ "re:^language_model[.]model[.]layers[.]43[.]self_attn[.]o_proj$",
181
  "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_qkv$",
182
  "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_qkvz$",
183
  "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_z$",
 
 
 
 
184
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]gate_proj$",
185
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]gate_up_proj$",
186
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]up_proj$",
187
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]gate_proj$",
188
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]gate_up_proj$",
189
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]up_proj$",
190
+ "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]k_proj$",
191
+ "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]q_proj$",
192
+ "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]qkv_proj$",
193
+ "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]v_proj$",
194
  "re:^language_model[.]model[.]layers[.]56[.]linear_attn[.]in_proj_qkv$",
195
  "re:^language_model[.]model[.]layers[.]56[.]linear_attn[.]in_proj_qkvz$",
196
  "re:^language_model[.]model[.]layers[.]56[.]linear_attn[.]in_proj_z$",
197
+ "re:^language_model[.]model[.]layers[.]63[.]self_attn[.]k_proj$",
198
+ "re:^language_model[.]model[.]layers[.]63[.]self_attn[.]q_proj$",
199
+ "re:^language_model[.]model[.]layers[.]63[.]self_attn[.]qkv_proj$",
200
+ "re:^language_model[.]model[.]layers[.]63[.]self_attn[.]v_proj$",
201
+ "re:^visual[.]blocks[.]13[.]attn[.]qkv$",
 
 
 
 
 
 
 
 
 
 
 
202
  "re:^visual[.]blocks[.]6[.]attn[.]qkv$",
203
+ "re:^visual[.]blocks[.]7[.]attn[.]proj$",
204
+ "re:^visual[.]blocks[.]8[.]attn[.]qkv$",
205
+ "re:^visual[.]blocks[.]9[.]attn[.]proj$",
206
  "re:^visual[.]blocks[.]9[.]attn[.]qkv$"
207
  ]
208
  },
 
308
  "re:^language_model[.]model[.]layers[.]17[.]mlp[.]gate_proj$",
309
  "re:^language_model[.]model[.]layers[.]17[.]mlp[.]gate_up_proj$",
310
  "re:^language_model[.]model[.]layers[.]17[.]mlp[.]up_proj$",
311
+ "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_a$",
312
+ "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_b$",
313
+ "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_ba$",
314
  "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_qkv$",
315
  "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_qkvz$",
316
  "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_z$",
 
323
  "re:^language_model[.]model[.]layers[.]19[.]mlp[.]gate_proj$",
324
  "re:^language_model[.]model[.]layers[.]19[.]mlp[.]gate_up_proj$",
325
  "re:^language_model[.]model[.]layers[.]19[.]mlp[.]up_proj$",
 
326
  "re:^language_model[.]model[.]layers[.]19[.]self_attn[.]o_proj$",
 
 
 
327
  "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_qkv$",
328
  "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_qkvz$",
329
  "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_z$",
 
332
  "re:^language_model[.]model[.]layers[.]2[.]mlp[.]gate_proj$",
333
  "re:^language_model[.]model[.]layers[.]2[.]mlp[.]gate_up_proj$",
334
  "re:^language_model[.]model[.]layers[.]2[.]mlp[.]up_proj$",
335
+ "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_a$",
336
+ "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_b$",
337
+ "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_ba$",
338
  "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_qkv$",
339
  "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_qkvz$",
340
  "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]in_proj_z$",
 
369
  "re:^language_model[.]model[.]layers[.]23[.]mlp[.]gate_proj$",
370
  "re:^language_model[.]model[.]layers[.]23[.]mlp[.]gate_up_proj$",
371
  "re:^language_model[.]model[.]layers[.]23[.]mlp[.]up_proj$",
 
372
  "re:^language_model[.]model[.]layers[.]23[.]self_attn[.]o_proj$",
 
 
 
373
  "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_qkv$",
374
  "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_qkvz$",
375
  "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_z$",
 
398
  "re:^language_model[.]model[.]layers[.]27[.]mlp[.]gate_proj$",
399
  "re:^language_model[.]model[.]layers[.]27[.]mlp[.]gate_up_proj$",
400
  "re:^language_model[.]model[.]layers[.]27[.]mlp[.]up_proj$",
401
+ "re:^language_model[.]model[.]layers[.]27[.]self_attn[.]o_proj$",
402
+ "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]in_proj_qkv$",
403
+ "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]in_proj_qkvz$",
404
+ "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]in_proj_z$",
405
  "re:^language_model[.]model[.]layers[.]28[.]linear_attn[.]out_proj$",
406
  "re:^language_model[.]model[.]layers[.]28[.]mlp[.]down_proj$",
407
  "re:^language_model[.]model[.]layers[.]28[.]mlp[.]gate_proj$",
 
428
  "re:^language_model[.]model[.]layers[.]31[.]mlp[.]up_proj$",
429
  "re:^language_model[.]model[.]layers[.]32[.]linear_attn[.]out_proj$",
430
  "re:^language_model[.]model[.]layers[.]32[.]mlp[.]down_proj$",
431
+ "re:^language_model[.]model[.]layers[.]32[.]mlp[.]gate_proj$",
432
+ "re:^language_model[.]model[.]layers[.]32[.]mlp[.]gate_up_proj$",
433
+ "re:^language_model[.]model[.]layers[.]32[.]mlp[.]up_proj$",
434
  "re:^language_model[.]model[.]layers[.]33[.]linear_attn[.]out_proj$",
435
  "re:^language_model[.]model[.]layers[.]33[.]mlp[.]down_proj$",
436
  "re:^language_model[.]model[.]layers[.]33[.]mlp[.]gate_proj$",
 
445
  "re:^language_model[.]model[.]layers[.]35[.]mlp[.]gate_proj$",
446
  "re:^language_model[.]model[.]layers[.]35[.]mlp[.]gate_up_proj$",
447
  "re:^language_model[.]model[.]layers[.]35[.]mlp[.]up_proj$",
448
+ "re:^language_model[.]model[.]layers[.]35[.]self_attn[.]o_proj$",
449
+ "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_a$",
450
+ "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_b$",
451
+ "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_ba$",
452
  "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_qkv$",
453
  "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_qkvz$",
454
  "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_z$",
 
480
  "re:^language_model[.]model[.]layers[.]39[.]mlp[.]gate_proj$",
481
  "re:^language_model[.]model[.]layers[.]39[.]mlp[.]gate_up_proj$",
482
  "re:^language_model[.]model[.]layers[.]39[.]mlp[.]up_proj$",
 
483
  "re:^language_model[.]model[.]layers[.]39[.]self_attn[.]o_proj$",
 
 
 
484
  "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_a$",
485
  "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_b$",
486
  "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_ba$",
 
527
  "re:^language_model[.]model[.]layers[.]44[.]mlp[.]up_proj$",
528
  "re:^language_model[.]model[.]layers[.]45[.]linear_attn[.]out_proj$",
529
  "re:^language_model[.]model[.]layers[.]45[.]mlp[.]down_proj$",
530
+ "re:^language_model[.]model[.]layers[.]45[.]mlp[.]gate_proj$",
531
+ "re:^language_model[.]model[.]layers[.]45[.]mlp[.]gate_up_proj$",
532
+ "re:^language_model[.]model[.]layers[.]45[.]mlp[.]up_proj$",
533
+ "re:^language_model[.]model[.]layers[.]46[.]linear_attn[.]out_proj$",
534
  "re:^language_model[.]model[.]layers[.]46[.]mlp[.]down_proj$",
535
  "re:^language_model[.]model[.]layers[.]47[.]mlp[.]down_proj$",
536
+ "re:^language_model[.]model[.]layers[.]47[.]self_attn[.]o_proj$",
537
  "re:^language_model[.]model[.]layers[.]48[.]linear_attn[.]out_proj$",
538
  "re:^language_model[.]model[.]layers[.]48[.]mlp[.]down_proj$",
539
  "re:^language_model[.]model[.]layers[.]48[.]mlp[.]gate_proj$",
 
564
  "re:^language_model[.]model[.]layers[.]51[.]mlp[.]gate_proj$",
565
  "re:^language_model[.]model[.]layers[.]51[.]mlp[.]gate_up_proj$",
566
  "re:^language_model[.]model[.]layers[.]51[.]mlp[.]up_proj$",
 
567
  "re:^language_model[.]model[.]layers[.]51[.]self_attn[.]o_proj$",
 
 
 
568
  "re:^language_model[.]model[.]layers[.]52[.]linear_attn[.]in_proj_a$",
569
  "re:^language_model[.]model[.]layers[.]52[.]linear_attn[.]in_proj_b$",
570
  "re:^language_model[.]model[.]layers[.]52[.]linear_attn[.]in_proj_ba$",
 
632
  "re:^language_model[.]model[.]layers[.]59[.]mlp[.]gate_proj$",
633
  "re:^language_model[.]model[.]layers[.]59[.]mlp[.]gate_up_proj$",
634
  "re:^language_model[.]model[.]layers[.]59[.]mlp[.]up_proj$",
635
+ "re:^language_model[.]model[.]layers[.]59[.]self_attn[.]o_proj$",
636
  "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_qkv$",
637
  "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_qkvz$",
638
  "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_z$",
 
671
  "re:^language_model[.]model[.]layers[.]62[.]mlp[.]gate_proj$",
672
  "re:^language_model[.]model[.]layers[.]62[.]mlp[.]gate_up_proj$",
673
  "re:^language_model[.]model[.]layers[.]62[.]mlp[.]up_proj$",
674
+ "re:^language_model[.]model[.]layers[.]63[.]mlp[.]down_proj$",
675
  "re:^language_model[.]model[.]layers[.]63[.]self_attn[.]o_proj$",
676
  "re:^language_model[.]model[.]layers[.]7[.]mlp[.]down_proj$",
677
  "re:^language_model[.]model[.]layers[.]7[.]mlp[.]gate_proj$",
 
709
  "re:^mtp[.]layers[.]0[.]self_attn[.]q_proj$",
710
  "re:^mtp[.]layers[.]0[.]self_attn[.]qkv_proj$",
711
  "re:^mtp[.]layers[.]0[.]self_attn[.]v_proj$",
712
+ "re:^visual[.]blocks[.]10[.]attn[.]proj$",
713
+ "re:^visual[.]blocks[.]10[.]attn[.]qkv$",
714
  "re:^visual[.]blocks[.]10[.]mlp[.]linear_fc1$",
715
  "re:^visual[.]blocks[.]10[.]mlp[.]linear_fc2$",
716
+ "re:^visual[.]blocks[.]11[.]attn[.]proj$",
717
+ "re:^visual[.]blocks[.]11[.]attn[.]qkv$",
718
  "re:^visual[.]blocks[.]11[.]mlp[.]linear_fc1$",
719
  "re:^visual[.]blocks[.]11[.]mlp[.]linear_fc2$",
720
  "re:^visual[.]blocks[.]12[.]attn[.]proj$",
721
+ "re:^visual[.]blocks[.]12[.]attn[.]qkv$",
722
  "re:^visual[.]blocks[.]12[.]mlp[.]linear_fc1$",
723
  "re:^visual[.]blocks[.]12[.]mlp[.]linear_fc2$",
724
+ "re:^visual[.]blocks[.]13[.]attn[.]proj$",
725
  "re:^visual[.]blocks[.]13[.]mlp[.]linear_fc1$",
726
  "re:^visual[.]blocks[.]13[.]mlp[.]linear_fc2$",
727
  "re:^visual[.]blocks[.]14[.]attn[.]proj$",
728
+ "re:^visual[.]blocks[.]14[.]attn[.]qkv$",
729
  "re:^visual[.]blocks[.]14[.]mlp[.]linear_fc1$",
730
  "re:^visual[.]blocks[.]14[.]mlp[.]linear_fc2$",
731
  "re:^visual[.]blocks[.]15[.]attn[.]proj$",
 
765
  "re:^visual[.]blocks[.]24[.]attn[.]proj$",
766
  "re:^visual[.]blocks[.]24[.]attn[.]qkv$",
767
  "re:^visual[.]blocks[.]24[.]mlp[.]linear_fc1$",
768
+ "re:^visual[.]blocks[.]24[.]mlp[.]linear_fc2$",
769
  "re:^visual[.]blocks[.]25[.]attn[.]proj$",
770
  "re:^visual[.]blocks[.]25[.]attn[.]qkv$",
771
  "re:^visual[.]blocks[.]25[.]mlp[.]linear_fc1$",
772
  "re:^visual[.]blocks[.]26[.]attn[.]proj$",
773
  "re:^visual[.]blocks[.]26[.]attn[.]qkv$",
774
  "re:^visual[.]blocks[.]26[.]mlp[.]linear_fc1$",
775
+ "re:^visual[.]blocks[.]5[.]mlp[.]linear_fc1$",
776
  "re:^visual[.]blocks[.]5[.]mlp[.]linear_fc2$",
777
  "re:^visual[.]blocks[.]6[.]mlp[.]linear_fc1$",
778
  "re:^visual[.]blocks[.]6[.]mlp[.]linear_fc2$",
779
+ "re:^visual[.]blocks[.]7[.]attn[.]qkv$",
780
  "re:^visual[.]blocks[.]7[.]mlp[.]linear_fc1$",
781
  "re:^visual[.]blocks[.]7[.]mlp[.]linear_fc2$",
782
  "re:^visual[.]blocks[.]8[.]mlp[.]linear_fc1$",
 
798
  "language_model.model.layers.10.linear_attn.in_proj_a",
799
  "language_model.model.layers.10.linear_attn.in_proj_b",
800
  "language_model.model.layers.10.linear_attn.in_proj_ba",
801
+ "language_model.model.layers.11.self_attn.k_proj",
802
+ "language_model.model.layers.11.self_attn.q_proj",
803
+ "language_model.model.layers.11.self_attn.qkv_proj",
804
+ "language_model.model.layers.11.self_attn.v_proj",
805
  "language_model.model.layers.12.linear_attn.in_proj_a",
806
  "language_model.model.layers.12.linear_attn.in_proj_b",
807
  "language_model.model.layers.12.linear_attn.in_proj_ba",
 
811
  "language_model.model.layers.14.linear_attn.in_proj_a",
812
  "language_model.model.layers.14.linear_attn.in_proj_b",
813
  "language_model.model.layers.14.linear_attn.in_proj_ba",
814
+ "language_model.model.layers.15.self_attn.k_proj",
815
+ "language_model.model.layers.15.self_attn.q_proj",
816
+ "language_model.model.layers.15.self_attn.qkv_proj",
817
+ "language_model.model.layers.15.self_attn.v_proj",
818
  "language_model.model.layers.16.linear_attn.in_proj_a",
819
  "language_model.model.layers.16.linear_attn.in_proj_b",
820
  "language_model.model.layers.16.linear_attn.in_proj_ba",
821
  "language_model.model.layers.17.linear_attn.in_proj_a",
822
  "language_model.model.layers.17.linear_attn.in_proj_b",
823
  "language_model.model.layers.17.linear_attn.in_proj_ba",
824
+ "language_model.model.layers.19.self_attn.k_proj",
825
+ "language_model.model.layers.19.self_attn.q_proj",
826
+ "language_model.model.layers.19.self_attn.qkv_proj",
827
+ "language_model.model.layers.19.self_attn.v_proj",
828
  "language_model.model.layers.2.linear_attn.in_proj_a",
829
  "language_model.model.layers.2.linear_attn.in_proj_b",
830
  "language_model.model.layers.2.linear_attn.in_proj_ba",
 
 
 
831
  "language_model.model.layers.24.linear_attn.in_proj_a",
832
  "language_model.model.layers.24.linear_attn.in_proj_b",
833
  "language_model.model.layers.24.linear_attn.in_proj_ba",
 
858
  "language_model.model.layers.30.linear_attn.in_proj_qkvz",
859
  "language_model.model.layers.30.linear_attn.in_proj_z",
860
  "language_model.model.layers.31.self_attn.k_proj",
 
861
  "language_model.model.layers.31.self_attn.q_proj",
862
  "language_model.model.layers.31.self_attn.qkv_proj",
863
  "language_model.model.layers.31.self_attn.v_proj",
 
883
  "language_model.model.layers.35.self_attn.q_proj",
884
  "language_model.model.layers.35.self_attn.qkv_proj",
885
  "language_model.model.layers.35.self_attn.v_proj",
 
 
 
886
  "language_model.model.layers.38.linear_attn.in_proj_a",
887
  "language_model.model.layers.38.linear_attn.in_proj_b",
888
  "language_model.model.layers.38.linear_attn.in_proj_ba",
 
896
  "language_model.model.layers.42.linear_attn.in_proj_qkvz",
897
  "language_model.model.layers.42.linear_attn.in_proj_z",
898
  "language_model.model.layers.43.self_attn.k_proj",
 
899
  "language_model.model.layers.43.self_attn.q_proj",
900
  "language_model.model.layers.43.self_attn.qkv_proj",
901
  "language_model.model.layers.43.self_attn.v_proj",
 
945
  "language_model.model.layers.56.linear_attn.in_proj_a",
946
  "language_model.model.layers.56.linear_attn.in_proj_b",
947
  "language_model.model.layers.56.linear_attn.in_proj_ba",
948
+ "language_model.model.layers.59.self_attn.k_proj",
949
+ "language_model.model.layers.59.self_attn.q_proj",
950
+ "language_model.model.layers.59.self_attn.qkv_proj",
951
+ "language_model.model.layers.59.self_attn.v_proj",
952
  "language_model.model.layers.6.linear_attn.in_proj_a",
953
  "language_model.model.layers.6.linear_attn.in_proj_b",
954
  "language_model.model.layers.6.linear_attn.in_proj_ba",
 
958
  "language_model.model.layers.63.mlp.gate_proj",
959
  "language_model.model.layers.63.mlp.gate_up_proj",
960
  "language_model.model.layers.63.mlp.up_proj",
961
+ "language_model.model.layers.7.self_attn.k_proj",
962
+ "language_model.model.layers.7.self_attn.q_proj",
963
+ "language_model.model.layers.7.self_attn.qkv_proj",
964
+ "language_model.model.layers.7.self_attn.v_proj",
965
  "mtp.fc",
966
  "visual.blocks.0.attn.proj",
967
  "visual.blocks.0.attn.qkv",
 
971
  "visual.blocks.1.attn.qkv",
972
  "visual.blocks.1.mlp.linear_fc1",
973
  "visual.blocks.1.mlp.linear_fc2",
 
 
974
  "visual.blocks.2.attn.proj",
975
  "visual.blocks.2.attn.qkv",
976
  "visual.blocks.2.mlp.linear_fc1",
977
  "visual.blocks.2.mlp.linear_fc2",
978
  "visual.blocks.21.mlp.linear_fc2",
979
  "visual.blocks.23.mlp.linear_fc2",
 
980
  "visual.blocks.25.mlp.linear_fc2",
981
  "visual.blocks.26.mlp.linear_fc2",
982
  "visual.blocks.3.attn.proj",
 
989
  "visual.blocks.4.mlp.linear_fc2",
990
  "visual.blocks.5.attn.proj",
991
  "visual.blocks.5.attn.qkv",
 
992
  "visual.blocks.6.attn.proj",
 
993
  "visual.blocks.8.attn.proj",
 
 
994
  "visual.pos_embed"
995
  ],
996
  "quantization_status": "compressed"
mixed_native_manifest.json CHANGED
@@ -3,10 +3,10 @@
3
  "source_recipe": "/work/artifacts/layer_config.json",
4
  "format_histogram": {
5
  "head_passthrough/BF16": 3,
6
- "linear/BF16": 114,
7
- "linear/NVFP4": 348,
8
  "layer_passthrough/BF16": 352,
9
- "linear/MXFP8": 34,
10
  "mtp_linear/NVFP4": 7,
11
  "mtp_passthrough/BF16": 8
12
  },
 
3
  "source_recipe": "/work/artifacts/layer_config.json",
4
  "format_histogram": {
5
  "head_passthrough/BF16": 3,
6
+ "linear/BF16": 118,
7
+ "linear/NVFP4": 354,
8
  "layer_passthrough/BF16": 352,
9
+ "linear/MXFP8": 24,
10
  "mtp_linear/NVFP4": 7,
11
  "mtp_passthrough/BF16": 8
12
  },
model-00002-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0c825c267e860aae88a95eb8f304c2c8248727bb8e2d200674aca444a3b93fa
3
- size 4489298320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5f3d4bca6418ab2d9fc03502124412c85c56b00000d51c2cc98a450b0d4bb13
3
+ size 4516492384
model-00003-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30495eb82723105ddde40df6959eda2c1d754e156ca56dcfbf7624934c18c9b7
3
- size 4504718112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:773d4a9f7bb6583c51a2124c732ad3cfb27430f92409516cf438ce83ec18a7c2
3
+ size 4488233624
model-00004-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1c0f2a6b8f68ee6133b5cd504ddbc8447ae79e02a75c17d51c03f774c6b923f
3
- size 4459780368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d918713c09bd215109256804998e5b7113ea5126a2822b393ba38f90e4b0d2a
3
+ size 4451868072
model-00005-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b498a777990a1c8efde1a4191a1d5069357542a2be5ab200c1f2bcd44ef5d94
3
- size 4508367648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad777d8cef79c3c4c2f3eadcb8feb447a31e2cc839950336ce3fc520cdf1e658
3
+ size 4532286704
model-00006-of-00006.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1da27c738bd49e1bad742aa3e95cef832031bf88b9672ea8a2ce0ec25738230
3
- size 2163987616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66dc99205ef90ee732d7e17ba02b4bb0c71116d4df855bb5c0bb066c7dee899e
3
+ size 2137290832
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 22668632072
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00001-of-00006.safetensors",
@@ -120,16 +120,13 @@
120
  "model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
121
  "model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00006.safetensors",
122
  "model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
123
- "model.language_model.layers.11.self_attn.k_proj.weight_scale": "model-00002-of-00006.safetensors",
124
  "model.language_model.layers.11.self_attn.o_proj.input_global_scale": "model-00002-of-00006.safetensors",
125
  "model.language_model.layers.11.self_attn.o_proj.weight_global_scale": "model-00002-of-00006.safetensors",
126
  "model.language_model.layers.11.self_attn.o_proj.weight_packed": "model-00002-of-00006.safetensors",
127
  "model.language_model.layers.11.self_attn.o_proj.weight_scale": "model-00002-of-00006.safetensors",
128
  "model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00006.safetensors",
129
  "model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
130
- "model.language_model.layers.11.self_attn.q_proj.weight_scale": "model-00002-of-00006.safetensors",
131
  "model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
132
- "model.language_model.layers.11.self_attn.v_proj.weight_scale": "model-00002-of-00006.safetensors",
133
  "model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00006.safetensors",
134
  "model.language_model.layers.12.linear_attn.A_log": "model-00002-of-00006.safetensors",
135
  "model.language_model.layers.12.linear_attn.conv1d.weight": "model-00002-of-00006.safetensors",
@@ -242,16 +239,13 @@
242
  "model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
243
  "model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00006.safetensors",
244
  "model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
245
- "model.language_model.layers.15.self_attn.k_proj.weight_scale": "model-00002-of-00006.safetensors",
246
  "model.language_model.layers.15.self_attn.o_proj.input_global_scale": "model-00002-of-00006.safetensors",
247
  "model.language_model.layers.15.self_attn.o_proj.weight_global_scale": "model-00002-of-00006.safetensors",
248
  "model.language_model.layers.15.self_attn.o_proj.weight_packed": "model-00002-of-00006.safetensors",
249
  "model.language_model.layers.15.self_attn.o_proj.weight_scale": "model-00002-of-00006.safetensors",
250
  "model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00006.safetensors",
251
  "model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
252
- "model.language_model.layers.15.self_attn.q_proj.weight_scale": "model-00002-of-00006.safetensors",
253
  "model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
254
- "model.language_model.layers.15.self_attn.v_proj.weight_scale": "model-00002-of-00006.safetensors",
255
  "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00006.safetensors",
256
  "model.language_model.layers.16.linear_attn.A_log": "model-00002-of-00006.safetensors",
257
  "model.language_model.layers.16.linear_attn.conv1d.weight": "model-00002-of-00006.safetensors",
@@ -260,23 +254,23 @@
260
  "model.language_model.layers.16.linear_attn.in_proj_b.weight": "model-00002-of-00006.safetensors",
261
  "model.language_model.layers.16.linear_attn.in_proj_qkv.input_global_scale": "model-00002-of-00006.safetensors",
262
  "model.language_model.layers.16.linear_attn.in_proj_qkv.weight_global_scale": "model-00002-of-00006.safetensors",
263
- "model.language_model.layers.16.linear_attn.in_proj_qkv.weight_packed": "model-00002-of-00006.safetensors",
264
- "model.language_model.layers.16.linear_attn.in_proj_qkv.weight_scale": "model-00002-of-00006.safetensors",
265
- "model.language_model.layers.16.linear_attn.in_proj_z.input_global_scale": "model-00002-of-00006.safetensors",
266
- "model.language_model.layers.16.linear_attn.in_proj_z.weight_global_scale": "model-00002-of-00006.safetensors",
267
- "model.language_model.layers.16.linear_attn.in_proj_z.weight_packed": "model-00002-of-00006.safetensors",
268
- "model.language_model.layers.16.linear_attn.in_proj_z.weight_scale": "model-00002-of-00006.safetensors",
269
- "model.language_model.layers.16.linear_attn.norm.weight": "model-00002-of-00006.safetensors",
270
- "model.language_model.layers.16.linear_attn.out_proj.input_global_scale": "model-00002-of-00006.safetensors",
271
- "model.language_model.layers.16.linear_attn.out_proj.weight_global_scale": "model-00002-of-00006.safetensors",
272
- "model.language_model.layers.16.linear_attn.out_proj.weight_packed": "model-00002-of-00006.safetensors",
273
- "model.language_model.layers.16.linear_attn.out_proj.weight_scale": "model-00002-of-00006.safetensors",
274
- "model.language_model.layers.16.mlp.down_proj.input_global_scale": "model-00002-of-00006.safetensors",
275
- "model.language_model.layers.16.mlp.down_proj.weight_global_scale": "model-00002-of-00006.safetensors",
276
- "model.language_model.layers.16.mlp.down_proj.weight_packed": "model-00002-of-00006.safetensors",
277
- "model.language_model.layers.16.mlp.down_proj.weight_scale": "model-00002-of-00006.safetensors",
278
- "model.language_model.layers.16.mlp.gate_proj.input_global_scale": "model-00002-of-00006.safetensors",
279
- "model.language_model.layers.16.mlp.gate_proj.weight_global_scale": "model-00002-of-00006.safetensors",
280
  "model.language_model.layers.16.mlp.gate_proj.weight_packed": "model-00003-of-00006.safetensors",
281
  "model.language_model.layers.16.mlp.gate_proj.weight_scale": "model-00003-of-00006.safetensors",
282
  "model.language_model.layers.16.mlp.up_proj.input_global_scale": "model-00003-of-00006.safetensors",
@@ -320,8 +314,14 @@
320
  "model.language_model.layers.18.linear_attn.A_log": "model-00003-of-00006.safetensors",
321
  "model.language_model.layers.18.linear_attn.conv1d.weight": "model-00003-of-00006.safetensors",
322
  "model.language_model.layers.18.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
323
- "model.language_model.layers.18.linear_attn.in_proj_a.weight": "model-00003-of-00006.safetensors",
324
- "model.language_model.layers.18.linear_attn.in_proj_b.weight": "model-00003-of-00006.safetensors",
 
 
 
 
 
 
325
  "model.language_model.layers.18.linear_attn.in_proj_qkv.input_global_scale": "model-00003-of-00006.safetensors",
326
  "model.language_model.layers.18.linear_attn.in_proj_qkv.weight_global_scale": "model-00003-of-00006.safetensors",
327
  "model.language_model.layers.18.linear_attn.in_proj_qkv.weight_packed": "model-00003-of-00006.safetensors",
@@ -363,23 +363,14 @@
363
  "model.language_model.layers.19.mlp.up_proj.weight_scale": "model-00003-of-00006.safetensors",
364
  "model.language_model.layers.19.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
365
  "model.language_model.layers.19.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
366
- "model.language_model.layers.19.self_attn.k_proj.input_global_scale": "model-00003-of-00006.safetensors",
367
- "model.language_model.layers.19.self_attn.k_proj.weight_global_scale": "model-00003-of-00006.safetensors",
368
- "model.language_model.layers.19.self_attn.k_proj.weight_packed": "model-00003-of-00006.safetensors",
369
- "model.language_model.layers.19.self_attn.k_proj.weight_scale": "model-00003-of-00006.safetensors",
370
  "model.language_model.layers.19.self_attn.o_proj.input_global_scale": "model-00003-of-00006.safetensors",
371
  "model.language_model.layers.19.self_attn.o_proj.weight_global_scale": "model-00003-of-00006.safetensors",
372
  "model.language_model.layers.19.self_attn.o_proj.weight_packed": "model-00003-of-00006.safetensors",
373
  "model.language_model.layers.19.self_attn.o_proj.weight_scale": "model-00003-of-00006.safetensors",
374
  "model.language_model.layers.19.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
375
- "model.language_model.layers.19.self_attn.q_proj.input_global_scale": "model-00003-of-00006.safetensors",
376
- "model.language_model.layers.19.self_attn.q_proj.weight_global_scale": "model-00003-of-00006.safetensors",
377
- "model.language_model.layers.19.self_attn.q_proj.weight_packed": "model-00003-of-00006.safetensors",
378
- "model.language_model.layers.19.self_attn.q_proj.weight_scale": "model-00003-of-00006.safetensors",
379
- "model.language_model.layers.19.self_attn.v_proj.input_global_scale": "model-00003-of-00006.safetensors",
380
- "model.language_model.layers.19.self_attn.v_proj.weight_global_scale": "model-00003-of-00006.safetensors",
381
- "model.language_model.layers.19.self_attn.v_proj.weight_packed": "model-00003-of-00006.safetensors",
382
- "model.language_model.layers.19.self_attn.v_proj.weight_scale": "model-00003-of-00006.safetensors",
383
  "model.language_model.layers.2.input_layernorm.weight": "model-00003-of-00006.safetensors",
384
  "model.language_model.layers.2.linear_attn.A_log": "model-00003-of-00006.safetensors",
385
  "model.language_model.layers.2.linear_attn.conv1d.weight": "model-00003-of-00006.safetensors",
@@ -416,8 +407,14 @@
416
  "model.language_model.layers.20.linear_attn.A_log": "model-00003-of-00006.safetensors",
417
  "model.language_model.layers.20.linear_attn.conv1d.weight": "model-00003-of-00006.safetensors",
418
  "model.language_model.layers.20.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
419
- "model.language_model.layers.20.linear_attn.in_proj_a.weight": "model-00003-of-00006.safetensors",
420
- "model.language_model.layers.20.linear_attn.in_proj_b.weight": "model-00003-of-00006.safetensors",
 
 
 
 
 
 
421
  "model.language_model.layers.20.linear_attn.in_proj_qkv.input_global_scale": "model-00003-of-00006.safetensors",
422
  "model.language_model.layers.20.linear_attn.in_proj_qkv.weight_global_scale": "model-00003-of-00006.safetensors",
423
  "model.language_model.layers.20.linear_attn.in_proj_qkv.weight_packed": "model-00003-of-00006.safetensors",
@@ -535,22 +532,16 @@
535
  "model.language_model.layers.23.mlp.up_proj.weight_scale": "model-00003-of-00006.safetensors",
536
  "model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
537
  "model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
538
- "model.language_model.layers.23.self_attn.k_proj.input_global_scale": "model-00003-of-00006.safetensors",
539
- "model.language_model.layers.23.self_attn.k_proj.weight_global_scale": "model-00003-of-00006.safetensors",
540
- "model.language_model.layers.23.self_attn.k_proj.weight_packed": "model-00003-of-00006.safetensors",
541
  "model.language_model.layers.23.self_attn.k_proj.weight_scale": "model-00003-of-00006.safetensors",
542
  "model.language_model.layers.23.self_attn.o_proj.input_global_scale": "model-00003-of-00006.safetensors",
543
  "model.language_model.layers.23.self_attn.o_proj.weight_global_scale": "model-00003-of-00006.safetensors",
544
  "model.language_model.layers.23.self_attn.o_proj.weight_packed": "model-00003-of-00006.safetensors",
545
  "model.language_model.layers.23.self_attn.o_proj.weight_scale": "model-00003-of-00006.safetensors",
546
  "model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
547
- "model.language_model.layers.23.self_attn.q_proj.input_global_scale": "model-00003-of-00006.safetensors",
548
- "model.language_model.layers.23.self_attn.q_proj.weight_global_scale": "model-00003-of-00006.safetensors",
549
- "model.language_model.layers.23.self_attn.q_proj.weight_packed": "model-00003-of-00006.safetensors",
550
  "model.language_model.layers.23.self_attn.q_proj.weight_scale": "model-00003-of-00006.safetensors",
551
- "model.language_model.layers.23.self_attn.v_proj.input_global_scale": "model-00003-of-00006.safetensors",
552
- "model.language_model.layers.23.self_attn.v_proj.weight_global_scale": "model-00003-of-00006.safetensors",
553
- "model.language_model.layers.23.self_attn.v_proj.weight_packed": "model-00003-of-00006.safetensors",
554
  "model.language_model.layers.23.self_attn.v_proj.weight_scale": "model-00003-of-00006.safetensors",
555
  "model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00006.safetensors",
556
  "model.language_model.layers.24.linear_attn.A_log": "model-00003-of-00006.safetensors",
@@ -664,7 +655,9 @@
664
  "model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
665
  "model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
666
  "model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
667
- "model.language_model.layers.27.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
 
 
668
  "model.language_model.layers.27.self_attn.o_proj.weight_scale": "model-00003-of-00006.safetensors",
669
  "model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
670
  "model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
@@ -675,9 +668,13 @@
675
  "model.language_model.layers.28.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
676
  "model.language_model.layers.28.linear_attn.in_proj_a.weight": "model-00003-of-00006.safetensors",
677
  "model.language_model.layers.28.linear_attn.in_proj_b.weight": "model-00003-of-00006.safetensors",
678
- "model.language_model.layers.28.linear_attn.in_proj_qkv.weight": "model-00003-of-00006.safetensors",
 
 
679
  "model.language_model.layers.28.linear_attn.in_proj_qkv.weight_scale": "model-00003-of-00006.safetensors",
680
- "model.language_model.layers.28.linear_attn.in_proj_z.weight": "model-00003-of-00006.safetensors",
 
 
681
  "model.language_model.layers.28.linear_attn.in_proj_z.weight_scale": "model-00003-of-00006.safetensors",
682
  "model.language_model.layers.28.linear_attn.norm.weight": "model-00003-of-00006.safetensors",
683
  "model.language_model.layers.28.linear_attn.out_proj.input_global_scale": "model-00003-of-00006.safetensors",
@@ -791,6 +788,7 @@
791
  "model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
792
  "model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
793
  "model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
 
794
  "model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
795
  "model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
796
  "model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
@@ -800,20 +798,24 @@
800
  "model.language_model.layers.32.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
801
  "model.language_model.layers.32.linear_attn.in_proj_a.weight": "model-00003-of-00006.safetensors",
802
  "model.language_model.layers.32.linear_attn.in_proj_b.weight": "model-00003-of-00006.safetensors",
803
- "model.language_model.layers.32.linear_attn.in_proj_qkv.weight": "model-00003-of-00006.safetensors",
804
- "model.language_model.layers.32.linear_attn.in_proj_z.weight": "model-00003-of-00006.safetensors",
805
- "model.language_model.layers.32.linear_attn.norm.weight": "model-00003-of-00006.safetensors",
806
- "model.language_model.layers.32.linear_attn.out_proj.input_global_scale": "model-00003-of-00006.safetensors",
807
- "model.language_model.layers.32.linear_attn.out_proj.weight_global_scale": "model-00003-of-00006.safetensors",
808
- "model.language_model.layers.32.linear_attn.out_proj.weight_packed": "model-00003-of-00006.safetensors",
809
- "model.language_model.layers.32.linear_attn.out_proj.weight_scale": "model-00003-of-00006.safetensors",
810
- "model.language_model.layers.32.mlp.down_proj.input_global_scale": "model-00003-of-00006.safetensors",
811
- "model.language_model.layers.32.mlp.down_proj.weight_global_scale": "model-00003-of-00006.safetensors",
812
  "model.language_model.layers.32.mlp.down_proj.weight_packed": "model-00004-of-00006.safetensors",
813
  "model.language_model.layers.32.mlp.down_proj.weight_scale": "model-00004-of-00006.safetensors",
814
- "model.language_model.layers.32.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
 
 
815
  "model.language_model.layers.32.mlp.gate_proj.weight_scale": "model-00004-of-00006.safetensors",
816
- "model.language_model.layers.32.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
 
 
817
  "model.language_model.layers.32.mlp.up_proj.weight_scale": "model-00004-of-00006.safetensors",
818
  "model.language_model.layers.32.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
819
  "model.language_model.layers.33.input_layernorm.weight": "model-00004-of-00006.safetensors",
@@ -884,7 +886,9 @@
884
  "model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
885
  "model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00006.safetensors",
886
  "model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
887
- "model.language_model.layers.35.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
 
 
888
  "model.language_model.layers.35.self_attn.o_proj.weight_scale": "model-00004-of-00006.safetensors",
889
  "model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00006.safetensors",
890
  "model.language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
@@ -893,8 +897,14 @@
893
  "model.language_model.layers.36.linear_attn.A_log": "model-00004-of-00006.safetensors",
894
  "model.language_model.layers.36.linear_attn.conv1d.weight": "model-00004-of-00006.safetensors",
895
  "model.language_model.layers.36.linear_attn.dt_bias": "model-00004-of-00006.safetensors",
896
- "model.language_model.layers.36.linear_attn.in_proj_a.weight": "model-00004-of-00006.safetensors",
897
- "model.language_model.layers.36.linear_attn.in_proj_b.weight": "model-00004-of-00006.safetensors",
 
 
 
 
 
 
898
  "model.language_model.layers.36.linear_attn.in_proj_qkv.input_global_scale": "model-00004-of-00006.safetensors",
899
  "model.language_model.layers.36.linear_attn.in_proj_qkv.weight_global_scale": "model-00004-of-00006.safetensors",
900
  "model.language_model.layers.36.linear_attn.in_proj_qkv.weight_packed": "model-00004-of-00006.safetensors",
@@ -1006,22 +1016,16 @@
1006
  "model.language_model.layers.39.mlp.up_proj.weight_scale": "model-00004-of-00006.safetensors",
1007
  "model.language_model.layers.39.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
1008
  "model.language_model.layers.39.self_attn.k_norm.weight": "model-00004-of-00006.safetensors",
1009
- "model.language_model.layers.39.self_attn.k_proj.input_global_scale": "model-00004-of-00006.safetensors",
1010
- "model.language_model.layers.39.self_attn.k_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1011
- "model.language_model.layers.39.self_attn.k_proj.weight_packed": "model-00004-of-00006.safetensors",
1012
  "model.language_model.layers.39.self_attn.k_proj.weight_scale": "model-00004-of-00006.safetensors",
1013
  "model.language_model.layers.39.self_attn.o_proj.input_global_scale": "model-00004-of-00006.safetensors",
1014
  "model.language_model.layers.39.self_attn.o_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1015
  "model.language_model.layers.39.self_attn.o_proj.weight_packed": "model-00004-of-00006.safetensors",
1016
  "model.language_model.layers.39.self_attn.o_proj.weight_scale": "model-00004-of-00006.safetensors",
1017
  "model.language_model.layers.39.self_attn.q_norm.weight": "model-00004-of-00006.safetensors",
1018
- "model.language_model.layers.39.self_attn.q_proj.input_global_scale": "model-00004-of-00006.safetensors",
1019
- "model.language_model.layers.39.self_attn.q_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1020
- "model.language_model.layers.39.self_attn.q_proj.weight_packed": "model-00004-of-00006.safetensors",
1021
  "model.language_model.layers.39.self_attn.q_proj.weight_scale": "model-00004-of-00006.safetensors",
1022
- "model.language_model.layers.39.self_attn.v_proj.input_global_scale": "model-00004-of-00006.safetensors",
1023
- "model.language_model.layers.39.self_attn.v_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1024
- "model.language_model.layers.39.self_attn.v_proj.weight_packed": "model-00004-of-00006.safetensors",
1025
  "model.language_model.layers.39.self_attn.v_proj.weight_scale": "model-00004-of-00006.safetensors",
1026
  "model.language_model.layers.4.input_layernorm.weight": "model-00004-of-00006.safetensors",
1027
  "model.language_model.layers.4.linear_attn.A_log": "model-00004-of-00006.safetensors",
@@ -1174,6 +1178,7 @@
1174
  "model.language_model.layers.43.self_attn.k_norm.weight": "model-00004-of-00006.safetensors",
1175
  "model.language_model.layers.43.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
1176
  "model.language_model.layers.43.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
 
1177
  "model.language_model.layers.43.self_attn.q_norm.weight": "model-00004-of-00006.safetensors",
1178
  "model.language_model.layers.43.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
1179
  "model.language_model.layers.43.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
@@ -1222,9 +1227,13 @@
1222
  "model.language_model.layers.45.mlp.down_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1223
  "model.language_model.layers.45.mlp.down_proj.weight_packed": "model-00004-of-00006.safetensors",
1224
  "model.language_model.layers.45.mlp.down_proj.weight_scale": "model-00004-of-00006.safetensors",
1225
- "model.language_model.layers.45.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
 
 
1226
  "model.language_model.layers.45.mlp.gate_proj.weight_scale": "model-00004-of-00006.safetensors",
1227
- "model.language_model.layers.45.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
 
 
1228
  "model.language_model.layers.45.mlp.up_proj.weight_scale": "model-00004-of-00006.safetensors",
1229
  "model.language_model.layers.45.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
1230
  "model.language_model.layers.46.input_layernorm.weight": "model-00004-of-00006.safetensors",
@@ -1236,7 +1245,9 @@
1236
  "model.language_model.layers.46.linear_attn.in_proj_qkv.weight": "model-00004-of-00006.safetensors",
1237
  "model.language_model.layers.46.linear_attn.in_proj_z.weight": "model-00004-of-00006.safetensors",
1238
  "model.language_model.layers.46.linear_attn.norm.weight": "model-00004-of-00006.safetensors",
1239
- "model.language_model.layers.46.linear_attn.out_proj.weight": "model-00004-of-00006.safetensors",
 
 
1240
  "model.language_model.layers.46.linear_attn.out_proj.weight_scale": "model-00004-of-00006.safetensors",
1241
  "model.language_model.layers.46.mlp.down_proj.input_global_scale": "model-00004-of-00006.safetensors",
1242
  "model.language_model.layers.46.mlp.down_proj.weight_global_scale": "model-00004-of-00006.safetensors",
@@ -1259,7 +1270,9 @@
1259
  "model.language_model.layers.47.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
1260
  "model.language_model.layers.47.self_attn.k_norm.weight": "model-00005-of-00006.safetensors",
1261
  "model.language_model.layers.47.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
1262
- "model.language_model.layers.47.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
 
 
1263
  "model.language_model.layers.47.self_attn.o_proj.weight_scale": "model-00005-of-00006.safetensors",
1264
  "model.language_model.layers.47.self_attn.q_norm.weight": "model-00005-of-00006.safetensors",
1265
  "model.language_model.layers.47.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
@@ -1395,22 +1408,16 @@
1395
  "model.language_model.layers.51.mlp.up_proj.weight_scale": "model-00005-of-00006.safetensors",
1396
  "model.language_model.layers.51.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
1397
  "model.language_model.layers.51.self_attn.k_norm.weight": "model-00005-of-00006.safetensors",
1398
- "model.language_model.layers.51.self_attn.k_proj.input_global_scale": "model-00005-of-00006.safetensors",
1399
- "model.language_model.layers.51.self_attn.k_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1400
- "model.language_model.layers.51.self_attn.k_proj.weight_packed": "model-00005-of-00006.safetensors",
1401
  "model.language_model.layers.51.self_attn.k_proj.weight_scale": "model-00005-of-00006.safetensors",
1402
  "model.language_model.layers.51.self_attn.o_proj.input_global_scale": "model-00005-of-00006.safetensors",
1403
  "model.language_model.layers.51.self_attn.o_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1404
  "model.language_model.layers.51.self_attn.o_proj.weight_packed": "model-00005-of-00006.safetensors",
1405
  "model.language_model.layers.51.self_attn.o_proj.weight_scale": "model-00005-of-00006.safetensors",
1406
  "model.language_model.layers.51.self_attn.q_norm.weight": "model-00005-of-00006.safetensors",
1407
- "model.language_model.layers.51.self_attn.q_proj.input_global_scale": "model-00005-of-00006.safetensors",
1408
- "model.language_model.layers.51.self_attn.q_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1409
- "model.language_model.layers.51.self_attn.q_proj.weight_packed": "model-00005-of-00006.safetensors",
1410
  "model.language_model.layers.51.self_attn.q_proj.weight_scale": "model-00005-of-00006.safetensors",
1411
- "model.language_model.layers.51.self_attn.v_proj.input_global_scale": "model-00005-of-00006.safetensors",
1412
- "model.language_model.layers.51.self_attn.v_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1413
- "model.language_model.layers.51.self_attn.v_proj.weight_packed": "model-00005-of-00006.safetensors",
1414
  "model.language_model.layers.51.self_attn.v_proj.weight_scale": "model-00005-of-00006.safetensors",
1415
  "model.language_model.layers.52.input_layernorm.weight": "model-00005-of-00006.safetensors",
1416
  "model.language_model.layers.52.linear_attn.A_log": "model-00005-of-00006.safetensors",
@@ -1666,14 +1673,13 @@
1666
  "model.language_model.layers.59.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
1667
  "model.language_model.layers.59.self_attn.k_norm.weight": "model-00005-of-00006.safetensors",
1668
  "model.language_model.layers.59.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
1669
- "model.language_model.layers.59.self_attn.k_proj.weight_scale": "model-00005-of-00006.safetensors",
1670
- "model.language_model.layers.59.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
 
1671
  "model.language_model.layers.59.self_attn.o_proj.weight_scale": "model-00005-of-00006.safetensors",
1672
  "model.language_model.layers.59.self_attn.q_norm.weight": "model-00005-of-00006.safetensors",
1673
  "model.language_model.layers.59.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
1674
- "model.language_model.layers.59.self_attn.q_proj.weight_scale": "model-00005-of-00006.safetensors",
1675
  "model.language_model.layers.59.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
1676
- "model.language_model.layers.59.self_attn.v_proj.weight_scale": "model-00005-of-00006.safetensors",
1677
  "model.language_model.layers.6.input_layernorm.weight": "model-00005-of-00006.safetensors",
1678
  "model.language_model.layers.6.linear_attn.A_log": "model-00005-of-00006.safetensors",
1679
  "model.language_model.layers.6.linear_attn.conv1d.weight": "model-00005-of-00006.safetensors",
@@ -1800,13 +1806,13 @@
1800
  "model.language_model.layers.62.linear_attn.out_proj.input_global_scale": "model-00005-of-00006.safetensors",
1801
  "model.language_model.layers.62.linear_attn.out_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1802
  "model.language_model.layers.62.linear_attn.out_proj.weight_packed": "model-00005-of-00006.safetensors",
1803
- "model.language_model.layers.62.linear_attn.out_proj.weight_scale": "model-00005-of-00006.safetensors",
1804
- "model.language_model.layers.62.mlp.down_proj.input_global_scale": "model-00005-of-00006.safetensors",
1805
- "model.language_model.layers.62.mlp.down_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1806
- "model.language_model.layers.62.mlp.down_proj.weight_packed": "model-00005-of-00006.safetensors",
1807
- "model.language_model.layers.62.mlp.down_proj.weight_scale": "model-00005-of-00006.safetensors",
1808
- "model.language_model.layers.62.mlp.gate_proj.input_global_scale": "model-00005-of-00006.safetensors",
1809
- "model.language_model.layers.62.mlp.gate_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1810
  "model.language_model.layers.62.mlp.gate_proj.weight_packed": "model-00006-of-00006.safetensors",
1811
  "model.language_model.layers.62.mlp.gate_proj.weight_scale": "model-00006-of-00006.safetensors",
1812
  "model.language_model.layers.62.mlp.up_proj.input_global_scale": "model-00006-of-00006.safetensors",
@@ -1815,20 +1821,25 @@
1815
  "model.language_model.layers.62.mlp.up_proj.weight_scale": "model-00006-of-00006.safetensors",
1816
  "model.language_model.layers.62.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
1817
  "model.language_model.layers.63.input_layernorm.weight": "model-00006-of-00006.safetensors",
1818
- "model.language_model.layers.63.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
 
 
1819
  "model.language_model.layers.63.mlp.down_proj.weight_scale": "model-00006-of-00006.safetensors",
1820
  "model.language_model.layers.63.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
1821
  "model.language_model.layers.63.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
1822
  "model.language_model.layers.63.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
1823
  "model.language_model.layers.63.self_attn.k_norm.weight": "model-00006-of-00006.safetensors",
1824
  "model.language_model.layers.63.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
 
1825
  "model.language_model.layers.63.self_attn.o_proj.input_global_scale": "model-00006-of-00006.safetensors",
1826
  "model.language_model.layers.63.self_attn.o_proj.weight_global_scale": "model-00006-of-00006.safetensors",
1827
  "model.language_model.layers.63.self_attn.o_proj.weight_packed": "model-00006-of-00006.safetensors",
1828
  "model.language_model.layers.63.self_attn.o_proj.weight_scale": "model-00006-of-00006.safetensors",
1829
  "model.language_model.layers.63.self_attn.q_norm.weight": "model-00006-of-00006.safetensors",
1830
  "model.language_model.layers.63.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
 
1831
  "model.language_model.layers.63.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
 
1832
  "model.language_model.layers.7.input_layernorm.weight": "model-00006-of-00006.safetensors",
1833
  "model.language_model.layers.7.mlp.down_proj.input_global_scale": "model-00006-of-00006.safetensors",
1834
  "model.language_model.layers.7.mlp.down_proj.weight_global_scale": "model-00006-of-00006.safetensors",
@@ -1845,16 +1856,13 @@
1845
  "model.language_model.layers.7.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
1846
  "model.language_model.layers.7.self_attn.k_norm.weight": "model-00006-of-00006.safetensors",
1847
  "model.language_model.layers.7.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
1848
- "model.language_model.layers.7.self_attn.k_proj.weight_scale": "model-00006-of-00006.safetensors",
1849
  "model.language_model.layers.7.self_attn.o_proj.input_global_scale": "model-00006-of-00006.safetensors",
1850
  "model.language_model.layers.7.self_attn.o_proj.weight_global_scale": "model-00006-of-00006.safetensors",
1851
  "model.language_model.layers.7.self_attn.o_proj.weight_packed": "model-00006-of-00006.safetensors",
1852
  "model.language_model.layers.7.self_attn.o_proj.weight_scale": "model-00006-of-00006.safetensors",
1853
  "model.language_model.layers.7.self_attn.q_norm.weight": "model-00006-of-00006.safetensors",
1854
  "model.language_model.layers.7.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
1855
- "model.language_model.layers.7.self_attn.q_proj.weight_scale": "model-00006-of-00006.safetensors",
1856
  "model.language_model.layers.7.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
1857
- "model.language_model.layers.7.self_attn.v_proj.weight_scale": "model-00006-of-00006.safetensors",
1858
  "model.language_model.layers.8.input_layernorm.weight": "model-00006-of-00006.safetensors",
1859
  "model.language_model.layers.8.linear_attn.A_log": "model-00006-of-00006.safetensors",
1860
  "model.language_model.layers.8.linear_attn.conv1d.weight": "model-00006-of-00006.safetensors",
@@ -1957,9 +1965,14 @@
1957
  "model.visual.blocks.1.norm2.bias": "model-00006-of-00006.safetensors",
1958
  "model.visual.blocks.1.norm2.weight": "model-00006-of-00006.safetensors",
1959
  "model.visual.blocks.10.attn.proj.bias": "model-00006-of-00006.safetensors",
1960
- "model.visual.blocks.10.attn.proj.weight": "model-00006-of-00006.safetensors",
 
 
 
1961
  "model.visual.blocks.10.attn.qkv.bias": "model-00006-of-00006.safetensors",
1962
- "model.visual.blocks.10.attn.qkv.weight": "model-00006-of-00006.safetensors",
 
 
1963
  "model.visual.blocks.10.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
1964
  "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
1965
  "model.visual.blocks.10.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
@@ -1976,10 +1989,14 @@
1976
  "model.visual.blocks.10.norm2.bias": "model-00006-of-00006.safetensors",
1977
  "model.visual.blocks.10.norm2.weight": "model-00006-of-00006.safetensors",
1978
  "model.visual.blocks.11.attn.proj.bias": "model-00006-of-00006.safetensors",
1979
- "model.visual.blocks.11.attn.proj.weight": "model-00006-of-00006.safetensors",
 
 
1980
  "model.visual.blocks.11.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
1981
  "model.visual.blocks.11.attn.qkv.bias": "model-00006-of-00006.safetensors",
1982
- "model.visual.blocks.11.attn.qkv.weight": "model-00006-of-00006.safetensors",
 
 
1983
  "model.visual.blocks.11.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
1984
  "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
1985
  "model.visual.blocks.11.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
@@ -2001,7 +2018,9 @@
2001
  "model.visual.blocks.12.attn.proj.weight_packed": "model-00006-of-00006.safetensors",
2002
  "model.visual.blocks.12.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2003
  "model.visual.blocks.12.attn.qkv.bias": "model-00006-of-00006.safetensors",
2004
- "model.visual.blocks.12.attn.qkv.weight": "model-00006-of-00006.safetensors",
 
 
2005
  "model.visual.blocks.12.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2006
  "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2007
  "model.visual.blocks.12.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
@@ -2018,10 +2037,13 @@
2018
  "model.visual.blocks.12.norm2.bias": "model-00006-of-00006.safetensors",
2019
  "model.visual.blocks.12.norm2.weight": "model-00006-of-00006.safetensors",
2020
  "model.visual.blocks.13.attn.proj.bias": "model-00006-of-00006.safetensors",
2021
- "model.visual.blocks.13.attn.proj.weight": "model-00006-of-00006.safetensors",
 
 
2022
  "model.visual.blocks.13.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2023
  "model.visual.blocks.13.attn.qkv.bias": "model-00006-of-00006.safetensors",
2024
  "model.visual.blocks.13.attn.qkv.weight": "model-00006-of-00006.safetensors",
 
2025
  "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2026
  "model.visual.blocks.13.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
2027
  "model.visual.blocks.13.mlp.linear_fc1.weight_global_scale": "model-00006-of-00006.safetensors",
@@ -2042,7 +2064,9 @@
2042
  "model.visual.blocks.14.attn.proj.weight_packed": "model-00006-of-00006.safetensors",
2043
  "model.visual.blocks.14.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2044
  "model.visual.blocks.14.attn.qkv.bias": "model-00006-of-00006.safetensors",
2045
- "model.visual.blocks.14.attn.qkv.weight": "model-00006-of-00006.safetensors",
 
 
2046
  "model.visual.blocks.14.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2047
  "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2048
  "model.visual.blocks.14.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
@@ -2296,7 +2320,10 @@
2296
  "model.visual.blocks.24.mlp.linear_fc1.weight_packed": "model-00006-of-00006.safetensors",
2297
  "model.visual.blocks.24.mlp.linear_fc1.weight_scale": "model-00006-of-00006.safetensors",
2298
  "model.visual.blocks.24.mlp.linear_fc2.bias": "model-00006-of-00006.safetensors",
2299
- "model.visual.blocks.24.mlp.linear_fc2.weight": "model-00006-of-00006.safetensors",
 
 
 
2300
  "model.visual.blocks.24.norm1.bias": "model-00006-of-00006.safetensors",
2301
  "model.visual.blocks.24.norm1.weight": "model-00006-of-00006.safetensors",
2302
  "model.visual.blocks.24.norm2.bias": "model-00006-of-00006.safetensors",
@@ -2372,7 +2399,10 @@
2372
  "model.visual.blocks.5.attn.qkv.bias": "model-00006-of-00006.safetensors",
2373
  "model.visual.blocks.5.attn.qkv.weight": "model-00006-of-00006.safetensors",
2374
  "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2375
- "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00006-of-00006.safetensors",
 
 
 
2376
  "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00006-of-00006.safetensors",
2377
  "model.visual.blocks.5.mlp.linear_fc2.input_global_scale": "model-00006-of-00006.safetensors",
2378
  "model.visual.blocks.5.mlp.linear_fc2.weight_global_scale": "model-00006-of-00006.safetensors",
@@ -2403,8 +2433,11 @@
2403
  "model.visual.blocks.6.norm2.weight": "model-00006-of-00006.safetensors",
2404
  "model.visual.blocks.7.attn.proj.bias": "model-00006-of-00006.safetensors",
2405
  "model.visual.blocks.7.attn.proj.weight": "model-00006-of-00006.safetensors",
 
2406
  "model.visual.blocks.7.attn.qkv.bias": "model-00006-of-00006.safetensors",
2407
- "model.visual.blocks.7.attn.qkv.weight": "model-00006-of-00006.safetensors",
 
 
2408
  "model.visual.blocks.7.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2409
  "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2410
  "model.visual.blocks.7.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
@@ -2424,6 +2457,7 @@
2424
  "model.visual.blocks.8.attn.proj.weight": "model-00006-of-00006.safetensors",
2425
  "model.visual.blocks.8.attn.qkv.bias": "model-00006-of-00006.safetensors",
2426
  "model.visual.blocks.8.attn.qkv.weight": "model-00006-of-00006.safetensors",
 
2427
  "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2428
  "model.visual.blocks.8.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
2429
  "model.visual.blocks.8.mlp.linear_fc1.weight_global_scale": "model-00006-of-00006.safetensors",
@@ -2440,6 +2474,7 @@
2440
  "model.visual.blocks.8.norm2.weight": "model-00006-of-00006.safetensors",
2441
  "model.visual.blocks.9.attn.proj.bias": "model-00006-of-00006.safetensors",
2442
  "model.visual.blocks.9.attn.proj.weight": "model-00006-of-00006.safetensors",
 
2443
  "model.visual.blocks.9.attn.qkv.bias": "model-00006-of-00006.safetensors",
2444
  "model.visual.blocks.9.attn.qkv.weight": "model-00006-of-00006.safetensors",
2445
  "model.visual.blocks.9.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 22668647304
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00001-of-00006.safetensors",
 
120
  "model.language_model.layers.11.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
121
  "model.language_model.layers.11.self_attn.k_norm.weight": "model-00002-of-00006.safetensors",
122
  "model.language_model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
 
123
  "model.language_model.layers.11.self_attn.o_proj.input_global_scale": "model-00002-of-00006.safetensors",
124
  "model.language_model.layers.11.self_attn.o_proj.weight_global_scale": "model-00002-of-00006.safetensors",
125
  "model.language_model.layers.11.self_attn.o_proj.weight_packed": "model-00002-of-00006.safetensors",
126
  "model.language_model.layers.11.self_attn.o_proj.weight_scale": "model-00002-of-00006.safetensors",
127
  "model.language_model.layers.11.self_attn.q_norm.weight": "model-00002-of-00006.safetensors",
128
  "model.language_model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
 
129
  "model.language_model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
 
130
  "model.language_model.layers.12.input_layernorm.weight": "model-00002-of-00006.safetensors",
131
  "model.language_model.layers.12.linear_attn.A_log": "model-00002-of-00006.safetensors",
132
  "model.language_model.layers.12.linear_attn.conv1d.weight": "model-00002-of-00006.safetensors",
 
239
  "model.language_model.layers.15.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
240
  "model.language_model.layers.15.self_attn.k_norm.weight": "model-00002-of-00006.safetensors",
241
  "model.language_model.layers.15.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
 
242
  "model.language_model.layers.15.self_attn.o_proj.input_global_scale": "model-00002-of-00006.safetensors",
243
  "model.language_model.layers.15.self_attn.o_proj.weight_global_scale": "model-00002-of-00006.safetensors",
244
  "model.language_model.layers.15.self_attn.o_proj.weight_packed": "model-00002-of-00006.safetensors",
245
  "model.language_model.layers.15.self_attn.o_proj.weight_scale": "model-00002-of-00006.safetensors",
246
  "model.language_model.layers.15.self_attn.q_norm.weight": "model-00002-of-00006.safetensors",
247
  "model.language_model.layers.15.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
 
248
  "model.language_model.layers.15.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
 
249
  "model.language_model.layers.16.input_layernorm.weight": "model-00002-of-00006.safetensors",
250
  "model.language_model.layers.16.linear_attn.A_log": "model-00002-of-00006.safetensors",
251
  "model.language_model.layers.16.linear_attn.conv1d.weight": "model-00002-of-00006.safetensors",
 
254
  "model.language_model.layers.16.linear_attn.in_proj_b.weight": "model-00002-of-00006.safetensors",
255
  "model.language_model.layers.16.linear_attn.in_proj_qkv.input_global_scale": "model-00002-of-00006.safetensors",
256
  "model.language_model.layers.16.linear_attn.in_proj_qkv.weight_global_scale": "model-00002-of-00006.safetensors",
257
+ "model.language_model.layers.16.linear_attn.in_proj_qkv.weight_packed": "model-00003-of-00006.safetensors",
258
+ "model.language_model.layers.16.linear_attn.in_proj_qkv.weight_scale": "model-00003-of-00006.safetensors",
259
+ "model.language_model.layers.16.linear_attn.in_proj_z.input_global_scale": "model-00003-of-00006.safetensors",
260
+ "model.language_model.layers.16.linear_attn.in_proj_z.weight_global_scale": "model-00003-of-00006.safetensors",
261
+ "model.language_model.layers.16.linear_attn.in_proj_z.weight_packed": "model-00003-of-00006.safetensors",
262
+ "model.language_model.layers.16.linear_attn.in_proj_z.weight_scale": "model-00003-of-00006.safetensors",
263
+ "model.language_model.layers.16.linear_attn.norm.weight": "model-00003-of-00006.safetensors",
264
+ "model.language_model.layers.16.linear_attn.out_proj.input_global_scale": "model-00003-of-00006.safetensors",
265
+ "model.language_model.layers.16.linear_attn.out_proj.weight_global_scale": "model-00003-of-00006.safetensors",
266
+ "model.language_model.layers.16.linear_attn.out_proj.weight_packed": "model-00003-of-00006.safetensors",
267
+ "model.language_model.layers.16.linear_attn.out_proj.weight_scale": "model-00003-of-00006.safetensors",
268
+ "model.language_model.layers.16.mlp.down_proj.input_global_scale": "model-00003-of-00006.safetensors",
269
+ "model.language_model.layers.16.mlp.down_proj.weight_global_scale": "model-00003-of-00006.safetensors",
270
+ "model.language_model.layers.16.mlp.down_proj.weight_packed": "model-00003-of-00006.safetensors",
271
+ "model.language_model.layers.16.mlp.down_proj.weight_scale": "model-00003-of-00006.safetensors",
272
+ "model.language_model.layers.16.mlp.gate_proj.input_global_scale": "model-00003-of-00006.safetensors",
273
+ "model.language_model.layers.16.mlp.gate_proj.weight_global_scale": "model-00003-of-00006.safetensors",
274
  "model.language_model.layers.16.mlp.gate_proj.weight_packed": "model-00003-of-00006.safetensors",
275
  "model.language_model.layers.16.mlp.gate_proj.weight_scale": "model-00003-of-00006.safetensors",
276
  "model.language_model.layers.16.mlp.up_proj.input_global_scale": "model-00003-of-00006.safetensors",
 
314
  "model.language_model.layers.18.linear_attn.A_log": "model-00003-of-00006.safetensors",
315
  "model.language_model.layers.18.linear_attn.conv1d.weight": "model-00003-of-00006.safetensors",
316
  "model.language_model.layers.18.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
317
+ "model.language_model.layers.18.linear_attn.in_proj_a.input_global_scale": "model-00003-of-00006.safetensors",
318
+ "model.language_model.layers.18.linear_attn.in_proj_a.weight_global_scale": "model-00003-of-00006.safetensors",
319
+ "model.language_model.layers.18.linear_attn.in_proj_a.weight_packed": "model-00003-of-00006.safetensors",
320
+ "model.language_model.layers.18.linear_attn.in_proj_a.weight_scale": "model-00003-of-00006.safetensors",
321
+ "model.language_model.layers.18.linear_attn.in_proj_b.input_global_scale": "model-00003-of-00006.safetensors",
322
+ "model.language_model.layers.18.linear_attn.in_proj_b.weight_global_scale": "model-00003-of-00006.safetensors",
323
+ "model.language_model.layers.18.linear_attn.in_proj_b.weight_packed": "model-00003-of-00006.safetensors",
324
+ "model.language_model.layers.18.linear_attn.in_proj_b.weight_scale": "model-00003-of-00006.safetensors",
325
  "model.language_model.layers.18.linear_attn.in_proj_qkv.input_global_scale": "model-00003-of-00006.safetensors",
326
  "model.language_model.layers.18.linear_attn.in_proj_qkv.weight_global_scale": "model-00003-of-00006.safetensors",
327
  "model.language_model.layers.18.linear_attn.in_proj_qkv.weight_packed": "model-00003-of-00006.safetensors",
 
363
  "model.language_model.layers.19.mlp.up_proj.weight_scale": "model-00003-of-00006.safetensors",
364
  "model.language_model.layers.19.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
365
  "model.language_model.layers.19.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
366
+ "model.language_model.layers.19.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
 
 
 
367
  "model.language_model.layers.19.self_attn.o_proj.input_global_scale": "model-00003-of-00006.safetensors",
368
  "model.language_model.layers.19.self_attn.o_proj.weight_global_scale": "model-00003-of-00006.safetensors",
369
  "model.language_model.layers.19.self_attn.o_proj.weight_packed": "model-00003-of-00006.safetensors",
370
  "model.language_model.layers.19.self_attn.o_proj.weight_scale": "model-00003-of-00006.safetensors",
371
  "model.language_model.layers.19.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
372
+ "model.language_model.layers.19.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
373
+ "model.language_model.layers.19.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
 
 
 
 
 
 
374
  "model.language_model.layers.2.input_layernorm.weight": "model-00003-of-00006.safetensors",
375
  "model.language_model.layers.2.linear_attn.A_log": "model-00003-of-00006.safetensors",
376
  "model.language_model.layers.2.linear_attn.conv1d.weight": "model-00003-of-00006.safetensors",
 
407
  "model.language_model.layers.20.linear_attn.A_log": "model-00003-of-00006.safetensors",
408
  "model.language_model.layers.20.linear_attn.conv1d.weight": "model-00003-of-00006.safetensors",
409
  "model.language_model.layers.20.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
410
+ "model.language_model.layers.20.linear_attn.in_proj_a.input_global_scale": "model-00003-of-00006.safetensors",
411
+ "model.language_model.layers.20.linear_attn.in_proj_a.weight_global_scale": "model-00003-of-00006.safetensors",
412
+ "model.language_model.layers.20.linear_attn.in_proj_a.weight_packed": "model-00003-of-00006.safetensors",
413
+ "model.language_model.layers.20.linear_attn.in_proj_a.weight_scale": "model-00003-of-00006.safetensors",
414
+ "model.language_model.layers.20.linear_attn.in_proj_b.input_global_scale": "model-00003-of-00006.safetensors",
415
+ "model.language_model.layers.20.linear_attn.in_proj_b.weight_global_scale": "model-00003-of-00006.safetensors",
416
+ "model.language_model.layers.20.linear_attn.in_proj_b.weight_packed": "model-00003-of-00006.safetensors",
417
+ "model.language_model.layers.20.linear_attn.in_proj_b.weight_scale": "model-00003-of-00006.safetensors",
418
  "model.language_model.layers.20.linear_attn.in_proj_qkv.input_global_scale": "model-00003-of-00006.safetensors",
419
  "model.language_model.layers.20.linear_attn.in_proj_qkv.weight_global_scale": "model-00003-of-00006.safetensors",
420
  "model.language_model.layers.20.linear_attn.in_proj_qkv.weight_packed": "model-00003-of-00006.safetensors",
 
532
  "model.language_model.layers.23.mlp.up_proj.weight_scale": "model-00003-of-00006.safetensors",
533
  "model.language_model.layers.23.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
534
  "model.language_model.layers.23.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
535
+ "model.language_model.layers.23.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
 
 
536
  "model.language_model.layers.23.self_attn.k_proj.weight_scale": "model-00003-of-00006.safetensors",
537
  "model.language_model.layers.23.self_attn.o_proj.input_global_scale": "model-00003-of-00006.safetensors",
538
  "model.language_model.layers.23.self_attn.o_proj.weight_global_scale": "model-00003-of-00006.safetensors",
539
  "model.language_model.layers.23.self_attn.o_proj.weight_packed": "model-00003-of-00006.safetensors",
540
  "model.language_model.layers.23.self_attn.o_proj.weight_scale": "model-00003-of-00006.safetensors",
541
  "model.language_model.layers.23.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
542
+ "model.language_model.layers.23.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
 
 
543
  "model.language_model.layers.23.self_attn.q_proj.weight_scale": "model-00003-of-00006.safetensors",
544
+ "model.language_model.layers.23.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
 
 
545
  "model.language_model.layers.23.self_attn.v_proj.weight_scale": "model-00003-of-00006.safetensors",
546
  "model.language_model.layers.24.input_layernorm.weight": "model-00003-of-00006.safetensors",
547
  "model.language_model.layers.24.linear_attn.A_log": "model-00003-of-00006.safetensors",
 
655
  "model.language_model.layers.27.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
656
  "model.language_model.layers.27.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
657
  "model.language_model.layers.27.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
658
+ "model.language_model.layers.27.self_attn.o_proj.input_global_scale": "model-00003-of-00006.safetensors",
659
+ "model.language_model.layers.27.self_attn.o_proj.weight_global_scale": "model-00003-of-00006.safetensors",
660
+ "model.language_model.layers.27.self_attn.o_proj.weight_packed": "model-00003-of-00006.safetensors",
661
  "model.language_model.layers.27.self_attn.o_proj.weight_scale": "model-00003-of-00006.safetensors",
662
  "model.language_model.layers.27.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
663
  "model.language_model.layers.27.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
 
668
  "model.language_model.layers.28.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
669
  "model.language_model.layers.28.linear_attn.in_proj_a.weight": "model-00003-of-00006.safetensors",
670
  "model.language_model.layers.28.linear_attn.in_proj_b.weight": "model-00003-of-00006.safetensors",
671
+ "model.language_model.layers.28.linear_attn.in_proj_qkv.input_global_scale": "model-00003-of-00006.safetensors",
672
+ "model.language_model.layers.28.linear_attn.in_proj_qkv.weight_global_scale": "model-00003-of-00006.safetensors",
673
+ "model.language_model.layers.28.linear_attn.in_proj_qkv.weight_packed": "model-00003-of-00006.safetensors",
674
  "model.language_model.layers.28.linear_attn.in_proj_qkv.weight_scale": "model-00003-of-00006.safetensors",
675
+ "model.language_model.layers.28.linear_attn.in_proj_z.input_global_scale": "model-00003-of-00006.safetensors",
676
+ "model.language_model.layers.28.linear_attn.in_proj_z.weight_global_scale": "model-00003-of-00006.safetensors",
677
+ "model.language_model.layers.28.linear_attn.in_proj_z.weight_packed": "model-00003-of-00006.safetensors",
678
  "model.language_model.layers.28.linear_attn.in_proj_z.weight_scale": "model-00003-of-00006.safetensors",
679
  "model.language_model.layers.28.linear_attn.norm.weight": "model-00003-of-00006.safetensors",
680
  "model.language_model.layers.28.linear_attn.out_proj.input_global_scale": "model-00003-of-00006.safetensors",
 
788
  "model.language_model.layers.31.self_attn.k_norm.weight": "model-00003-of-00006.safetensors",
789
  "model.language_model.layers.31.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
790
  "model.language_model.layers.31.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
791
+ "model.language_model.layers.31.self_attn.o_proj.weight_scale": "model-00003-of-00006.safetensors",
792
  "model.language_model.layers.31.self_attn.q_norm.weight": "model-00003-of-00006.safetensors",
793
  "model.language_model.layers.31.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
794
  "model.language_model.layers.31.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
 
798
  "model.language_model.layers.32.linear_attn.dt_bias": "model-00003-of-00006.safetensors",
799
  "model.language_model.layers.32.linear_attn.in_proj_a.weight": "model-00003-of-00006.safetensors",
800
  "model.language_model.layers.32.linear_attn.in_proj_b.weight": "model-00003-of-00006.safetensors",
801
+ "model.language_model.layers.32.linear_attn.in_proj_qkv.weight": "model-00004-of-00006.safetensors",
802
+ "model.language_model.layers.32.linear_attn.in_proj_z.weight": "model-00004-of-00006.safetensors",
803
+ "model.language_model.layers.32.linear_attn.norm.weight": "model-00004-of-00006.safetensors",
804
+ "model.language_model.layers.32.linear_attn.out_proj.input_global_scale": "model-00004-of-00006.safetensors",
805
+ "model.language_model.layers.32.linear_attn.out_proj.weight_global_scale": "model-00004-of-00006.safetensors",
806
+ "model.language_model.layers.32.linear_attn.out_proj.weight_packed": "model-00004-of-00006.safetensors",
807
+ "model.language_model.layers.32.linear_attn.out_proj.weight_scale": "model-00004-of-00006.safetensors",
808
+ "model.language_model.layers.32.mlp.down_proj.input_global_scale": "model-00004-of-00006.safetensors",
809
+ "model.language_model.layers.32.mlp.down_proj.weight_global_scale": "model-00004-of-00006.safetensors",
810
  "model.language_model.layers.32.mlp.down_proj.weight_packed": "model-00004-of-00006.safetensors",
811
  "model.language_model.layers.32.mlp.down_proj.weight_scale": "model-00004-of-00006.safetensors",
812
+ "model.language_model.layers.32.mlp.gate_proj.input_global_scale": "model-00004-of-00006.safetensors",
813
+ "model.language_model.layers.32.mlp.gate_proj.weight_global_scale": "model-00004-of-00006.safetensors",
814
+ "model.language_model.layers.32.mlp.gate_proj.weight_packed": "model-00004-of-00006.safetensors",
815
  "model.language_model.layers.32.mlp.gate_proj.weight_scale": "model-00004-of-00006.safetensors",
816
+ "model.language_model.layers.32.mlp.up_proj.input_global_scale": "model-00004-of-00006.safetensors",
817
+ "model.language_model.layers.32.mlp.up_proj.weight_global_scale": "model-00004-of-00006.safetensors",
818
+ "model.language_model.layers.32.mlp.up_proj.weight_packed": "model-00004-of-00006.safetensors",
819
  "model.language_model.layers.32.mlp.up_proj.weight_scale": "model-00004-of-00006.safetensors",
820
  "model.language_model.layers.32.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
821
  "model.language_model.layers.33.input_layernorm.weight": "model-00004-of-00006.safetensors",
 
886
  "model.language_model.layers.35.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
887
  "model.language_model.layers.35.self_attn.k_norm.weight": "model-00004-of-00006.safetensors",
888
  "model.language_model.layers.35.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
889
+ "model.language_model.layers.35.self_attn.o_proj.input_global_scale": "model-00004-of-00006.safetensors",
890
+ "model.language_model.layers.35.self_attn.o_proj.weight_global_scale": "model-00004-of-00006.safetensors",
891
+ "model.language_model.layers.35.self_attn.o_proj.weight_packed": "model-00004-of-00006.safetensors",
892
  "model.language_model.layers.35.self_attn.o_proj.weight_scale": "model-00004-of-00006.safetensors",
893
  "model.language_model.layers.35.self_attn.q_norm.weight": "model-00004-of-00006.safetensors",
894
  "model.language_model.layers.35.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
 
897
  "model.language_model.layers.36.linear_attn.A_log": "model-00004-of-00006.safetensors",
898
  "model.language_model.layers.36.linear_attn.conv1d.weight": "model-00004-of-00006.safetensors",
899
  "model.language_model.layers.36.linear_attn.dt_bias": "model-00004-of-00006.safetensors",
900
+ "model.language_model.layers.36.linear_attn.in_proj_a.input_global_scale": "model-00004-of-00006.safetensors",
901
+ "model.language_model.layers.36.linear_attn.in_proj_a.weight_global_scale": "model-00004-of-00006.safetensors",
902
+ "model.language_model.layers.36.linear_attn.in_proj_a.weight_packed": "model-00004-of-00006.safetensors",
903
+ "model.language_model.layers.36.linear_attn.in_proj_a.weight_scale": "model-00004-of-00006.safetensors",
904
+ "model.language_model.layers.36.linear_attn.in_proj_b.input_global_scale": "model-00004-of-00006.safetensors",
905
+ "model.language_model.layers.36.linear_attn.in_proj_b.weight_global_scale": "model-00004-of-00006.safetensors",
906
+ "model.language_model.layers.36.linear_attn.in_proj_b.weight_packed": "model-00004-of-00006.safetensors",
907
+ "model.language_model.layers.36.linear_attn.in_proj_b.weight_scale": "model-00004-of-00006.safetensors",
908
  "model.language_model.layers.36.linear_attn.in_proj_qkv.input_global_scale": "model-00004-of-00006.safetensors",
909
  "model.language_model.layers.36.linear_attn.in_proj_qkv.weight_global_scale": "model-00004-of-00006.safetensors",
910
  "model.language_model.layers.36.linear_attn.in_proj_qkv.weight_packed": "model-00004-of-00006.safetensors",
 
1016
  "model.language_model.layers.39.mlp.up_proj.weight_scale": "model-00004-of-00006.safetensors",
1017
  "model.language_model.layers.39.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
1018
  "model.language_model.layers.39.self_attn.k_norm.weight": "model-00004-of-00006.safetensors",
1019
+ "model.language_model.layers.39.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
 
 
1020
  "model.language_model.layers.39.self_attn.k_proj.weight_scale": "model-00004-of-00006.safetensors",
1021
  "model.language_model.layers.39.self_attn.o_proj.input_global_scale": "model-00004-of-00006.safetensors",
1022
  "model.language_model.layers.39.self_attn.o_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1023
  "model.language_model.layers.39.self_attn.o_proj.weight_packed": "model-00004-of-00006.safetensors",
1024
  "model.language_model.layers.39.self_attn.o_proj.weight_scale": "model-00004-of-00006.safetensors",
1025
  "model.language_model.layers.39.self_attn.q_norm.weight": "model-00004-of-00006.safetensors",
1026
+ "model.language_model.layers.39.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
 
 
1027
  "model.language_model.layers.39.self_attn.q_proj.weight_scale": "model-00004-of-00006.safetensors",
1028
+ "model.language_model.layers.39.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
 
 
1029
  "model.language_model.layers.39.self_attn.v_proj.weight_scale": "model-00004-of-00006.safetensors",
1030
  "model.language_model.layers.4.input_layernorm.weight": "model-00004-of-00006.safetensors",
1031
  "model.language_model.layers.4.linear_attn.A_log": "model-00004-of-00006.safetensors",
 
1178
  "model.language_model.layers.43.self_attn.k_norm.weight": "model-00004-of-00006.safetensors",
1179
  "model.language_model.layers.43.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
1180
  "model.language_model.layers.43.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
1181
+ "model.language_model.layers.43.self_attn.o_proj.weight_scale": "model-00004-of-00006.safetensors",
1182
  "model.language_model.layers.43.self_attn.q_norm.weight": "model-00004-of-00006.safetensors",
1183
  "model.language_model.layers.43.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
1184
  "model.language_model.layers.43.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
 
1227
  "model.language_model.layers.45.mlp.down_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1228
  "model.language_model.layers.45.mlp.down_proj.weight_packed": "model-00004-of-00006.safetensors",
1229
  "model.language_model.layers.45.mlp.down_proj.weight_scale": "model-00004-of-00006.safetensors",
1230
+ "model.language_model.layers.45.mlp.gate_proj.input_global_scale": "model-00004-of-00006.safetensors",
1231
+ "model.language_model.layers.45.mlp.gate_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1232
+ "model.language_model.layers.45.mlp.gate_proj.weight_packed": "model-00004-of-00006.safetensors",
1233
  "model.language_model.layers.45.mlp.gate_proj.weight_scale": "model-00004-of-00006.safetensors",
1234
+ "model.language_model.layers.45.mlp.up_proj.input_global_scale": "model-00004-of-00006.safetensors",
1235
+ "model.language_model.layers.45.mlp.up_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1236
+ "model.language_model.layers.45.mlp.up_proj.weight_packed": "model-00004-of-00006.safetensors",
1237
  "model.language_model.layers.45.mlp.up_proj.weight_scale": "model-00004-of-00006.safetensors",
1238
  "model.language_model.layers.45.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
1239
  "model.language_model.layers.46.input_layernorm.weight": "model-00004-of-00006.safetensors",
 
1245
  "model.language_model.layers.46.linear_attn.in_proj_qkv.weight": "model-00004-of-00006.safetensors",
1246
  "model.language_model.layers.46.linear_attn.in_proj_z.weight": "model-00004-of-00006.safetensors",
1247
  "model.language_model.layers.46.linear_attn.norm.weight": "model-00004-of-00006.safetensors",
1248
+ "model.language_model.layers.46.linear_attn.out_proj.input_global_scale": "model-00004-of-00006.safetensors",
1249
+ "model.language_model.layers.46.linear_attn.out_proj.weight_global_scale": "model-00004-of-00006.safetensors",
1250
+ "model.language_model.layers.46.linear_attn.out_proj.weight_packed": "model-00004-of-00006.safetensors",
1251
  "model.language_model.layers.46.linear_attn.out_proj.weight_scale": "model-00004-of-00006.safetensors",
1252
  "model.language_model.layers.46.mlp.down_proj.input_global_scale": "model-00004-of-00006.safetensors",
1253
  "model.language_model.layers.46.mlp.down_proj.weight_global_scale": "model-00004-of-00006.safetensors",
 
1270
  "model.language_model.layers.47.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
1271
  "model.language_model.layers.47.self_attn.k_norm.weight": "model-00005-of-00006.safetensors",
1272
  "model.language_model.layers.47.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
1273
+ "model.language_model.layers.47.self_attn.o_proj.input_global_scale": "model-00005-of-00006.safetensors",
1274
+ "model.language_model.layers.47.self_attn.o_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1275
+ "model.language_model.layers.47.self_attn.o_proj.weight_packed": "model-00005-of-00006.safetensors",
1276
  "model.language_model.layers.47.self_attn.o_proj.weight_scale": "model-00005-of-00006.safetensors",
1277
  "model.language_model.layers.47.self_attn.q_norm.weight": "model-00005-of-00006.safetensors",
1278
  "model.language_model.layers.47.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
 
1408
  "model.language_model.layers.51.mlp.up_proj.weight_scale": "model-00005-of-00006.safetensors",
1409
  "model.language_model.layers.51.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
1410
  "model.language_model.layers.51.self_attn.k_norm.weight": "model-00005-of-00006.safetensors",
1411
+ "model.language_model.layers.51.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
 
 
1412
  "model.language_model.layers.51.self_attn.k_proj.weight_scale": "model-00005-of-00006.safetensors",
1413
  "model.language_model.layers.51.self_attn.o_proj.input_global_scale": "model-00005-of-00006.safetensors",
1414
  "model.language_model.layers.51.self_attn.o_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1415
  "model.language_model.layers.51.self_attn.o_proj.weight_packed": "model-00005-of-00006.safetensors",
1416
  "model.language_model.layers.51.self_attn.o_proj.weight_scale": "model-00005-of-00006.safetensors",
1417
  "model.language_model.layers.51.self_attn.q_norm.weight": "model-00005-of-00006.safetensors",
1418
+ "model.language_model.layers.51.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
 
 
1419
  "model.language_model.layers.51.self_attn.q_proj.weight_scale": "model-00005-of-00006.safetensors",
1420
+ "model.language_model.layers.51.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
 
 
1421
  "model.language_model.layers.51.self_attn.v_proj.weight_scale": "model-00005-of-00006.safetensors",
1422
  "model.language_model.layers.52.input_layernorm.weight": "model-00005-of-00006.safetensors",
1423
  "model.language_model.layers.52.linear_attn.A_log": "model-00005-of-00006.safetensors",
 
1673
  "model.language_model.layers.59.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
1674
  "model.language_model.layers.59.self_attn.k_norm.weight": "model-00005-of-00006.safetensors",
1675
  "model.language_model.layers.59.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
1676
+ "model.language_model.layers.59.self_attn.o_proj.input_global_scale": "model-00005-of-00006.safetensors",
1677
+ "model.language_model.layers.59.self_attn.o_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1678
+ "model.language_model.layers.59.self_attn.o_proj.weight_packed": "model-00005-of-00006.safetensors",
1679
  "model.language_model.layers.59.self_attn.o_proj.weight_scale": "model-00005-of-00006.safetensors",
1680
  "model.language_model.layers.59.self_attn.q_norm.weight": "model-00005-of-00006.safetensors",
1681
  "model.language_model.layers.59.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
 
1682
  "model.language_model.layers.59.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
 
1683
  "model.language_model.layers.6.input_layernorm.weight": "model-00005-of-00006.safetensors",
1684
  "model.language_model.layers.6.linear_attn.A_log": "model-00005-of-00006.safetensors",
1685
  "model.language_model.layers.6.linear_attn.conv1d.weight": "model-00005-of-00006.safetensors",
 
1806
  "model.language_model.layers.62.linear_attn.out_proj.input_global_scale": "model-00005-of-00006.safetensors",
1807
  "model.language_model.layers.62.linear_attn.out_proj.weight_global_scale": "model-00005-of-00006.safetensors",
1808
  "model.language_model.layers.62.linear_attn.out_proj.weight_packed": "model-00005-of-00006.safetensors",
1809
+ "model.language_model.layers.62.linear_attn.out_proj.weight_scale": "model-00006-of-00006.safetensors",
1810
+ "model.language_model.layers.62.mlp.down_proj.input_global_scale": "model-00006-of-00006.safetensors",
1811
+ "model.language_model.layers.62.mlp.down_proj.weight_global_scale": "model-00006-of-00006.safetensors",
1812
+ "model.language_model.layers.62.mlp.down_proj.weight_packed": "model-00006-of-00006.safetensors",
1813
+ "model.language_model.layers.62.mlp.down_proj.weight_scale": "model-00006-of-00006.safetensors",
1814
+ "model.language_model.layers.62.mlp.gate_proj.input_global_scale": "model-00006-of-00006.safetensors",
1815
+ "model.language_model.layers.62.mlp.gate_proj.weight_global_scale": "model-00006-of-00006.safetensors",
1816
  "model.language_model.layers.62.mlp.gate_proj.weight_packed": "model-00006-of-00006.safetensors",
1817
  "model.language_model.layers.62.mlp.gate_proj.weight_scale": "model-00006-of-00006.safetensors",
1818
  "model.language_model.layers.62.mlp.up_proj.input_global_scale": "model-00006-of-00006.safetensors",
 
1821
  "model.language_model.layers.62.mlp.up_proj.weight_scale": "model-00006-of-00006.safetensors",
1822
  "model.language_model.layers.62.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
1823
  "model.language_model.layers.63.input_layernorm.weight": "model-00006-of-00006.safetensors",
1824
+ "model.language_model.layers.63.mlp.down_proj.input_global_scale": "model-00006-of-00006.safetensors",
1825
+ "model.language_model.layers.63.mlp.down_proj.weight_global_scale": "model-00006-of-00006.safetensors",
1826
+ "model.language_model.layers.63.mlp.down_proj.weight_packed": "model-00006-of-00006.safetensors",
1827
  "model.language_model.layers.63.mlp.down_proj.weight_scale": "model-00006-of-00006.safetensors",
1828
  "model.language_model.layers.63.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
1829
  "model.language_model.layers.63.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
1830
  "model.language_model.layers.63.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
1831
  "model.language_model.layers.63.self_attn.k_norm.weight": "model-00006-of-00006.safetensors",
1832
  "model.language_model.layers.63.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
1833
+ "model.language_model.layers.63.self_attn.k_proj.weight_scale": "model-00006-of-00006.safetensors",
1834
  "model.language_model.layers.63.self_attn.o_proj.input_global_scale": "model-00006-of-00006.safetensors",
1835
  "model.language_model.layers.63.self_attn.o_proj.weight_global_scale": "model-00006-of-00006.safetensors",
1836
  "model.language_model.layers.63.self_attn.o_proj.weight_packed": "model-00006-of-00006.safetensors",
1837
  "model.language_model.layers.63.self_attn.o_proj.weight_scale": "model-00006-of-00006.safetensors",
1838
  "model.language_model.layers.63.self_attn.q_norm.weight": "model-00006-of-00006.safetensors",
1839
  "model.language_model.layers.63.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
1840
+ "model.language_model.layers.63.self_attn.q_proj.weight_scale": "model-00006-of-00006.safetensors",
1841
  "model.language_model.layers.63.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
1842
+ "model.language_model.layers.63.self_attn.v_proj.weight_scale": "model-00006-of-00006.safetensors",
1843
  "model.language_model.layers.7.input_layernorm.weight": "model-00006-of-00006.safetensors",
1844
  "model.language_model.layers.7.mlp.down_proj.input_global_scale": "model-00006-of-00006.safetensors",
1845
  "model.language_model.layers.7.mlp.down_proj.weight_global_scale": "model-00006-of-00006.safetensors",
 
1856
  "model.language_model.layers.7.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
1857
  "model.language_model.layers.7.self_attn.k_norm.weight": "model-00006-of-00006.safetensors",
1858
  "model.language_model.layers.7.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
 
1859
  "model.language_model.layers.7.self_attn.o_proj.input_global_scale": "model-00006-of-00006.safetensors",
1860
  "model.language_model.layers.7.self_attn.o_proj.weight_global_scale": "model-00006-of-00006.safetensors",
1861
  "model.language_model.layers.7.self_attn.o_proj.weight_packed": "model-00006-of-00006.safetensors",
1862
  "model.language_model.layers.7.self_attn.o_proj.weight_scale": "model-00006-of-00006.safetensors",
1863
  "model.language_model.layers.7.self_attn.q_norm.weight": "model-00006-of-00006.safetensors",
1864
  "model.language_model.layers.7.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
 
1865
  "model.language_model.layers.7.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
 
1866
  "model.language_model.layers.8.input_layernorm.weight": "model-00006-of-00006.safetensors",
1867
  "model.language_model.layers.8.linear_attn.A_log": "model-00006-of-00006.safetensors",
1868
  "model.language_model.layers.8.linear_attn.conv1d.weight": "model-00006-of-00006.safetensors",
 
1965
  "model.visual.blocks.1.norm2.bias": "model-00006-of-00006.safetensors",
1966
  "model.visual.blocks.1.norm2.weight": "model-00006-of-00006.safetensors",
1967
  "model.visual.blocks.10.attn.proj.bias": "model-00006-of-00006.safetensors",
1968
+ "model.visual.blocks.10.attn.proj.input_global_scale": "model-00006-of-00006.safetensors",
1969
+ "model.visual.blocks.10.attn.proj.weight_global_scale": "model-00006-of-00006.safetensors",
1970
+ "model.visual.blocks.10.attn.proj.weight_packed": "model-00006-of-00006.safetensors",
1971
+ "model.visual.blocks.10.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
1972
  "model.visual.blocks.10.attn.qkv.bias": "model-00006-of-00006.safetensors",
1973
+ "model.visual.blocks.10.attn.qkv.input_global_scale": "model-00006-of-00006.safetensors",
1974
+ "model.visual.blocks.10.attn.qkv.weight_global_scale": "model-00006-of-00006.safetensors",
1975
+ "model.visual.blocks.10.attn.qkv.weight_packed": "model-00006-of-00006.safetensors",
1976
  "model.visual.blocks.10.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
1977
  "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
1978
  "model.visual.blocks.10.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
 
1989
  "model.visual.blocks.10.norm2.bias": "model-00006-of-00006.safetensors",
1990
  "model.visual.blocks.10.norm2.weight": "model-00006-of-00006.safetensors",
1991
  "model.visual.blocks.11.attn.proj.bias": "model-00006-of-00006.safetensors",
1992
+ "model.visual.blocks.11.attn.proj.input_global_scale": "model-00006-of-00006.safetensors",
1993
+ "model.visual.blocks.11.attn.proj.weight_global_scale": "model-00006-of-00006.safetensors",
1994
+ "model.visual.blocks.11.attn.proj.weight_packed": "model-00006-of-00006.safetensors",
1995
  "model.visual.blocks.11.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
1996
  "model.visual.blocks.11.attn.qkv.bias": "model-00006-of-00006.safetensors",
1997
+ "model.visual.blocks.11.attn.qkv.input_global_scale": "model-00006-of-00006.safetensors",
1998
+ "model.visual.blocks.11.attn.qkv.weight_global_scale": "model-00006-of-00006.safetensors",
1999
+ "model.visual.blocks.11.attn.qkv.weight_packed": "model-00006-of-00006.safetensors",
2000
  "model.visual.blocks.11.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2001
  "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2002
  "model.visual.blocks.11.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
 
2018
  "model.visual.blocks.12.attn.proj.weight_packed": "model-00006-of-00006.safetensors",
2019
  "model.visual.blocks.12.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2020
  "model.visual.blocks.12.attn.qkv.bias": "model-00006-of-00006.safetensors",
2021
+ "model.visual.blocks.12.attn.qkv.input_global_scale": "model-00006-of-00006.safetensors",
2022
+ "model.visual.blocks.12.attn.qkv.weight_global_scale": "model-00006-of-00006.safetensors",
2023
+ "model.visual.blocks.12.attn.qkv.weight_packed": "model-00006-of-00006.safetensors",
2024
  "model.visual.blocks.12.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2025
  "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2026
  "model.visual.blocks.12.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
 
2037
  "model.visual.blocks.12.norm2.bias": "model-00006-of-00006.safetensors",
2038
  "model.visual.blocks.12.norm2.weight": "model-00006-of-00006.safetensors",
2039
  "model.visual.blocks.13.attn.proj.bias": "model-00006-of-00006.safetensors",
2040
+ "model.visual.blocks.13.attn.proj.input_global_scale": "model-00006-of-00006.safetensors",
2041
+ "model.visual.blocks.13.attn.proj.weight_global_scale": "model-00006-of-00006.safetensors",
2042
+ "model.visual.blocks.13.attn.proj.weight_packed": "model-00006-of-00006.safetensors",
2043
  "model.visual.blocks.13.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2044
  "model.visual.blocks.13.attn.qkv.bias": "model-00006-of-00006.safetensors",
2045
  "model.visual.blocks.13.attn.qkv.weight": "model-00006-of-00006.safetensors",
2046
+ "model.visual.blocks.13.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2047
  "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2048
  "model.visual.blocks.13.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
2049
  "model.visual.blocks.13.mlp.linear_fc1.weight_global_scale": "model-00006-of-00006.safetensors",
 
2064
  "model.visual.blocks.14.attn.proj.weight_packed": "model-00006-of-00006.safetensors",
2065
  "model.visual.blocks.14.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2066
  "model.visual.blocks.14.attn.qkv.bias": "model-00006-of-00006.safetensors",
2067
+ "model.visual.blocks.14.attn.qkv.input_global_scale": "model-00006-of-00006.safetensors",
2068
+ "model.visual.blocks.14.attn.qkv.weight_global_scale": "model-00006-of-00006.safetensors",
2069
+ "model.visual.blocks.14.attn.qkv.weight_packed": "model-00006-of-00006.safetensors",
2070
  "model.visual.blocks.14.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2071
  "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2072
  "model.visual.blocks.14.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
 
2320
  "model.visual.blocks.24.mlp.linear_fc1.weight_packed": "model-00006-of-00006.safetensors",
2321
  "model.visual.blocks.24.mlp.linear_fc1.weight_scale": "model-00006-of-00006.safetensors",
2322
  "model.visual.blocks.24.mlp.linear_fc2.bias": "model-00006-of-00006.safetensors",
2323
+ "model.visual.blocks.24.mlp.linear_fc2.input_global_scale": "model-00006-of-00006.safetensors",
2324
+ "model.visual.blocks.24.mlp.linear_fc2.weight_global_scale": "model-00006-of-00006.safetensors",
2325
+ "model.visual.blocks.24.mlp.linear_fc2.weight_packed": "model-00006-of-00006.safetensors",
2326
+ "model.visual.blocks.24.mlp.linear_fc2.weight_scale": "model-00006-of-00006.safetensors",
2327
  "model.visual.blocks.24.norm1.bias": "model-00006-of-00006.safetensors",
2328
  "model.visual.blocks.24.norm1.weight": "model-00006-of-00006.safetensors",
2329
  "model.visual.blocks.24.norm2.bias": "model-00006-of-00006.safetensors",
 
2399
  "model.visual.blocks.5.attn.qkv.bias": "model-00006-of-00006.safetensors",
2400
  "model.visual.blocks.5.attn.qkv.weight": "model-00006-of-00006.safetensors",
2401
  "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2402
+ "model.visual.blocks.5.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
2403
+ "model.visual.blocks.5.mlp.linear_fc1.weight_global_scale": "model-00006-of-00006.safetensors",
2404
+ "model.visual.blocks.5.mlp.linear_fc1.weight_packed": "model-00006-of-00006.safetensors",
2405
+ "model.visual.blocks.5.mlp.linear_fc1.weight_scale": "model-00006-of-00006.safetensors",
2406
  "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00006-of-00006.safetensors",
2407
  "model.visual.blocks.5.mlp.linear_fc2.input_global_scale": "model-00006-of-00006.safetensors",
2408
  "model.visual.blocks.5.mlp.linear_fc2.weight_global_scale": "model-00006-of-00006.safetensors",
 
2433
  "model.visual.blocks.6.norm2.weight": "model-00006-of-00006.safetensors",
2434
  "model.visual.blocks.7.attn.proj.bias": "model-00006-of-00006.safetensors",
2435
  "model.visual.blocks.7.attn.proj.weight": "model-00006-of-00006.safetensors",
2436
+ "model.visual.blocks.7.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2437
  "model.visual.blocks.7.attn.qkv.bias": "model-00006-of-00006.safetensors",
2438
+ "model.visual.blocks.7.attn.qkv.input_global_scale": "model-00006-of-00006.safetensors",
2439
+ "model.visual.blocks.7.attn.qkv.weight_global_scale": "model-00006-of-00006.safetensors",
2440
+ "model.visual.blocks.7.attn.qkv.weight_packed": "model-00006-of-00006.safetensors",
2441
  "model.visual.blocks.7.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2442
  "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2443
  "model.visual.blocks.7.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
 
2457
  "model.visual.blocks.8.attn.proj.weight": "model-00006-of-00006.safetensors",
2458
  "model.visual.blocks.8.attn.qkv.bias": "model-00006-of-00006.safetensors",
2459
  "model.visual.blocks.8.attn.qkv.weight": "model-00006-of-00006.safetensors",
2460
+ "model.visual.blocks.8.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",
2461
  "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00006-of-00006.safetensors",
2462
  "model.visual.blocks.8.mlp.linear_fc1.input_global_scale": "model-00006-of-00006.safetensors",
2463
  "model.visual.blocks.8.mlp.linear_fc1.weight_global_scale": "model-00006-of-00006.safetensors",
 
2474
  "model.visual.blocks.8.norm2.weight": "model-00006-of-00006.safetensors",
2475
  "model.visual.blocks.9.attn.proj.bias": "model-00006-of-00006.safetensors",
2476
  "model.visual.blocks.9.attn.proj.weight": "model-00006-of-00006.safetensors",
2477
+ "model.visual.blocks.9.attn.proj.weight_scale": "model-00006-of-00006.safetensors",
2478
  "model.visual.blocks.9.attn.qkv.bias": "model-00006-of-00006.safetensors",
2479
  "model.visual.blocks.9.attn.qkv.weight": "model-00006-of-00006.safetensors",
2480
  "model.visual.blocks.9.attn.qkv.weight_scale": "model-00006-of-00006.safetensors",