rdtand commited on
Commit
bd4f6c5
·
verified ·
1 Parent(s): a0c02b0

PrismaQuant Phase 2 incremental: body AWQ-safe GPTQ+act-round, visual Fisher-allocated (108/111 Linears via per-Linear DP)

Browse files
config.json CHANGED
@@ -2,271 +2,12 @@
2
  "architectures": [
3
  "Qwen3_5MoeForConditionalGeneration"
4
  ],
5
- "dtype": "bfloat16",
6
  "image_token_id": 248056,
7
  "model_type": "qwen3_5_moe",
8
- "quantization_config": {
9
- "config_groups": {
10
- "group_0": {
11
- "format": "nvfp4-pack-quantized",
12
- "input_activations": {
13
- "actorder": null,
14
- "block_structure": null,
15
- "dynamic": "local",
16
- "group_size": 16,
17
- "num_bits": 4,
18
- "observer": "static_minmax",
19
- "observer_kwargs": {},
20
- "scale_dtype": "torch.float8_e4m3fn",
21
- "strategy": "tensor_group",
22
- "symmetric": true,
23
- "type": "float",
24
- "zp_dtype": null
25
- },
26
- "output_activations": null,
27
- "targets": [
28
- "Linear"
29
- ],
30
- "weights": {
31
- "actorder": null,
32
- "block_structure": null,
33
- "dynamic": false,
34
- "group_size": 16,
35
- "num_bits": 4,
36
- "observer": "memoryless_minmax",
37
- "observer_kwargs": {},
38
- "scale_dtype": "torch.float8_e4m3fn",
39
- "strategy": "tensor_group",
40
- "symmetric": true,
41
- "type": "float",
42
- "zp_dtype": null
43
- }
44
- }
45
- },
46
- "format": "nvfp4-pack-quantized",
47
- "global_compression_ratio": null,
48
- "ignore": [
49
- "model.visual.blocks.0.attn.qkv",
50
- "model.visual.blocks.0.attn.proj",
51
- "model.visual.blocks.0.mlp.linear_fc1",
52
- "model.visual.blocks.0.mlp.linear_fc2",
53
- "model.visual.blocks.1.attn.qkv",
54
- "model.visual.blocks.1.attn.proj",
55
- "model.visual.blocks.1.mlp.linear_fc1",
56
- "model.visual.blocks.1.mlp.linear_fc2",
57
- "model.visual.blocks.2.attn.qkv",
58
- "model.visual.blocks.2.attn.proj",
59
- "model.visual.blocks.2.mlp.linear_fc1",
60
- "model.visual.blocks.2.mlp.linear_fc2",
61
- "model.visual.blocks.3.attn.qkv",
62
- "model.visual.blocks.3.attn.proj",
63
- "model.visual.blocks.3.mlp.linear_fc1",
64
- "model.visual.blocks.3.mlp.linear_fc2",
65
- "model.visual.blocks.4.attn.qkv",
66
- "model.visual.blocks.4.attn.proj",
67
- "model.visual.blocks.4.mlp.linear_fc1",
68
- "model.visual.blocks.4.mlp.linear_fc2",
69
- "model.visual.blocks.5.attn.qkv",
70
- "model.visual.blocks.5.attn.proj",
71
- "model.visual.blocks.5.mlp.linear_fc1",
72
- "model.visual.blocks.5.mlp.linear_fc2",
73
- "model.visual.blocks.6.attn.qkv",
74
- "model.visual.blocks.6.attn.proj",
75
- "model.visual.blocks.6.mlp.linear_fc1",
76
- "model.visual.blocks.6.mlp.linear_fc2",
77
- "model.visual.blocks.7.attn.qkv",
78
- "model.visual.blocks.7.attn.proj",
79
- "model.visual.blocks.7.mlp.linear_fc1",
80
- "model.visual.blocks.7.mlp.linear_fc2",
81
- "model.visual.blocks.8.attn.qkv",
82
- "model.visual.blocks.8.attn.proj",
83
- "model.visual.blocks.8.mlp.linear_fc1",
84
- "model.visual.blocks.8.mlp.linear_fc2",
85
- "model.visual.blocks.9.attn.qkv",
86
- "model.visual.blocks.9.attn.proj",
87
- "model.visual.blocks.9.mlp.linear_fc1",
88
- "model.visual.blocks.9.mlp.linear_fc2",
89
- "model.visual.blocks.10.attn.qkv",
90
- "model.visual.blocks.10.attn.proj",
91
- "model.visual.blocks.10.mlp.linear_fc1",
92
- "model.visual.blocks.10.mlp.linear_fc2",
93
- "model.visual.blocks.11.attn.qkv",
94
- "model.visual.blocks.11.attn.proj",
95
- "model.visual.blocks.11.mlp.linear_fc1",
96
- "model.visual.blocks.11.mlp.linear_fc2",
97
- "model.visual.blocks.12.attn.qkv",
98
- "model.visual.blocks.12.attn.proj",
99
- "model.visual.blocks.12.mlp.linear_fc1",
100
- "model.visual.blocks.12.mlp.linear_fc2",
101
- "model.visual.blocks.13.attn.qkv",
102
- "model.visual.blocks.13.attn.proj",
103
- "model.visual.blocks.13.mlp.linear_fc1",
104
- "model.visual.blocks.13.mlp.linear_fc2",
105
- "model.visual.blocks.14.attn.qkv",
106
- "model.visual.blocks.14.attn.proj",
107
- "model.visual.blocks.14.mlp.linear_fc1",
108
- "model.visual.blocks.14.mlp.linear_fc2",
109
- "model.visual.blocks.15.attn.qkv",
110
- "model.visual.blocks.15.attn.proj",
111
- "model.visual.blocks.15.mlp.linear_fc1",
112
- "model.visual.blocks.15.mlp.linear_fc2",
113
- "model.visual.blocks.16.attn.qkv",
114
- "model.visual.blocks.16.attn.proj",
115
- "model.visual.blocks.16.mlp.linear_fc1",
116
- "model.visual.blocks.16.mlp.linear_fc2",
117
- "model.visual.blocks.17.attn.qkv",
118
- "model.visual.blocks.17.attn.proj",
119
- "model.visual.blocks.17.mlp.linear_fc1",
120
- "model.visual.blocks.17.mlp.linear_fc2",
121
- "model.visual.blocks.18.attn.qkv",
122
- "model.visual.blocks.18.attn.proj",
123
- "model.visual.blocks.18.mlp.linear_fc1",
124
- "model.visual.blocks.18.mlp.linear_fc2",
125
- "model.visual.blocks.19.attn.qkv",
126
- "model.visual.blocks.19.attn.proj",
127
- "model.visual.blocks.19.mlp.linear_fc1",
128
- "model.visual.blocks.19.mlp.linear_fc2",
129
- "model.visual.blocks.20.attn.qkv",
130
- "model.visual.blocks.20.attn.proj",
131
- "model.visual.blocks.20.mlp.linear_fc1",
132
- "model.visual.blocks.20.mlp.linear_fc2",
133
- "model.visual.blocks.21.attn.qkv",
134
- "model.visual.blocks.21.attn.proj",
135
- "model.visual.blocks.21.mlp.linear_fc1",
136
- "model.visual.blocks.21.mlp.linear_fc2",
137
- "model.visual.blocks.22.attn.qkv",
138
- "model.visual.blocks.22.attn.proj",
139
- "model.visual.blocks.22.mlp.linear_fc1",
140
- "model.visual.blocks.22.mlp.linear_fc2",
141
- "model.visual.blocks.23.attn.qkv",
142
- "model.visual.blocks.23.attn.proj",
143
- "model.visual.blocks.23.mlp.linear_fc1",
144
- "model.visual.blocks.23.mlp.linear_fc2",
145
- "model.visual.blocks.24.attn.qkv",
146
- "model.visual.blocks.24.attn.proj",
147
- "model.visual.blocks.24.mlp.linear_fc1",
148
- "model.visual.blocks.24.mlp.linear_fc2",
149
- "model.visual.blocks.25.attn.qkv",
150
- "model.visual.blocks.25.attn.proj",
151
- "model.visual.blocks.25.mlp.linear_fc1",
152
- "model.visual.blocks.25.mlp.linear_fc2",
153
- "model.visual.blocks.26.attn.qkv",
154
- "model.visual.blocks.26.attn.proj",
155
- "model.visual.blocks.26.mlp.linear_fc1",
156
- "model.visual.blocks.26.mlp.linear_fc2",
157
- "model.visual.merger.linear_fc1",
158
- "model.visual.merger.linear_fc2",
159
- "model.language_model.layers.0.mlp.gate",
160
- "model.language_model.layers.0.mlp.shared_expert_gate",
161
- "model.language_model.layers.1.mlp.gate",
162
- "model.language_model.layers.1.mlp.shared_expert_gate",
163
- "model.language_model.layers.2.mlp.gate",
164
- "model.language_model.layers.2.mlp.shared_expert_gate",
165
- "model.language_model.layers.3.mlp.gate",
166
- "model.language_model.layers.3.mlp.shared_expert_gate",
167
- "model.language_model.layers.4.mlp.gate",
168
- "model.language_model.layers.4.mlp.shared_expert_gate",
169
- "model.language_model.layers.5.mlp.gate",
170
- "model.language_model.layers.5.mlp.shared_expert_gate",
171
- "model.language_model.layers.6.mlp.gate",
172
- "model.language_model.layers.6.mlp.shared_expert_gate",
173
- "model.language_model.layers.7.mlp.gate",
174
- "model.language_model.layers.7.mlp.shared_expert_gate",
175
- "model.language_model.layers.8.mlp.gate",
176
- "model.language_model.layers.8.mlp.shared_expert_gate",
177
- "model.language_model.layers.9.mlp.gate",
178
- "model.language_model.layers.9.mlp.shared_expert_gate",
179
- "model.language_model.layers.10.mlp.gate",
180
- "model.language_model.layers.10.mlp.shared_expert_gate",
181
- "model.language_model.layers.11.mlp.gate",
182
- "model.language_model.layers.11.mlp.shared_expert_gate",
183
- "model.language_model.layers.12.mlp.gate",
184
- "model.language_model.layers.12.mlp.shared_expert_gate",
185
- "model.language_model.layers.13.mlp.gate",
186
- "model.language_model.layers.13.mlp.shared_expert_gate",
187
- "model.language_model.layers.14.mlp.gate",
188
- "model.language_model.layers.14.mlp.shared_expert_gate",
189
- "model.language_model.layers.15.mlp.gate",
190
- "model.language_model.layers.15.mlp.shared_expert_gate",
191
- "model.language_model.layers.16.mlp.gate",
192
- "model.language_model.layers.16.mlp.shared_expert_gate",
193
- "model.language_model.layers.17.mlp.gate",
194
- "model.language_model.layers.17.mlp.shared_expert_gate",
195
- "model.language_model.layers.18.mlp.gate",
196
- "model.language_model.layers.18.mlp.shared_expert_gate",
197
- "model.language_model.layers.19.mlp.gate",
198
- "model.language_model.layers.19.mlp.shared_expert_gate",
199
- "model.language_model.layers.20.mlp.gate",
200
- "model.language_model.layers.20.mlp.shared_expert_gate",
201
- "model.language_model.layers.21.mlp.gate",
202
- "model.language_model.layers.21.mlp.shared_expert_gate",
203
- "model.language_model.layers.22.mlp.gate",
204
- "model.language_model.layers.22.mlp.shared_expert_gate",
205
- "model.language_model.layers.23.mlp.gate",
206
- "model.language_model.layers.23.mlp.shared_expert_gate",
207
- "model.language_model.layers.24.mlp.gate",
208
- "model.language_model.layers.24.mlp.shared_expert_gate",
209
- "model.language_model.layers.25.mlp.gate",
210
- "model.language_model.layers.25.mlp.shared_expert_gate",
211
- "model.language_model.layers.26.mlp.gate",
212
- "model.language_model.layers.26.mlp.shared_expert_gate",
213
- "model.language_model.layers.27.mlp.gate",
214
- "model.language_model.layers.27.mlp.shared_expert_gate",
215
- "model.language_model.layers.28.mlp.gate",
216
- "model.language_model.layers.28.mlp.shared_expert_gate",
217
- "model.language_model.layers.29.mlp.gate",
218
- "model.language_model.layers.29.mlp.shared_expert_gate",
219
- "model.language_model.layers.30.mlp.gate",
220
- "model.language_model.layers.30.mlp.shared_expert_gate",
221
- "model.language_model.layers.31.mlp.gate",
222
- "model.language_model.layers.31.mlp.shared_expert_gate",
223
- "model.language_model.layers.32.mlp.gate",
224
- "model.language_model.layers.32.mlp.shared_expert_gate",
225
- "model.language_model.layers.33.mlp.gate",
226
- "model.language_model.layers.33.mlp.shared_expert_gate",
227
- "model.language_model.layers.34.mlp.gate",
228
- "model.language_model.layers.34.mlp.shared_expert_gate",
229
- "model.language_model.layers.35.mlp.gate",
230
- "model.language_model.layers.35.mlp.shared_expert_gate",
231
- "model.language_model.layers.36.mlp.gate",
232
- "model.language_model.layers.36.mlp.shared_expert_gate",
233
- "model.language_model.layers.37.mlp.gate",
234
- "model.language_model.layers.37.mlp.shared_expert_gate",
235
- "model.language_model.layers.38.mlp.gate",
236
- "model.language_model.layers.38.mlp.shared_expert_gate",
237
- "model.language_model.layers.39.mlp.gate",
238
- "model.language_model.layers.39.mlp.shared_expert_gate",
239
- "model.language_model.layers.40.mlp.gate",
240
- "model.language_model.layers.40.mlp.shared_expert_gate",
241
- "model.language_model.layers.41.mlp.gate",
242
- "model.language_model.layers.41.mlp.shared_expert_gate",
243
- "model.language_model.layers.42.mlp.gate",
244
- "model.language_model.layers.42.mlp.shared_expert_gate",
245
- "model.language_model.layers.43.mlp.gate",
246
- "model.language_model.layers.43.mlp.shared_expert_gate",
247
- "model.language_model.layers.44.mlp.gate",
248
- "model.language_model.layers.44.mlp.shared_expert_gate",
249
- "model.language_model.layers.45.mlp.gate",
250
- "model.language_model.layers.45.mlp.shared_expert_gate",
251
- "model.language_model.layers.46.mlp.gate",
252
- "model.language_model.layers.46.mlp.shared_expert_gate",
253
- "model.language_model.layers.47.mlp.gate",
254
- "model.language_model.layers.47.mlp.shared_expert_gate",
255
- "lm_head",
256
- "re:^mtp.*"
257
- ],
258
- "kv_cache_scheme": null,
259
- "quant_method": "compressed-tensors",
260
- "quantization_status": "compressed",
261
- "sparsity_config": {},
262
- "transform_config": {},
263
- "version": "0.14.1.dev30+gbf783b1"
264
- },
265
  "text_config": {
266
  "attention_bias": false,
267
  "attention_dropout": 0.0,
268
  "attn_output_gate": true,
269
- "bos_token_id": null,
270
  "dtype": "bfloat16",
271
  "eos_token_id": 248044,
272
  "full_attention_interval": 4,
@@ -329,7 +70,6 @@
329
  "linear_num_key_heads": 16,
330
  "linear_num_value_heads": 64,
331
  "linear_value_head_dim": 128,
332
- "mamba_ssm_dtype": "float32",
333
  "max_position_embeddings": 262144,
334
  "mlp_only_layers": [],
335
  "model_type": "qwen3_5_moe_text",
@@ -341,10 +81,12 @@
341
  "num_experts_per_tok": 8,
342
  "num_hidden_layers": 48,
343
  "num_key_value_heads": 2,
344
- "output_router_logits": false,
345
- "pad_token_id": null,
346
- "partial_rotary_factor": 0.25,
347
  "rms_norm_eps": 1e-06,
 
 
 
 
 
348
  "rope_parameters": {
349
  "mrope_interleaved": true,
350
  "mrope_section": [
@@ -352,23 +94,17 @@
352
  11,
353
  10
354
  ],
355
- "partial_rotary_factor": 0.25,
356
  "rope_theta": 10000000,
357
- "rope_type": "default"
358
- },
359
- "router_aux_loss_coef": 0.001,
360
- "shared_expert_intermediate_size": 1024,
361
- "tie_word_embeddings": false,
362
- "use_cache": true,
363
- "vocab_size": 248320
364
  },
365
  "tie_word_embeddings": false,
366
- "transformers_version": "5.6.0.dev0",
367
  "video_token_id": 248057,
368
  "vision_config": {
369
  "deepstack_visual_indexes": [],
370
  "depth": 27,
371
- "dtype": "bfloat16",
372
  "hidden_act": "gelu_pytorch_tanh",
373
  "hidden_size": 1152,
374
  "in_channels": 3,
@@ -383,5 +119,893 @@
383
  "temporal_patch_size": 2
384
  },
385
  "vision_end_token_id": 248054,
386
- "vision_start_token_id": 248053
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  }
 
2
  "architectures": [
3
  "Qwen3_5MoeForConditionalGeneration"
4
  ],
 
5
  "image_token_id": 248056,
6
  "model_type": "qwen3_5_moe",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  "text_config": {
8
  "attention_bias": false,
9
  "attention_dropout": 0.0,
10
  "attn_output_gate": true,
 
11
  "dtype": "bfloat16",
12
  "eos_token_id": 248044,
13
  "full_attention_interval": 4,
 
70
  "linear_num_key_heads": 16,
71
  "linear_num_value_heads": 64,
72
  "linear_value_head_dim": 128,
 
73
  "max_position_embeddings": 262144,
74
  "mlp_only_layers": [],
75
  "model_type": "qwen3_5_moe_text",
 
81
  "num_experts_per_tok": 8,
82
  "num_hidden_layers": 48,
83
  "num_key_value_heads": 2,
 
 
 
84
  "rms_norm_eps": 1e-06,
85
+ "router_aux_loss_coef": 0.001,
86
+ "shared_expert_intermediate_size": 1024,
87
+ "use_cache": true,
88
+ "vocab_size": 248320,
89
+ "mamba_ssm_dtype": "float32",
90
  "rope_parameters": {
91
  "mrope_interleaved": true,
92
  "mrope_section": [
 
94
  11,
95
  10
96
  ],
97
+ "rope_type": "default",
98
  "rope_theta": 10000000,
99
+ "partial_rotary_factor": 0.25
100
+ }
 
 
 
 
 
101
  },
102
  "tie_word_embeddings": false,
103
+ "transformers_version": "4.57.0.dev0",
104
  "video_token_id": 248057,
105
  "vision_config": {
106
  "deepstack_visual_indexes": [],
107
  "depth": 27,
 
108
  "hidden_act": "gelu_pytorch_tanh",
109
  "hidden_size": 1152,
110
  "in_channels": 3,
 
119
  "temporal_patch_size": 2
120
  },
121
  "vision_end_token_id": 248054,
122
+ "vision_start_token_id": 248053,
123
+ "quantization_config": {
124
+ "quant_method": "compressed-tensors",
125
+ "format": "mixed-precision",
126
+ "config_groups": {
127
+ "group_0": {
128
+ "format": "mxfp8-quantized",
129
+ "weights": {
130
+ "num_bits": 8,
131
+ "type": "float",
132
+ "strategy": "group",
133
+ "group_size": 32,
134
+ "symmetric": true,
135
+ "dynamic": false,
136
+ "scale_dtype": "torch.uint8",
137
+ "zp_dtype": "torch.uint8",
138
+ "observer": "memoryless_minmax"
139
+ },
140
+ "input_activations": {
141
+ "num_bits": 8,
142
+ "type": "float",
143
+ "strategy": "group",
144
+ "group_size": 32,
145
+ "symmetric": true,
146
+ "dynamic": true,
147
+ "scale_dtype": "torch.uint8",
148
+ "zp_dtype": "torch.uint8"
149
+ },
150
+ "targets": [
151
+ "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_qkv$",
152
+ "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_qkvz$",
153
+ "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]in_proj_z$",
154
+ "re:^language_model[.]model[.]layers[.]30[.]linear_attn[.]in_proj_qkv$",
155
+ "re:^language_model[.]model[.]layers[.]30[.]linear_attn[.]in_proj_qkvz$",
156
+ "re:^language_model[.]model[.]layers[.]30[.]linear_attn[.]in_proj_z$",
157
+ "re:^language_model[.]model[.]layers[.]34[.]linear_attn[.]in_proj_qkv$",
158
+ "re:^language_model[.]model[.]layers[.]34[.]linear_attn[.]in_proj_qkvz$",
159
+ "re:^language_model[.]model[.]layers[.]34[.]linear_attn[.]in_proj_z$",
160
+ "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_qkv$",
161
+ "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_qkvz$",
162
+ "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]in_proj_z$",
163
+ "re:^language_model[.]model[.]layers[.]37[.]linear_attn[.]in_proj_qkv$",
164
+ "re:^language_model[.]model[.]layers[.]37[.]linear_attn[.]in_proj_qkvz$",
165
+ "re:^language_model[.]model[.]layers[.]37[.]linear_attn[.]in_proj_z$",
166
+ "re:^language_model[.]model[.]layers[.]38[.]linear_attn[.]in_proj_qkv$",
167
+ "re:^language_model[.]model[.]layers[.]38[.]linear_attn[.]in_proj_qkvz$",
168
+ "re:^language_model[.]model[.]layers[.]38[.]linear_attn[.]in_proj_z$"
169
+ ]
170
+ },
171
+ "group_1": {
172
+ "format": "nvfp4-pack-quantized",
173
+ "weights": {
174
+ "num_bits": 4,
175
+ "type": "float",
176
+ "strategy": "tensor_group",
177
+ "group_size": 16,
178
+ "symmetric": true,
179
+ "dynamic": false,
180
+ "scale_dtype": "torch.float8_e4m3fn",
181
+ "zp_dtype": "torch.float8_e4m3fn",
182
+ "observer": "memoryless_minmax"
183
+ },
184
+ "input_activations": {
185
+ "num_bits": 4,
186
+ "type": "float",
187
+ "strategy": "tensor_group",
188
+ "group_size": 16,
189
+ "symmetric": true,
190
+ "dynamic": "local",
191
+ "observer": "static_minmax",
192
+ "scale_dtype": "torch.float8_e4m3fn",
193
+ "zp_dtype": "torch.float8_e4m3fn"
194
+ },
195
+ "targets": [
196
+ "re:^language_model[.]lm_head$",
197
+ "re:^language_model[.]model[.]layers[.]0[.]linear_attn[.]in_proj_qkv$",
198
+ "re:^language_model[.]model[.]layers[.]0[.]linear_attn[.]in_proj_qkvz$",
199
+ "re:^language_model[.]model[.]layers[.]0[.]linear_attn[.]in_proj_z$",
200
+ "re:^language_model[.]model[.]layers[.]0[.]linear_attn[.]out_proj$",
201
+ "re:^language_model[.]model[.]layers[.]0[.]mlp[.]experts[.]down_proj$",
202
+ "re:^language_model[.]model[.]layers[.]0[.]mlp[.]experts[.]gate_up_proj$",
203
+ "re:^language_model[.]model[.]layers[.]1[.]linear_attn[.]in_proj_qkv$",
204
+ "re:^language_model[.]model[.]layers[.]1[.]linear_attn[.]in_proj_qkvz$",
205
+ "re:^language_model[.]model[.]layers[.]1[.]linear_attn[.]in_proj_z$",
206
+ "re:^language_model[.]model[.]layers[.]1[.]linear_attn[.]out_proj$",
207
+ "re:^language_model[.]model[.]layers[.]1[.]mlp[.]experts[.]down_proj$",
208
+ "re:^language_model[.]model[.]layers[.]1[.]mlp[.]experts[.]gate_up_proj$",
209
+ "re:^language_model[.]model[.]layers[.]10[.]linear_attn[.]in_proj_qkv$",
210
+ "re:^language_model[.]model[.]layers[.]10[.]linear_attn[.]in_proj_qkvz$",
211
+ "re:^language_model[.]model[.]layers[.]10[.]linear_attn[.]in_proj_z$",
212
+ "re:^language_model[.]model[.]layers[.]10[.]linear_attn[.]out_proj$",
213
+ "re:^language_model[.]model[.]layers[.]10[.]mlp[.]experts[.]down_proj$",
214
+ "re:^language_model[.]model[.]layers[.]10[.]mlp[.]experts[.]gate_up_proj$",
215
+ "re:^language_model[.]model[.]layers[.]11[.]mlp[.]experts[.]down_proj$",
216
+ "re:^language_model[.]model[.]layers[.]11[.]mlp[.]experts[.]gate_up_proj$",
217
+ "re:^language_model[.]model[.]layers[.]11[.]self_attn[.]o_proj$",
218
+ "re:^language_model[.]model[.]layers[.]12[.]linear_attn[.]in_proj_qkv$",
219
+ "re:^language_model[.]model[.]layers[.]12[.]linear_attn[.]in_proj_qkvz$",
220
+ "re:^language_model[.]model[.]layers[.]12[.]linear_attn[.]in_proj_z$",
221
+ "re:^language_model[.]model[.]layers[.]12[.]linear_attn[.]out_proj$",
222
+ "re:^language_model[.]model[.]layers[.]12[.]mlp[.]experts[.]down_proj$",
223
+ "re:^language_model[.]model[.]layers[.]12[.]mlp[.]experts[.]gate_up_proj$",
224
+ "re:^language_model[.]model[.]layers[.]13[.]linear_attn[.]in_proj_qkv$",
225
+ "re:^language_model[.]model[.]layers[.]13[.]linear_attn[.]in_proj_qkvz$",
226
+ "re:^language_model[.]model[.]layers[.]13[.]linear_attn[.]in_proj_z$",
227
+ "re:^language_model[.]model[.]layers[.]13[.]linear_attn[.]out_proj$",
228
+ "re:^language_model[.]model[.]layers[.]13[.]mlp[.]experts[.]down_proj$",
229
+ "re:^language_model[.]model[.]layers[.]13[.]mlp[.]experts[.]gate_up_proj$",
230
+ "re:^language_model[.]model[.]layers[.]14[.]linear_attn[.]in_proj_qkv$",
231
+ "re:^language_model[.]model[.]layers[.]14[.]linear_attn[.]in_proj_qkvz$",
232
+ "re:^language_model[.]model[.]layers[.]14[.]linear_attn[.]in_proj_z$",
233
+ "re:^language_model[.]model[.]layers[.]14[.]linear_attn[.]out_proj$",
234
+ "re:^language_model[.]model[.]layers[.]14[.]mlp[.]experts[.]down_proj$",
235
+ "re:^language_model[.]model[.]layers[.]14[.]mlp[.]experts[.]gate_up_proj$",
236
+ "re:^language_model[.]model[.]layers[.]15[.]mlp[.]experts[.]down_proj$",
237
+ "re:^language_model[.]model[.]layers[.]15[.]mlp[.]experts[.]gate_up_proj$",
238
+ "re:^language_model[.]model[.]layers[.]15[.]self_attn[.]o_proj$",
239
+ "re:^language_model[.]model[.]layers[.]16[.]linear_attn[.]in_proj_qkv$",
240
+ "re:^language_model[.]model[.]layers[.]16[.]linear_attn[.]in_proj_qkvz$",
241
+ "re:^language_model[.]model[.]layers[.]16[.]linear_attn[.]in_proj_z$",
242
+ "re:^language_model[.]model[.]layers[.]16[.]linear_attn[.]out_proj$",
243
+ "re:^language_model[.]model[.]layers[.]16[.]mlp[.]experts[.]down_proj$",
244
+ "re:^language_model[.]model[.]layers[.]16[.]mlp[.]experts[.]gate_up_proj$",
245
+ "re:^language_model[.]model[.]layers[.]17[.]linear_attn[.]in_proj_qkv$",
246
+ "re:^language_model[.]model[.]layers[.]17[.]linear_attn[.]in_proj_qkvz$",
247
+ "re:^language_model[.]model[.]layers[.]17[.]linear_attn[.]in_proj_z$",
248
+ "re:^language_model[.]model[.]layers[.]17[.]linear_attn[.]out_proj$",
249
+ "re:^language_model[.]model[.]layers[.]17[.]mlp[.]experts[.]down_proj$",
250
+ "re:^language_model[.]model[.]layers[.]17[.]mlp[.]experts[.]gate_up_proj$",
251
+ "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_qkv$",
252
+ "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_qkvz$",
253
+ "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]in_proj_z$",
254
+ "re:^language_model[.]model[.]layers[.]18[.]linear_attn[.]out_proj$",
255
+ "re:^language_model[.]model[.]layers[.]18[.]mlp[.]experts[.]down_proj$",
256
+ "re:^language_model[.]model[.]layers[.]18[.]mlp[.]experts[.]gate_up_proj$",
257
+ "re:^language_model[.]model[.]layers[.]19[.]mlp[.]experts[.]down_proj$",
258
+ "re:^language_model[.]model[.]layers[.]19[.]mlp[.]experts[.]gate_up_proj$",
259
+ "re:^language_model[.]model[.]layers[.]19[.]self_attn[.]o_proj$",
260
+ "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_qkv$",
261
+ "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_qkvz$",
262
+ "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]in_proj_z$",
263
+ "re:^language_model[.]model[.]layers[.]2[.]linear_attn[.]out_proj$",
264
+ "re:^language_model[.]model[.]layers[.]2[.]mlp[.]experts[.]down_proj$",
265
+ "re:^language_model[.]model[.]layers[.]2[.]mlp[.]experts[.]gate_up_proj$",
266
+ "re:^language_model[.]model[.]layers[.]20[.]linear_attn[.]out_proj$",
267
+ "re:^language_model[.]model[.]layers[.]20[.]mlp[.]experts[.]down_proj$",
268
+ "re:^language_model[.]model[.]layers[.]20[.]mlp[.]experts[.]gate_up_proj$",
269
+ "re:^language_model[.]model[.]layers[.]21[.]linear_attn[.]out_proj$",
270
+ "re:^language_model[.]model[.]layers[.]21[.]mlp[.]experts[.]down_proj$",
271
+ "re:^language_model[.]model[.]layers[.]21[.]mlp[.]experts[.]gate_up_proj$",
272
+ "re:^language_model[.]model[.]layers[.]22[.]mlp[.]experts[.]down_proj$",
273
+ "re:^language_model[.]model[.]layers[.]22[.]mlp[.]experts[.]gate_up_proj$",
274
+ "re:^language_model[.]model[.]layers[.]23[.]mlp[.]experts[.]down_proj$",
275
+ "re:^language_model[.]model[.]layers[.]23[.]mlp[.]experts[.]gate_up_proj$",
276
+ "re:^language_model[.]model[.]layers[.]24[.]linear_attn[.]out_proj$",
277
+ "re:^language_model[.]model[.]layers[.]24[.]mlp[.]experts[.]down_proj$",
278
+ "re:^language_model[.]model[.]layers[.]24[.]mlp[.]experts[.]gate_up_proj$",
279
+ "re:^language_model[.]model[.]layers[.]25[.]linear_attn[.]out_proj$",
280
+ "re:^language_model[.]model[.]layers[.]25[.]mlp[.]experts[.]down_proj$",
281
+ "re:^language_model[.]model[.]layers[.]25[.]mlp[.]experts[.]gate_up_proj$",
282
+ "re:^language_model[.]model[.]layers[.]26[.]mlp[.]experts[.]down_proj$",
283
+ "re:^language_model[.]model[.]layers[.]26[.]mlp[.]experts[.]gate_up_proj$",
284
+ "re:^language_model[.]model[.]layers[.]27[.]mlp[.]experts[.]down_proj$",
285
+ "re:^language_model[.]model[.]layers[.]27[.]mlp[.]experts[.]gate_up_proj$",
286
+ "re:^language_model[.]model[.]layers[.]28[.]mlp[.]experts[.]down_proj$",
287
+ "re:^language_model[.]model[.]layers[.]28[.]mlp[.]experts[.]gate_up_proj$",
288
+ "re:^language_model[.]model[.]layers[.]29[.]linear_attn[.]out_proj$",
289
+ "re:^language_model[.]model[.]layers[.]29[.]mlp[.]experts[.]down_proj$",
290
+ "re:^language_model[.]model[.]layers[.]29[.]mlp[.]experts[.]gate_up_proj$",
291
+ "re:^language_model[.]model[.]layers[.]3[.]mlp[.]experts[.]down_proj$",
292
+ "re:^language_model[.]model[.]layers[.]3[.]mlp[.]experts[.]gate_up_proj$",
293
+ "re:^language_model[.]model[.]layers[.]3[.]self_attn[.]o_proj$",
294
+ "re:^language_model[.]model[.]layers[.]30[.]linear_attn[.]out_proj$",
295
+ "re:^language_model[.]model[.]layers[.]30[.]mlp[.]experts[.]down_proj$",
296
+ "re:^language_model[.]model[.]layers[.]30[.]mlp[.]experts[.]gate_up_proj$",
297
+ "re:^language_model[.]model[.]layers[.]31[.]mlp[.]experts[.]down_proj$",
298
+ "re:^language_model[.]model[.]layers[.]31[.]mlp[.]experts[.]gate_up_proj$",
299
+ "re:^language_model[.]model[.]layers[.]32[.]linear_attn[.]in_proj_qkv$",
300
+ "re:^language_model[.]model[.]layers[.]32[.]linear_attn[.]in_proj_qkvz$",
301
+ "re:^language_model[.]model[.]layers[.]32[.]linear_attn[.]in_proj_z$",
302
+ "re:^language_model[.]model[.]layers[.]32[.]linear_attn[.]out_proj$",
303
+ "re:^language_model[.]model[.]layers[.]32[.]mlp[.]experts[.]down_proj$",
304
+ "re:^language_model[.]model[.]layers[.]32[.]mlp[.]experts[.]gate_up_proj$",
305
+ "re:^language_model[.]model[.]layers[.]33[.]linear_attn[.]in_proj_qkv$",
306
+ "re:^language_model[.]model[.]layers[.]33[.]linear_attn[.]in_proj_qkvz$",
307
+ "re:^language_model[.]model[.]layers[.]33[.]linear_attn[.]in_proj_z$",
308
+ "re:^language_model[.]model[.]layers[.]33[.]linear_attn[.]out_proj$",
309
+ "re:^language_model[.]model[.]layers[.]33[.]mlp[.]experts[.]down_proj$",
310
+ "re:^language_model[.]model[.]layers[.]33[.]mlp[.]experts[.]gate_up_proj$",
311
+ "re:^language_model[.]model[.]layers[.]34[.]linear_attn[.]out_proj$",
312
+ "re:^language_model[.]model[.]layers[.]34[.]mlp[.]experts[.]down_proj$",
313
+ "re:^language_model[.]model[.]layers[.]34[.]mlp[.]experts[.]gate_up_proj$",
314
+ "re:^language_model[.]model[.]layers[.]35[.]mlp[.]experts[.]down_proj$",
315
+ "re:^language_model[.]model[.]layers[.]35[.]mlp[.]experts[.]gate_up_proj$",
316
+ "re:^language_model[.]model[.]layers[.]36[.]linear_attn[.]out_proj$",
317
+ "re:^language_model[.]model[.]layers[.]36[.]mlp[.]experts[.]down_proj$",
318
+ "re:^language_model[.]model[.]layers[.]36[.]mlp[.]experts[.]gate_up_proj$",
319
+ "re:^language_model[.]model[.]layers[.]37[.]linear_attn[.]out_proj$",
320
+ "re:^language_model[.]model[.]layers[.]37[.]mlp[.]experts[.]down_proj$",
321
+ "re:^language_model[.]model[.]layers[.]37[.]mlp[.]experts[.]gate_up_proj$",
322
+ "re:^language_model[.]model[.]layers[.]38[.]linear_attn[.]out_proj$",
323
+ "re:^language_model[.]model[.]layers[.]38[.]mlp[.]experts[.]down_proj$",
324
+ "re:^language_model[.]model[.]layers[.]38[.]mlp[.]experts[.]gate_up_proj$",
325
+ "re:^language_model[.]model[.]layers[.]39[.]mlp[.]experts[.]down_proj$",
326
+ "re:^language_model[.]model[.]layers[.]39[.]mlp[.]experts[.]gate_up_proj$",
327
+ "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_qkv$",
328
+ "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_qkvz$",
329
+ "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]in_proj_z$",
330
+ "re:^language_model[.]model[.]layers[.]4[.]linear_attn[.]out_proj$",
331
+ "re:^language_model[.]model[.]layers[.]4[.]mlp[.]experts[.]down_proj$",
332
+ "re:^language_model[.]model[.]layers[.]4[.]mlp[.]experts[.]gate_up_proj$",
333
+ "re:^language_model[.]model[.]layers[.]40[.]linear_attn[.]out_proj$",
334
+ "re:^language_model[.]model[.]layers[.]40[.]mlp[.]experts[.]down_proj$",
335
+ "re:^language_model[.]model[.]layers[.]40[.]mlp[.]experts[.]gate_up_proj$",
336
+ "re:^language_model[.]model[.]layers[.]41[.]linear_attn[.]out_proj$",
337
+ "re:^language_model[.]model[.]layers[.]41[.]mlp[.]experts[.]down_proj$",
338
+ "re:^language_model[.]model[.]layers[.]41[.]mlp[.]experts[.]gate_up_proj$",
339
+ "re:^language_model[.]model[.]layers[.]42[.]linear_attn[.]out_proj$",
340
+ "re:^language_model[.]model[.]layers[.]42[.]mlp[.]experts[.]down_proj$",
341
+ "re:^language_model[.]model[.]layers[.]42[.]mlp[.]experts[.]gate_up_proj$",
342
+ "re:^language_model[.]model[.]layers[.]43[.]mlp[.]experts[.]down_proj$",
343
+ "re:^language_model[.]model[.]layers[.]43[.]mlp[.]experts[.]gate_up_proj$",
344
+ "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_qkv$",
345
+ "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_qkvz$",
346
+ "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]in_proj_z$",
347
+ "re:^language_model[.]model[.]layers[.]44[.]linear_attn[.]out_proj$",
348
+ "re:^language_model[.]model[.]layers[.]44[.]mlp[.]experts[.]down_proj$",
349
+ "re:^language_model[.]model[.]layers[.]44[.]mlp[.]experts[.]gate_up_proj$",
350
+ "re:^language_model[.]model[.]layers[.]45[.]mlp[.]experts[.]down_proj$",
351
+ "re:^language_model[.]model[.]layers[.]45[.]mlp[.]experts[.]gate_up_proj$",
352
+ "re:^language_model[.]model[.]layers[.]46[.]mlp[.]experts[.]down_proj$",
353
+ "re:^language_model[.]model[.]layers[.]46[.]mlp[.]experts[.]gate_up_proj$",
354
+ "re:^language_model[.]model[.]layers[.]47[.]mlp[.]experts[.]down_proj$",
355
+ "re:^language_model[.]model[.]layers[.]47[.]mlp[.]experts[.]gate_up_proj$",
356
+ "re:^language_model[.]model[.]layers[.]5[.]linear_attn[.]in_proj_qkv$",
357
+ "re:^language_model[.]model[.]layers[.]5[.]linear_attn[.]in_proj_qkvz$",
358
+ "re:^language_model[.]model[.]layers[.]5[.]linear_attn[.]in_proj_z$",
359
+ "re:^language_model[.]model[.]layers[.]5[.]linear_attn[.]out_proj$",
360
+ "re:^language_model[.]model[.]layers[.]5[.]mlp[.]experts[.]down_proj$",
361
+ "re:^language_model[.]model[.]layers[.]5[.]mlp[.]experts[.]gate_up_proj$",
362
+ "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_qkv$",
363
+ "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_qkvz$",
364
+ "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]in_proj_z$",
365
+ "re:^language_model[.]model[.]layers[.]6[.]linear_attn[.]out_proj$",
366
+ "re:^language_model[.]model[.]layers[.]6[.]mlp[.]experts[.]down_proj$",
367
+ "re:^language_model[.]model[.]layers[.]6[.]mlp[.]experts[.]gate_up_proj$",
368
+ "re:^language_model[.]model[.]layers[.]7[.]mlp[.]experts[.]down_proj$",
369
+ "re:^language_model[.]model[.]layers[.]7[.]mlp[.]experts[.]gate_up_proj$",
370
+ "re:^language_model[.]model[.]layers[.]7[.]self_attn[.]o_proj$",
371
+ "re:^language_model[.]model[.]layers[.]8[.]linear_attn[.]in_proj_qkv$",
372
+ "re:^language_model[.]model[.]layers[.]8[.]linear_attn[.]in_proj_qkvz$",
373
+ "re:^language_model[.]model[.]layers[.]8[.]linear_attn[.]in_proj_z$",
374
+ "re:^language_model[.]model[.]layers[.]8[.]linear_attn[.]out_proj$",
375
+ "re:^language_model[.]model[.]layers[.]8[.]mlp[.]experts[.]down_proj$",
376
+ "re:^language_model[.]model[.]layers[.]8[.]mlp[.]experts[.]gate_up_proj$",
377
+ "re:^language_model[.]model[.]layers[.]9[.]linear_attn[.]in_proj_qkv$",
378
+ "re:^language_model[.]model[.]layers[.]9[.]linear_attn[.]in_proj_qkvz$",
379
+ "re:^language_model[.]model[.]layers[.]9[.]linear_attn[.]in_proj_z$",
380
+ "re:^language_model[.]model[.]layers[.]9[.]linear_attn[.]out_proj$",
381
+ "re:^language_model[.]model[.]layers[.]9[.]mlp[.]experts[.]down_proj$",
382
+ "re:^language_model[.]model[.]layers[.]9[.]mlp[.]experts[.]gate_up_proj$",
383
+ "re:^mtp[.]layers[.]0[.]mlp[.]experts[.]down_proj$",
384
+ "re:^mtp[.]layers[.]0[.]mlp[.]experts[.]gate_up_proj$",
385
+ "re:^language_model[.]model[.]layers[.][0-9]+[.]mlp[.]experts[.][0-9]+[.](gate|up|down)_proj$",
386
+ "re:^mtp[.]layers[.][0-9]+[.]mlp[.]experts[.][0-9]+[.](gate|up|down)_proj$"
387
+ ]
388
+ }
389
+ },
390
+ "ignore": [
391
+ "language_model.lm_head",
392
+ "language_model.model.embed_tokens",
393
+ "language_model.model.layers.0.linear_attn.in_proj_a",
394
+ "language_model.model.layers.0.linear_attn.in_proj_b",
395
+ "language_model.model.layers.0.linear_attn.in_proj_ba",
396
+ "language_model.model.layers.0.mlp.gate",
397
+ "language_model.model.layers.0.mlp.shared_expert.down_proj",
398
+ "language_model.model.layers.0.mlp.shared_expert.gate_proj",
399
+ "language_model.model.layers.0.mlp.shared_expert.gate_up_proj",
400
+ "language_model.model.layers.0.mlp.shared_expert.up_proj",
401
+ "language_model.model.layers.0.mlp.shared_expert_gate",
402
+ "language_model.model.layers.1.linear_attn.in_proj_a",
403
+ "language_model.model.layers.1.linear_attn.in_proj_b",
404
+ "language_model.model.layers.1.linear_attn.in_proj_ba",
405
+ "language_model.model.layers.1.mlp.gate",
406
+ "language_model.model.layers.1.mlp.shared_expert.down_proj",
407
+ "language_model.model.layers.1.mlp.shared_expert.gate_proj",
408
+ "language_model.model.layers.1.mlp.shared_expert.gate_up_proj",
409
+ "language_model.model.layers.1.mlp.shared_expert.up_proj",
410
+ "language_model.model.layers.1.mlp.shared_expert_gate",
411
+ "language_model.model.layers.10.linear_attn.in_proj_a",
412
+ "language_model.model.layers.10.linear_attn.in_proj_b",
413
+ "language_model.model.layers.10.linear_attn.in_proj_ba",
414
+ "language_model.model.layers.10.mlp.gate",
415
+ "language_model.model.layers.10.mlp.shared_expert.down_proj",
416
+ "language_model.model.layers.10.mlp.shared_expert.gate_proj",
417
+ "language_model.model.layers.10.mlp.shared_expert.gate_up_proj",
418
+ "language_model.model.layers.10.mlp.shared_expert.up_proj",
419
+ "language_model.model.layers.10.mlp.shared_expert_gate",
420
+ "language_model.model.layers.11.mlp.gate",
421
+ "language_model.model.layers.11.mlp.shared_expert.down_proj",
422
+ "language_model.model.layers.11.mlp.shared_expert.gate_proj",
423
+ "language_model.model.layers.11.mlp.shared_expert.gate_up_proj",
424
+ "language_model.model.layers.11.mlp.shared_expert.up_proj",
425
+ "language_model.model.layers.11.mlp.shared_expert_gate",
426
+ "language_model.model.layers.11.self_attn.k_proj",
427
+ "language_model.model.layers.11.self_attn.q_proj",
428
+ "language_model.model.layers.11.self_attn.qkv_proj",
429
+ "language_model.model.layers.11.self_attn.v_proj",
430
+ "language_model.model.layers.12.linear_attn.in_proj_a",
431
+ "language_model.model.layers.12.linear_attn.in_proj_b",
432
+ "language_model.model.layers.12.linear_attn.in_proj_ba",
433
+ "language_model.model.layers.12.mlp.gate",
434
+ "language_model.model.layers.12.mlp.shared_expert.down_proj",
435
+ "language_model.model.layers.12.mlp.shared_expert.gate_proj",
436
+ "language_model.model.layers.12.mlp.shared_expert.gate_up_proj",
437
+ "language_model.model.layers.12.mlp.shared_expert.up_proj",
438
+ "language_model.model.layers.12.mlp.shared_expert_gate",
439
+ "language_model.model.layers.13.linear_attn.in_proj_a",
440
+ "language_model.model.layers.13.linear_attn.in_proj_b",
441
+ "language_model.model.layers.13.linear_attn.in_proj_ba",
442
+ "language_model.model.layers.13.mlp.gate",
443
+ "language_model.model.layers.13.mlp.shared_expert.down_proj",
444
+ "language_model.model.layers.13.mlp.shared_expert.gate_proj",
445
+ "language_model.model.layers.13.mlp.shared_expert.gate_up_proj",
446
+ "language_model.model.layers.13.mlp.shared_expert.up_proj",
447
+ "language_model.model.layers.13.mlp.shared_expert_gate",
448
+ "language_model.model.layers.14.linear_attn.in_proj_a",
449
+ "language_model.model.layers.14.linear_attn.in_proj_b",
450
+ "language_model.model.layers.14.linear_attn.in_proj_ba",
451
+ "language_model.model.layers.14.mlp.gate",
452
+ "language_model.model.layers.14.mlp.shared_expert.down_proj",
453
+ "language_model.model.layers.14.mlp.shared_expert.gate_proj",
454
+ "language_model.model.layers.14.mlp.shared_expert.gate_up_proj",
455
+ "language_model.model.layers.14.mlp.shared_expert.up_proj",
456
+ "language_model.model.layers.14.mlp.shared_expert_gate",
457
+ "language_model.model.layers.15.mlp.gate",
458
+ "language_model.model.layers.15.mlp.shared_expert.down_proj",
459
+ "language_model.model.layers.15.mlp.shared_expert.gate_proj",
460
+ "language_model.model.layers.15.mlp.shared_expert.gate_up_proj",
461
+ "language_model.model.layers.15.mlp.shared_expert.up_proj",
462
+ "language_model.model.layers.15.mlp.shared_expert_gate",
463
+ "language_model.model.layers.15.self_attn.k_proj",
464
+ "language_model.model.layers.15.self_attn.q_proj",
465
+ "language_model.model.layers.15.self_attn.qkv_proj",
466
+ "language_model.model.layers.15.self_attn.v_proj",
467
+ "language_model.model.layers.16.linear_attn.in_proj_a",
468
+ "language_model.model.layers.16.linear_attn.in_proj_b",
469
+ "language_model.model.layers.16.linear_attn.in_proj_ba",
470
+ "language_model.model.layers.16.mlp.gate",
471
+ "language_model.model.layers.16.mlp.shared_expert.down_proj",
472
+ "language_model.model.layers.16.mlp.shared_expert.gate_proj",
473
+ "language_model.model.layers.16.mlp.shared_expert.gate_up_proj",
474
+ "language_model.model.layers.16.mlp.shared_expert.up_proj",
475
+ "language_model.model.layers.16.mlp.shared_expert_gate",
476
+ "language_model.model.layers.17.linear_attn.in_proj_a",
477
+ "language_model.model.layers.17.linear_attn.in_proj_b",
478
+ "language_model.model.layers.17.linear_attn.in_proj_ba",
479
+ "language_model.model.layers.17.mlp.gate",
480
+ "language_model.model.layers.17.mlp.shared_expert.down_proj",
481
+ "language_model.model.layers.17.mlp.shared_expert.gate_proj",
482
+ "language_model.model.layers.17.mlp.shared_expert.gate_up_proj",
483
+ "language_model.model.layers.17.mlp.shared_expert.up_proj",
484
+ "language_model.model.layers.17.mlp.shared_expert_gate",
485
+ "language_model.model.layers.18.linear_attn.in_proj_a",
486
+ "language_model.model.layers.18.linear_attn.in_proj_b",
487
+ "language_model.model.layers.18.linear_attn.in_proj_ba",
488
+ "language_model.model.layers.18.mlp.gate",
489
+ "language_model.model.layers.18.mlp.shared_expert.down_proj",
490
+ "language_model.model.layers.18.mlp.shared_expert.gate_proj",
491
+ "language_model.model.layers.18.mlp.shared_expert.gate_up_proj",
492
+ "language_model.model.layers.18.mlp.shared_expert.up_proj",
493
+ "language_model.model.layers.18.mlp.shared_expert_gate",
494
+ "language_model.model.layers.19.mlp.gate",
495
+ "language_model.model.layers.19.mlp.shared_expert.down_proj",
496
+ "language_model.model.layers.19.mlp.shared_expert.gate_proj",
497
+ "language_model.model.layers.19.mlp.shared_expert.gate_up_proj",
498
+ "language_model.model.layers.19.mlp.shared_expert.up_proj",
499
+ "language_model.model.layers.19.mlp.shared_expert_gate",
500
+ "language_model.model.layers.19.self_attn.k_proj",
501
+ "language_model.model.layers.19.self_attn.q_proj",
502
+ "language_model.model.layers.19.self_attn.qkv_proj",
503
+ "language_model.model.layers.19.self_attn.v_proj",
504
+ "language_model.model.layers.2.linear_attn.in_proj_a",
505
+ "language_model.model.layers.2.linear_attn.in_proj_b",
506
+ "language_model.model.layers.2.linear_attn.in_proj_ba",
507
+ "language_model.model.layers.2.mlp.gate",
508
+ "language_model.model.layers.2.mlp.shared_expert.down_proj",
509
+ "language_model.model.layers.2.mlp.shared_expert.gate_proj",
510
+ "language_model.model.layers.2.mlp.shared_expert.gate_up_proj",
511
+ "language_model.model.layers.2.mlp.shared_expert.up_proj",
512
+ "language_model.model.layers.2.mlp.shared_expert_gate",
513
+ "language_model.model.layers.20.linear_attn.in_proj_a",
514
+ "language_model.model.layers.20.linear_attn.in_proj_b",
515
+ "language_model.model.layers.20.linear_attn.in_proj_ba",
516
+ "language_model.model.layers.20.linear_attn.in_proj_qkv",
517
+ "language_model.model.layers.20.linear_attn.in_proj_qkvz",
518
+ "language_model.model.layers.20.linear_attn.in_proj_z",
519
+ "language_model.model.layers.20.mlp.gate",
520
+ "language_model.model.layers.20.mlp.shared_expert.down_proj",
521
+ "language_model.model.layers.20.mlp.shared_expert.gate_proj",
522
+ "language_model.model.layers.20.mlp.shared_expert.gate_up_proj",
523
+ "language_model.model.layers.20.mlp.shared_expert.up_proj",
524
+ "language_model.model.layers.20.mlp.shared_expert_gate",
525
+ "language_model.model.layers.21.linear_attn.in_proj_a",
526
+ "language_model.model.layers.21.linear_attn.in_proj_b",
527
+ "language_model.model.layers.21.linear_attn.in_proj_ba",
528
+ "language_model.model.layers.21.linear_attn.in_proj_qkv",
529
+ "language_model.model.layers.21.linear_attn.in_proj_qkvz",
530
+ "language_model.model.layers.21.linear_attn.in_proj_z",
531
+ "language_model.model.layers.21.mlp.gate",
532
+ "language_model.model.layers.21.mlp.shared_expert.down_proj",
533
+ "language_model.model.layers.21.mlp.shared_expert.gate_proj",
534
+ "language_model.model.layers.21.mlp.shared_expert.gate_up_proj",
535
+ "language_model.model.layers.21.mlp.shared_expert.up_proj",
536
+ "language_model.model.layers.21.mlp.shared_expert_gate",
537
+ "language_model.model.layers.22.linear_attn.in_proj_a",
538
+ "language_model.model.layers.22.linear_attn.in_proj_b",
539
+ "language_model.model.layers.22.linear_attn.in_proj_ba",
540
+ "language_model.model.layers.22.linear_attn.in_proj_qkv",
541
+ "language_model.model.layers.22.linear_attn.in_proj_qkvz",
542
+ "language_model.model.layers.22.linear_attn.in_proj_z",
543
+ "language_model.model.layers.22.linear_attn.out_proj",
544
+ "language_model.model.layers.22.mlp.gate",
545
+ "language_model.model.layers.22.mlp.shared_expert.down_proj",
546
+ "language_model.model.layers.22.mlp.shared_expert.gate_proj",
547
+ "language_model.model.layers.22.mlp.shared_expert.gate_up_proj",
548
+ "language_model.model.layers.22.mlp.shared_expert.up_proj",
549
+ "language_model.model.layers.22.mlp.shared_expert_gate",
550
+ "language_model.model.layers.23.mlp.gate",
551
+ "language_model.model.layers.23.mlp.shared_expert.down_proj",
552
+ "language_model.model.layers.23.mlp.shared_expert.gate_proj",
553
+ "language_model.model.layers.23.mlp.shared_expert.gate_up_proj",
554
+ "language_model.model.layers.23.mlp.shared_expert.up_proj",
555
+ "language_model.model.layers.23.mlp.shared_expert_gate",
556
+ "language_model.model.layers.23.self_attn.k_proj",
557
+ "language_model.model.layers.23.self_attn.o_proj",
558
+ "language_model.model.layers.23.self_attn.q_proj",
559
+ "language_model.model.layers.23.self_attn.qkv_proj",
560
+ "language_model.model.layers.23.self_attn.v_proj",
561
+ "language_model.model.layers.24.linear_attn.in_proj_a",
562
+ "language_model.model.layers.24.linear_attn.in_proj_b",
563
+ "language_model.model.layers.24.linear_attn.in_proj_ba",
564
+ "language_model.model.layers.24.mlp.gate",
565
+ "language_model.model.layers.24.mlp.shared_expert.down_proj",
566
+ "language_model.model.layers.24.mlp.shared_expert.gate_proj",
567
+ "language_model.model.layers.24.mlp.shared_expert.gate_up_proj",
568
+ "language_model.model.layers.24.mlp.shared_expert.up_proj",
569
+ "language_model.model.layers.24.mlp.shared_expert_gate",
570
+ "language_model.model.layers.25.linear_attn.in_proj_a",
571
+ "language_model.model.layers.25.linear_attn.in_proj_b",
572
+ "language_model.model.layers.25.linear_attn.in_proj_ba",
573
+ "language_model.model.layers.25.linear_attn.in_proj_qkv",
574
+ "language_model.model.layers.25.linear_attn.in_proj_qkvz",
575
+ "language_model.model.layers.25.linear_attn.in_proj_z",
576
+ "language_model.model.layers.25.mlp.gate",
577
+ "language_model.model.layers.25.mlp.shared_expert.down_proj",
578
+ "language_model.model.layers.25.mlp.shared_expert.gate_proj",
579
+ "language_model.model.layers.25.mlp.shared_expert.gate_up_proj",
580
+ "language_model.model.layers.25.mlp.shared_expert.up_proj",
581
+ "language_model.model.layers.25.mlp.shared_expert_gate",
582
+ "language_model.model.layers.26.linear_attn.in_proj_a",
583
+ "language_model.model.layers.26.linear_attn.in_proj_b",
584
+ "language_model.model.layers.26.linear_attn.in_proj_ba",
585
+ "language_model.model.layers.26.linear_attn.in_proj_qkv",
586
+ "language_model.model.layers.26.linear_attn.in_proj_qkvz",
587
+ "language_model.model.layers.26.linear_attn.in_proj_z",
588
+ "language_model.model.layers.26.linear_attn.out_proj",
589
+ "language_model.model.layers.26.mlp.gate",
590
+ "language_model.model.layers.26.mlp.shared_expert.down_proj",
591
+ "language_model.model.layers.26.mlp.shared_expert.gate_proj",
592
+ "language_model.model.layers.26.mlp.shared_expert.gate_up_proj",
593
+ "language_model.model.layers.26.mlp.shared_expert.up_proj",
594
+ "language_model.model.layers.26.mlp.shared_expert_gate",
595
+ "language_model.model.layers.27.mlp.gate",
596
+ "language_model.model.layers.27.mlp.shared_expert.down_proj",
597
+ "language_model.model.layers.27.mlp.shared_expert.gate_proj",
598
+ "language_model.model.layers.27.mlp.shared_expert.gate_up_proj",
599
+ "language_model.model.layers.27.mlp.shared_expert.up_proj",
600
+ "language_model.model.layers.27.mlp.shared_expert_gate",
601
+ "language_model.model.layers.27.self_attn.k_proj",
602
+ "language_model.model.layers.27.self_attn.o_proj",
603
+ "language_model.model.layers.27.self_attn.q_proj",
604
+ "language_model.model.layers.27.self_attn.qkv_proj",
605
+ "language_model.model.layers.27.self_attn.v_proj",
606
+ "language_model.model.layers.28.linear_attn.in_proj_a",
607
+ "language_model.model.layers.28.linear_attn.in_proj_b",
608
+ "language_model.model.layers.28.linear_attn.in_proj_ba",
609
+ "language_model.model.layers.28.linear_attn.in_proj_qkv",
610
+ "language_model.model.layers.28.linear_attn.in_proj_qkvz",
611
+ "language_model.model.layers.28.linear_attn.in_proj_z",
612
+ "language_model.model.layers.28.linear_attn.out_proj",
613
+ "language_model.model.layers.28.mlp.gate",
614
+ "language_model.model.layers.28.mlp.shared_expert.down_proj",
615
+ "language_model.model.layers.28.mlp.shared_expert.gate_proj",
616
+ "language_model.model.layers.28.mlp.shared_expert.gate_up_proj",
617
+ "language_model.model.layers.28.mlp.shared_expert.up_proj",
618
+ "language_model.model.layers.28.mlp.shared_expert_gate",
619
+ "language_model.model.layers.29.linear_attn.in_proj_a",
620
+ "language_model.model.layers.29.linear_attn.in_proj_b",
621
+ "language_model.model.layers.29.linear_attn.in_proj_ba",
622
+ "language_model.model.layers.29.linear_attn.in_proj_qkv",
623
+ "language_model.model.layers.29.linear_attn.in_proj_qkvz",
624
+ "language_model.model.layers.29.linear_attn.in_proj_z",
625
+ "language_model.model.layers.29.mlp.gate",
626
+ "language_model.model.layers.29.mlp.shared_expert.down_proj",
627
+ "language_model.model.layers.29.mlp.shared_expert.gate_proj",
628
+ "language_model.model.layers.29.mlp.shared_expert.gate_up_proj",
629
+ "language_model.model.layers.29.mlp.shared_expert.up_proj",
630
+ "language_model.model.layers.29.mlp.shared_expert_gate",
631
+ "language_model.model.layers.3.mlp.gate",
632
+ "language_model.model.layers.3.mlp.shared_expert.down_proj",
633
+ "language_model.model.layers.3.mlp.shared_expert.gate_proj",
634
+ "language_model.model.layers.3.mlp.shared_expert.gate_up_proj",
635
+ "language_model.model.layers.3.mlp.shared_expert.up_proj",
636
+ "language_model.model.layers.3.mlp.shared_expert_gate",
637
+ "language_model.model.layers.3.self_attn.k_proj",
638
+ "language_model.model.layers.3.self_attn.q_proj",
639
+ "language_model.model.layers.3.self_attn.qkv_proj",
640
+ "language_model.model.layers.3.self_attn.v_proj",
641
+ "language_model.model.layers.30.linear_attn.in_proj_a",
642
+ "language_model.model.layers.30.linear_attn.in_proj_b",
643
+ "language_model.model.layers.30.linear_attn.in_proj_ba",
644
+ "language_model.model.layers.30.mlp.gate",
645
+ "language_model.model.layers.30.mlp.shared_expert.down_proj",
646
+ "language_model.model.layers.30.mlp.shared_expert.gate_proj",
647
+ "language_model.model.layers.30.mlp.shared_expert.gate_up_proj",
648
+ "language_model.model.layers.30.mlp.shared_expert.up_proj",
649
+ "language_model.model.layers.30.mlp.shared_expert_gate",
650
+ "language_model.model.layers.31.mlp.gate",
651
+ "language_model.model.layers.31.mlp.shared_expert.down_proj",
652
+ "language_model.model.layers.31.mlp.shared_expert.gate_proj",
653
+ "language_model.model.layers.31.mlp.shared_expert.gate_up_proj",
654
+ "language_model.model.layers.31.mlp.shared_expert.up_proj",
655
+ "language_model.model.layers.31.mlp.shared_expert_gate",
656
+ "language_model.model.layers.31.self_attn.k_proj",
657
+ "language_model.model.layers.31.self_attn.o_proj",
658
+ "language_model.model.layers.31.self_attn.q_proj",
659
+ "language_model.model.layers.31.self_attn.qkv_proj",
660
+ "language_model.model.layers.31.self_attn.v_proj",
661
+ "language_model.model.layers.32.linear_attn.in_proj_a",
662
+ "language_model.model.layers.32.linear_attn.in_proj_b",
663
+ "language_model.model.layers.32.linear_attn.in_proj_ba",
664
+ "language_model.model.layers.32.mlp.gate",
665
+ "language_model.model.layers.32.mlp.shared_expert.down_proj",
666
+ "language_model.model.layers.32.mlp.shared_expert.gate_proj",
667
+ "language_model.model.layers.32.mlp.shared_expert.gate_up_proj",
668
+ "language_model.model.layers.32.mlp.shared_expert.up_proj",
669
+ "language_model.model.layers.32.mlp.shared_expert_gate",
670
+ "language_model.model.layers.33.linear_attn.in_proj_a",
671
+ "language_model.model.layers.33.linear_attn.in_proj_b",
672
+ "language_model.model.layers.33.linear_attn.in_proj_ba",
673
+ "language_model.model.layers.33.mlp.gate",
674
+ "language_model.model.layers.33.mlp.shared_expert.down_proj",
675
+ "language_model.model.layers.33.mlp.shared_expert.gate_proj",
676
+ "language_model.model.layers.33.mlp.shared_expert.gate_up_proj",
677
+ "language_model.model.layers.33.mlp.shared_expert.up_proj",
678
+ "language_model.model.layers.33.mlp.shared_expert_gate",
679
+ "language_model.model.layers.34.linear_attn.in_proj_a",
680
+ "language_model.model.layers.34.linear_attn.in_proj_b",
681
+ "language_model.model.layers.34.linear_attn.in_proj_ba",
682
+ "language_model.model.layers.34.mlp.gate",
683
+ "language_model.model.layers.34.mlp.shared_expert.down_proj",
684
+ "language_model.model.layers.34.mlp.shared_expert.gate_proj",
685
+ "language_model.model.layers.34.mlp.shared_expert.gate_up_proj",
686
+ "language_model.model.layers.34.mlp.shared_expert.up_proj",
687
+ "language_model.model.layers.34.mlp.shared_expert_gate",
688
+ "language_model.model.layers.35.mlp.gate",
689
+ "language_model.model.layers.35.mlp.shared_expert.down_proj",
690
+ "language_model.model.layers.35.mlp.shared_expert.gate_proj",
691
+ "language_model.model.layers.35.mlp.shared_expert.gate_up_proj",
692
+ "language_model.model.layers.35.mlp.shared_expert.up_proj",
693
+ "language_model.model.layers.35.mlp.shared_expert_gate",
694
+ "language_model.model.layers.35.self_attn.k_proj",
695
+ "language_model.model.layers.35.self_attn.o_proj",
696
+ "language_model.model.layers.35.self_attn.q_proj",
697
+ "language_model.model.layers.35.self_attn.qkv_proj",
698
+ "language_model.model.layers.35.self_attn.v_proj",
699
+ "language_model.model.layers.36.linear_attn.in_proj_a",
700
+ "language_model.model.layers.36.linear_attn.in_proj_b",
701
+ "language_model.model.layers.36.linear_attn.in_proj_ba",
702
+ "language_model.model.layers.36.mlp.gate",
703
+ "language_model.model.layers.36.mlp.shared_expert.down_proj",
704
+ "language_model.model.layers.36.mlp.shared_expert.gate_proj",
705
+ "language_model.model.layers.36.mlp.shared_expert.gate_up_proj",
706
+ "language_model.model.layers.36.mlp.shared_expert.up_proj",
707
+ "language_model.model.layers.36.mlp.shared_expert_gate",
708
+ "language_model.model.layers.37.linear_attn.in_proj_a",
709
+ "language_model.model.layers.37.linear_attn.in_proj_b",
710
+ "language_model.model.layers.37.linear_attn.in_proj_ba",
711
+ "language_model.model.layers.37.mlp.gate",
712
+ "language_model.model.layers.37.mlp.shared_expert.down_proj",
713
+ "language_model.model.layers.37.mlp.shared_expert.gate_proj",
714
+ "language_model.model.layers.37.mlp.shared_expert.gate_up_proj",
715
+ "language_model.model.layers.37.mlp.shared_expert.up_proj",
716
+ "language_model.model.layers.37.mlp.shared_expert_gate",
717
+ "language_model.model.layers.38.linear_attn.in_proj_a",
718
+ "language_model.model.layers.38.linear_attn.in_proj_b",
719
+ "language_model.model.layers.38.linear_attn.in_proj_ba",
720
+ "language_model.model.layers.38.mlp.gate",
721
+ "language_model.model.layers.38.mlp.shared_expert.down_proj",
722
+ "language_model.model.layers.38.mlp.shared_expert.gate_proj",
723
+ "language_model.model.layers.38.mlp.shared_expert.gate_up_proj",
724
+ "language_model.model.layers.38.mlp.shared_expert.up_proj",
725
+ "language_model.model.layers.38.mlp.shared_expert_gate",
726
+ "language_model.model.layers.39.mlp.gate",
727
+ "language_model.model.layers.39.mlp.shared_expert.down_proj",
728
+ "language_model.model.layers.39.mlp.shared_expert.gate_proj",
729
+ "language_model.model.layers.39.mlp.shared_expert.gate_up_proj",
730
+ "language_model.model.layers.39.mlp.shared_expert.up_proj",
731
+ "language_model.model.layers.39.mlp.shared_expert_gate",
732
+ "language_model.model.layers.39.self_attn.k_proj",
733
+ "language_model.model.layers.39.self_attn.o_proj",
734
+ "language_model.model.layers.39.self_attn.q_proj",
735
+ "language_model.model.layers.39.self_attn.qkv_proj",
736
+ "language_model.model.layers.39.self_attn.v_proj",
737
+ "language_model.model.layers.4.linear_attn.in_proj_a",
738
+ "language_model.model.layers.4.linear_attn.in_proj_b",
739
+ "language_model.model.layers.4.linear_attn.in_proj_ba",
740
+ "language_model.model.layers.4.mlp.gate",
741
+ "language_model.model.layers.4.mlp.shared_expert.down_proj",
742
+ "language_model.model.layers.4.mlp.shared_expert.gate_proj",
743
+ "language_model.model.layers.4.mlp.shared_expert.gate_up_proj",
744
+ "language_model.model.layers.4.mlp.shared_expert.up_proj",
745
+ "language_model.model.layers.4.mlp.shared_expert_gate",
746
+ "language_model.model.layers.40.linear_attn.in_proj_a",
747
+ "language_model.model.layers.40.linear_attn.in_proj_b",
748
+ "language_model.model.layers.40.linear_attn.in_proj_ba",
749
+ "language_model.model.layers.40.linear_attn.in_proj_qkv",
750
+ "language_model.model.layers.40.linear_attn.in_proj_qkvz",
751
+ "language_model.model.layers.40.linear_attn.in_proj_z",
752
+ "language_model.model.layers.40.mlp.gate",
753
+ "language_model.model.layers.40.mlp.shared_expert.down_proj",
754
+ "language_model.model.layers.40.mlp.shared_expert.gate_proj",
755
+ "language_model.model.layers.40.mlp.shared_expert.gate_up_proj",
756
+ "language_model.model.layers.40.mlp.shared_expert.up_proj",
757
+ "language_model.model.layers.40.mlp.shared_expert_gate",
758
+ "language_model.model.layers.41.linear_attn.in_proj_a",
759
+ "language_model.model.layers.41.linear_attn.in_proj_b",
760
+ "language_model.model.layers.41.linear_attn.in_proj_ba",
761
+ "language_model.model.layers.41.linear_attn.in_proj_qkv",
762
+ "language_model.model.layers.41.linear_attn.in_proj_qkvz",
763
+ "language_model.model.layers.41.linear_attn.in_proj_z",
764
+ "language_model.model.layers.41.mlp.gate",
765
+ "language_model.model.layers.41.mlp.shared_expert.down_proj",
766
+ "language_model.model.layers.41.mlp.shared_expert.gate_proj",
767
+ "language_model.model.layers.41.mlp.shared_expert.gate_up_proj",
768
+ "language_model.model.layers.41.mlp.shared_expert.up_proj",
769
+ "language_model.model.layers.41.mlp.shared_expert_gate",
770
+ "language_model.model.layers.42.linear_attn.in_proj_a",
771
+ "language_model.model.layers.42.linear_attn.in_proj_b",
772
+ "language_model.model.layers.42.linear_attn.in_proj_ba",
773
+ "language_model.model.layers.42.linear_attn.in_proj_qkv",
774
+ "language_model.model.layers.42.linear_attn.in_proj_qkvz",
775
+ "language_model.model.layers.42.linear_attn.in_proj_z",
776
+ "language_model.model.layers.42.mlp.gate",
777
+ "language_model.model.layers.42.mlp.shared_expert.down_proj",
778
+ "language_model.model.layers.42.mlp.shared_expert.gate_proj",
779
+ "language_model.model.layers.42.mlp.shared_expert.gate_up_proj",
780
+ "language_model.model.layers.42.mlp.shared_expert.up_proj",
781
+ "language_model.model.layers.42.mlp.shared_expert_gate",
782
+ "language_model.model.layers.43.mlp.gate",
783
+ "language_model.model.layers.43.mlp.shared_expert.down_proj",
784
+ "language_model.model.layers.43.mlp.shared_expert.gate_proj",
785
+ "language_model.model.layers.43.mlp.shared_expert.gate_up_proj",
786
+ "language_model.model.layers.43.mlp.shared_expert.up_proj",
787
+ "language_model.model.layers.43.mlp.shared_expert_gate",
788
+ "language_model.model.layers.43.self_attn.k_proj",
789
+ "language_model.model.layers.43.self_attn.o_proj",
790
+ "language_model.model.layers.43.self_attn.q_proj",
791
+ "language_model.model.layers.43.self_attn.qkv_proj",
792
+ "language_model.model.layers.43.self_attn.v_proj",
793
+ "language_model.model.layers.44.linear_attn.in_proj_a",
794
+ "language_model.model.layers.44.linear_attn.in_proj_b",
795
+ "language_model.model.layers.44.linear_attn.in_proj_ba",
796
+ "language_model.model.layers.44.mlp.gate",
797
+ "language_model.model.layers.44.mlp.shared_expert.down_proj",
798
+ "language_model.model.layers.44.mlp.shared_expert.gate_proj",
799
+ "language_model.model.layers.44.mlp.shared_expert.gate_up_proj",
800
+ "language_model.model.layers.44.mlp.shared_expert.up_proj",
801
+ "language_model.model.layers.44.mlp.shared_expert_gate",
802
+ "language_model.model.layers.45.linear_attn.in_proj_a",
803
+ "language_model.model.layers.45.linear_attn.in_proj_b",
804
+ "language_model.model.layers.45.linear_attn.in_proj_ba",
805
+ "language_model.model.layers.45.linear_attn.in_proj_qkv",
806
+ "language_model.model.layers.45.linear_attn.in_proj_qkvz",
807
+ "language_model.model.layers.45.linear_attn.in_proj_z",
808
+ "language_model.model.layers.45.linear_attn.out_proj",
809
+ "language_model.model.layers.45.mlp.gate",
810
+ "language_model.model.layers.45.mlp.shared_expert.down_proj",
811
+ "language_model.model.layers.45.mlp.shared_expert.gate_proj",
812
+ "language_model.model.layers.45.mlp.shared_expert.gate_up_proj",
813
+ "language_model.model.layers.45.mlp.shared_expert.up_proj",
814
+ "language_model.model.layers.45.mlp.shared_expert_gate",
815
+ "language_model.model.layers.46.linear_attn.in_proj_a",
816
+ "language_model.model.layers.46.linear_attn.in_proj_b",
817
+ "language_model.model.layers.46.linear_attn.in_proj_ba",
818
+ "language_model.model.layers.46.linear_attn.in_proj_qkv",
819
+ "language_model.model.layers.46.linear_attn.in_proj_qkvz",
820
+ "language_model.model.layers.46.linear_attn.in_proj_z",
821
+ "language_model.model.layers.46.linear_attn.out_proj",
822
+ "language_model.model.layers.46.mlp.gate",
823
+ "language_model.model.layers.46.mlp.shared_expert.down_proj",
824
+ "language_model.model.layers.46.mlp.shared_expert.gate_proj",
825
+ "language_model.model.layers.46.mlp.shared_expert.gate_up_proj",
826
+ "language_model.model.layers.46.mlp.shared_expert.up_proj",
827
+ "language_model.model.layers.46.mlp.shared_expert_gate",
828
+ "language_model.model.layers.47.mlp.gate",
829
+ "language_model.model.layers.47.mlp.shared_expert.down_proj",
830
+ "language_model.model.layers.47.mlp.shared_expert.gate_proj",
831
+ "language_model.model.layers.47.mlp.shared_expert.gate_up_proj",
832
+ "language_model.model.layers.47.mlp.shared_expert.up_proj",
833
+ "language_model.model.layers.47.mlp.shared_expert_gate",
834
+ "language_model.model.layers.47.self_attn.k_proj",
835
+ "language_model.model.layers.47.self_attn.o_proj",
836
+ "language_model.model.layers.47.self_attn.q_proj",
837
+ "language_model.model.layers.47.self_attn.qkv_proj",
838
+ "language_model.model.layers.47.self_attn.v_proj",
839
+ "language_model.model.layers.5.linear_attn.in_proj_a",
840
+ "language_model.model.layers.5.linear_attn.in_proj_b",
841
+ "language_model.model.layers.5.linear_attn.in_proj_ba",
842
+ "language_model.model.layers.5.mlp.gate",
843
+ "language_model.model.layers.5.mlp.shared_expert.down_proj",
844
+ "language_model.model.layers.5.mlp.shared_expert.gate_proj",
845
+ "language_model.model.layers.5.mlp.shared_expert.gate_up_proj",
846
+ "language_model.model.layers.5.mlp.shared_expert.up_proj",
847
+ "language_model.model.layers.5.mlp.shared_expert_gate",
848
+ "language_model.model.layers.6.linear_attn.in_proj_a",
849
+ "language_model.model.layers.6.linear_attn.in_proj_b",
850
+ "language_model.model.layers.6.linear_attn.in_proj_ba",
851
+ "language_model.model.layers.6.mlp.gate",
852
+ "language_model.model.layers.6.mlp.shared_expert.down_proj",
853
+ "language_model.model.layers.6.mlp.shared_expert.gate_proj",
854
+ "language_model.model.layers.6.mlp.shared_expert.gate_up_proj",
855
+ "language_model.model.layers.6.mlp.shared_expert.up_proj",
856
+ "language_model.model.layers.6.mlp.shared_expert_gate",
857
+ "language_model.model.layers.7.mlp.gate",
858
+ "language_model.model.layers.7.mlp.shared_expert.down_proj",
859
+ "language_model.model.layers.7.mlp.shared_expert.gate_proj",
860
+ "language_model.model.layers.7.mlp.shared_expert.gate_up_proj",
861
+ "language_model.model.layers.7.mlp.shared_expert.up_proj",
862
+ "language_model.model.layers.7.mlp.shared_expert_gate",
863
+ "language_model.model.layers.7.self_attn.k_proj",
864
+ "language_model.model.layers.7.self_attn.q_proj",
865
+ "language_model.model.layers.7.self_attn.qkv_proj",
866
+ "language_model.model.layers.7.self_attn.v_proj",
867
+ "language_model.model.layers.8.linear_attn.in_proj_a",
868
+ "language_model.model.layers.8.linear_attn.in_proj_b",
869
+ "language_model.model.layers.8.linear_attn.in_proj_ba",
870
+ "language_model.model.layers.8.mlp.gate",
871
+ "language_model.model.layers.8.mlp.shared_expert.down_proj",
872
+ "language_model.model.layers.8.mlp.shared_expert.gate_proj",
873
+ "language_model.model.layers.8.mlp.shared_expert.gate_up_proj",
874
+ "language_model.model.layers.8.mlp.shared_expert.up_proj",
875
+ "language_model.model.layers.8.mlp.shared_expert_gate",
876
+ "language_model.model.layers.9.linear_attn.in_proj_a",
877
+ "language_model.model.layers.9.linear_attn.in_proj_b",
878
+ "language_model.model.layers.9.linear_attn.in_proj_ba",
879
+ "language_model.model.layers.9.mlp.gate",
880
+ "language_model.model.layers.9.mlp.shared_expert.down_proj",
881
+ "language_model.model.layers.9.mlp.shared_expert.gate_proj",
882
+ "language_model.model.layers.9.mlp.shared_expert.gate_up_proj",
883
+ "language_model.model.layers.9.mlp.shared_expert.up_proj",
884
+ "language_model.model.layers.9.mlp.shared_expert_gate",
885
+ "mtp.fc",
886
+ "mtp.layers.0.mlp.gate",
887
+ "mtp.layers.0.mlp.shared_expert.down_proj",
888
+ "mtp.layers.0.mlp.shared_expert.gate_proj",
889
+ "mtp.layers.0.mlp.shared_expert.gate_up_proj",
890
+ "mtp.layers.0.mlp.shared_expert.up_proj",
891
+ "mtp.layers.0.mlp.shared_expert_gate",
892
+ "mtp.layers.0.self_attn.k_proj",
893
+ "mtp.layers.0.self_attn.o_proj",
894
+ "mtp.layers.0.self_attn.q_proj",
895
+ "mtp.layers.0.self_attn.qkv_proj",
896
+ "mtp.layers.0.self_attn.v_proj",
897
+ "visual.blocks.0.attn.proj",
898
+ "visual.blocks.0.attn.qkv",
899
+ "visual.blocks.0.mlp.linear_fc1",
900
+ "visual.blocks.0.mlp.linear_fc2",
901
+ "visual.blocks.1.attn.proj",
902
+ "visual.blocks.1.attn.qkv",
903
+ "visual.blocks.1.mlp.linear_fc1",
904
+ "visual.blocks.1.mlp.linear_fc2",
905
+ "visual.blocks.10.attn.proj",
906
+ "visual.blocks.10.attn.qkv",
907
+ "visual.blocks.10.mlp.linear_fc1",
908
+ "visual.blocks.10.mlp.linear_fc2",
909
+ "visual.blocks.11.attn.proj",
910
+ "visual.blocks.11.attn.qkv",
911
+ "visual.blocks.11.mlp.linear_fc1",
912
+ "visual.blocks.11.mlp.linear_fc2",
913
+ "visual.blocks.12.attn.proj",
914
+ "visual.blocks.12.attn.qkv",
915
+ "visual.blocks.12.mlp.linear_fc1",
916
+ "visual.blocks.12.mlp.linear_fc2",
917
+ "visual.blocks.13.attn.proj",
918
+ "visual.blocks.13.attn.qkv",
919
+ "visual.blocks.13.mlp.linear_fc1",
920
+ "visual.blocks.13.mlp.linear_fc2",
921
+ "visual.blocks.14.attn.proj",
922
+ "visual.blocks.14.attn.qkv",
923
+ "visual.blocks.14.mlp.linear_fc1",
924
+ "visual.blocks.14.mlp.linear_fc2",
925
+ "visual.blocks.15.attn.proj",
926
+ "visual.blocks.15.attn.qkv",
927
+ "visual.blocks.15.mlp.linear_fc1",
928
+ "visual.blocks.15.mlp.linear_fc2",
929
+ "visual.blocks.16.attn.proj",
930
+ "visual.blocks.16.attn.qkv",
931
+ "visual.blocks.16.mlp.linear_fc1",
932
+ "visual.blocks.16.mlp.linear_fc2",
933
+ "visual.blocks.17.attn.proj",
934
+ "visual.blocks.17.attn.qkv",
935
+ "visual.blocks.17.mlp.linear_fc1",
936
+ "visual.blocks.17.mlp.linear_fc2",
937
+ "visual.blocks.18.attn.proj",
938
+ "visual.blocks.18.attn.qkv",
939
+ "visual.blocks.18.mlp.linear_fc1",
940
+ "visual.blocks.18.mlp.linear_fc2",
941
+ "visual.blocks.19.attn.proj",
942
+ "visual.blocks.19.attn.qkv",
943
+ "visual.blocks.19.mlp.linear_fc1",
944
+ "visual.blocks.19.mlp.linear_fc2",
945
+ "visual.blocks.2.attn.proj",
946
+ "visual.blocks.2.attn.qkv",
947
+ "visual.blocks.2.mlp.linear_fc1",
948
+ "visual.blocks.2.mlp.linear_fc2",
949
+ "visual.blocks.20.attn.proj",
950
+ "visual.blocks.20.attn.qkv",
951
+ "visual.blocks.20.mlp.linear_fc1",
952
+ "visual.blocks.20.mlp.linear_fc2",
953
+ "visual.blocks.21.attn.proj",
954
+ "visual.blocks.21.attn.qkv",
955
+ "visual.blocks.21.mlp.linear_fc1",
956
+ "visual.blocks.21.mlp.linear_fc2",
957
+ "visual.blocks.22.attn.proj",
958
+ "visual.blocks.22.attn.qkv",
959
+ "visual.blocks.22.mlp.linear_fc1",
960
+ "visual.blocks.22.mlp.linear_fc2",
961
+ "visual.blocks.23.attn.proj",
962
+ "visual.blocks.23.attn.qkv",
963
+ "visual.blocks.23.mlp.linear_fc1",
964
+ "visual.blocks.23.mlp.linear_fc2",
965
+ "visual.blocks.24.attn.proj",
966
+ "visual.blocks.24.attn.qkv",
967
+ "visual.blocks.24.mlp.linear_fc1",
968
+ "visual.blocks.24.mlp.linear_fc2",
969
+ "visual.blocks.25.attn.proj",
970
+ "visual.blocks.25.attn.qkv",
971
+ "visual.blocks.25.mlp.linear_fc1",
972
+ "visual.blocks.25.mlp.linear_fc2",
973
+ "visual.blocks.26.attn.proj",
974
+ "visual.blocks.26.attn.qkv",
975
+ "visual.blocks.26.mlp.linear_fc1",
976
+ "visual.blocks.26.mlp.linear_fc2",
977
+ "visual.blocks.3.attn.proj",
978
+ "visual.blocks.3.attn.qkv",
979
+ "visual.blocks.3.mlp.linear_fc1",
980
+ "visual.blocks.3.mlp.linear_fc2",
981
+ "visual.blocks.4.attn.proj",
982
+ "visual.blocks.4.attn.qkv",
983
+ "visual.blocks.4.mlp.linear_fc1",
984
+ "visual.blocks.4.mlp.linear_fc2",
985
+ "visual.blocks.5.attn.proj",
986
+ "visual.blocks.5.attn.qkv",
987
+ "visual.blocks.5.mlp.linear_fc1",
988
+ "visual.blocks.5.mlp.linear_fc2",
989
+ "visual.blocks.6.attn.proj",
990
+ "visual.blocks.6.attn.qkv",
991
+ "visual.blocks.6.mlp.linear_fc1",
992
+ "visual.blocks.6.mlp.linear_fc2",
993
+ "visual.blocks.7.attn.proj",
994
+ "visual.blocks.7.attn.qkv",
995
+ "visual.blocks.7.mlp.linear_fc1",
996
+ "visual.blocks.7.mlp.linear_fc2",
997
+ "visual.blocks.8.attn.proj",
998
+ "visual.blocks.8.attn.qkv",
999
+ "visual.blocks.8.mlp.linear_fc1",
1000
+ "visual.blocks.8.mlp.linear_fc2",
1001
+ "visual.blocks.9.attn.proj",
1002
+ "visual.blocks.9.attn.qkv",
1003
+ "visual.blocks.9.mlp.linear_fc1",
1004
+ "visual.blocks.9.mlp.linear_fc2",
1005
+ "visual.merger.linear_fc1",
1006
+ "visual.merger.linear_fc2",
1007
+ "visual.pos_embed"
1008
+ ],
1009
+ "quantization_status": "compressed"
1010
+ }
1011
  }
generation_config.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "bos_token_id": 248044,
3
- "do_sample": true,
4
- "eos_token_id": [
5
- 248046,
6
- 248044
7
- ],
8
- "pad_token_id": 248044,
9
- "temperature": 0.6,
10
- "top_k": 20,
11
- "top_p": 0.95,
12
- "transformers_version": "5.6.0.dev0"
13
- }
 
1
  {
2
+ "bos_token_id": 248044,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 248046,
6
+ 248044
7
+ ],
8
+ "pad_token_id": 248044,
9
+ "temperature": 0.6,
10
+ "top_k": 20,
11
+ "top_p": 0.95,
12
+ "transformers_version": "4.57.0.dev0"
13
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
mixed_native_manifest.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "source_model": "/root/.cache/huggingface/hub/models--Qwen--Qwen3.5-122B-A10B/snapshots/b000b2eb18a7f4cdf3153c4215842da339e09d99",
3
+ "source_recipe": "/work/artifacts/layer_config.json",
4
+ "format_histogram": {
5
+ "head_passthrough/BF16": 3,
6
+ "linear/NVFP4": 72,
7
+ "linear/BF16": 336,
8
+ "packed_moe_per_expert/NVFP4": 96,
9
+ "layer_passthrough/BF16": 312,
10
+ "linear/MXFP8": 12,
11
+ "mtp_linear/BF16": 8,
12
+ "mtp_packed_moe_per_expert/NVFP4": 2,
13
+ "mtp_passthrough/BF16": 9
14
+ },
15
+ "n_assignment_entries": 590,
16
+ "ignore": [
17
+ "lm_head"
18
+ ]
19
+ }
model-00001-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c4fcff082526d1da5b7679802b5d3bf4dd111264c60cb5aa2c6cd2fc86f9a0e
3
+ size 5114226512
model-00002-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a10e9c7164c651ab540d55d0762fd990a8edbf8858783af14438e5970135aaa
3
+ size 5114009072
model-00003-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f6c8c7222397fd84adf82aa855ac9f52e3eab27d3379be62cb36889f28552a0
3
+ size 5113912496
model-00004-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f368238d4f6b103617baab434c8bf4497ee751d532b99c6f0cf24650511edf85
3
+ size 5106839032
model-00005-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a060f0705b7d59387fc2250307c8ec4d6d9217d29856961e65e514ebb53a6d1d
3
+ size 5115003872
model-00006-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3fdecc210de7448eb05a3d19b0ead3fb40d115b698c8c9ef1b6ab9953ab1477
3
+ size 5114352912
model-00007-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fecaad8e04d4eaaf54daa1e5735ed37de2e625f0dccad00846ab575380d7c1b
3
+ size 5114968200
model-00008-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:493e8e9a8a2ffdda2db8f54dbb266f7267949ceaee0f96f6a501eb06b5d47c86
3
+ size 5114109968
model-00009-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:933bdb77f37c90a9177daabbca0c6f0ee406af1fe92c774f0ffd6ddd4997d664
3
+ size 5114086280
model-00010-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:695dde7cda4d948f1cb73ce12fc1f9c11c66667200f77c046aefd2b7628553fb
3
+ size 5113661128
model-00011-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf95284b270316d34ade4c012c2a0cbdd3f5473a1cc54b4d911e03dbfd8b6085
3
+ size 5114752408
model-00012-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27254df34465a5d06109103fad04f3e7554aed9ca85c8fa9280085ecfa968cec
3
+ size 5115020720
model-00013-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35f52f075258f944d906a640d5179e962b9d10414f9c57123e302ee6758ab609
3
+ size 5114947608
model-00014-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb7079e13fca9937723fcdf331a6d9c65c32fcf016e184fc82aa293720c88ace
3
+ size 5113656704
model-00015-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec8db6fd3477452bea9e784c16f9169b040aecbf84b8cd03fff7914008229872
3
+ size 5029797224
model-00016-of-00016.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab8c907b436995f7ac37b12be820bcacfb02daa7c48ae92de179690a18e553eb
3
+ size 103828008
model.safetensors.index.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbb2d314d4e025ff43d5f34c9e740c0a5cf077069299114b8675e01d2baac58c
3
- size 17283445
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e7d71867e482556c2c1f4bff9486585871d9666c26ab14fcee802802d282e4
3
+ size 17414423
preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 16777216,
4
+ "shortest_edge": 65536
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "image_processor_type": "Qwen2VLImageProcessorFast"
21
+ }
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87a7830d63fcf43bf241c3c5242e96e62dd3fdc29224ca26fed8ea333db72de4
3
- size 19989343
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f9e4d4901a92b997e463c1f46055088b6cca5ca61a6522d1b9f64c4bb81cb42
3
+ size 12807982
tokenizer_config.json CHANGED
@@ -1,32 +1,305 @@
1
  {
2
- "add_prefix_space": false,
3
- "audio_bos_token": "<|audio_start|>",
4
- "audio_eos_token": "<|audio_end|>",
5
- "audio_token": "<|audio_pad|>",
6
- "backend": "tokenizers",
7
- "bos_token": null,
8
- "clean_up_tokenization_spaces": false,
9
- "eos_token": "<|im_end|>",
10
- "errors": "replace",
11
- "image_token": "<|image_pad|>",
12
- "is_local": false,
13
- "model_max_length": 262144,
14
- "model_specific_special_tokens": {
15
- "audio_bos_token": "<|audio_start|>",
16
- "audio_eos_token": "<|audio_end|>",
17
- "audio_token": "<|audio_pad|>",
18
- "image_token": "<|image_pad|>",
19
- "video_token": "<|video_pad|>",
20
- "vision_bos_token": "<|vision_start|>",
21
- "vision_eos_token": "<|vision_end|>"
22
- },
23
- "pad_token": "<|endoftext|>",
24
- "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
25
- "processor_class": "Qwen3VLProcessor",
26
- "split_special_tokens": false,
27
- "tokenizer_class": "TokenizersBackend",
28
- "unk_token": null,
29
- "video_token": "<|video_pad|>",
30
- "vision_bos_token": "<|vision_start|>",
31
- "vision_eos_token": "<|vision_end|>"
32
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "248044": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "248045": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "248046": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "248047": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "248048": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "248049": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "248050": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "248051": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "248052": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "248053": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "248054": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "248055": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "248056": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "248057": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "248058": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "248059": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "248060": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "248061": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "248062": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "248063": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "248064": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "248065": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ },
180
+ "248066": {
181
+ "content": "<tool_response>",
182
+ "lstrip": false,
183
+ "normalized": false,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": false
187
+ },
188
+ "248067": {
189
+ "content": "</tool_response>",
190
+ "lstrip": false,
191
+ "normalized": false,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": false
195
+ },
196
+ "248068": {
197
+ "content": "<think>",
198
+ "lstrip": false,
199
+ "normalized": false,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": false
203
+ },
204
+ "248069": {
205
+ "content": "</think>",
206
+ "lstrip": false,
207
+ "normalized": false,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": false
211
+ },
212
+ "248070": {
213
+ "content": "<|audio_start|>",
214
+ "lstrip": false,
215
+ "normalized": false,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": true
219
+ },
220
+ "248071": {
221
+ "content": "<|audio_end|>",
222
+ "lstrip": false,
223
+ "normalized": false,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": true
227
+ },
228
+ "248072": {
229
+ "content": "<tts_pad>",
230
+ "lstrip": false,
231
+ "normalized": false,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": true
235
+ },
236
+ "248073": {
237
+ "content": "<tts_text_bos>",
238
+ "lstrip": false,
239
+ "normalized": false,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": true
243
+ },
244
+ "248074": {
245
+ "content": "<tts_text_eod>",
246
+ "lstrip": false,
247
+ "normalized": false,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": true
251
+ },
252
+ "248075": {
253
+ "content": "<tts_text_bos_single>",
254
+ "lstrip": false,
255
+ "normalized": false,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": true
259
+ },
260
+ "248076": {
261
+ "content": "<|audio_pad|>",
262
+ "lstrip": false,
263
+ "normalized": false,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": true
267
+ }
268
+ },
269
+ "additional_special_tokens": [
270
+ "<|im_start|>",
271
+ "<|im_end|>",
272
+ "<|object_ref_start|>",
273
+ "<|object_ref_end|>",
274
+ "<|box_start|>",
275
+ "<|box_end|>",
276
+ "<|quad_start|>",
277
+ "<|quad_end|>",
278
+ "<|vision_start|>",
279
+ "<|vision_end|>",
280
+ "<|vision_pad|>",
281
+ "<|image_pad|>",
282
+ "<|video_pad|>"
283
+ ],
284
+ "bos_token": null,
285
+ "chat_template": "{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- macro render_content(content, do_vision_count, is_system_content=false) %}\n {%- if content is string %}\n {{- content }}\n {%- elif content is iterable and content is not mapping %}\n {%- for item in content %}\n {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain images.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Picture ' ~ image_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|image_pad|><|vision_end|>' }}\n {%- elif 'video' in item or item.type == 'video' %}\n {%- if is_system_content %}\n {{- raise_exception('System message cannot contain videos.') }}\n {%- endif %}\n {%- if do_vision_count %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}\n {{- 'Video ' ~ video_count.value ~ ': ' }}\n {%- endif %}\n {{- '<|vision_start|><|video_pad|><|vision_end|>' }}\n {%- elif 'text' in item %}\n {{- item.text }}\n {%- else %}\n {{- raise_exception('Unexpected item type in content.') }}\n {%- endif %}\n {%- endfor %}\n {%- elif content is none or content is undefined %}\n {{- '' }}\n {%- else %}\n {{- raise_exception('Unexpected content type.') }}\n {%- endif %}\n{%- endmacro %}\n{%- if not messages %}\n {{- raise_exception('No messages provided.') }}\n{%- endif %}\n{%- if tools and tools is iterable and tools is not mapping %}\n {{- '<|im_start|>system\\n' }}\n {{- \"# Tools\\n\\nYou have access to the following functions:\\n\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\" }}\n {{- '\\n\\nIf you choose to call a function ONLY reply in the following format with NO suffix:\\n\\n<tool_call>\\n<function=example_function_name>\\n<parameter=example_parameter_1>\\nvalue_1\\n</parameter>\\n<parameter=example_parameter_2>\\nThis is the value for the second parameter\\nthat can span\\nmultiple lines\\n</parameter>\\n</function>\\n</tool_call>\\n\\n<IMPORTANT>\\nReminder:\\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\\n- Required parameters MUST be specified\\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\\n</IMPORTANT>' }}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {%- if content %}\n {{- '\\n\\n' + content }}\n {%- endif %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {%- set content = render_content(messages[0].content, false, true)|trim %}\n {{- '<|im_start|>system\\n' + content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" %}\n {%- set content = render_content(message.content, false)|trim %}\n {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if ns.multi_step_tool %}\n {{- raise_exception('No user query found in messages.') }}\n{%- endif %}\n{%- for message in messages %}\n {%- set content = render_content(message.content, true)|trim %}\n {%- if message.role == \"system\" %}\n {%- if not loop.first %}\n {{- raise_exception('System message must be at the beginning.') }}\n {%- endif %}\n {%- elif message.role == \"user\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- set reasoning_content = reasoning_content|trim %}\n {%- if loop.index0 > ns.last_query_index %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content + '\\n</think>\\n\\n' + content }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls and message.tool_calls is iterable and message.tool_calls is not mapping %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {%- if loop.first %}\n {%- if content|trim %}\n {{- '\\n\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- else %}\n {{- '<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- else %}\n {{- '\\n<tool_call>\\n<function=' + tool_call.name + '>\\n' }}\n {%- endif %}\n {%- if tool_call.arguments is defined %}\n {%- for args_name, args_value in tool_call.arguments|items %}\n {{- '<parameter=' + args_name + '>\\n' }}\n {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}\n {{- args_value }}\n {{- '\\n</parameter>\\n' }}\n {%- endfor %}\n {%- endif %}\n {{- '</function>\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.previtem and loop.previtem.role != \"tool\" %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if not loop.last and loop.nextitem.role != \"tool\" %}\n {{- '<|im_end|>\\n' }}\n {%- elif loop.last %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- else %}\n {{- raise_exception('Unexpected message role.') }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- else %}\n {{- '<think>\\n' }}\n {%- endif %}\n{%- endif %}",
286
+ "clean_up_tokenization_spaces": false,
287
+ "eos_token": "<|im_end|>",
288
+ "errors": "replace",
289
+ "model_max_length": 262144,
290
+ "pad_token": "<|endoftext|>",
291
+ "split_special_tokens": false,
292
+ "tokenizer_class": "Qwen2Tokenizer",
293
+ "unk_token": null,
294
+ "add_bos_token": false,
295
+ "pretokenize_regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?[\\p{L}\\p{M}]+|\\p{N}| ?[^\\s\\p{L}\\p{M}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
296
+ "extra_special_tokens": {
297
+ "audio_bos_token": "<|audio_start|>",
298
+ "audio_eos_token": "<|audio_end|>",
299
+ "audio_token": "<|audio_pad|>",
300
+ "image_token": "<|image_pad|>",
301
+ "video_token": "<|video_pad|>",
302
+ "vision_bos_token": "<|vision_start|>",
303
+ "vision_eos_token": "<|vision_end|>"
304
+ }
305
+ }
video_preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 25165824,
4
+ "shortest_edge": 4096
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "video_processor_type": "Qwen3VLVideoProcessor"
21
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff