nm-research commited on
Commit
4cfc814
·
verified ·
1 Parent(s): ad12997

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +167 -49
README.md CHANGED
@@ -87,47 +87,46 @@ This model was created by applying [LLM Compressor with calibration samples from
87
  <summary>Model Creation Code</summary>
88
 
89
  ```python
 
 
90
  from transformers import Llama4ForConditionalGeneration, Llama4Processor
 
91
  from llmcompressor import oneshot
92
  from llmcompressor.modifiers.quantization import QuantizationModifier
93
- from llmcompressor.modeling.prepare import replace_modules_for_calibration
94
- from datasets import load_dataset
95
- import torch
96
- import gc
97
 
98
- # --- Load model ---
99
  model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
100
-
101
- model = Llama4ForConditionalGeneration.from_pretrained(
102
- model_id, torch_dtype="auto", device_map=None
103
- )
104
  processor = Llama4Processor.from_pretrained(model_id)
 
 
 
 
 
 
105
 
106
- # --- Patch MoE layers to run all experts during calibration ---
107
- model = replace_modules_for_calibration(model, calibrate_all_experts=True)
108
-
109
- # Oneshot arguments
110
  DATASET_ID = "neuralmagic/calibration"
111
- NUM_CALIBRATION_SAMPLES = 512
112
- MAX_SEQUENCE_LENGTH = 2048
113
 
114
  ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
115
 
 
116
  def preprocess_function(example):
117
  messgages = []
118
  for message in example["messages"]:
119
  messgages.append(
120
  {
121
- "role": message["role"],
122
- "content": [{"type": "text", "text": message["content"]}]
123
  }
124
  )
125
-
126
  return processor.apply_chat_template(
127
- messgages,
128
- return_tensors="pt",
129
- padding=False,
130
- truncation=True,
131
  max_length=MAX_SEQUENCE_LENGTH,
132
  tokenize=True,
133
  add_special_tokens=False,
@@ -135,52 +134,53 @@ def preprocess_function(example):
135
  add_generation_prompt=False,
136
  )
137
 
138
- ds = ds.map(
139
- preprocess_function,
140
- batched=False,
141
- remove_columns=ds.column_names
142
- )
143
 
144
 
145
- # Define a oneshot data collator for multimodal inputs.
146
  def data_collator(batch):
147
  assert len(batch) == 1
148
  return {
149
- key: torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
 
 
 
 
150
  for key, value in batch[0].items()
151
  }
152
 
153
- recipe = QuantizationModifier(targets="Linear", scheme="NVFP4",
154
- ignore=[
155
- 're:.*lm_head',
156
- 're:.*self_attn',
157
- 're:.*router',
158
- 're:.*vision_model',
159
- 're:.*multi_modal_projector',
160
- "Llama4TextAttention",
161
- ],
162
- )
163
 
164
- MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # Perform oneshot
 
 
167
  oneshot(
168
  model=model,
169
- tokenizer=model_id,
170
  dataset=ds,
171
  recipe=recipe,
172
  max_seq_length=MAX_SEQUENCE_LENGTH,
173
  num_calibration_samples=NUM_CALIBRATION_SAMPLES,
174
- trust_remote_code_model=True,
175
- data_collator=data_collator,
176
- output_dir=SAVE_DIR,
177
- pipeline="sequential",
178
  sequential_targets=["Llama4TextMLP"],
 
179
  )
180
 
181
- # --- Save compressed model ---
182
- print("Saving compressed model...")
183
- model.save_pretrained(SAVE_DIR, save_compressed=True)
 
184
  processor.save_pretrained(SAVE_DIR)
185
 
186
  ```
@@ -192,6 +192,124 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2 and HumanEval_
192
 
193
  ### Accuracy
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  ### Reproduction
196
 
197
  The results were obtained using the following commands:
 
87
  <summary>Model Creation Code</summary>
88
 
89
  ```python
90
+ import torch
91
+ from datasets import load_dataset
92
  from transformers import Llama4ForConditionalGeneration, Llama4Processor
93
+
94
  from llmcompressor import oneshot
95
  from llmcompressor.modifiers.quantization import QuantizationModifier
 
 
 
 
96
 
97
+ # Select model and load it.
98
  model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
99
+ model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 
 
 
100
  processor = Llama4Processor.from_pretrained(model_id)
101
+ # MoE calibration is now handled automatically by the pipeline.
102
+ # The `SequentialLlama4TextMoe` modules (from `llmcompressor.modeling.llama4`)
103
+ # will be applied during calibration to enable
104
+ # proper expert calibration and vLLM compatibility.
105
+ # These replace the original `Llama4TextMoe` class from
106
+ # `transformers.models.llama4.modeling_llama4`.
107
 
 
 
 
 
108
  DATASET_ID = "neuralmagic/calibration"
109
+ NUM_CALIBRATION_SAMPLES = 20
110
+ MAX_SEQUENCE_LENGTH = 8192
111
 
112
  ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
113
 
114
+
115
  def preprocess_function(example):
116
  messgages = []
117
  for message in example["messages"]:
118
  messgages.append(
119
  {
120
+ "role": message["role"],
121
+ "content": [{"type": "text", "text": message["content"]}],
122
  }
123
  )
124
+
125
  return processor.apply_chat_template(
126
+ messgages,
127
+ return_tensors="pt",
128
+ padding=False,
129
+ truncation=True,
130
  max_length=MAX_SEQUENCE_LENGTH,
131
  tokenize=True,
132
  add_special_tokens=False,
 
134
  add_generation_prompt=False,
135
  )
136
 
137
+
138
+ ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
 
 
 
139
 
140
 
 
141
  def data_collator(batch):
142
  assert len(batch) == 1
143
  return {
144
+ key: (
145
+ torch.tensor(value)
146
+ if key != "pixel_values"
147
+ else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
148
+ )
149
  for key, value in batch[0].items()
150
  }
151
 
 
 
 
 
 
 
 
 
 
 
152
 
153
+ # Configure the quantization algorithm to run.
154
+ recipe = QuantizationModifier(
155
+ targets="Linear",
156
+ scheme="NVFP4",
157
+ ignore=[
158
+ "re:.*lm_head",
159
+ "re:.*self_attn",
160
+ "re:.*router",
161
+ "re:.*vision_model.*",
162
+ "re:.*multi_modal_projector.*",
163
+ "Llama4TextAttention",
164
+ ],
165
+ )
166
 
167
+ # Apply algorithms.
168
+ # due to the large size of Llama4, we specify sequential targets such that
169
+ # only one MLP is loaded into GPU memory at a time
170
  oneshot(
171
  model=model,
 
172
  dataset=ds,
173
  recipe=recipe,
174
  max_seq_length=MAX_SEQUENCE_LENGTH,
175
  num_calibration_samples=NUM_CALIBRATION_SAMPLES,
 
 
 
 
176
  sequential_targets=["Llama4TextMLP"],
177
+ data_collator=data_collator,
178
  )
179
 
180
+
181
+ # Save to disk compressed.
182
+ SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
183
+ model.save_pretrained(SAVE_DIR)
184
  processor.save_pretrained(SAVE_DIR)
185
 
186
  ```
 
192
 
193
  ### Accuracy
194
 
195
+ <table>
196
+ <thead>
197
+ <tr>
198
+ <th>Category</th>
199
+ <th>Metric</th>
200
+ <th>Llama-4-Maverick-17B-128E-Instruct</th>
201
+ <th>Llama-4-Maverick-17B-128E-Instruct-NVFP4 (this model)</th>
202
+ <th>Recovery</th>
203
+ </tr>
204
+ </thead>
205
+ <tbody>
206
+ <!-- OpenLLM -->
207
+ <tr>
208
+ <td rowspan="8"><b>OpenLLM V1</b></td>
209
+ <td>arc_challenge_llama</td>
210
+ <td>95.97</td>
211
+ <td>95.88</td>
212
+ <td>99.91</td>
213
+ </tr>
214
+ <tr>
215
+ <td>gsm8k_llama</td>
216
+ <td>96.13</td>
217
+ <td>96.06</td>
218
+ <td>99.93</td>
219
+ </tr>
220
+ <tr>
221
+ <td>mmlu_llama</td>
222
+ <td>86.77</td>
223
+ <td>85.49</td>
224
+ <td>98.53</td>
225
+ </tr>
226
+ <tr>
227
+ <td>mmlu_cot_llama</td>
228
+ <td>89.49</td>
229
+ <td>88.72</td>
230
+ <td>99.14</td>
231
+ </tr>
232
+ <tr>
233
+ <td>truthfulqa_mc2</td>
234
+ <td>68.23</td>
235
+ <td>68.42</td>
236
+ <td>100.28</td>
237
+ </tr>
238
+ <tr>
239
+ <td>winogrande</td>
240
+ <td>77.98</td>
241
+ <td>77.74</td>
242
+ <td>99.69</td>
243
+ </tr>
244
+ <tr>
245
+ <td>hellaswag</td>
246
+ <td></td>
247
+ <td></td>
248
+ <td></td>
249
+ </tr>
250
+ <tr>
251
+ <td><b>Average</b></td>
252
+ <td><b></b></td>
253
+ <td><b>85.23</b></td>
254
+ <td><b></b></td>
255
+ </tr>
256
+ <!-- Leaderboard (vLLM 0.11.0) -->
257
+ <tr>
258
+ <td rowspan="7"><b>OpenLLM V2</b></td>
259
+ <td>BBH</td>
260
+ <td></td>
261
+ <td>69.52</td>
262
+ <td></td>
263
+ </tr>
264
+ <tr>
265
+ <td>MMLU-Pro</td>
266
+ <td></td>
267
+ <td>62.83</td>
268
+ <td></td>
269
+ </tr>
270
+ <tr>
271
+ <td>MuSR</td>
272
+ <td></td>
273
+ <td>45.77</td>
274
+ <td></td>
275
+ </tr>
276
+ <tr>
277
+ <td>IFEval</td>
278
+ <td></td>
279
+ <td>89.45</td>
280
+ <td></td>
281
+ </tr>
282
+ <tr>
283
+ <td>GPQA</td>
284
+ <td></td>
285
+ <td>30.54</td>
286
+ <td></td>
287
+ </tr>
288
+ <tr>
289
+ <td>Math-Hard</td>
290
+ <td></td>
291
+ <td>64.95</td>
292
+ <td></td>
293
+ </tr>
294
+ <tr>
295
+ <td><b>Average</b></td>
296
+ <td></td>
297
+ <td><b>60.51</b></td>
298
+ <td></td>
299
+ </tr>
300
+ <!-- Coding -->
301
+ <tr>
302
+ <td rowspan="1"><b>Coding</b></td>
303
+ <td>HumanEval_64 (pass@2)</td>
304
+ <td></td>
305
+ <td>88.88</td>
306
+ <td></td>
307
+ </tr>
308
+
309
+ </tbody>
310
+ </table>
311
+
312
+
313
  ### Reproduction
314
 
315
  The results were obtained using the following commands: