RedHatAI
/

Llama-4-Maverick-17B-128E-Instruct-NVFP4

@@ -87,47 +87,46 @@ This model was created by applying [LLM Compressor with calibration samples from
 <summary>Model Creation Code</summary>
 ```python
 from transformers import Llama4ForConditionalGeneration, Llama4Processor
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
-from llmcompressor.modeling.prepare import replace_modules_for_calibration
-from datasets import load_dataset
-import torch
-import gc
-# --- Load model ---
 model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
-model = Llama4ForConditionalGeneration.from_pretrained(
-    model_id, torch_dtype="auto", device_map=None
-)
 processor = Llama4Processor.from_pretrained(model_id)
-# --- Patch MoE layers to run all experts during calibration ---
-model = replace_modules_for_calibration(model, calibrate_all_experts=True)
-# Oneshot arguments
 DATASET_ID = "neuralmagic/calibration"
-NUM_CALIBRATION_SAMPLES = 512
-MAX_SEQUENCE_LENGTH = 2048
 ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
 def preprocess_function(example):
     messgages = []
     for message in example["messages"]:
         messgages.append(
             {
-                "role": message["role"],
-                "content": [{"type": "text", "text": message["content"]}]
             }
         )
     return processor.apply_chat_template(
-        messgages,
-        return_tensors="pt",
-        padding=False,
-        truncation=True,
         max_length=MAX_SEQUENCE_LENGTH,
         tokenize=True,
         add_special_tokens=False,
@@ -135,52 +134,53 @@ def preprocess_function(example):
         add_generation_prompt=False,
     )
-ds = ds.map(
-    preprocess_function,
-    batched=False,
-    remove_columns=ds.column_names
-)
-# Define a oneshot data collator for multimodal inputs.
 def data_collator(batch):
     assert len(batch) == 1
     return {
-        key: torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
         for key, value in batch[0].items()
     }
-recipe = QuantizationModifier(targets="Linear", scheme="NVFP4",
-            ignore=[
-                're:.*lm_head',
-                're:.*self_attn',
-                're:.*router',
-                're:.*vision_model',
-                're:.*multi_modal_projector',
-                "Llama4TextAttention",
-            ],
-        )
-MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
-# Perform oneshot
 oneshot(
     model=model,
-    tokenizer=model_id,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-    data_collator=data_collator,
-    output_dir=SAVE_DIR,
-    pipeline="sequential",
     sequential_targets=["Llama4TextMLP"],
 )
-# --- Save compressed model ---
-print("Saving compressed model...")
-model.save_pretrained(SAVE_DIR, save_compressed=True)
 processor.save_pretrained(SAVE_DIR)
 ```
@@ -192,6 +192,124 @@ This model was evaluated on the well-known OpenLLM v1, OpenLLM v2 and HumanEval_
 ### Accuracy
 ### Reproduction
 The results were obtained using the following commands:

 <summary>Model Creation Code</summary>
 ```python
+import torch
+from datasets import load_dataset
 from transformers import Llama4ForConditionalGeneration, Llama4Processor
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import QuantizationModifier
+# Select model and load it.
 model_id = "meta-llama/Llama-4-Maverick-17B-128E-Instruct"
+model = Llama4ForConditionalGeneration.from_pretrained(model_id, torch_dtype="auto")
 processor = Llama4Processor.from_pretrained(model_id)
+# MoE calibration is now handled automatically by the pipeline.
+# The `SequentialLlama4TextMoe` modules (from `llmcompressor.modeling.llama4`)
+# will be applied during calibration to enable
+# proper expert calibration and vLLM compatibility.
+# These replace the original `Llama4TextMoe` class from
+# `transformers.models.llama4.modeling_llama4`.
 DATASET_ID = "neuralmagic/calibration"
+NUM_CALIBRATION_SAMPLES = 20
+MAX_SEQUENCE_LENGTH = 8192
 ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")
 def preprocess_function(example):
     messgages = []
     for message in example["messages"]:
         messgages.append(
             {
+                "role": message["role"],
+                "content": [{"type": "text", "text": message["content"]}],
             }
         )
     return processor.apply_chat_template(
+        messgages,
+        return_tensors="pt",
+        padding=False,
+        truncation=True,
         max_length=MAX_SEQUENCE_LENGTH,
         tokenize=True,
         add_special_tokens=False,
         add_generation_prompt=False,
     )
+ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
 def data_collator(batch):
     assert len(batch) == 1
     return {
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        )
         for key, value in batch[0].items()
     }
+# Configure the quantization algorithm to run.
+recipe = QuantizationModifier(
+    targets="Linear",
+    scheme="NVFP4",
+    ignore=[
+        "re:.*lm_head",
+        "re:.*self_attn",
+        "re:.*router",
+        "re:.*vision_model.*",
+        "re:.*multi_modal_projector.*",
+        "Llama4TextAttention",
+    ],
+)
+# Apply algorithms.
+# due to the large size of Llama4, we specify sequential targets such that
+# only one MLP is loaded into GPU memory at a time
 oneshot(
     model=model,
     dataset=ds,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     sequential_targets=["Llama4TextMLP"],
+    data_collator=data_collator,
 )
+# Save to disk compressed.
+SAVE_DIR = model_id.rstrip("/").split("/")[-1] + "-NVFP4"
+model.save_pretrained(SAVE_DIR)
 processor.save_pretrained(SAVE_DIR)
 ```
 ### Accuracy
+<table>
+  <thead>
+    <tr>
+      <th>Category</th>
+      <th>Metric</th>
+      <th>Llama-4-Maverick-17B-128E-Instruct</th>
+      <th>Llama-4-Maverick-17B-128E-Instruct-NVFP4 (this model)</th>
+      <th>Recovery</th>
+    </tr>
+  </thead>
+  <tbody>
+    <!-- OpenLLM -->
+    <tr>
+      <td rowspan="8"><b>OpenLLM V1</b></td>
+      <td>arc_challenge_llama</td>
+      <td>95.97</td>
+      <td>95.88</td>
+      <td>99.91</td>
+    </tr>
+    <tr>
+      <td>gsm8k_llama</td>
+      <td>96.13</td>
+      <td>96.06</td>
+      <td>99.93</td>
+    </tr>
+    <tr>
+      <td>mmlu_llama</td>
+      <td>86.77</td>
+      <td>85.49</td>
+      <td>98.53</td>
+    </tr>
+    <tr>
+      <td>mmlu_cot_llama</td>
+      <td>89.49</td>
+      <td>88.72</td>
+      <td>99.14</td>
+    </tr>
+    <tr>
+      <td>truthfulqa_mc2</td>
+      <td>68.23</td>
+      <td>68.42</td>
+      <td>100.28</td>
+    </tr>
+    <tr>
+      <td>winogrande</td>
+      <td>77.98</td>
+      <td>77.74</td>
+      <td>99.69</td>
+    </tr>
+    <tr>
+      <td>hellaswag</td>
+      <td></td>
+      <td></td>
+      <td></td>
+    </tr>
+    <tr>
+      <td><b>Average</b></td>
+      <td><b></b></td>
+      <td><b>85.23</b></td>
+      <td><b></b></td>
+    </tr>
+    <!-- Leaderboard (vLLM 0.11.0) -->
+    <tr>
+      <td rowspan="7"><b>OpenLLM V2</b></td>
+      <td>BBH</td>
+      <td></td>
+      <td>69.52</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MMLU-Pro</td>
+      <td></td>
+      <td>62.83</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>MuSR</td>
+      <td></td>
+      <td>45.77</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>IFEval</td>
+      <td></td>
+      <td>89.45</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>GPQA</td>
+      <td></td>
+      <td>30.54</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td>Math-Hard</td>
+      <td></td>
+      <td>64.95</td>
+      <td></td>
+    </tr>
+    <tr>
+      <td><b>Average</b></td>
+      <td></td>
+      <td><b>60.51</b></td>
+      <td></td>
+    </tr>
+    <!-- Coding -->
+    <tr>
+      <td rowspan="1"><b>Coding</b></td>
+      <td>HumanEval_64 (pass@2)</td>
+      <td></td>
+      <td>88.88</td>
+      <td></td>
+    </tr>
+  </tbody>
+</table>
 ### Reproduction
 The results were obtained using the following commands: