Regarding the correctness of the int4 quantization script
Can I use the following script to quantize the model to int4 w4a16? Is it correct?
import torch
from compressed_tensors.utils import save_mtp_tensors_to_checkpoint
from datasets import load_dataset
from transformers import AutoProcessor, Qwen3_5MoeForConditionalGeneration
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
# NOTE: This example requires transformers >= v5
MODEL_ID = "/data/model-cache/Qwen3.6-35B-A3B"
# Load model.
model = Qwen3_5MoeForConditionalGeneration.from_pretrained(MODEL_ID, dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID)
# No need to include mtp layers as they are not loaded
# through Qwen3_5MoeForConditionalGeneration
recipe = QuantizationModifier(
targets="Linear",
scheme="W4A16",
weight_observer="mse",
ignore=[
"re:.*lm_head",
"re:visual.*",
"re:model.visual.*",
"re:.*mlp.gate$",
"re:.*embed_tokens$",
"re:.*shared_expert_gate$",
"re:.*linear_attn.*",
],
)
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 4096
ds = load_dataset(
"/data/model-cache/ultrachat_200k",
split=f"train_sft[:{NUM_CALIBRATION_SAMPLES}]",
)
ds = ds.select_columns(["messages"])
ds = ds.shuffle(seed=42)
def preprocess_function(example):
messages = [
{"role": m["role"], "content": [{"type": "text", "text": m["content"]}]}
for m in example["messages"]
]
return processor.apply_chat_template(
messages,
tokenize=True,
return_dict=True,
add_generation_prompt=False,
processor_kwargs={
"return_tensors": "pt",
"padding": False,
"truncation": True,
"max_length": MAX_SEQUENCE_LENGTH,
"add_special_tokens": False,
},
)
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
def data_collator(batch):
assert len(batch) == 1
return {key: torch.tensor(value) for key, value in batch[0].items()}
# Apply quantization.
oneshot(
model=model,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
moe_calibrate_all_experts=True,
data_collator=data_collator,
)
# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID + "-W4A16"
model.save_pretrained(SAVE_DIR)
processor.save_pretrained(SAVE_DIR)
# MTP layers are excluded from the model through Qwen3_5MoeForConditionalGeneration
# Save them as-is from the original checkpoint into the quantized output.
save_mtp_tensors_to_checkpoint(source_model=MODEL_ID, dest_dir=SAVE_DIR)
Hey!
If you're interested in quantizing your model to W4A16, you will get better performance if you apply GPTQ or AWQ - see the LLM Compressor documentation for further details: https://docs.vllm.ai/projects/llm-compressor/en/latest/steps/choosing-algo/
Using the QuantizationModifier will just apply a round-to-nearest method which usually provides subpar accuracy for int4. However, it can be quick to do as the other algorithms tend to be more expensive compute and time wise and require data (so you can actually remove all the code in your file that is processing / loading / passing in the dataset).
If you run into any issues, please feel free to open an issue with LLM Compresor: https://github.com/vllm-project/llm-compressor/issues