Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +2 -0
- README.md +341 -0
- calibrate_software_engineer.yaml +442 -0
- chat_template.jinja +159 -0
- config.json +223 -0
- configuration_minimax_m2.py +200 -0
- generation_config.json +8 -0
- merges.txt +0 -0
- model-00000-of-00126.safetensors +3 -0
- model-00001-of-00126.safetensors +3 -0
- model-00002-of-00126.safetensors +3 -0
- model-00003-of-00126.safetensors +3 -0
- model-00004-of-00126.safetensors +3 -0
- model-00005-of-00126.safetensors +3 -0
- model-00006-of-00126.safetensors +3 -0
- model-00007-of-00126.safetensors +3 -0
- model-00008-of-00126.safetensors +3 -0
- model-00009-of-00126.safetensors +3 -0
- model-00010-of-00126.safetensors +3 -0
- model-00011-of-00126.safetensors +3 -0
- model-00012-of-00126.safetensors +3 -0
- model-00013-of-00126.safetensors +3 -0
- model-00014-of-00126.safetensors +3 -0
- model-00015-of-00126.safetensors +3 -0
- model-00016-of-00126.safetensors +3 -0
- model-00017-of-00126.safetensors +3 -0
- model-00018-of-00126.safetensors +3 -0
- model-00019-of-00126.safetensors +3 -0
- model-00020-of-00126.safetensors +3 -0
- model-00021-of-00126.safetensors +3 -0
- model-00022-of-00126.safetensors +3 -0
- model-00023-of-00126.safetensors +3 -0
- model-00024-of-00126.safetensors +3 -0
- model-00025-of-00126.safetensors +3 -0
- model-00026-of-00126.safetensors +3 -0
- model-00027-of-00126.safetensors +3 -0
- model-00028-of-00126.safetensors +3 -0
- model-00029-of-00126.safetensors +3 -0
- model-00030-of-00126.safetensors +3 -0
- model-00031-of-00126.safetensors +3 -0
- model-00032-of-00126.safetensors +3 -0
- model-00033-of-00126.safetensors +3 -0
- model-00034-of-00126.safetensors +3 -0
- model-00035-of-00126.safetensors +3 -0
- model-00036-of-00126.safetensors +3 -0
- model-00037-of-00126.safetensors +3 -0
- model-00038-of-00126.safetensors +3 -0
- model-00039-of-00126.safetensors +3 -0
- model-00040-of-00126.safetensors +3 -0
- model-00041-of-00126.safetensors +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
pipeline_tag: text-generation
|
| 3 |
+
license: other
|
| 4 |
+
license_name: modified-mit
|
| 5 |
+
license_link: https://github.com/MiniMax-AI/MiniMax-M2.5/blob/main/LICENSE
|
| 6 |
+
library_name: llm-compressor
|
| 7 |
+
tags:
|
| 8 |
+
- fp8
|
| 9 |
+
- awq
|
| 10 |
+
- conversational
|
| 11 |
+
- vllm
|
| 12 |
+
- code
|
| 13 |
+
- devops
|
| 14 |
+
- software engineering
|
| 15 |
+
- engineer
|
| 16 |
+
- developer
|
| 17 |
+
- architect
|
| 18 |
+
- stem
|
| 19 |
+
- agent
|
| 20 |
+
datasets:
|
| 21 |
+
- HuggingFaceH4/ultrachat_200k
|
| 22 |
+
- databricks/databricks-dolly-15k
|
| 23 |
+
- neuralmagic/calibration
|
| 24 |
+
- HuggingFaceH4/no_robots
|
| 25 |
+
- nvidia/HelpSteer
|
| 26 |
+
- garage-bAInd/Open-Platypus
|
| 27 |
+
- PJMixers/grimulkan_physical-reasoning-ShareGPT
|
| 28 |
+
- PJMixers/grimulkan_theory-of-mind-ShareGPT
|
| 29 |
+
- HuggingFaceH4/Multilingual-Thinking
|
| 30 |
+
- ServiceNow-AI/M2Lingual
|
| 31 |
+
- droussis/euroblocks_sft_1sample_per_lang
|
| 32 |
+
- interstellarninja/hermes_reasoning_tool_use
|
| 33 |
+
- deepmind/code_contests
|
| 34 |
+
- dh02391735/stackoverflow-kubernetes-questions
|
| 35 |
+
- diversoailab/humaneval-rust
|
| 36 |
+
- ammarnasr/the-stack-rust-clean
|
| 37 |
+
- CSJianYang/CodeArena
|
| 38 |
+
- nvidia/OpenCodeInstruct
|
| 39 |
+
- nvidia/Llama-Nemotron-Post-Training-Dataset
|
| 40 |
+
- nvidia/Nemotron-Competitive-Programming-v1
|
| 41 |
+
- rombodawg/code_bagel_hermes-2.5
|
| 42 |
+
- MathArena/project_euler
|
| 43 |
+
- nvidia/Nemotron-Math-Proofs-v1
|
| 44 |
+
- nvidia/OpenMathInstruct-2
|
| 45 |
+
- nvidia/OpenScienceReasoning-2
|
| 46 |
+
- MegaScience/MegaScience
|
| 47 |
+
- OpenMed/Medical-Reasoning-SFT-GPT-OSS-120B
|
| 48 |
+
- ccdv/pubmed-summarization
|
| 49 |
+
- gbharti/finance-alpaca
|
| 50 |
+
- vladlen32230/summarization-yahoo-stock-finance-article-text
|
| 51 |
+
- fka/awesome-chatgpt-prompts
|
| 52 |
+
- theoldmandthesea/17k_business_book
|
| 53 |
+
- ruggsea/stanford-encyclopedia-of-philosophy_instruct
|
| 54 |
+
- mlfoundations-dev/stackexchange_philosophy
|
| 55 |
+
- FreedomIntelligence/SocraticChat
|
| 56 |
+
- Gryphe/Opus-WritingPrompts
|
| 57 |
+
- anthracite-org/nopm_claude_writing_fixed
|
| 58 |
+
- zerofata/Roleplay-Anime-Characters
|
| 59 |
+
- zerofata/Instruct-Anime
|
| 60 |
+
- zerofata/Instruct-Anime-CreativeWriting
|
| 61 |
+
- sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo
|
| 62 |
+
- PocketDoc/Dans-Prosemaxx-Adventure
|
| 63 |
+
- anthracite-org/stheno-filtered-v1.1
|
| 64 |
+
- KaraKaraWitch/TvTroper-2025
|
| 65 |
+
- AquaV/US-Army-Survival-Sharegpt
|
| 66 |
+
- AquaV/Interrogation-Sharegpt
|
| 67 |
+
- AquaV/Multi-Environment-Operations-Sharegpt
|
| 68 |
+
- AquaV/Resistance-Sharegpt
|
| 69 |
+
- PocketDoc/Dans-Kinomaxx-VanillaBackrooms
|
| 70 |
+
base_model:
|
| 71 |
+
- MiniMaxAI/MiniMax-M2.5
|
| 72 |
+
---
|
| 73 |
+
|
| 74 |
+
# MiniMax M2.5 (Mixed-Precision FP8 + INT4 AWQ FrankenQuant)
|
| 75 |
+
|
| 76 |
+
This strives to be the highest quality quant that can run on 192GiB VRAM
|
| 77 |
+
|
| 78 |
+
> [!TIP]
|
| 79 |
+
> 💡 A non-FP8 version is available at [mratsim/MiniMax-M2.5-BF16-INT4-AWQ](https://huggingface.co/mratsim/MiniMax-M2.5-BF16-INT4-AWQ) \
|
| 80 |
+
> That version is compatible with 8x RTX 3090s and with SGLang (which doesn't support mixed quantization yet) for an extra 3GiB in VRAM. \
|
| 81 |
+
> This FP8+INT4 AWQ was build by merging the original FP8 self-attention weights and [mratsim/MiniMax-M2.5-BF16-INT4-AWQ](https://huggingface.co/mratsim/MiniMax-M2.5-BF16-INT4-AWQ) experts.
|
| 82 |
+
|
| 83 |
+
It features:
|
| 84 |
+
- That model has ensured that all experts are calibrated, not doing so is extremely detrimental, PR: https://github.com/vllm-project/llm-compressor/pull/2171
|
| 85 |
+
<details>
|
| 86 |
+
<summary>**[Click me!]** Visual showcase of why ensuring quantization of all MoE experts is important</summary>
|
| 87 |
+
|
| 88 |
+
- Source: https://avtc.github.io/aquarium-side-by-side/
|
| 89 |
+
- Context: https://github.com/ModelCloud/GPTQModel/pull/2235
|
| 90 |
+
|
| 91 |
+

|
| 92 |
+
|
| 93 |
+
</details>
|
| 94 |
+
- Mixed precision with:
|
| 95 |
+
- self-attention weights copied directly from the official version (default FP8 with 2D-blocks)
|
| 96 |
+
- experts weights quantized using AWQ W4A16G32 scheme (4-bit weights, 16-bit activations, scaling factor per group of 32 weights)
|
| 97 |
+
- High-quality large and diverse dataset with programming and devops focus
|
| 98 |
+
as well as domain-specific knowledge (math, sciences, medical, finance, business, humanities, philosophy, creative writing), general knowledge, pop culture and behavioral situations because we never code in a vacuum. And we want to make sure all experts are calibrated to the full range of their activations.
|
| 99 |
+
- Calibration explicitly tests multilingual capabilities:
|
| 100 |
+
- Asia: Chinese, Hindi, Korean, Japanese
|
| 101 |
+
- Europe: French, German, Portuguese, Russian, Spanish
|
| 102 |
+
- Middle-East: Arabic, Hebrew, Turkish
|
| 103 |
+
- Calibration explicitly tests 60 programming languages and not just Python:
|
| 104 |
+
- Imperative programming: C, C++, Go, Zig, ...
|
| 105 |
+
- Functional programming: Haskell, F#, OCaml, Erlang, Lisp, Clojure ...
|
| 106 |
+
- Web-focused: HTML/CSS, Typescript, PHP, ...
|
| 107 |
+
- Mixed paradigm: D, Kotlin, Nim, Rust, Swift, ...
|
| 108 |
+
- Theorem provers: Coq, Lean
|
| 109 |
+
- Low-level: ARM64 assembly, x86-64 assembly, LLVM IR
|
| 110 |
+
- GPU Programming: Cuda, Vulkan, Apple Metal
|
| 111 |
+
- Game Programming: GDScript, GLSL
|
| 112 |
+
- Domain-specific: MATLAB, Julia, Solidity, R
|
| 113 |
+
- Calibration tries to ensure coverage for a wide variety of experience (from explaining concepts to your grandmother to debugging Kubernetes logs)
|
| 114 |
+
- Built by a dev, for devs (and it looks very good for STEM as well)
|
| 115 |
+
|
| 116 |
+
It uses my new declarative quantization framework https://github.com/mratsim/quantizers which facilitates highly-tuned calibration sets: [calibrate_software_engineer.yaml](./calibrate_software_engineer.yaml)
|
| 117 |
+
|
| 118 |
+
<details>
|
| 119 |
+
<summary>This has taken several days and contribution and bug reports to the ecosystem, I hope you find it useful.</summary>
|
| 120 |
+
|
| 121 |
+
- https://github.com/vllm-project/llm-compressor/pull/2171
|
| 122 |
+
- https://github.com/vllm-project/llm-compressor/issues/2172
|
| 123 |
+
- https://github.com/vllm-project/vllm/issues/31623
|
| 124 |
+
- https://github.com/sgl-project/sglang/issues/16276
|
| 125 |
+
- https://github.com/sgl-project/sglang/issues/16295
|
| 126 |
+
|
| 127 |
+
</details>
|
| 128 |
+
|
| 129 |
+
## 📥 Usage & Running Instructions
|
| 130 |
+
|
| 131 |
+
The model was tested with vLLM + 2x RTX Pro 6000, here is a script suitable for such configuration with the maximum 196,608 context length. This uses 92.5GiB of VRAM with the flashinfer backend.
|
| 132 |
+
|
| 133 |
+
> [!WARNING]
|
| 134 |
+
> ⚠️ Due to rope_parameters change, at the moment this model is incompatible with transformers V5.\
|
| 135 |
+
This makes it incompatible with GLM-4.6V which requires transformers V5. Use different Docker images.
|
| 136 |
+
|
| 137 |
+
> [!WARNING]
|
| 138 |
+
> ⚠️ SGLang does not support this model due to missing mixed precision support. Feature request raised at https://github.com/sgl-project/sglang/issues/16276.\
|
| 139 |
+
> Please use [mratsim/MiniMax-M2.5-BF16-INT4-AWQ](https://huggingface.co/mratsim/MiniMax-M2.5-BF16-INT4-AWQ) in the meantime.
|
| 140 |
+
|
| 141 |
+
### Running script
|
| 142 |
+
|
| 143 |
+
`--trust-remote-code` is necessary until the transformers team merges github.com/huggingface/transformers/pull/42028
|
| 144 |
+
|
| 145 |
+
You have 2 reasoning parsers;
|
| 146 |
+
- `minimax_m2`, puts the reasoning content in a special field like DeepSeek models that is usually rendered in a specific manner in frontends.
|
| 147 |
+
- `minimax_m2_append_think`, puts the reasoning into `<think>reasoning_content</think>` and that is sent as normal text. Few frontends properly render that, I'm aware of [Cherry Studio](https://github.com/CherryHQ/cherry-studio) on Desktop and [ChatterUI](https://github.com/Vali-98/ChatterUI) on Android.
|
| 148 |
+
|
| 149 |
+
The reason why `minimax_m2_append_think` was introduced was Interleaved Thinking and having the model build upon it's previous thinking (usually frontends discard the thinking trace)
|
| 150 |
+
|
| 151 |
+
> [!TIP]
|
| 152 |
+
> 💡With the recommended parameters the model tends to get stuck in repetition loops.\
|
| 153 |
+
> It seems like repetition_penalty: 1.10, frequency_penalty: 0.40 avoids that
|
| 154 |
+
|
| 155 |
+
```bash
|
| 156 |
+
# Model configuration (Mandatory)
|
| 157 |
+
MODEL="mratsim/MiniMax-M2.5-FP8-INT4-AWQ"
|
| 158 |
+
MODELNAME="MiniMax-M2.5"
|
| 159 |
+
GPU_UTIL=0.93
|
| 160 |
+
SAMPLER_OVERRIDE='{"temperature": 1, "top_p": 0.95, "top_k": 40, "repetition_penalty": 1.1, "frequency_penalty": 0.40}'
|
| 161 |
+
|
| 162 |
+
# Prevent memory fragmentation
|
| 163 |
+
export PYTORCH_ALLOC_CONF=expandable_segments:True,max_split_size_mb:512
|
| 164 |
+
|
| 165 |
+
# Prevent vLLM from using 100% CPU when idle (Very Recommended)
|
| 166 |
+
export VLLM_SLEEP_WHEN_IDLE=1
|
| 167 |
+
|
| 168 |
+
vllm serve "${MODEL}" \
|
| 169 |
+
--served-model-name "${MODELNAME}" \
|
| 170 |
+
--trust-remote-code \
|
| 171 |
+
--gpu-memory-utilization ${GPU_UTIL} \
|
| 172 |
+
--tp 2 \
|
| 173 |
+
--override-generation-config "${SAMPLER_OVERRIDE}" \
|
| 174 |
+
--enable-auto-tool-choice \
|
| 175 |
+
--tool-call-parser minimax_m2 \
|
| 176 |
+
--reasoning-parser minimax_m2
|
| 177 |
+
# --reasoning-parser minimax_m2_append_think
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
## Performance
|
| 181 |
+
|
| 182 |
+
On dual RTX Pro 6000, I can reach over 5500 prefill/prompt/context processing and over 100 tok/s token generation for a single request.
|
| 183 |
+
|
| 184 |
+

|
| 185 |
+
|
| 186 |
+
With PagedAttention in action you can reach over 25000 tok/s in prompt processing speed.
|
| 187 |
+
|
| 188 |
+

|
| 189 |
+
|
| 190 |
+
When batching, with default config, you can reach over 6000 even 8000 tok/s and 1200 tok/s generation speed.\
|
| 191 |
+
Tune prefill vs decode prioritization with `--max_num_batched_tokens` see [Performance & Tuning | vLLM](https://docs.vllm.ai/en/v0.4.2/models/performance.html)
|
| 192 |
+
|
| 193 |
+

|
| 194 |
+
|
| 195 |
+
In a steady state with interleaved prefill and decode requests that interrupt each other, you can get ~2400 tok/s context processing and 800 tok/s generation
|
| 196 |
+
|
| 197 |
+

|
| 198 |
+
|
| 199 |
+
Note: vLLM supports prefill-decode disaggregation for high throughput serving if you have double the minimum hardware:
|
| 200 |
+
- https://pytorch.org/blog/disaggregated-inference-at-scale-with-pytorch-vllm/
|
| 201 |
+
- https://github.com/vllm-project/production-stack
|
| 202 |
+
- Prefill/decode disaggregation
|
| 203 |
+
- Multi-Tier KV-cache via [LMCache](https://github.com/LMCache/LMCache) (GPU > CPU > Local Disk)
|
| 204 |
+
- Cache aware router
|
| 205 |
+
- Multi-model dispatch via single interface
|
| 206 |
+
|
| 207 |
+
## 🔬 Quantization method
|
| 208 |
+
|
| 209 |
+
Quantization was quite complex for this model and was done in 3 steps:
|
| 210 |
+
1. Original weights are in FP8, they were dequantized to FP16 due to llm-compressor not being able to process FP8.
|
| 211 |
+
2. llm-compressor was used to quantize the MLP experts projection using AWQ, with [PR #2171](https://github.com/vllm-project/llm-compressor/pull/2171) to ensure they were all activated.
|
| 212 |
+
3. Stitching the FrankenQuant: I combined the original weights, including the 2D-block FP8, with the experts-only AWQ weights.
|
| 213 |
+
|
| 214 |
+
The llmcompressor library was used with the following recipe:
|
| 215 |
+
|
| 216 |
+
```yaml
|
| 217 |
+
default_stage:
|
| 218 |
+
default_modifiers:
|
| 219 |
+
AWQModifier:
|
| 220 |
+
config_groups:
|
| 221 |
+
mlp_experts_projections:
|
| 222 |
+
# Include only MLP expert weights for 4-bit quantization
|
| 223 |
+
targets: ["re:.*block_sparse_moe\\.experts\\.\\d+\\.(w1|w2|w3)$"]
|
| 224 |
+
weights:
|
| 225 |
+
num_bits: 4
|
| 226 |
+
type: int
|
| 227 |
+
symmetric: true
|
| 228 |
+
group_size: 32
|
| 229 |
+
strategy: group
|
| 230 |
+
dynamic: false
|
| 231 |
+
# actorder: group
|
| 232 |
+
observer: memoryless_minmax
|
| 233 |
+
|
| 234 |
+
mappings:
|
| 235 |
+
- smooth_layer: re:.*post_attention_layernorm$
|
| 236 |
+
balance_layers: ["re:.*w1$", "re:.*w3$"]
|
| 237 |
+
- smooth_layer: re:.*w3$
|
| 238 |
+
balance_layers: ["re:.*w2$"]
|
| 239 |
+
duo_scaling: true
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
The calibration set had 590 examples, 8192 sequence length, 60 programming languages, 12 spoken languages and is detailed at [calibrate_software_engineer.yaml](./calibrate_software_engineer.yaml)
|
| 243 |
+
|
| 244 |
+
## Quantization theory and heuristics for manual tuning
|
| 245 |
+
|
| 246 |
+
<details>
|
| 247 |
+
<summary>In-depth overview of quantization theory and heuristics for manual tuning</summary>
|
| 248 |
+
|
| 249 |
+
### Layers to quantize
|
| 250 |
+
|
| 251 |
+
Quantization should be focused on Linear layers (also called Dense or Fully-Connected layers i.e. MatMul+Bias)
|
| 252 |
+
In particular quantizing LayerNorm/RMSnorm layer is strongly discouraged, see [1]
|
| 253 |
+
> LayerNorm in Quantization. Kovaleva et al. (2021); Wei et al. (2022) find that outliers in the
|
| 254 |
+
> LayerNorm parameters of BERT (Devlin et al., 2019) cause difficulties in model compression.
|
| 255 |
+
> Given the importance of LayerNorm, all the quantization methods we discuss above leave LayerNorm unquantized.
|
| 256 |
+
|
| 257 |
+
This is also reported in Intel and Nvidia repo:
|
| 258 |
+
- https://github.com/intel/neural-compressor/issues/1963#issuecomment-2274873441
|
| 259 |
+
- https://github.com/NVIDIA/TensorRT/issues/4084#issuecomment-2294513950
|
| 260 |
+
|
| 261 |
+
### Tensors to up-quantize
|
| 262 |
+
|
| 263 |
+
If there is enough bits, down projections should be prioritized.
|
| 264 |
+
|
| 265 |
+
According to [4]
|
| 266 |
+
> Fig. 3: Maximum absolute value over layers for a LLaMA3-8B.
|
| 267 |
+
> Each color represent a different projection and we clearly see that down_proj has the biggest
|
| 268 |
+
> spikes in input and output. We also observe that RMSNorm propagate spikes through the entire model
|
| 269 |
+
|
| 270 |
+
According to [5]
|
| 271 |
+
> Figure 5(a) illustrates the extremal ratio across layers and modules in LLaMA2-7B, highlighting
|
| 272 |
+
> that weight outliers are concentrated in the down-projection matrices Wdown
|
| 273 |
+
> ℓ of the second layer and
|
| 274 |
+
> the last two layers. Figures 5(b) and 5(c) provide detailed visualizations of these outliers in the last
|
| 275 |
+
> two layers.
|
| 276 |
+
|
| 277 |
+
### Mixture-of-Experts quantization (MoE)
|
| 278 |
+
|
| 279 |
+
Mixture-of-Experts require specific quantization techniques.
|
| 280 |
+
|
| 281 |
+
#### Mixed-precision quantization
|
| 282 |
+
|
| 283 |
+
Some layers have a higher impact on LLM performance.
|
| 284 |
+
According to [2], spending more bits in attention layers results in large gain compared to spending them in FFN layers.
|
| 285 |
+
According to [3] on 2-bit quantization:
|
| 286 |
+
- quantizing expert FFN layers do not seriously impact model quality
|
| 287 |
+
- quantizing cross-attention has some impact
|
| 288 |
+
- quantizing self-attention has a large impact
|
| 289 |
+
- quantizing dense FFN has a very significant impact
|
| 290 |
+
|
| 291 |
+
Hence to preserve model quality we should choose not to quantize dense FFN layers and self-attention layers.
|
| 292 |
+
|
| 293 |
+
We notice that:
|
| 294 |
+
- official MXFP4 weights of gpt-oss-120b from OpenAI keep self-attention in BF16:
|
| 295 |
+
- https://huggingface.co/openai/gpt-oss-120b/blob/main/model.safetensors.index.json
|
| 296 |
+
- NVFP4 weights of DeepSeek-R1 quantized by Nvidia also keep self-attention in BF16:
|
| 297 |
+
- https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4/blob/main/model.safetensors.index.json
|
| 298 |
+
|
| 299 |
+
#### Layers with high-impact
|
| 300 |
+
|
| 301 |
+
According to [2], giving more bits to the first `k` blocks have a significantly higher impact on model quality than for the same last `k` blocks.
|
| 302 |
+
|
| 303 |
+
#### Expert quantization
|
| 304 |
+
|
| 305 |
+
When quantizing MoE, quantizing activations is tricky as only a subset of experts are activated per request. You have to make sure all experts are calibrated.
|
| 306 |
+
|
| 307 |
+
<details>
|
| 308 |
+
<summary>Visual showcase of why ensuring quantization of all MoE experts is important</summary>
|
| 309 |
+
|
| 310 |
+
- Source: https://avtc.github.io/aquarium-side-by-side/
|
| 311 |
+
- Context: https://github.com/ModelCloud/GPTQModel/pull/2235
|
| 312 |
+
|
| 313 |
+

|
| 314 |
+
|
| 315 |
+
</details>
|
| 316 |
+
|
| 317 |
+
## References
|
| 318 |
+
|
| 319 |
+
1. Why Do Some Inputs Break Low-Bit LLM Quantization? (2025)\
|
| 320 |
+
Ting-Yun Chang, Muru Zhang, Jesse Thomason, Robin Jia\
|
| 321 |
+
https://arxiv.org/pdf/2506.12044
|
| 322 |
+
|
| 323 |
+
2. Examining Post-Training Quantization for Mixture-of-Experts: A Benchmark (2024)\
|
| 324 |
+
Pingzhi Li, Xiaolong Jin, Yu Cheng, Tianlong Chen\
|
| 325 |
+
https://arxiv.org/pdf/2406.08155v1
|
| 326 |
+
|
| 327 |
+
3. Mixture of Quantized Experts (MoQE): Complementary Effect of Low-bit Quantization and Robustness (2023)\
|
| 328 |
+
Young Jin Kim, Raffy Fahim, Hany Hassan Awadalla\
|
| 329 |
+
https://arxiv.org/pdf/2310.02410
|
| 330 |
+
|
| 331 |
+
4. Precision Where It Matters: A Novel Spike\
|
| 332 |
+
Aware Mixed-Precision Quantization Strategy for\
|
| 333 |
+
LLaMA-based Language Models (2025)\
|
| 334 |
+
Lucas Maisonnave, Cyril Moineau, Olivier Bichler, and Fabrice Rastello\
|
| 335 |
+
https://arxiv.org/pdf/2504.21553
|
| 336 |
+
|
| 337 |
+
5. Systematic Outliers in Large Language Models (2025)\
|
| 338 |
+
Yongqi An, Xu Zhao, Tao Yu, Ming Tang, Jinqiao Wang\
|
| 339 |
+
https://arxiv.org/pdf/2502.06415v2
|
| 340 |
+
|
| 341 |
+
</details>
|
calibrate_software_engineer.yaml
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
calibration_set:
|
| 2 |
+
_templates:
|
| 3 |
+
programming_languages: &programming_languages "Solve the following problem using {{ ['Zephyr', 'Prolog', 'Cobol', 'Apex', 'Crystal', 'Fortran', 'Nim', 'Delphi', 'Ada', 'Objective-C', 'VBA', 'Perl', 'Groovy', 'MATLAB', 'Solidity', 'Visual Basic', 'OCaml', 'Erlang', 'Julia', 'Lisp', 'F#', 'Clojure', 'GDScript', 'Scala', 'R', 'Haskell', 'Ruby', 'Elixir', 'Lua', 'Zig', 'Dart', 'Swift', 'Metal', 'PowerShell', 'PHP', 'Kotlin', 'C', 'Java', 'C++', 'C#', 'Bash/Shell', 'Go', 'Rust', 'TypeScript', 'HTML/CSS', 'SQL', 'JavaScript', 'Python', 'Lean', 'Coq', 'Pony', 'D', 'Racket', 'Haxe', 'x86-64 ASM', 'ARM-64 ASM', 'LLVM IR', 'GLSL', 'CUDA', 'Vulkan'][hash(row|string) % 60] }}\n***\n"
|
| 4 |
+
spoken_languages: &spoken_languages "Answer in {{ ['Arabic', 'Chinese', 'French', 'German', 'Greek', 'Hebrew', 'Hindi', 'Japanese', 'Korean', 'Portuguese', 'Russian', 'Spanish', 'Turkish'][hash(row|string) % 13] }}\n***\n"
|
| 5 |
+
max_seq_length: 8192
|
| 6 |
+
shuffle: true
|
| 7 |
+
seed: 42
|
| 8 |
+
datasets:
|
| 9 |
+
|
| 10 |
+
# Category Summary (Total: 624 samples)
|
| 11 |
+
# =====================================================
|
| 12 |
+
# General chat (24 samples - 3.85%)
|
| 13 |
+
# Instruction and Reasoning tuning (14 samples - 2.24%)
|
| 14 |
+
# Multilingual (70 samples - 11.22%)
|
| 15 |
+
# Tool use (100 samples - 16.03%)
|
| 16 |
+
# Code / Programming / Software Engineering / Devops (328 samples - 52.56%)
|
| 17 |
+
# Math (12 samples - 1.92%)
|
| 18 |
+
# Sciences (16 samples - 2.56%)
|
| 19 |
+
# Medical (8 samples - 1.28%)
|
| 20 |
+
# Finance (8 samples - 1.28%)
|
| 21 |
+
# Business (16 samples - 2.56%)
|
| 22 |
+
# Humanities and Philosophy (8 samples - 1.28%)
|
| 23 |
+
# Creative Writing, Adventure, Roleplay (13 samples - 2.08%)
|
| 24 |
+
# General Knowledge and Pop Culture (2 samples - 0.32%)
|
| 25 |
+
# Behavioral skills (4 samples - 0.64%)
|
| 26 |
+
# Misc (1 sample - 0.16%)
|
| 27 |
+
# =====================================================
|
| 28 |
+
|
| 29 |
+
# Research
|
| 30 |
+
# =====================================================
|
| 31 |
+
# According to this presentation https://minjiazhang.github.io/courses/fall24-resource/slides/awq.pdf
|
| 32 |
+
# AWQ only needs 64 samples to identify salient weights that need to be preserved.
|
| 33 |
+
#
|
| 34 |
+
# This research predates the boom of MoE (Mixture-of-Experts) models
|
| 35 |
+
# and it's safer to assume that 64 samples of a general dataset
|
| 36 |
+
# cannot properly identify salient weights of experts.
|
| 37 |
+
|
| 38 |
+
# General chat (24 samples)
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
- dataset: HuggingFaceH4/ultrachat_200k
|
| 41 |
+
columns: [messages]
|
| 42 |
+
split: train_sft
|
| 43 |
+
formatter: chat_completion
|
| 44 |
+
num_samples: 8
|
| 45 |
+
streaming: true
|
| 46 |
+
|
| 47 |
+
- dataset: databricks/databricks-dolly-15k
|
| 48 |
+
split: train
|
| 49 |
+
columns: [instruction, response]
|
| 50 |
+
formatter: prompt_answer
|
| 51 |
+
num_samples: 8
|
| 52 |
+
|
| 53 |
+
- dataset: neuralmagic/calibration
|
| 54 |
+
subset: LLM
|
| 55 |
+
split: train
|
| 56 |
+
columns: [messages]
|
| 57 |
+
formatter: chat_completion
|
| 58 |
+
num_samples: 8
|
| 59 |
+
|
| 60 |
+
# Instruction and Reasoning tuning (14 samples)
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
- dataset: HuggingFaceH4/no_robots
|
| 63 |
+
split: train
|
| 64 |
+
columns: [messages]
|
| 65 |
+
formatter: chat_completion
|
| 66 |
+
num_samples: 2
|
| 67 |
+
|
| 68 |
+
- dataset: nvidia/HelpSteer
|
| 69 |
+
split: train
|
| 70 |
+
columns: [prompt, response]
|
| 71 |
+
formatter: prompt_answer
|
| 72 |
+
num_samples: 2
|
| 73 |
+
streaming: true
|
| 74 |
+
|
| 75 |
+
- dataset: garage-bAInd/Open-Platypus
|
| 76 |
+
split: train
|
| 77 |
+
columns: [instruction, output]
|
| 78 |
+
formatter: prompt_answer
|
| 79 |
+
num_samples: 2
|
| 80 |
+
|
| 81 |
+
- dataset: PJMixers/grimulkan_physical-reasoning-ShareGPT
|
| 82 |
+
split: train
|
| 83 |
+
columns: [conversations]
|
| 84 |
+
formatter: sharegpt
|
| 85 |
+
num_samples: 4
|
| 86 |
+
|
| 87 |
+
- dataset: PJMixers/grimulkan_theory-of-mind-ShareGPT
|
| 88 |
+
split: train
|
| 89 |
+
columns: [conversations]
|
| 90 |
+
formatter: sharegpt
|
| 91 |
+
num_samples: 4
|
| 92 |
+
|
| 93 |
+
# Multilingual (70 samples)
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
- dataset: HuggingFaceH4/Multilingual-Thinking
|
| 96 |
+
split: train
|
| 97 |
+
columns: [user]
|
| 98 |
+
formatter: raw_text
|
| 99 |
+
num_samples: 32
|
| 100 |
+
formatter_params:
|
| 101 |
+
prefix: *spoken_languages
|
| 102 |
+
|
| 103 |
+
- dataset: ServiceNow-AI/M2Lingual
|
| 104 |
+
subset: full_data
|
| 105 |
+
split: train
|
| 106 |
+
columns: [conversation]
|
| 107 |
+
formatter: chat_completion
|
| 108 |
+
num_samples: 4
|
| 109 |
+
streaming: true
|
| 110 |
+
|
| 111 |
+
- dataset: droussis/euroblocks_sft_1sample_per_lang
|
| 112 |
+
split: train
|
| 113 |
+
columns: [conversations]
|
| 114 |
+
formatter: chat_completion
|
| 115 |
+
num_samples: 34
|
| 116 |
+
|
| 117 |
+
# Tool use (include commented out ToolAce) (100 samples)
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
# Fail with minimax!
|
| 121 |
+
# jinja2.exceptions.TemplateError: Message has tool role, but there was no previous assistant message with a tool call!
|
| 122 |
+
# - dataset: Team-ACE/ToolACE
|
| 123 |
+
# split: train
|
| 124 |
+
# columns: [system, conversations]
|
| 125 |
+
# formatter: chat_completion_with_sysprompt
|
| 126 |
+
# num_samples: 100
|
| 127 |
+
|
| 128 |
+
- dataset: interstellarninja/hermes_reasoning_tool_use
|
| 129 |
+
split: train
|
| 130 |
+
columns: [conversations]
|
| 131 |
+
formatter: sharegpt
|
| 132 |
+
num_samples: 100
|
| 133 |
+
streaming: true
|
| 134 |
+
|
| 135 |
+
# Code / Programming / Software Engineering / Devops (336 samples)
|
| 136 |
+
# ---------------------------------------------------------------------------
|
| 137 |
+
|
| 138 |
+
- dataset: deepmind/code_contests
|
| 139 |
+
split: train
|
| 140 |
+
columns: [name]
|
| 141 |
+
formatter: deepmind_code_contests
|
| 142 |
+
num_samples: 50
|
| 143 |
+
streaming: true
|
| 144 |
+
|
| 145 |
+
- dataset: dh02391735/stackoverflow-kubernetes-questions
|
| 146 |
+
split: train
|
| 147 |
+
columns: [instruction]
|
| 148 |
+
formatter: raw_text
|
| 149 |
+
num_samples: 8
|
| 150 |
+
streaming: true
|
| 151 |
+
|
| 152 |
+
- dataset: diversoailab/humaneval-rust
|
| 153 |
+
split: train
|
| 154 |
+
columns: [prompt]
|
| 155 |
+
formatter: raw_text
|
| 156 |
+
num_samples: 100
|
| 157 |
+
formatter_params: # The dataset actually doesn't hardcode the language
|
| 158 |
+
prefix: *programming_languages
|
| 159 |
+
|
| 160 |
+
- dataset: ammarnasr/the-stack-rust-clean
|
| 161 |
+
split: train
|
| 162 |
+
columns: [content]
|
| 163 |
+
formatter: raw_text
|
| 164 |
+
num_samples: 8
|
| 165 |
+
streaming: true
|
| 166 |
+
formatter_params:
|
| 167 |
+
prefix: "Explain this code and comment it for a junior dev.\n***\n"
|
| 168 |
+
|
| 169 |
+
- dataset: CSJianYang/CodeArena
|
| 170 |
+
split: test
|
| 171 |
+
columns: [messages]
|
| 172 |
+
formatter: chat_completion
|
| 173 |
+
num_samples: 8
|
| 174 |
+
|
| 175 |
+
- dataset: nvidia/OpenCodeInstruct
|
| 176 |
+
split: train
|
| 177 |
+
columns: [input, output]
|
| 178 |
+
formatter: prompt_answer
|
| 179 |
+
num_samples: 8
|
| 180 |
+
streaming: true
|
| 181 |
+
|
| 182 |
+
- dataset: nvidia/Llama-Nemotron-Post-Training-Dataset
|
| 183 |
+
split: code
|
| 184 |
+
columns: [input]
|
| 185 |
+
formatter: chat_completion
|
| 186 |
+
num_samples: 8
|
| 187 |
+
streaming: true
|
| 188 |
+
|
| 189 |
+
- dataset: nvidia/Nemotron-Competitive-Programming-v1
|
| 190 |
+
split: competitive_coding_cpp_part00
|
| 191 |
+
columns: [messages]
|
| 192 |
+
formatter: chat_completion
|
| 193 |
+
num_samples: 8
|
| 194 |
+
streaming: true
|
| 195 |
+
|
| 196 |
+
# The conversations columns has another "conversations" field :/
|
| 197 |
+
# - dataset: sr5434/CodegebraGPT_data
|
| 198 |
+
# subset: 100k-text
|
| 199 |
+
# split: train
|
| 200 |
+
# columns: [conversations]
|
| 201 |
+
# formatter: sharegpt
|
| 202 |
+
# num_samples: 8
|
| 203 |
+
|
| 204 |
+
- dataset: rombodawg/code_bagel_hermes-2.5
|
| 205 |
+
split: train
|
| 206 |
+
columns: [input, output]
|
| 207 |
+
formatter: prompt_answer
|
| 208 |
+
num_samples: 100
|
| 209 |
+
streaming: true
|
| 210 |
+
|
| 211 |
+
- dataset: MathArena/project_euler
|
| 212 |
+
split: train
|
| 213 |
+
columns: [problem]
|
| 214 |
+
formatter: raw_text
|
| 215 |
+
num_samples: 30
|
| 216 |
+
formatter_params:
|
| 217 |
+
prefix: *programming_languages
|
| 218 |
+
|
| 219 |
+
# Math (12 samples)
|
| 220 |
+
# ---------------------------------------------------------------------------
|
| 221 |
+
|
| 222 |
+
- dataset: nvidia/Llama-Nemotron-Post-Training-Dataset
|
| 223 |
+
split: math
|
| 224 |
+
columns: [input]
|
| 225 |
+
formatter: chat_completion
|
| 226 |
+
num_samples: 4
|
| 227 |
+
streaming: true
|
| 228 |
+
|
| 229 |
+
- dataset: nvidia/Nemotron-Math-Proofs-v1
|
| 230 |
+
split: lean
|
| 231 |
+
columns: [formal_statement]
|
| 232 |
+
formatter: raw_text
|
| 233 |
+
num_samples: 4
|
| 234 |
+
streaming: true
|
| 235 |
+
formatter_params:
|
| 236 |
+
prefix: "Can you improve, document and add comment to this Lean proof for a non-mathematician?\n***\n"
|
| 237 |
+
|
| 238 |
+
- dataset: nvidia/OpenMathInstruct-2
|
| 239 |
+
split: train
|
| 240 |
+
columns: [problem, generated_solution]
|
| 241 |
+
formatter: prompt_answer
|
| 242 |
+
num_samples: 4
|
| 243 |
+
streaming: true
|
| 244 |
+
|
| 245 |
+
# Sciences (16 samples)
|
| 246 |
+
# ---------------------------------------------------------------------------
|
| 247 |
+
|
| 248 |
+
- dataset: nvidia/Llama-Nemotron-Post-Training-Dataset
|
| 249 |
+
split: science
|
| 250 |
+
columns: [input]
|
| 251 |
+
formatter: chat_completion
|
| 252 |
+
num_samples: 4
|
| 253 |
+
streaming: true
|
| 254 |
+
|
| 255 |
+
- dataset: nvidia/OpenScienceReasoning-2
|
| 256 |
+
split: train
|
| 257 |
+
columns: [input, output]
|
| 258 |
+
formatter: prompt_answer
|
| 259 |
+
num_samples: 8
|
| 260 |
+
streaming: true
|
| 261 |
+
|
| 262 |
+
- dataset: MegaScience/MegaScience
|
| 263 |
+
split: train
|
| 264 |
+
columns: [question, answer]
|
| 265 |
+
formatter: prompt_answer
|
| 266 |
+
num_samples: 4
|
| 267 |
+
streaming: true
|
| 268 |
+
|
| 269 |
+
# Medical (8 samples)
|
| 270 |
+
# ---------------------------------------------------------------------------
|
| 271 |
+
|
| 272 |
+
- dataset: OpenMed/Medical-Reasoning-SFT-GPT-OSS-120B
|
| 273 |
+
split: train
|
| 274 |
+
columns: [messages]
|
| 275 |
+
formatter: chat_completion
|
| 276 |
+
num_samples: 4
|
| 277 |
+
streaming: true
|
| 278 |
+
|
| 279 |
+
- dataset: ccdv/pubmed-summarization
|
| 280 |
+
subset: section
|
| 281 |
+
split: train
|
| 282 |
+
columns: [article]
|
| 283 |
+
formatter: raw_text
|
| 284 |
+
num_samples: 4
|
| 285 |
+
streaming: true
|
| 286 |
+
formatter_params:
|
| 287 |
+
prefix: "Summarize this:\n***\n"
|
| 288 |
+
|
| 289 |
+
# Finance (8 samples)
|
| 290 |
+
# ---------------------------------------------------------------------------
|
| 291 |
+
|
| 292 |
+
- dataset: gbharti/finance-alpaca
|
| 293 |
+
split: train
|
| 294 |
+
columns: [instruction, output]
|
| 295 |
+
formatter: prompt_answer
|
| 296 |
+
num_samples: 4
|
| 297 |
+
|
| 298 |
+
- dataset: vladlen32230/summarization-yahoo-stock-finance-article-text
|
| 299 |
+
split: train
|
| 300 |
+
columns: [text]
|
| 301 |
+
formatter: raw_text
|
| 302 |
+
num_samples: 4
|
| 303 |
+
formatter_params:
|
| 304 |
+
prefix: "Summarize this:\n***\n"
|
| 305 |
+
|
| 306 |
+
# Business (16 samples)
|
| 307 |
+
# ---------------------------------------------------------------------------
|
| 308 |
+
|
| 309 |
+
- dataset: fka/awesome-chatgpt-prompts
|
| 310 |
+
split: train
|
| 311 |
+
columns: [prompt]
|
| 312 |
+
formatter: raw_text
|
| 313 |
+
num_samples: 8
|
| 314 |
+
|
| 315 |
+
- dataset: theoldmandthesea/17k_business_book
|
| 316 |
+
split: train
|
| 317 |
+
columns: [question, answer]
|
| 318 |
+
formatter: prompt_answer
|
| 319 |
+
num_samples: 8
|
| 320 |
+
|
| 321 |
+
# Humanities and Philosophy (8 samples)
|
| 322 |
+
# ---------------------------------------------------------------------------
|
| 323 |
+
|
| 324 |
+
- dataset: ruggsea/stanford-encyclopedia-of-philosophy_instruct
|
| 325 |
+
split: train
|
| 326 |
+
columns: [question, answer]
|
| 327 |
+
formatter: prompt_answer
|
| 328 |
+
num_samples: 2
|
| 329 |
+
streaming: true
|
| 330 |
+
|
| 331 |
+
- dataset: mlfoundations-dev/stackexchange_philosophy
|
| 332 |
+
split: train
|
| 333 |
+
columns: [conversations]
|
| 334 |
+
formatter: sharegpt
|
| 335 |
+
num_samples: 2
|
| 336 |
+
|
| 337 |
+
- dataset: FreedomIntelligence/SocraticChat
|
| 338 |
+
split: train
|
| 339 |
+
columns: [conversations]
|
| 340 |
+
formatter: sharegpt
|
| 341 |
+
num_samples: 4
|
| 342 |
+
streaming: true
|
| 343 |
+
|
| 344 |
+
# Creative Writing, Adventure, Roleplay (13 samples)
|
| 345 |
+
# ---------------------------------------------------------------------------
|
| 346 |
+
|
| 347 |
+
- dataset: Gryphe/Opus-WritingPrompts
|
| 348 |
+
split: train
|
| 349 |
+
columns: [conversations]
|
| 350 |
+
formatter: sharegpt
|
| 351 |
+
num_samples: 2
|
| 352 |
+
|
| 353 |
+
- dataset: anthracite-org/nopm_claude_writing_fixed
|
| 354 |
+
split: train
|
| 355 |
+
columns: [conversations]
|
| 356 |
+
formatter: sharegpt
|
| 357 |
+
num_samples: 2
|
| 358 |
+
|
| 359 |
+
- dataset: zerofata/Roleplay-Anime-Characters
|
| 360 |
+
split: train
|
| 361 |
+
columns: [messages]
|
| 362 |
+
formatter: chat_completion
|
| 363 |
+
num_samples: 1
|
| 364 |
+
|
| 365 |
+
- dataset: zerofata/Instruct-Anime
|
| 366 |
+
split: train
|
| 367 |
+
columns: [messages]
|
| 368 |
+
formatter: chat_completion
|
| 369 |
+
num_samples: 1
|
| 370 |
+
|
| 371 |
+
- dataset: zerofata/Instruct-Anime-CreativeWriting
|
| 372 |
+
split: train
|
| 373 |
+
columns: [messages]
|
| 374 |
+
formatter: chat_completion
|
| 375 |
+
num_samples: 1
|
| 376 |
+
|
| 377 |
+
- dataset: sam-paech/gutenberg3-generalfiction-scifi-fantasy-romance-adventure-dpo
|
| 378 |
+
split: train
|
| 379 |
+
columns: [chosen]
|
| 380 |
+
formatter: chat_completion
|
| 381 |
+
num_samples: 2
|
| 382 |
+
|
| 383 |
+
- dataset: PocketDoc/Dans-Prosemaxx-Adventure
|
| 384 |
+
split: train
|
| 385 |
+
columns: [conversations]
|
| 386 |
+
formatter: sharegpt
|
| 387 |
+
num_samples: 2
|
| 388 |
+
|
| 389 |
+
- dataset: anthracite-org/stheno-filtered-v1.1
|
| 390 |
+
split: train
|
| 391 |
+
columns: [conversations]
|
| 392 |
+
formatter: sharegpt
|
| 393 |
+
num_samples: 2
|
| 394 |
+
streaming: true
|
| 395 |
+
|
| 396 |
+
# General Knowledge and Pop Culture (2 samples)
|
| 397 |
+
# ---------------------------------------------------------------------------
|
| 398 |
+
|
| 399 |
+
- dataset: KaraKaraWitch/TvTroper-2025
|
| 400 |
+
split: train
|
| 401 |
+
columns: [article]
|
| 402 |
+
formatter: raw_text
|
| 403 |
+
num_samples: 2
|
| 404 |
+
streaming: true
|
| 405 |
+
formatter_params:
|
| 406 |
+
prefix: "Explain this trope like I'm your grandmother\n***\n"
|
| 407 |
+
|
| 408 |
+
# Behavioral skills (4 samples)
|
| 409 |
+
# ---------------------------------------------------------------------------
|
| 410 |
+
|
| 411 |
+
- dataset: AquaV/US-Army-Survival-Sharegpt
|
| 412 |
+
split: train
|
| 413 |
+
columns: [conversations]
|
| 414 |
+
formatter: sharegpt
|
| 415 |
+
num_samples: 1
|
| 416 |
+
|
| 417 |
+
- dataset: AquaV/Interrogation-Sharegpt
|
| 418 |
+
split: train
|
| 419 |
+
columns: [conversations]
|
| 420 |
+
formatter: sharegpt
|
| 421 |
+
num_samples: 1
|
| 422 |
+
|
| 423 |
+
- dataset: AquaV/Multi-Environment-Operations-Sharegpt
|
| 424 |
+
split: train
|
| 425 |
+
columns: [conversations]
|
| 426 |
+
formatter: sharegpt
|
| 427 |
+
num_samples: 1
|
| 428 |
+
|
| 429 |
+
- dataset: AquaV/Resistance-Sharegpt
|
| 430 |
+
split: train
|
| 431 |
+
columns: [conversations]
|
| 432 |
+
formatter: sharegpt
|
| 433 |
+
num_samples: 1
|
| 434 |
+
|
| 435 |
+
# Misc (1 sample)
|
| 436 |
+
# ---------------------------------------------------------------------------
|
| 437 |
+
|
| 438 |
+
- dataset: PocketDoc/Dans-Kinomaxx-VanillaBackrooms
|
| 439 |
+
split: train
|
| 440 |
+
columns: [conversations]
|
| 441 |
+
formatter: sharegpt
|
| 442 |
+
num_samples: 1
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{# ----------‑‑‑ special token variables ‑‑‑---------- #}
|
| 2 |
+
{%- set toolcall_begin_token = '<minimax:tool_call>' -%}
|
| 3 |
+
{%- set toolcall_end_token = '</minimax:tool_call>' -%}
|
| 4 |
+
{#- Tool Rendering Functions ============================================== -#}
|
| 5 |
+
{%- macro render_tool_namespace(namespace_name, tool_list) -%}
|
| 6 |
+
{%- for tool in tool_list -%}
|
| 7 |
+
<tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
|
| 8 |
+
{% endfor -%}
|
| 9 |
+
{%- endmacro -%}
|
| 10 |
+
{%- macro visible_text(content) -%}
|
| 11 |
+
{%- if content is string -%}
|
| 12 |
+
{{ content }}
|
| 13 |
+
{%- elif content is iterable and content is not mapping -%}
|
| 14 |
+
{%- for item in content -%}
|
| 15 |
+
{%- if item is mapping and item.type == 'text' -%}
|
| 16 |
+
{{- item.text }}
|
| 17 |
+
{%- elif item is string -%}
|
| 18 |
+
{{- item }}
|
| 19 |
+
{%- endif -%}
|
| 20 |
+
{%- endfor -%}
|
| 21 |
+
{%- else -%}
|
| 22 |
+
{{- content }}
|
| 23 |
+
{%- endif -%}
|
| 24 |
+
{%- endmacro -%}
|
| 25 |
+
{#- System Message Construction ============================================ -#}
|
| 26 |
+
{%- macro build_system_message(system_message) -%}
|
| 27 |
+
{%- if system_message and system_message.content -%}
|
| 28 |
+
{{- visible_text(system_message.content) }}
|
| 29 |
+
{%- else -%}
|
| 30 |
+
{%- if model_identity is not defined -%}
|
| 31 |
+
{%- set model_identity = "You are a helpful assistant. Your name is MiniMax-M2.5 and is built by MiniMax." -%}
|
| 32 |
+
{%- endif -%}
|
| 33 |
+
{{- model_identity }}
|
| 34 |
+
{%- endif -%}
|
| 35 |
+
|
| 36 |
+
{#- Handle current_date -#}
|
| 37 |
+
{%- if system_message and system_message.current_date -%}
|
| 38 |
+
{{- '\n' ~ 'Current date: ' + system_message.current_date }}
|
| 39 |
+
{%- endif -%}
|
| 40 |
+
{#- Handle current_location -#}
|
| 41 |
+
{%- if system_message and system_message.current_location -%}
|
| 42 |
+
{{- '\n' ~ 'Current location: ' + system_message.current_location }}
|
| 43 |
+
{%- endif -%}
|
| 44 |
+
{%- endmacro -%}
|
| 45 |
+
{#- Main Template Logic ================================================= -#}
|
| 46 |
+
{#- Extract system message (only first message if it's system) -#}
|
| 47 |
+
{%- set system_message = none -%}
|
| 48 |
+
{%- set conversation_messages = messages -%}
|
| 49 |
+
{%- if messages and messages[0].role == "system" -%}
|
| 50 |
+
{%- set system_message = messages[0] -%}
|
| 51 |
+
{%- set conversation_messages = messages[1:] -%}
|
| 52 |
+
{%- endif -%}
|
| 53 |
+
{#- Get the last user message turn, for interleved thinking -#}
|
| 54 |
+
{%- set ns = namespace(last_user_index=-1) %}
|
| 55 |
+
{% for m in conversation_messages %}
|
| 56 |
+
{%- if m.role == 'user' %}
|
| 57 |
+
{% set ns.last_user_index = loop.index0 -%}
|
| 58 |
+
{%- endif %}
|
| 59 |
+
{%- endfor %}
|
| 60 |
+
{#- Render system message -#}
|
| 61 |
+
{{- ']~!b[' ~ ']~b]system' ~ '\n' }}
|
| 62 |
+
{{- build_system_message(system_message) }}
|
| 63 |
+
{#- Render tools if available -#}
|
| 64 |
+
{%- if tools -%}
|
| 65 |
+
{{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
|
| 66 |
+
{{- '\n' ~ '<tools>' ~ '\n' }}
|
| 67 |
+
{{- render_tool_namespace("functions", tools) }}
|
| 68 |
+
{{- '</tools>' ~ '\n\n' }}
|
| 69 |
+
{{- 'When making tool calls, use XML format to invoke tools and pass parameters:' ~ '\n' }}
|
| 70 |
+
{{- '\n' ~ toolcall_begin_token }}
|
| 71 |
+
<invoke name="tool-name-1">
|
| 72 |
+
<parameter name="param-key-1">param-value-1</parameter>
|
| 73 |
+
<parameter name="param-key-2">param-value-2</parameter>
|
| 74 |
+
...
|
| 75 |
+
</invoke>
|
| 76 |
+
{{- '\n' ~ toolcall_end_token }}
|
| 77 |
+
{%- endif -%}
|
| 78 |
+
{{- '[e~[\n' }}
|
| 79 |
+
|
| 80 |
+
{#- Render messages -#}
|
| 81 |
+
{%- set last_tool_call = namespace(name=none) -%}
|
| 82 |
+
{%- for message in conversation_messages -%}
|
| 83 |
+
{%- if message.role == 'assistant' -%}
|
| 84 |
+
{#- Only render reasoning_content if no user message follows -#}
|
| 85 |
+
{{- ']~b]ai' ~ '\n' }}
|
| 86 |
+
|
| 87 |
+
{%- set reasoning_content = '' %}
|
| 88 |
+
{%- set content = visible_text(message.content) %}
|
| 89 |
+
{%- if message.reasoning_content is string %}
|
| 90 |
+
{%- set reasoning_content = message.reasoning_content %}
|
| 91 |
+
{%- else %}
|
| 92 |
+
{%- if '</think>' in content %}
|
| 93 |
+
{%- set reasoning_content = content.split('</think>')[0].strip('\n').split('<think>')[-1].strip('\n') %}
|
| 94 |
+
{%- set content = content.split('</think>')[-1].strip('\n') %}
|
| 95 |
+
{%- endif %}
|
| 96 |
+
{%- endif %}
|
| 97 |
+
{%- if reasoning_content and loop.index0 > ns.last_user_index -%}
|
| 98 |
+
{{- '<think>' ~ '\n' ~ reasoning_content ~ '\n' ~ '</think>' ~ '\n\n' }}
|
| 99 |
+
{%- endif -%}
|
| 100 |
+
{%- if content -%}
|
| 101 |
+
{{- content }}
|
| 102 |
+
{%- endif -%}
|
| 103 |
+
{%- if message.tool_calls -%}
|
| 104 |
+
{{- '\n' ~ toolcall_begin_token ~ '\n' }}
|
| 105 |
+
|
| 106 |
+
{%- for tool_call in message.tool_calls -%}
|
| 107 |
+
{%- if tool_call.function %}
|
| 108 |
+
{%- set tool_call = tool_call.function %}
|
| 109 |
+
{%- endif %}
|
| 110 |
+
{{- '<invoke name="' + tool_call.name + '">' }}
|
| 111 |
+
{% set _args = tool_call.arguments %}
|
| 112 |
+
{%- for k, v in _args.items() %}
|
| 113 |
+
{{- '<parameter name="' + k + '">' }}
|
| 114 |
+
{{- v | tojson(ensure_ascii=False) if v is not string else v }}
|
| 115 |
+
{{- '</parameter>' }}
|
| 116 |
+
{% endfor %}
|
| 117 |
+
{{- '</invoke>' ~ '\n' }}
|
| 118 |
+
{%- endfor -%}
|
| 119 |
+
|
| 120 |
+
{{- toolcall_end_token}}
|
| 121 |
+
{%- set last_tool_call.name = message.tool_calls[-1].name -%}
|
| 122 |
+
{%- else -%}
|
| 123 |
+
{%- set last_tool_call.name = none -%}
|
| 124 |
+
{%- endif -%}
|
| 125 |
+
{{- '[e~[' ~ '\n' }}
|
| 126 |
+
|
| 127 |
+
{%- elif message.role == 'tool' -%}
|
| 128 |
+
{%- if last_tool_call.name is none -%}
|
| 129 |
+
{{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
|
| 130 |
+
{%- endif -%}
|
| 131 |
+
{%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
|
| 132 |
+
{{- ']~b]tool' }}
|
| 133 |
+
{%- endif -%}
|
| 134 |
+
{%- if message.content is string -%}
|
| 135 |
+
{{- '\n<response>' }}
|
| 136 |
+
{{- message.content }}
|
| 137 |
+
{{- '</response>' }}
|
| 138 |
+
{%- else -%}
|
| 139 |
+
{%- for tr in message.content -%}
|
| 140 |
+
{{- '\n<response>' }}
|
| 141 |
+
{{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
|
| 142 |
+
{{- '\n</response>' }}
|
| 143 |
+
{%- endfor -%}
|
| 144 |
+
{%- endif -%}
|
| 145 |
+
{%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
|
| 146 |
+
{{- '[e~[\n' -}}
|
| 147 |
+
{%- endif -%}
|
| 148 |
+
|
| 149 |
+
{%- elif message.role == 'user' -%}
|
| 150 |
+
{{- ']~b]user' ~ '\n' }}
|
| 151 |
+
{{- visible_text(message.content) }}
|
| 152 |
+
{{- '[e~[' ~ '\n' }}
|
| 153 |
+
{%- endif -%}
|
| 154 |
+
{%- endfor -%}
|
| 155 |
+
|
| 156 |
+
{#- Generation prompt -#}
|
| 157 |
+
{%- if add_generation_prompt -%}
|
| 158 |
+
{{- ']~b]ai' ~ '\n' ~ '<think>' ~ '\n' }}
|
| 159 |
+
{%- endif -%}
|
config.json
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"MiniMaxM2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn_type_list": [
|
| 6 |
+
1,
|
| 7 |
+
1,
|
| 8 |
+
1,
|
| 9 |
+
1,
|
| 10 |
+
1,
|
| 11 |
+
1,
|
| 12 |
+
1,
|
| 13 |
+
1,
|
| 14 |
+
1,
|
| 15 |
+
1,
|
| 16 |
+
1,
|
| 17 |
+
1,
|
| 18 |
+
1,
|
| 19 |
+
1,
|
| 20 |
+
1,
|
| 21 |
+
1,
|
| 22 |
+
1,
|
| 23 |
+
1,
|
| 24 |
+
1,
|
| 25 |
+
1,
|
| 26 |
+
1,
|
| 27 |
+
1,
|
| 28 |
+
1,
|
| 29 |
+
1,
|
| 30 |
+
1,
|
| 31 |
+
1,
|
| 32 |
+
1,
|
| 33 |
+
1,
|
| 34 |
+
1,
|
| 35 |
+
1,
|
| 36 |
+
1,
|
| 37 |
+
1,
|
| 38 |
+
1,
|
| 39 |
+
1,
|
| 40 |
+
1,
|
| 41 |
+
1,
|
| 42 |
+
1,
|
| 43 |
+
1,
|
| 44 |
+
1,
|
| 45 |
+
1,
|
| 46 |
+
1,
|
| 47 |
+
1,
|
| 48 |
+
1,
|
| 49 |
+
1,
|
| 50 |
+
1,
|
| 51 |
+
1,
|
| 52 |
+
1,
|
| 53 |
+
1,
|
| 54 |
+
1,
|
| 55 |
+
1,
|
| 56 |
+
1,
|
| 57 |
+
1,
|
| 58 |
+
1,
|
| 59 |
+
1,
|
| 60 |
+
1,
|
| 61 |
+
1,
|
| 62 |
+
1,
|
| 63 |
+
1,
|
| 64 |
+
1,
|
| 65 |
+
1,
|
| 66 |
+
1,
|
| 67 |
+
1
|
| 68 |
+
],
|
| 69 |
+
"auto_map": {
|
| 70 |
+
"AutoConfig": "configuration_minimax_m2.MiniMaxM2Config",
|
| 71 |
+
"AutoModelForCausalLM": "modeling_minimax_m2.MiniMaxM2ForCausalLM"
|
| 72 |
+
},
|
| 73 |
+
"head_dim": 128,
|
| 74 |
+
"hidden_act": "silu",
|
| 75 |
+
"hidden_size": 3072,
|
| 76 |
+
"intermediate_size": 1536,
|
| 77 |
+
"max_position_embeddings": 196608,
|
| 78 |
+
"model_type": "minimax_m2",
|
| 79 |
+
"mtp_transformer_layers": 1,
|
| 80 |
+
"num_attention_heads": 48,
|
| 81 |
+
"num_experts_per_tok": 8,
|
| 82 |
+
"num_hidden_layers": 62,
|
| 83 |
+
"num_key_value_heads": 8,
|
| 84 |
+
"num_local_experts": 256,
|
| 85 |
+
"num_mtp_modules": 3,
|
| 86 |
+
"qk_norm_type": "per_layer",
|
| 87 |
+
"rms_norm_eps": 1e-06,
|
| 88 |
+
"rope_theta": 5000000,
|
| 89 |
+
"rotary_dim": 64,
|
| 90 |
+
"scoring_func": "sigmoid",
|
| 91 |
+
"shared_intermediate_size": 0,
|
| 92 |
+
"tie_word_embeddings": false,
|
| 93 |
+
"transformers_version": "4.46.1",
|
| 94 |
+
"use_cache": true,
|
| 95 |
+
"use_mtp": true,
|
| 96 |
+
"use_qk_norm": true,
|
| 97 |
+
"use_routing_bias": true,
|
| 98 |
+
"vocab_size": 200064,
|
| 99 |
+
"quantization_config": {
|
| 100 |
+
"quant_method": "compressed-tensors",
|
| 101 |
+
"format": "mixed-precision",
|
| 102 |
+
"quantization_status": "compressed",
|
| 103 |
+
"config_groups": {
|
| 104 |
+
"self_attention_projections": {
|
| 105 |
+
"targets": [
|
| 106 |
+
"Linear",
|
| 107 |
+
"re:.*self_attn\\.(k|q|o|v)_proj$",
|
| 108 |
+
"re:.*self_attn\\.qkv_proj$"
|
| 109 |
+
],
|
| 110 |
+
"weights": {
|
| 111 |
+
"type": "float",
|
| 112 |
+
"num_bits": 8,
|
| 113 |
+
"strategy": "block",
|
| 114 |
+
"block_structure": [
|
| 115 |
+
128,
|
| 116 |
+
128
|
| 117 |
+
],
|
| 118 |
+
"symmetric": true,
|
| 119 |
+
"dynamic": false
|
| 120 |
+
},
|
| 121 |
+
"input_activations": {
|
| 122 |
+
"type": "float",
|
| 123 |
+
"num_bits": 8,
|
| 124 |
+
"strategy": "token",
|
| 125 |
+
"symmetric": true,
|
| 126 |
+
"dynamic": true
|
| 127 |
+
},
|
| 128 |
+
"format": "float-quantized"
|
| 129 |
+
},
|
| 130 |
+
"mlp_experts_projections": {
|
| 131 |
+
"format": "pack-quantized",
|
| 132 |
+
"input_activations": null,
|
| 133 |
+
"output_activations": null,
|
| 134 |
+
"targets": [
|
| 135 |
+
"Linear",
|
| 136 |
+
"re:.*block_sparse_moe\\.experts\\.\\d+\\.(w1|w2|w3)$"
|
| 137 |
+
],
|
| 138 |
+
"weights": {
|
| 139 |
+
"actorder": null,
|
| 140 |
+
"block_structure": null,
|
| 141 |
+
"dynamic": false,
|
| 142 |
+
"group_size": 32,
|
| 143 |
+
"num_bits": 4,
|
| 144 |
+
"observer": "memoryless_minmax",
|
| 145 |
+
"observer_kwargs": {},
|
| 146 |
+
"strategy": "group",
|
| 147 |
+
"symmetric": true,
|
| 148 |
+
"type": "int"
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
},
|
| 152 |
+
"ignore": [
|
| 153 |
+
"model.layers.0.block_sparse_moe.gate",
|
| 154 |
+
"model.layers.1.block_sparse_moe.gate",
|
| 155 |
+
"model.layers.2.block_sparse_moe.gate",
|
| 156 |
+
"model.layers.3.block_sparse_moe.gate",
|
| 157 |
+
"model.layers.4.block_sparse_moe.gate",
|
| 158 |
+
"model.layers.5.block_sparse_moe.gate",
|
| 159 |
+
"model.layers.6.block_sparse_moe.gate",
|
| 160 |
+
"model.layers.7.block_sparse_moe.gate",
|
| 161 |
+
"model.layers.8.block_sparse_moe.gate",
|
| 162 |
+
"model.layers.9.block_sparse_moe.gate",
|
| 163 |
+
"model.layers.10.block_sparse_moe.gate",
|
| 164 |
+
"model.layers.11.block_sparse_moe.gate",
|
| 165 |
+
"model.layers.12.block_sparse_moe.gate",
|
| 166 |
+
"model.layers.13.block_sparse_moe.gate",
|
| 167 |
+
"model.layers.14.block_sparse_moe.gate",
|
| 168 |
+
"model.layers.15.block_sparse_moe.gate",
|
| 169 |
+
"model.layers.16.block_sparse_moe.gate",
|
| 170 |
+
"model.layers.17.block_sparse_moe.gate",
|
| 171 |
+
"model.layers.18.block_sparse_moe.gate",
|
| 172 |
+
"model.layers.19.block_sparse_moe.gate",
|
| 173 |
+
"model.layers.20.block_sparse_moe.gate",
|
| 174 |
+
"model.layers.21.block_sparse_moe.gate",
|
| 175 |
+
"model.layers.22.block_sparse_moe.gate",
|
| 176 |
+
"model.layers.23.block_sparse_moe.gate",
|
| 177 |
+
"model.layers.24.block_sparse_moe.gate",
|
| 178 |
+
"model.layers.25.block_sparse_moe.gate",
|
| 179 |
+
"model.layers.26.block_sparse_moe.gate",
|
| 180 |
+
"model.layers.27.block_sparse_moe.gate",
|
| 181 |
+
"model.layers.28.block_sparse_moe.gate",
|
| 182 |
+
"model.layers.29.block_sparse_moe.gate",
|
| 183 |
+
"model.layers.30.block_sparse_moe.gate",
|
| 184 |
+
"model.layers.31.block_sparse_moe.gate",
|
| 185 |
+
"model.layers.32.block_sparse_moe.gate",
|
| 186 |
+
"model.layers.33.block_sparse_moe.gate",
|
| 187 |
+
"model.layers.34.block_sparse_moe.gate",
|
| 188 |
+
"model.layers.35.block_sparse_moe.gate",
|
| 189 |
+
"model.layers.36.block_sparse_moe.gate",
|
| 190 |
+
"model.layers.37.block_sparse_moe.gate",
|
| 191 |
+
"model.layers.38.block_sparse_moe.gate",
|
| 192 |
+
"model.layers.39.block_sparse_moe.gate",
|
| 193 |
+
"model.layers.40.block_sparse_moe.gate",
|
| 194 |
+
"model.layers.41.block_sparse_moe.gate",
|
| 195 |
+
"model.layers.42.block_sparse_moe.gate",
|
| 196 |
+
"model.layers.43.block_sparse_moe.gate",
|
| 197 |
+
"model.layers.44.block_sparse_moe.gate",
|
| 198 |
+
"model.layers.45.block_sparse_moe.gate",
|
| 199 |
+
"model.layers.46.block_sparse_moe.gate",
|
| 200 |
+
"model.layers.47.block_sparse_moe.gate",
|
| 201 |
+
"model.layers.48.block_sparse_moe.gate",
|
| 202 |
+
"model.layers.49.block_sparse_moe.gate",
|
| 203 |
+
"model.layers.50.block_sparse_moe.gate",
|
| 204 |
+
"model.layers.51.block_sparse_moe.gate",
|
| 205 |
+
"model.layers.52.block_sparse_moe.gate",
|
| 206 |
+
"model.layers.53.block_sparse_moe.gate",
|
| 207 |
+
"model.layers.54.block_sparse_moe.gate",
|
| 208 |
+
"model.layers.55.block_sparse_moe.gate",
|
| 209 |
+
"model.layers.56.block_sparse_moe.gate",
|
| 210 |
+
"model.layers.57.block_sparse_moe.gate",
|
| 211 |
+
"model.layers.58.block_sparse_moe.gate",
|
| 212 |
+
"model.layers.59.block_sparse_moe.gate",
|
| 213 |
+
"model.layers.60.block_sparse_moe.gate",
|
| 214 |
+
"model.layers.61.block_sparse_moe.gate",
|
| 215 |
+
"lm_head"
|
| 216 |
+
],
|
| 217 |
+
"kv_cache_scheme": null,
|
| 218 |
+
"global_compression_ratio": null,
|
| 219 |
+
"sparsity_config": {},
|
| 220 |
+
"transform_config": {},
|
| 221 |
+
"version": "0.13.1.dev0+g797d301.d20251228"
|
| 222 |
+
}
|
| 223 |
+
}
|
configuration_minimax_m2.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
| 2 |
+
# This file was automatically generated from src/transformers/models/minimax_m2/modular_minimax_m2.py.
|
| 3 |
+
# Do NOT edit this file manually as any edits will be overwritten by the generation of
|
| 4 |
+
# the file from the modular. If any change should be done, please apply the change to the
|
| 5 |
+
# modular_minimax_m2.py file directly. One of our CI enforces this.
|
| 6 |
+
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
|
| 7 |
+
# coding=utf-8
|
| 8 |
+
# Copyright 2025 the HuggingFace Team. All rights reserved.
|
| 9 |
+
#
|
| 10 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 11 |
+
# you may not use this file except in compliance with the License.
|
| 12 |
+
# You may obtain a copy of the License at
|
| 13 |
+
#
|
| 14 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 15 |
+
#
|
| 16 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 17 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 18 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 19 |
+
# See the License for the specific language governing permissions and
|
| 20 |
+
# limitations under the License.
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class MiniMaxM2Config(PretrainedConfig):
|
| 27 |
+
r"""
|
| 28 |
+
This is the configuration class to store the configuration of a [`MiniMaxM2Model`]. It is used to instantiate an
|
| 29 |
+
MiniMaxM2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
| 30 |
+
with the defaults will yield a similar configuration to that of the MiniMaxM2-7B-v0.1 or MiniMaxM2-7B-Instruct-v0.1.
|
| 31 |
+
|
| 32 |
+
[minimax_m2ai/MiniMaxM2-8x7B](https://huggingface.co/minimax_m2ai/MiniMaxM2-8x7B)
|
| 33 |
+
[minimax_m2ai/MiniMaxM2-7B-Instruct-v0.1](https://huggingface.co/minimax_m2ai/MiniMaxM2-7B-Instruct-v0.1)
|
| 34 |
+
|
| 35 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 36 |
+
documentation from [`PretrainedConfig`] for more information.
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
vocab_size (`int`, *optional*, defaults to 32000):
|
| 41 |
+
Vocabulary size of the MiniMaxM2 model. Defines the number of different tokens that can be represented by the
|
| 42 |
+
`inputs_ids` passed when calling [`MiniMaxM2Model`]
|
| 43 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
| 44 |
+
Dimension of the hidden representations.
|
| 45 |
+
intermediate_size (`int`, *optional*, defaults to 14336):
|
| 46 |
+
Dimension of the MLP representations.
|
| 47 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
| 48 |
+
Number of hidden layers in the Transformer encoder.
|
| 49 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
| 50 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
| 51 |
+
num_key_value_heads (`int`, *optional*, defaults to 8):
|
| 52 |
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
| 53 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
| 54 |
+
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
| 55 |
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
| 56 |
+
by meanpooling all the original heads within that group. For more details, check out [this
|
| 57 |
+
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
|
| 58 |
+
head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
|
| 59 |
+
The attention head dimension.
|
| 60 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 61 |
+
The non-linear activation function (function or string) in the decoder.
|
| 62 |
+
max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
|
| 63 |
+
The maximum sequence length that this model might ever be used with. MiniMaxM2's sliding window attention
|
| 64 |
+
allows sequence of up to 4096*32 tokens.
|
| 65 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 66 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 67 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
| 68 |
+
The epsilon used by the rms normalization layers.
|
| 69 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
| 70 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
| 71 |
+
relevant if `config.is_decoder=True`.
|
| 72 |
+
pad_token_id (`int`, *optional*):
|
| 73 |
+
The id of the padding token.
|
| 74 |
+
bos_token_id (`int`, *optional*, defaults to 1):
|
| 75 |
+
The id of the "beginning-of-sequence" token.
|
| 76 |
+
eos_token_id (`int`, *optional*, defaults to 2):
|
| 77 |
+
The id of the "end-of-sequence" token.
|
| 78 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
| 79 |
+
Whether the model's input and output word embeddings should be tied.
|
| 80 |
+
rope_theta (`float`, *optional*, defaults to 1000000.0):
|
| 81 |
+
The base period of the RoPE embeddings.
|
| 82 |
+
sliding_window (`int`, *optional*):
|
| 83 |
+
Sliding window attention window size. If not specified, will default to `4096`.
|
| 84 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 85 |
+
The dropout ratio for the attention probabilities.
|
| 86 |
+
num_experts_per_tok (`int`, *optional*, defaults to 2):
|
| 87 |
+
The number of experts to route per-token, can be also interpreted as the `top-k` routing
|
| 88 |
+
parameter
|
| 89 |
+
num_local_experts (`int`, *optional*, defaults to 8):
|
| 90 |
+
Number of experts per Sparse MLP layer.
|
| 91 |
+
output_router_logits (`bool`, *optional*, defaults to `False`):
|
| 92 |
+
Whether or not the router logits should be returned by the model. Enabling this will also
|
| 93 |
+
allow the model to output the auxiliary loss. See [here]() for more details
|
| 94 |
+
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
| 95 |
+
The aux loss factor for the total loss.
|
| 96 |
+
router_jitter_noise (`float`, *optional*, defaults to 0.0):
|
| 97 |
+
Amount of noise to add to the router.
|
| 98 |
+
|
| 99 |
+
```python
|
| 100 |
+
>>> from transformers import MiniMaxM2Model, MiniMaxM2Config
|
| 101 |
+
|
| 102 |
+
>>> # Initializing a MiniMaxM2 7B style configuration
|
| 103 |
+
>>> configuration = MiniMaxM2Config()
|
| 104 |
+
|
| 105 |
+
>>> # Initializing a model from the MiniMaxM2 7B style configuration
|
| 106 |
+
>>> model = MiniMaxM2Model(configuration)
|
| 107 |
+
|
| 108 |
+
>>> # Accessing the model configuration
|
| 109 |
+
>>> configuration = model.config
|
| 110 |
+
```"""
|
| 111 |
+
|
| 112 |
+
model_type = "minimax_m2"
|
| 113 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
| 114 |
+
base_model_tp_plan = {
|
| 115 |
+
"layers.*.self_attn.q_proj": "colwise",
|
| 116 |
+
"layers.*.self_attn.k_proj": "colwise",
|
| 117 |
+
"layers.*.self_attn.v_proj": "colwise",
|
| 118 |
+
"layers.*.self_attn.o_proj": "rowwise",
|
| 119 |
+
"layers.*.block_sparse_moe.gate": "colwise_rep", # we need to replicate here to correctly route experts
|
| 120 |
+
"layers.*.block_sparse_moe.experts.*.w1": "colwise",
|
| 121 |
+
"layers.*.block_sparse_moe.experts.*.w2": "rowwise",
|
| 122 |
+
"layers.*.block_sparse_moe.experts.*.w3": "colwise",
|
| 123 |
+
}
|
| 124 |
+
base_model_pp_plan = {
|
| 125 |
+
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
|
| 126 |
+
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
|
| 127 |
+
"norm": (["hidden_states"], ["hidden_states"]),
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
def __init__(
|
| 131 |
+
self,
|
| 132 |
+
vocab_size=32000,
|
| 133 |
+
hidden_size=4096,
|
| 134 |
+
intermediate_size=14336,
|
| 135 |
+
num_hidden_layers=32,
|
| 136 |
+
num_attention_heads=32,
|
| 137 |
+
num_key_value_heads=8,
|
| 138 |
+
head_dim=None,
|
| 139 |
+
hidden_act="silu",
|
| 140 |
+
max_position_embeddings=4096 * 32,
|
| 141 |
+
initializer_range=0.02,
|
| 142 |
+
rms_norm_eps=1e-5,
|
| 143 |
+
use_cache=True,
|
| 144 |
+
pad_token_id=None,
|
| 145 |
+
bos_token_id=1,
|
| 146 |
+
eos_token_id=2,
|
| 147 |
+
tie_word_embeddings=False,
|
| 148 |
+
rope_theta=1e6,
|
| 149 |
+
sliding_window=None,
|
| 150 |
+
attention_dropout=0.0,
|
| 151 |
+
num_experts_per_tok=2,
|
| 152 |
+
num_local_experts=8,
|
| 153 |
+
output_router_logits=False,
|
| 154 |
+
router_aux_loss_coef=0.001,
|
| 155 |
+
router_jitter_noise=0.0,
|
| 156 |
+
**kwargs,
|
| 157 |
+
):
|
| 158 |
+
self.vocab_size = vocab_size
|
| 159 |
+
self.max_position_embeddings = max_position_embeddings
|
| 160 |
+
self.hidden_size = hidden_size
|
| 161 |
+
self.intermediate_size = intermediate_size
|
| 162 |
+
self.num_hidden_layers = num_hidden_layers
|
| 163 |
+
self.num_attention_heads = num_attention_heads
|
| 164 |
+
self.sliding_window = sliding_window
|
| 165 |
+
|
| 166 |
+
# for backward compatibility
|
| 167 |
+
if num_key_value_heads is None:
|
| 168 |
+
num_key_value_heads = num_attention_heads
|
| 169 |
+
|
| 170 |
+
self.num_key_value_heads = num_key_value_heads
|
| 171 |
+
self.hidden_act = hidden_act
|
| 172 |
+
self.initializer_range = initializer_range
|
| 173 |
+
self.rms_norm_eps = rms_norm_eps
|
| 174 |
+
self.use_cache = use_cache
|
| 175 |
+
self.rope_theta = rope_theta
|
| 176 |
+
self.attention_dropout = attention_dropout
|
| 177 |
+
self.head_dim = head_dim
|
| 178 |
+
|
| 179 |
+
self.num_experts_per_tok = num_experts_per_tok
|
| 180 |
+
self.num_local_experts = num_local_experts
|
| 181 |
+
self.output_router_logits = output_router_logits
|
| 182 |
+
self.router_aux_loss_coef = router_aux_loss_coef
|
| 183 |
+
self.router_jitter_noise = router_jitter_noise
|
| 184 |
+
|
| 185 |
+
self.use_qk_norm = kwargs.pop("use_qk_norm", False)
|
| 186 |
+
self.rotary_dim = kwargs.pop("rotary_dim", self.head_dim)
|
| 187 |
+
self.partial_rotary_factor = kwargs.pop("partial_rotary_factor", 1)
|
| 188 |
+
if self.head_dim is not None:
|
| 189 |
+
self.partial_rotary_factor = self.rotary_dim / self.head_dim
|
| 190 |
+
|
| 191 |
+
super().__init__(
|
| 192 |
+
pad_token_id=pad_token_id,
|
| 193 |
+
bos_token_id=bos_token_id,
|
| 194 |
+
eos_token_id=eos_token_id,
|
| 195 |
+
tie_word_embeddings=tie_word_embeddings,
|
| 196 |
+
**kwargs,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
__all__ = ["MiniMaxM2Config"]
|
generation_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 200019,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": 200020,
|
| 5 |
+
"top_k": 40,
|
| 6 |
+
"top_p": 0.95,
|
| 7 |
+
"transformers_version": "4.57.3"
|
| 8 |
+
}
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model-00000-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ab3425a58e19c5937397c2e650fa5c2eca31d8510854b85f9f1f86038e61d01
|
| 3 |
+
size 2635579896
|
model-00001-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2216aa0d02dc9f9b24af9f43044af5c1cf6c1afa667b3d1c7390e0be59d825b5
|
| 3 |
+
size 679579608
|
model-00002-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:397e30f6587539cf2d8ab054333e02c89a95308b68c5fec1f657cd5c815546e7
|
| 3 |
+
size 1406386576
|
model-00003-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c23c95eeaab36e3041ddf9c1d931f97bf5fcdc77ad3f3720a9fde9f1c10fcc30
|
| 3 |
+
size 679579608
|
model-00004-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8007a0d220e3cad06535d61e9af55319a7c8e16f5f3f7a6a0d536634a2c053f
|
| 3 |
+
size 1406386576
|
model-00005-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8357379b03591695f22dd8623e85f9ee344dc50e2adaac9182446fa6678451c9
|
| 3 |
+
size 679579608
|
model-00006-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2304172be40d611560eec65b92a8e67dbdd76db6242f4b1a8f70dbc0dc119ea
|
| 3 |
+
size 1406386576
|
model-00007-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b847fcbbfac12ea04d89aab6f9bf8fa9f8a0ca43afb037d10ce0b03761d88c96
|
| 3 |
+
size 679579608
|
model-00008-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49b7716786362a2bda42cc37e74de9246c0da8bae3b8dd541e223a4f4a80f9cc
|
| 3 |
+
size 1406386576
|
model-00009-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a7178b968a409f64445ddba49d5ac42ffca93d30a5271649285f8350a81b61a
|
| 3 |
+
size 679579608
|
model-00010-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5153cab8d648eee39c3164cb8ae60437a2865fd57788ace734d0a8e4452ae76a
|
| 3 |
+
size 1406386576
|
model-00011-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8989077984e9fd1036d9848373daf2b76c2def9da0a7e8db81b068b59a20465b
|
| 3 |
+
size 679579608
|
model-00012-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f361ed1b4fc43c71a0ec6df3298828cd5b39ff28486717663497b22e8d2cec3a
|
| 3 |
+
size 1406386576
|
model-00013-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c16d7af5f4a82c8a6f0879faf883ea1e21deff74f7b3477ed46909f9dab8bf0
|
| 3 |
+
size 679579608
|
model-00014-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95ee92624cc8f902a33a46d921babd224c00602736dc43b0d80438e62497b703
|
| 3 |
+
size 1406386576
|
model-00015-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fac258378c41b643c8a6533ddfb19f80f28b2e4ad895b161a9259951044d573f
|
| 3 |
+
size 679579608
|
model-00016-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0de568683b7f823d37b706beab45de331687399dd1e07afc94c7dd8dc725dca2
|
| 3 |
+
size 1406386576
|
model-00017-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6c5b094ae52fa3cbd8cbcaf955cf49bc5a8b262ae21f7bc0b98397717f609c1
|
| 3 |
+
size 679579608
|
model-00018-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db9b60e07f20da7bbccfcc7869b25f9502330fda3c9600ee555d2c1fc8c39771
|
| 3 |
+
size 1406386576
|
model-00019-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e4663e6ea7dac4dd0b49bde8eca5e2c3b96d2913c0997371a383cd80855ec14
|
| 3 |
+
size 679579608
|
model-00020-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97de867f6abe61daaa1feae6767c1d3c8db4c2e888124b834cd3bd9a32b269c6
|
| 3 |
+
size 1406388128
|
model-00021-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70996131b5f441319a24e5b04fffd21fcf16fccf16246ef1d04c5094d481ac07
|
| 3 |
+
size 679580376
|
model-00022-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f23f667295c6e4623859e5e9e68554a48c15614106fb076ffdef57a302ac70df
|
| 3 |
+
size 1406388128
|
model-00023-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe6d49a0a4da5c96c6e44f8bb3976b859cbd5e3f806144d19c7a5d2971bc3562
|
| 3 |
+
size 679580376
|
model-00024-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:615dfb553e208187841c3f49af967f1537d32ea7915e78873515819b9c10d203
|
| 3 |
+
size 1406388128
|
model-00025-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b631abe29100d76512c54b30e845ac077e19740d20d5384ec3576ca1c20981b
|
| 3 |
+
size 679580376
|
model-00026-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6aff9daaeaf1b44541286daba889264169b4327dff893e1e681ceeae0ef4b971
|
| 3 |
+
size 1406388128
|
model-00027-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c67d132dff423f512c986e461a13e4bc32debe8d479f9a37cbba8235051f14d0
|
| 3 |
+
size 679580376
|
model-00028-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b164fb2b5121eb460a1f450f98bce94dfffb8948c987634c0fbb6db789ce1160
|
| 3 |
+
size 1406388128
|
model-00029-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0fb7b93d05a8853e03ddc65070e73be1bbd1742c8da7b31ae05efe7e7af0d7b
|
| 3 |
+
size 679580376
|
model-00030-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6aac5cf99a35483dffef5b3f7c88446932cb99a5ada0eee3cfb42737b46c332f
|
| 3 |
+
size 1406388128
|
model-00031-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b00d17d98b3340d90acb59bd7bd333623dc86b9ed0bbafcef1c08a5afe2084dc
|
| 3 |
+
size 679580376
|
model-00032-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65791b5f25d12df6cc5b96528445190f9d041ca9a8f5ada6d205c4c2d4f2abbe
|
| 3 |
+
size 1406388128
|
model-00033-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17ba972e688ad1dea052a414993664ffd1cf7d977d86bba977176bd88cf8e616
|
| 3 |
+
size 679580376
|
model-00034-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c2b98a81ee0bebf55808833fef1f75664402ade20952fe14f618adc6c13b953
|
| 3 |
+
size 1406388128
|
model-00035-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bc11127c7dd9cb7731a6045edac6eabb26c378ce7990d14de83507bea841dff
|
| 3 |
+
size 679580376
|
model-00036-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ee6346f9fd286195a7801240a5d334bee6e234714d7965f46c057a0cc41fe5c
|
| 3 |
+
size 1406388128
|
model-00037-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60ad909251a4af09afe66716632e80cd70df295f6c12d6e7ffa86d5037b6e208
|
| 3 |
+
size 679580376
|
model-00038-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fd4b5d62565d0f41a75b22b43ede83d170fd013da124c1b1878bf5e995ad68d
|
| 3 |
+
size 1406388128
|
model-00039-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:172835f6c4dee07b34d92de53c942e27066e2b61bfad2e3cef892d652245f9ab
|
| 3 |
+
size 679580376
|
model-00040-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fb479c10ef909d12ee2cfcc73a6661104a24c9c30f6f42df2945213a857572b
|
| 3 |
+
size 1406388128
|
model-00041-of-00126.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c0abbd93045df3f1409ed3a8e18bfce814123d3c2d52108a3bd67c57f72e27dc
|
| 3 |
+
size 679580376
|