rungalileo commited on
Commit
83a16a6
·
verified ·
1 Parent(s): 6e9f9be

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gitattributes +0 -34
  2. README.md +102 -0
  3. config.json +83 -0
  4. rank0.safetensors +3 -0
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
  *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: llama3.2
3
+ tags:
4
+ - tensorrt-llm
5
+ - nvfp4
6
+ - fp4
7
+ - kv-cache-quantization
8
+ - text-generation
9
+ - llama
10
+ base_model: meta-llama/Llama-3.2-3B-Instruct
11
+ ---
12
+
13
+ # Llama-3.2-3B-Instruct TensorRT-LLM checkpoint (NVFP4 weight + FP8 KV)
14
+
15
+ TensorRT-LLM **checkpoint** for **Llama-3.2-3B-Instruct**, with **NVFP4 (W4A4)** weight quantization and **FP8** KV cache. Use with `trtllm-build` to produce an engine for inference.
16
+
17
+ ## Model details
18
+
19
+ | Item | Value |
20
+ |------|--------|
21
+ | **Base model** | Llama-3.2-3B-Instruct |
22
+ | **Framework** | TensorRT-LLM (checkpoint format) |
23
+ | **Weight quantization** | NVFP4 (W4A4) |
24
+ | **KV cache** | FP8 |
25
+ | **Producer** | TensorRT-Model-Optimizer llm_ptq + TensorRT-LLM convert_checkpoint (--use_nvfp4, --fp8_kv_cache) |
26
+ | **Architecture** | LlamaForCausalLM (decoder-only) |
27
+
28
+ ## Build (how to produce this checkpoint)
29
+
30
+ NVFP4 requires a two-step pipeline: (1) run Model Optimizer llm_ptq to quantize the Hugging Face model to NVFP4; (2) run TensorRT-LLM convert_checkpoint with the PTQ output to produce this checkpoint.
31
+
32
+ ### 1. Environment and dependencies
33
+
34
+ ```bash
35
+ sudo apt install git-lfs
36
+ git lfs install
37
+
38
+ pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com
39
+ # Install TensorRT-Model-Optimizer (required for NVFP4 quantization)
40
+ # See https://github.com/NVIDIA/TensorRT-Model-Optimizer
41
+ ```
42
+
43
+ ### 2. Quantize base model to NVFP4 (llm_ptq)
44
+
45
+ Clone the base model and run Model Optimizer's llm_ptq to produce an NVFP4-quantized HF-format directory. Then run TensorRT-LLM convert_checkpoint:
46
+
47
+ ```bash
48
+ # Example: after llm_ptq has produced PTQ output (NVFP4 weights),
49
+ # run convert_checkpoint with that directory as --model_dir:
50
+ python TensorRT-LLM/examples/llama/convert_checkpoint.py \
51
+ --model_dir ./path/to/ptq_output \
52
+ --output_dir ./llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8 \
53
+ --dtype float16 \
54
+ --use_nvfp4 \
55
+ --fp8_kv_cache
56
+ ```
57
+
58
+ ### 3. Output
59
+
60
+ After conversion, `--output_dir` contains `config.json` and `rank0.safetensors`; that is the checkpoint in this repo.
61
+
62
+ ## Upload (how to upload to Hugging Face)
63
+
64
+ ```bash
65
+ cd ./llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8
66
+
67
+ huggingface-cli repo create rungalileo/llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8 --repo-type model
68
+ huggingface-cli upload rungalileo/llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8 . --repo-type model
69
+ ```
70
+
71
+ ## How to use
72
+
73
+ ### 1. Build engine
74
+
75
+ Requires [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and `tensorrt_llm` installed:
76
+
77
+ ```bash
78
+ git clone https://huggingface.co/rungalileo/llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8
79
+ cd llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8
80
+
81
+ trtllm-build --checkpoint_dir . --output_dir ./engine \
82
+ --max_batch_size 1 --max_input_len 512 --max_seq_len 1024
83
+ ```
84
+
85
+ ### 2. Run inference
86
+
87
+ Use a tokenizer from the base model (e.g. `meta-llama/Llama-3.2-3B-Instruct`):
88
+
89
+ ```bash
90
+ trtllm-serve ./engine --tokenizer meta-llama/Llama-3.2-3B-Instruct --port 8000
91
+ # OpenAI-compatible API: http://localhost:8000/v1/completions
92
+ ```
93
+
94
+ ## Files in this repo
95
+
96
+ - `config.json` – TensorRT-LLM model config
97
+ - `rank0.safetensors` – Rank 0 weights (single-GPU)
98
+
99
+ ## References
100
+
101
+ - [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM)
102
+ - [Llama 3.2](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)
config.json ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mlp_bias": false,
3
+ "attn_bias": false,
4
+ "rotary_base": 500000.0,
5
+ "rotary_scaling": {
6
+ "factor": 32.0,
7
+ "high_freq_factor": 4.0,
8
+ "low_freq_factor": 1.0,
9
+ "original_max_position_embeddings": 8192,
10
+ "rope_type": "llama3"
11
+ },
12
+ "residual_mlp": false,
13
+ "disable_weight_only_quant_plugin": false,
14
+ "moe": {
15
+ "num_experts": 0,
16
+ "shared_expert_intermediate_size": 0,
17
+ "top_k": 0,
18
+ "normalization_mode": null,
19
+ "sparse_mixer_epsilon": 0.01,
20
+ "tp_mode": 0,
21
+ "device_limited_n_group": 0,
22
+ "device_limited_topk_group": 0,
23
+ "device_limited_routed_scaling_factor": 1.0
24
+ },
25
+ "remove_duplicated_kv_heads": false,
26
+ "fc_after_embed": false,
27
+ "use_input_layernorm_in_first_layer": true,
28
+ "use_last_layernorm": true,
29
+ "layer_idx_offset": 0,
30
+ "embedding_multiplier": 1.0,
31
+ "attention_multiplier": 1.0,
32
+ "residual_multiplier": 1.0,
33
+ "output_multiplier_scale": 1.0,
34
+ "has_partial_lora_mask": false,
35
+ "architecture": "LlamaForCausalLM",
36
+ "dtype": "float16",
37
+ "vocab_size": 128256,
38
+ "hidden_size": 3072,
39
+ "num_hidden_layers": 28,
40
+ "num_attention_heads": 24,
41
+ "hidden_act": "silu",
42
+ "logits_dtype": "float32",
43
+ "norm_epsilon": 1e-05,
44
+ "runtime_defaults": null,
45
+ "position_embedding_type": "rope_gpt_neox",
46
+ "num_key_value_heads": 8,
47
+ "intermediate_size": 8192,
48
+ "max_position_embeddings": 131072,
49
+ "mapping": {
50
+ "world_size": 1,
51
+ "gpus_per_node": 8,
52
+ "cp_size": 1,
53
+ "tp_size": 1,
54
+ "pp_size": 1,
55
+ "moe_tp_size": 1,
56
+ "moe_cluster_size": 1,
57
+ "moe_ep_size": 1,
58
+ "attn_tp_size": -1,
59
+ "attn_cp_size": -1,
60
+ "cp_config": {},
61
+ "auto_parallel": false,
62
+ "enable_attention_dp": false,
63
+ "enable_lm_head_tp_in_adp": false
64
+ },
65
+ "quantization": {
66
+ "quant_algo": "NVFP4",
67
+ "kv_cache_quant_algo": "FP8",
68
+ "group_size": 128,
69
+ "smoothquant_val": 0.5,
70
+ "clamp_val": null,
71
+ "use_meta_recipe": false,
72
+ "has_zero_point": false,
73
+ "pre_quant_scale": false,
74
+ "exclude_modules": null,
75
+ "mamba_ssm_cache_dtype": null
76
+ },
77
+ "use_parallel_embedding": false,
78
+ "embedding_sharding_dim": 0,
79
+ "head_size": 128,
80
+ "qk_layernorm": false,
81
+ "rotary_embedding_dim": 128,
82
+ "tie_word_embeddings": true
83
+ }
rank0.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e788dd862deaaeb31a65171f9af7ed98a3200cb38a20c44eed9b3eef4e4fb7cb
3
+ size 3472310504