Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

.gitattributes +0 -34
README.md +102 -0
config.json +83 -0
rank0.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


























1	*.safetensors filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,102 @@

+---
+license: llama3.2
+tags:
+  - tensorrt-llm
+  - nvfp4
+  - fp4
+  - kv-cache-quantization
+  - text-generation
+  - llama
+base_model: meta-llama/Llama-3.2-3B-Instruct
+---
+# Llama-3.2-3B-Instruct TensorRT-LLM checkpoint (NVFP4 weight + FP8 KV)
+TensorRT-LLM **checkpoint** for **Llama-3.2-3B-Instruct**, with **NVFP4 (W4A4)** weight quantization and **FP8** KV cache. Use with `trtllm-build` to produce an engine for inference.
+## Model details
+| Item | Value |
+|------|--------|
+| **Base model** | Llama-3.2-3B-Instruct |
+| **Framework** | TensorRT-LLM (checkpoint format) |
+| **Weight quantization** | NVFP4 (W4A4) |
+| **KV cache** | FP8 |
+| **Producer** | TensorRT-Model-Optimizer llm_ptq + TensorRT-LLM convert_checkpoint (--use_nvfp4, --fp8_kv_cache) |
+| **Architecture** | LlamaForCausalLM (decoder-only) |
+## Build (how to produce this checkpoint)
+NVFP4 requires a two-step pipeline: (1) run Model Optimizer llm_ptq to quantize the Hugging Face model to NVFP4; (2) run TensorRT-LLM convert_checkpoint with the PTQ output to produce this checkpoint.
+### 1. Environment and dependencies
+```bash
+sudo apt install git-lfs
+git lfs install
+pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com
+# Install TensorRT-Model-Optimizer (required for NVFP4 quantization)
+# See https://github.com/NVIDIA/TensorRT-Model-Optimizer
+```
+### 2. Quantize base model to NVFP4 (llm_ptq)
+Clone the base model and run Model Optimizer's llm_ptq to produce an NVFP4-quantized HF-format directory. Then run TensorRT-LLM convert_checkpoint:
+```bash
+# Example: after llm_ptq has produced PTQ output (NVFP4 weights),
+# run convert_checkpoint with that directory as --model_dir:
+python TensorRT-LLM/examples/llama/convert_checkpoint.py \
+  --model_dir ./path/to/ptq_output \
+  --output_dir ./llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8 \
+  --dtype float16 \
+  --use_nvfp4 \
+  --fp8_kv_cache
+```
+### 3. Output
+After conversion, `--output_dir` contains `config.json` and `rank0.safetensors`; that is the checkpoint in this repo.
+## Upload (how to upload to Hugging Face)
+```bash
+cd ./llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8
+huggingface-cli repo create rungalileo/llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8 --repo-type model
+huggingface-cli upload rungalileo/llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8 . --repo-type model
+```
+## How to use
+### 1. Build engine
+Requires [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and `tensorrt_llm` installed:
+```bash
+git clone https://huggingface.co/rungalileo/llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8
+cd llama-3.2-3b-instruct-trtllm-ckpt-wq_nvfp4-kv_fp8
+trtllm-build --checkpoint_dir . --output_dir ./engine \
+  --max_batch_size 1 --max_input_len 512 --max_seq_len 1024
+```
+### 2. Run inference
+Use a tokenizer from the base model (e.g. `meta-llama/Llama-3.2-3B-Instruct`):
+```bash
+trtllm-serve ./engine --tokenizer meta-llama/Llama-3.2-3B-Instruct --port 8000
+# OpenAI-compatible API: http://localhost:8000/v1/completions
+```
+## Files in this repo
+- `config.json` – TensorRT-LLM model config
+- `rank0.safetensors` – Rank 0 weights (single-GPU)
+## References
+- [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM)
+- [Llama 3.2](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct)

config.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+    "mlp_bias": false,
+    "attn_bias": false,
+    "rotary_base": 500000.0,
+    "rotary_scaling": {
+        "factor": 32.0,
+        "high_freq_factor": 4.0,
+        "low_freq_factor": 1.0,
+        "original_max_position_embeddings": 8192,
+        "rope_type": "llama3"
+    },
+    "residual_mlp": false,
+    "disable_weight_only_quant_plugin": false,
+    "moe": {
+        "num_experts": 0,
+        "shared_expert_intermediate_size": 0,
+        "top_k": 0,
+        "normalization_mode": null,
+        "sparse_mixer_epsilon": 0.01,
+        "tp_mode": 0,
+        "device_limited_n_group": 0,
+        "device_limited_topk_group": 0,
+        "device_limited_routed_scaling_factor": 1.0
+    },
+    "remove_duplicated_kv_heads": false,
+    "fc_after_embed": false,
+    "use_input_layernorm_in_first_layer": true,
+    "use_last_layernorm": true,
+    "layer_idx_offset": 0,
+    "embedding_multiplier": 1.0,
+    "attention_multiplier": 1.0,
+    "residual_multiplier": 1.0,
+    "output_multiplier_scale": 1.0,
+    "has_partial_lora_mask": false,
+    "architecture": "LlamaForCausalLM",
+    "dtype": "float16",
+    "vocab_size": 128256,
+    "hidden_size": 3072,
+    "num_hidden_layers": 28,
+    "num_attention_heads": 24,
+    "hidden_act": "silu",
+    "logits_dtype": "float32",
+    "norm_epsilon": 1e-05,
+    "runtime_defaults": null,
+    "position_embedding_type": "rope_gpt_neox",
+    "num_key_value_heads": 8,
+    "intermediate_size": 8192,
+    "max_position_embeddings": 131072,
+    "mapping": {
+        "world_size": 1,
+        "gpus_per_node": 8,
+        "cp_size": 1,
+        "tp_size": 1,
+        "pp_size": 1,
+        "moe_tp_size": 1,
+        "moe_cluster_size": 1,
+        "moe_ep_size": 1,
+        "attn_tp_size": -1,
+        "attn_cp_size": -1,
+        "cp_config": {},
+        "auto_parallel": false,
+        "enable_attention_dp": false,
+        "enable_lm_head_tp_in_adp": false
+    },
+    "quantization": {
+        "quant_algo": "NVFP4",
+        "kv_cache_quant_algo": "FP8",
+        "group_size": 128,
+        "smoothquant_val": 0.5,
+        "clamp_val": null,
+        "use_meta_recipe": false,
+        "has_zero_point": false,
+        "pre_quant_scale": false,
+        "exclude_modules": null,
+        "mamba_ssm_cache_dtype": null
+    },
+    "use_parallel_embedding": false,
+    "embedding_sharding_dim": 0,
+    "head_size": 128,
+    "qk_layernorm": false,
+    "rotary_embedding_dim": 128,
+    "tie_word_embeddings": true
+}

rank0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e788dd862deaaeb31a65171f9af7ed98a3200cb38a20c44eed9b3eef4e4fb7cb
+size 3472310504