GLM-5.2-NVFP4-REAP-Recall-N172 / serve_glm52_reap_recall.sh
brandonmusic's picture
Upload serve_glm52_reap_recall.sh with huggingface_hub
68a218d verified
Raw
History Blame Contribute Delete
9.53 kB
#!/usr/bin/env bash
# =============================================================================
# GLM-5.2-NVFP4-REAP-Recall (N=172) — serve on 4x RTX PRO 6000 (sm120) / TP4
# Model: https://huggingface.co/brandonmusic/GLM-5.2-NVFP4-REAP-Recall-N172
# Image: docker.io/verdictai/gloriousluminousmonotheism:latest
# Recipe: https://github.com/brandonmmusic-max/GLM-5.2-Reap
#
# This is the EXACT verified config (Brandon Music, 2026-06-21):
# util 0.95, DCP=4, MTP3, max-num-batched-tokens=2048, gtopk=1, nvfp4_ds_mla KV
# Measured on 4x RTX PRO 6000 96GB:
# KV pool : 542,857 tokens (max_conc@250K = 2.17x)
# Single-user : 80.7 t/s decode @ 0 ctx (synth padded text, MTP3 acc=3.87)
# ~58 t/s decode on natural English (Marbury essay, x5 runs)
# Aggregate @ C=4 : 218.6 t/s @ 128K ctx
# Prefill 128K : 1,477 t/s (87s TTFT for full 128K)
# VRAM peak : 97.93% (no headroom waste, no OOM in benchmark)
#
# Hardware fixes carried over (from GLM-5.2-REAP-fixes):
# 1) MOE_A16=1 (B12X_MOE_FORCE_A16) for long-ctx correctness (w4a4 accumulates error)
# 2) HF_OVERRIDES.index_topk_pattern: DSA sparse-indexer pattern (derived from config.json)
# 3) NCCL: unset NCCL_GRAPH_FILE inside the container
# 4) -cc.cudagraph_mode=PIECEWISE (NOT JSON form): required for long-context decode.
# Under FULL cudagraph capture, the CuTe-DSL JIT machinery's cooperative-grid
# spin-barrier deadlocks because it's invoked from a captured stream — the
# _dcp_pack_topk_candidates_kernel JIT-compiles at inference time on the first
# long-prompt decode and hangs (sample_tokens RPC times out). PIECEWISE breaks
# the graph at vllm::sparse_attn_indexer (already in splitting_ops) so the
# indexer runs eagerly between captured pieces. The JSON form
# `--compilation-config '{"cudagraph_mode":"PIECEWISE"}'` silently drops to
# None — the CLI shortcut `-cc.cudagraph_mode=PIECEWISE` is the only working form.
# =============================================================================
set -euo pipefail
# --- Required: path to the downloaded HF model directory on the host ----------
# Download with:
# huggingface-cli download brandonmusic/GLM-5.2-NVFP4-REAP-Recall-N172 \
# --local-dir /path/to/models/GLM-5.2-NVFP4-REAP-Recall-N172
MODEL_HOST_DIR="${MODEL_HOST_DIR:-$HOME/models}"
MODEL_DIRNAME="${MODEL_DIRNAME:-GLM-5.2-NVFP4-REAP-Recall-N172}"
MODEL_PATH="${MODEL_PATH:-/models-archive/$MODEL_DIRNAME}"
# --- Serving identity ---------------------------------------------------------
IMAGE="${IMAGE:-verdictai/gloriousluminousmonotheism:latest}"
NAME="${NAME:-glm52-recall}"
SERVED_NAME="${SERVED_NAME:-glm-5.2-nvfp4}"
PORT="${PORT:-9402}"
# --- Verified parallelism / batch / KV config (the WORKING combo) --------------
TP="${TP:-4}" # 4x RTX PRO 6000 96GB
DCP="${DCP:-4}" # decode-context-parallel = TP for max KV
GPU_UTIL="${GPU_UTIL:-0.95}" # 0.95 with batched=2048 leaves GPU1 ~1.1GB remap headroom
MAXLEN="${MAXLEN:-250000}" # well above 128K; fits the 542K KV pool
MAX_SEQS="${MAX_SEQS:-16}"
MAX_BATCHED="${MAX_BATCHED:-2048}" # smaller chunks halve DCP-global-topk remap (576 -> 144 MiB)
KV_DTYPE="${KV_DTYPE:-nvfp4_ds_mla}" # 4-bit MLA KV cache (+1.47x context vs fp8)
# --- MTP / speculative decode --------------------------------------------------
MTP="${MTP:-1}" # MTP on/off
NUM_SPEC="${NUM_SPEC:-3}" # 3 speculative tokens; mean accept ~3.87 under sustained load
# --- MoE / DSA / quantization knobs -------------------------------------------
MOE_BACKEND="${MOE_BACKEND:-b12x}"
LINEAR_BACKEND="${LINEAR_BACKEND:-auto}" # required for MTP (b12x has no NVFP4 nextn eh_proj kernel)
ATTN_BACKEND="${ATTN_BACKEND:-B12X_MLA_SPARSE}"
SPARSE_INDEXER="${SPARSE_INDEXER:-1}"
FUSED_TOPK="${FUSED_TOPK:-1}"
MOE_A16="${MOE_A16:-1}" # LONG-CTX FIX (w4a16 MoE decode)
DCP_GLOBAL_TOPK="${DCP_GLOBAL_TOPK:-1}" # DCP top-k remap (required with DCP>1)
SHARD_DRAFT="${SHARD_DRAFT:-1}" # shard MTP/draft KV across DCP ranks
GRAPH_CAP="${GRAPH_CAP:-64}"
# --- Reasoning / parsing -------------------------------------------------------
REASONING_PARSER="${REASONING_PARSER:-glm45}"
TOOL_PARSER="${TOOL_PARSER:-glm47}"
# REASONING_CONFIG is opt-in: ONLY set for thinking_token_budget; otherwise it CLOBBERS
# the glm45 parser's <think> priming and gimps thinking quality. Default OFF.
REASONING_CONFIG="${REASONING_CONFIG:-}"
# --- DSA sparse-indexer pattern. Derived from the model's config.json ----------
# python -c "import json;c=json.load(open('config.json'));print(''.join('F' if t=='full' else 'S' for t in c['indexer_types']))"
INDEX_PATTERN="${INDEX_PATTERN:-FFFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSS}"
if [ -z "${HF_OVERRIDES:-}" ]; then
HF_OVERRIDES='{"use_index_cache":true,"index_topk_pattern":"'"$INDEX_PATTERN"'"}'
fi
# --- Caches --------------------------------------------------------------------
CACHE_DIR="${CACHE_DIR:-$HOME/.cache/glm52-recall}"
mkdir -p "$CACHE_DIR"
# --- Preflight -----------------------------------------------------------------
echo "== preflight =="
docker image inspect "$IMAGE" >/dev/null 2>&1 || {
echo "image not local; pulling $IMAGE ..."; docker pull "$IMAGE"
}
[ -d "$MODEL_HOST_DIR/$MODEL_DIRNAME" ] || {
echo "FATAL: model dir missing: $MODEL_HOST_DIR/$MODEL_DIRNAME"
echo " Download with:"
echo " huggingface-cli download brandonmusic/GLM-5.2-NVFP4-REAP-Recall-N172 \\"
echo " --local-dir $MODEL_HOST_DIR/$MODEL_DIRNAME"
exit 1
}
ST=$(ls "$MODEL_HOST_DIR/$MODEL_DIRNAME"/*.safetensors 2>/dev/null | wc -l)
[ "$ST" -ge 1 ] || { echo "FATAL: no safetensors in $MODEL_HOST_DIR/$MODEL_DIRNAME"; exit 1; }
echo "image=$IMAGE model=$MODEL_DIRNAME util=$GPU_UTIL maxlen=$MAXLEN dcp=$DCP mtp=$MTP num_spec=$NUM_SPEC max_batched=$MAX_BATCHED moe_a16=$MOE_A16 gtopk=$DCP_GLOBAL_TOPK kv=$KV_DTYPE"
# --- Launch --------------------------------------------------------------------
docker rm -f "$NAME" >/dev/null 2>&1 || true
docker run -d --name "$NAME" \
--gpus all --runtime nvidia --ipc host --shm-size 32g --network host \
--ulimit memlock=-1 --ulimit stack=67108864 \
-v "$MODEL_HOST_DIR":/models-archive:ro -v "$CACHE_DIR":/cache \
-e CUDA_VISIBLE_DEVICES=0,1,2,3 -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUTE_DSL_ARCH=sm_120a \
-e HF_HUB_OFFLINE=1 -e NCCL_IB_DISABLE=1 -e NCCL_P2P_LEVEL=SYS -e NCCL_PROTO=LL,LL128,Simple \
-e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
-e VLLM_USE_AOT_COMPILE=1 -e VLLM_USE_BREAKABLE_CUDAGRAPH=0 \
-e VLLM_MEMORY_PROFILE_INCLUDE_ATTN=1 -e B12X_MHC_MAX_TOKENS=16384 -e VLLM_USE_FLASHINFER_SAMPLER=1 \
-e VLLM_USE_B12X_WO_PROJECTION=1 -e VLLM_USE_B12X_MHC=1 -e VLLM_USE_B12X_FP8_GEMM=1 \
-e VLLM_USE_B12X_MOE=$([ "$MOE_BACKEND" = "b12x" ] && echo 1 || echo 0) \
-e VLLM_USE_B12X_SPARSE_INDEXER=$SPARSE_INDEXER -e VLLM_USE_V2_MODEL_RUNNER=1 -e VLLM_USE_FUSED_MOE_GROUPED_TOPK=$FUSED_TOPK \
-e VLLM_PCIE_ALLREDUCE_BACKEND=b12x -e VLLM_ENABLE_PCIE_ALLREDUCE=1 \
-e B12X_MLA_SM120_UNIFIED=1 -e USES_B12X=True -e B12X_DENSE_SPLITK_TURBO=1 -e B12X_W4A16_TC_DECODE=1 \
-e B12X_MOE_FORCE_A16=$MOE_A16 \
-e VLLM_DCP_GLOBAL_TOPK=$DCP_GLOBAL_TOPK -e VLLM_DCP_SHARD_DRAFT=$SHARD_DRAFT \
"$IMAGE" \
/bin/bash -lc "
set -euo pipefail
unset NCCL_GRAPH_FILE NCCL_GRAPH_DUMP_FILE VLLM_B12X_MLA_EXTEND_MAX_CHUNKS
SPEC_ARGS=()
if [ '$MTP' = '1' ]; then
SPEC_ARGS=(--speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":$NUM_SPEC,\"draft_sample_method\":\"probabilistic\",\"moe_backend\":\"$MOE_BACKEND\",\"use_local_argmax_reduction\":true}')
fi
HF_ARGS=()
if [ -n '$HF_OVERRIDES' ]; then HF_ARGS=(--hf-overrides '$HF_OVERRIDES'); fi
PARSER_ARGS=()
if [ -n '$REASONING_PARSER' ]; then PARSER_ARGS+=(--reasoning-parser '$REASONING_PARSER'); fi
if [ -n '$TOOL_PARSER' ]; then PARSER_ARGS+=(--tool-call-parser '$TOOL_PARSER' --enable-auto-tool-choice); fi
if [ -n '$REASONING_CONFIG' ]; then PARSER_ARGS+=(--reasoning-config '$REASONING_CONFIG'); fi
cd /
exec /opt/venv/bin/python -m vllm.entrypoints.cli.main serve '$MODEL_PATH' \
--served-model-name '$SERVED_NAME' --host 0.0.0.0 --port '$PORT' \
--cpu-offload-gb 0 \
--kv-cache-dtype '$KV_DTYPE' --block-size 256 --load-format safetensors \
--tensor-parallel-size '$TP' --decode-context-parallel-size '$DCP' --moe-backend '$MOE_BACKEND' --linear-backend '$LINEAR_BACKEND' \
--gpu-memory-utilization '$GPU_UTIL' --max-model-len '$MAXLEN' --max-num-seqs '$MAX_SEQS' \
--enable-chunked-prefill --max-num-batched-tokens '$MAX_BATCHED' \
--max-cudagraph-capture-size '$GRAPH_CAP' --attention-backend '$ATTN_BACKEND' \
-cc.cudagraph_mode=PIECEWISE \
--compilation-config '{\"custom_ops\":[\"all\"]}' \
--enable-flashinfer-autotune \
\"\${HF_ARGS[@]}\" \"\${PARSER_ARGS[@]}\" \"\${SPEC_ARGS[@]}\"
"
echo "Launched $NAME (tp=$TP util=$GPU_UTIL maxlen=$MAXLEN dcp=$DCP mtp=$MTP num_spec=$NUM_SPEC batched=$MAX_BATCHED)"
echo "watch boot: docker logs -f $NAME"
echo "smoke test: curl -s http://127.0.0.1:$PORT/v1/models"
echo "completion: curl -s http://127.0.0.1:$PORT/v1/chat/completions -d '{\"model\":\"$SERVED_NAME\",\"messages\":[{\"role\":\"user\",\"content\":\"what is the capital of kentucky?\"}]}'"