#!/usr/bin/env bash # ============================================================================= # GLM-5.2-NVFP4-REAP-Recall (N=172) — serve on 4x RTX PRO 6000 (sm120) / TP4 # Model: https://huggingface.co/brandonmusic/GLM-5.2-NVFP4-REAP-Recall-N172 # Image: docker.io/verdictai/gloriousluminousmonotheism:latest # Recipe: https://github.com/brandonmmusic-max/GLM-5.2-Reap # # This is the EXACT verified config (Brandon Music, 2026-06-21): # util 0.95, DCP=4, MTP3, max-num-batched-tokens=2048, gtopk=1, nvfp4_ds_mla KV # Measured on 4x RTX PRO 6000 96GB: # KV pool : 542,857 tokens (max_conc@250K = 2.17x) # Single-user : 80.7 t/s decode @ 0 ctx (synth padded text, MTP3 acc=3.87) # ~58 t/s decode on natural English (Marbury essay, x5 runs) # Aggregate @ C=4 : 218.6 t/s @ 128K ctx # Prefill 128K : 1,477 t/s (87s TTFT for full 128K) # VRAM peak : 97.93% (no headroom waste, no OOM in benchmark) # # Hardware fixes carried over (from GLM-5.2-REAP-fixes): # 1) MOE_A16=1 (B12X_MOE_FORCE_A16) for long-ctx correctness (w4a4 accumulates error) # 2) HF_OVERRIDES.index_topk_pattern: DSA sparse-indexer pattern (derived from config.json) # 3) NCCL: unset NCCL_GRAPH_FILE inside the container # 4) -cc.cudagraph_mode=PIECEWISE (NOT JSON form): required for long-context decode. # Under FULL cudagraph capture, the CuTe-DSL JIT machinery's cooperative-grid # spin-barrier deadlocks because it's invoked from a captured stream — the # _dcp_pack_topk_candidates_kernel JIT-compiles at inference time on the first # long-prompt decode and hangs (sample_tokens RPC times out). PIECEWISE breaks # the graph at vllm::sparse_attn_indexer (already in splitting_ops) so the # indexer runs eagerly between captured pieces. The JSON form # `--compilation-config '{"cudagraph_mode":"PIECEWISE"}'` silently drops to # None — the CLI shortcut `-cc.cudagraph_mode=PIECEWISE` is the only working form. # ============================================================================= set -euo pipefail # --- Required: path to the downloaded HF model directory on the host ---------- # Download with: # huggingface-cli download brandonmusic/GLM-5.2-NVFP4-REAP-Recall-N172 \ # --local-dir /path/to/models/GLM-5.2-NVFP4-REAP-Recall-N172 MODEL_HOST_DIR="${MODEL_HOST_DIR:-$HOME/models}" MODEL_DIRNAME="${MODEL_DIRNAME:-GLM-5.2-NVFP4-REAP-Recall-N172}" MODEL_PATH="${MODEL_PATH:-/models-archive/$MODEL_DIRNAME}" # --- Serving identity --------------------------------------------------------- IMAGE="${IMAGE:-verdictai/gloriousluminousmonotheism:latest}" NAME="${NAME:-glm52-recall}" SERVED_NAME="${SERVED_NAME:-glm-5.2-nvfp4}" PORT="${PORT:-9402}" # --- Verified parallelism / batch / KV config (the WORKING combo) -------------- TP="${TP:-4}" # 4x RTX PRO 6000 96GB DCP="${DCP:-4}" # decode-context-parallel = TP for max KV GPU_UTIL="${GPU_UTIL:-0.95}" # 0.95 with batched=2048 leaves GPU1 ~1.1GB remap headroom MAXLEN="${MAXLEN:-250000}" # well above 128K; fits the 542K KV pool MAX_SEQS="${MAX_SEQS:-16}" MAX_BATCHED="${MAX_BATCHED:-2048}" # smaller chunks halve DCP-global-topk remap (576 -> 144 MiB) KV_DTYPE="${KV_DTYPE:-nvfp4_ds_mla}" # 4-bit MLA KV cache (+1.47x context vs fp8) # --- MTP / speculative decode -------------------------------------------------- MTP="${MTP:-1}" # MTP on/off NUM_SPEC="${NUM_SPEC:-3}" # 3 speculative tokens; mean accept ~3.87 under sustained load # --- MoE / DSA / quantization knobs ------------------------------------------- MOE_BACKEND="${MOE_BACKEND:-b12x}" LINEAR_BACKEND="${LINEAR_BACKEND:-auto}" # required for MTP (b12x has no NVFP4 nextn eh_proj kernel) ATTN_BACKEND="${ATTN_BACKEND:-B12X_MLA_SPARSE}" SPARSE_INDEXER="${SPARSE_INDEXER:-1}" FUSED_TOPK="${FUSED_TOPK:-1}" MOE_A16="${MOE_A16:-1}" # LONG-CTX FIX (w4a16 MoE decode) DCP_GLOBAL_TOPK="${DCP_GLOBAL_TOPK:-1}" # DCP top-k remap (required with DCP>1) SHARD_DRAFT="${SHARD_DRAFT:-1}" # shard MTP/draft KV across DCP ranks GRAPH_CAP="${GRAPH_CAP:-64}" # --- Reasoning / parsing ------------------------------------------------------- REASONING_PARSER="${REASONING_PARSER:-glm45}" TOOL_PARSER="${TOOL_PARSER:-glm47}" # REASONING_CONFIG is opt-in: ONLY set for thinking_token_budget; otherwise it CLOBBERS # the glm45 parser's priming and gimps thinking quality. Default OFF. REASONING_CONFIG="${REASONING_CONFIG:-}" # --- DSA sparse-indexer pattern. Derived from the model's config.json ---------- # python -c "import json;c=json.load(open('config.json'));print(''.join('F' if t=='full' else 'S' for t in c['indexer_types']))" INDEX_PATTERN="${INDEX_PATTERN:-FFFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSSFSSS}" if [ -z "${HF_OVERRIDES:-}" ]; then HF_OVERRIDES='{"use_index_cache":true,"index_topk_pattern":"'"$INDEX_PATTERN"'"}' fi # --- Caches -------------------------------------------------------------------- CACHE_DIR="${CACHE_DIR:-$HOME/.cache/glm52-recall}" mkdir -p "$CACHE_DIR" # --- Preflight ----------------------------------------------------------------- echo "== preflight ==" docker image inspect "$IMAGE" >/dev/null 2>&1 || { echo "image not local; pulling $IMAGE ..."; docker pull "$IMAGE" } [ -d "$MODEL_HOST_DIR/$MODEL_DIRNAME" ] || { echo "FATAL: model dir missing: $MODEL_HOST_DIR/$MODEL_DIRNAME" echo " Download with:" echo " huggingface-cli download brandonmusic/GLM-5.2-NVFP4-REAP-Recall-N172 \\" echo " --local-dir $MODEL_HOST_DIR/$MODEL_DIRNAME" exit 1 } ST=$(ls "$MODEL_HOST_DIR/$MODEL_DIRNAME"/*.safetensors 2>/dev/null | wc -l) [ "$ST" -ge 1 ] || { echo "FATAL: no safetensors in $MODEL_HOST_DIR/$MODEL_DIRNAME"; exit 1; } echo "image=$IMAGE model=$MODEL_DIRNAME util=$GPU_UTIL maxlen=$MAXLEN dcp=$DCP mtp=$MTP num_spec=$NUM_SPEC max_batched=$MAX_BATCHED moe_a16=$MOE_A16 gtopk=$DCP_GLOBAL_TOPK kv=$KV_DTYPE" # --- Launch -------------------------------------------------------------------- docker rm -f "$NAME" >/dev/null 2>&1 || true docker run -d --name "$NAME" \ --gpus all --runtime nvidia --ipc host --shm-size 32g --network host \ --ulimit memlock=-1 --ulimit stack=67108864 \ -v "$MODEL_HOST_DIR":/models-archive:ro -v "$CACHE_DIR":/cache \ -e CUDA_VISIBLE_DEVICES=0,1,2,3 -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e CUTE_DSL_ARCH=sm_120a \ -e HF_HUB_OFFLINE=1 -e NCCL_IB_DISABLE=1 -e NCCL_P2P_LEVEL=SYS -e NCCL_PROTO=LL,LL128,Simple \ -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \ -e VLLM_USE_AOT_COMPILE=1 -e VLLM_USE_BREAKABLE_CUDAGRAPH=0 \ -e VLLM_MEMORY_PROFILE_INCLUDE_ATTN=1 -e B12X_MHC_MAX_TOKENS=16384 -e VLLM_USE_FLASHINFER_SAMPLER=1 \ -e VLLM_USE_B12X_WO_PROJECTION=1 -e VLLM_USE_B12X_MHC=1 -e VLLM_USE_B12X_FP8_GEMM=1 \ -e VLLM_USE_B12X_MOE=$([ "$MOE_BACKEND" = "b12x" ] && echo 1 || echo 0) \ -e VLLM_USE_B12X_SPARSE_INDEXER=$SPARSE_INDEXER -e VLLM_USE_V2_MODEL_RUNNER=1 -e VLLM_USE_FUSED_MOE_GROUPED_TOPK=$FUSED_TOPK \ -e VLLM_PCIE_ALLREDUCE_BACKEND=b12x -e VLLM_ENABLE_PCIE_ALLREDUCE=1 \ -e B12X_MLA_SM120_UNIFIED=1 -e USES_B12X=True -e B12X_DENSE_SPLITK_TURBO=1 -e B12X_W4A16_TC_DECODE=1 \ -e B12X_MOE_FORCE_A16=$MOE_A16 \ -e VLLM_DCP_GLOBAL_TOPK=$DCP_GLOBAL_TOPK -e VLLM_DCP_SHARD_DRAFT=$SHARD_DRAFT \ "$IMAGE" \ /bin/bash -lc " set -euo pipefail unset NCCL_GRAPH_FILE NCCL_GRAPH_DUMP_FILE VLLM_B12X_MLA_EXTEND_MAX_CHUNKS SPEC_ARGS=() if [ '$MTP' = '1' ]; then SPEC_ARGS=(--speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":$NUM_SPEC,\"draft_sample_method\":\"probabilistic\",\"moe_backend\":\"$MOE_BACKEND\",\"use_local_argmax_reduction\":true}') fi HF_ARGS=() if [ -n '$HF_OVERRIDES' ]; then HF_ARGS=(--hf-overrides '$HF_OVERRIDES'); fi PARSER_ARGS=() if [ -n '$REASONING_PARSER' ]; then PARSER_ARGS+=(--reasoning-parser '$REASONING_PARSER'); fi if [ -n '$TOOL_PARSER' ]; then PARSER_ARGS+=(--tool-call-parser '$TOOL_PARSER' --enable-auto-tool-choice); fi if [ -n '$REASONING_CONFIG' ]; then PARSER_ARGS+=(--reasoning-config '$REASONING_CONFIG'); fi cd / exec /opt/venv/bin/python -m vllm.entrypoints.cli.main serve '$MODEL_PATH' \ --served-model-name '$SERVED_NAME' --host 0.0.0.0 --port '$PORT' \ --cpu-offload-gb 0 \ --kv-cache-dtype '$KV_DTYPE' --block-size 256 --load-format safetensors \ --tensor-parallel-size '$TP' --decode-context-parallel-size '$DCP' --moe-backend '$MOE_BACKEND' --linear-backend '$LINEAR_BACKEND' \ --gpu-memory-utilization '$GPU_UTIL' --max-model-len '$MAXLEN' --max-num-seqs '$MAX_SEQS' \ --enable-chunked-prefill --max-num-batched-tokens '$MAX_BATCHED' \ --max-cudagraph-capture-size '$GRAPH_CAP' --attention-backend '$ATTN_BACKEND' \ -cc.cudagraph_mode=PIECEWISE \ --compilation-config '{\"custom_ops\":[\"all\"]}' \ --enable-flashinfer-autotune \ \"\${HF_ARGS[@]}\" \"\${PARSER_ARGS[@]}\" \"\${SPEC_ARGS[@]}\" " echo "Launched $NAME (tp=$TP util=$GPU_UTIL maxlen=$MAXLEN dcp=$DCP mtp=$MTP num_spec=$NUM_SPEC batched=$MAX_BATCHED)" echo "watch boot: docker logs -f $NAME" echo "smoke test: curl -s http://127.0.0.1:$PORT/v1/models" echo "completion: curl -s http://127.0.0.1:$PORT/v1/chat/completions -d '{\"model\":\"$SERVED_NAME\",\"messages\":[{\"role\":\"user\",\"content\":\"what is the capital of kentucky?\"}]}'"