#!/usr/bin/env bash # Build Kimi K2.6 JANGTQ_3L (3-bit MXTQ routed + fp16 everything else). # Optionally with REAP-30 routing-aware expert pruning before quantization. # # Self-contained: also installs the jang_tools patches needed to make Kimi # K2.6 + 3-bit work (per-row pack/unpack + in_features inference fix). # # Layout: # Source download: /Volumes/Backup/Kimi-K2.6/source (~2.05 TB) # REAP-30 pruned source: /Volumes/Backup/Kimi-K2.6/REAP-30 (~1.4 TB, optional) # Calibration scratch: /Volumes/Backup/Kimi-K2.6/calib (~few GB) # Final bundle: /Users/spotted/.cache/huggingface/hub/deviad/Kimi-K2.6-JANGTQ_3L # # Why 3L: # - 1L/2L use 2-bit routed experts → collapse loops on long generations. # - 3L uses 3-bit MXTQ routed + fp16 non-routed → ~3.3 bpw avg, no collapse. # - Fits a 512 GB M3 Ultra at ~420 GB raw, or ~295 GB after REAP-30 pruning. # # Two pipelines: # all apply-patches -> download -> convert -> finalize -> patch (~420 GB) # all-pruned apply-patches -> download -> prune -> convert -> finalize -> patch (~295 GB) set -euo pipefail BACKUP_ROOT="/Volumes/Backup" KIMI_ROOT="$BACKUP_ROOT/Kimi-K2.6" SRC_DIR="$KIMI_ROOT/source" PRUNED_DIR="$KIMI_ROOT/REAP-30" CALIB_DIR="$KIMI_ROOT/calib" CALIB_JSONL="$CALIB_DIR/calib_v2.jsonl" TOKENS_PT="$CALIB_DIR/tokens.safetensors" PRUNE_PLAN="$CALIB_DIR/prune_plan_30.json" CACHE_ROOT="/Users/spotted/.cache/huggingface/hub/deviad" OUT_DIR="$CACHE_ROOT/Kimi-K2.6-JANGTQ_3L" HF_REPO="${HF_REPO:-moonshotai/Kimi-K2.6}" PYTHON="/Applications/vMLX.app/Contents/Resources/bundled-python/python/bin/python3" HF_CLI="/Applications/vMLX.app/Contents/Resources/bundled-python/python/bin/hf" KIMI_2L_DIR="${KIMI_2L_DIR:-/Users/spotted/.cache/huggingface/hub/JANGQ-AI/Kimi-K2.6-Small-JANGTQ}" # jang_tools patch targets (Kimi K2.6 + 3-bit needs per-row pack/unpack + correct in_features) JANG_PKG="/Applications/vMLX.app/Contents/Resources/bundled-python/python/lib/python3.12/site-packages/jang_tools" JANG_LINEAR="$JANG_PKG/turboquant/linear.py" JANG_LOADER="$JANG_PKG/load_jangtq.py" usage() { cat < apply-patches -> download -> convert -> finalize -> patch (~420 GB) all-pruned check -> apply-patches -> download -> prune -> convert -> finalize -> patch (~295 GB) Individual stages: check Verify volumes, free space, tooling, repo accessibility apply-patches Idempotently apply 3 jang_tools patches required for Kimi K2.6 3-bit rollback-patches Restore .jang3l.bak backups created by apply-patches download hf download $HF_REPO -> \$SRC_DIR prune REAP-30 pipeline: build_calib_v2 -> tokenize_calib -> jangreap -> prune convert jang_tools.kimi_prune.convert_kimi_jangtq --profile 3L finalize Copy tokenizer / chat_template into output bundle patch Kimi-correct eos_token_id list + eos_token string serve Launch vmlx_engine.cli serve Output bundle: $OUT_DIR Repo: $HF_REPO (override with HF_REPO=... env var) EOF } check_step() { echo "==> Sanity checks" if ! mount | grep -q "on $BACKUP_ROOT "; then echo "ERROR: $BACKUP_ROOT not mounted."; exit 1 fi backup_free_gb=$(df -g "$BACKUP_ROOT" | awk 'NR==2 {print $4}') cache_free_gb=$(df -g "$HOME" | awk 'NR==2 {print $4}') echo " Free on Backup: ${backup_free_gb} GB (need ~2200 raw, ~3600 if pruning)" echo " Free on ~/.cache: ${cache_free_gb} GB (need ~500)" if [ "$backup_free_gb" -lt 2200 ]; then echo "ERROR: insufficient Backup free space."; exit 1 fi if [ "$cache_free_gb" -lt 500 ]; then echo "ERROR: insufficient ~/.cache free space."; exit 1 fi if [ ! -x "$PYTHON" ] || [ ! -x "$HF_CLI" ]; then echo "ERROR: bundled python/hf CLI missing."; exit 1 fi for mod in jang_tools.kimi_prune.convert_kimi_jangtq \ jang_tools.kimi_prune.build_calib_v2 \ jang_tools.kimi_prune.tokenize_calib \ jang_tools.kimi_prune.jangreap \ jang_tools.kimi_prune.prune \ jang_tools.turboquant.linear \ jang_tools.load_jangtq; do if ! "$PYTHON" -c "import $mod" 2>/dev/null; then echo "ERROR: $mod not importable from bundled python."; exit 1 fi done echo " Verifying repo $HF_REPO is reachable..." if ! "$HF_CLI" download "$HF_REPO" --include "config.json" \ --local-dir /tmp/kimi_repo_probe --max-workers 1 >/dev/null 2>&1; then echo "ERROR: cannot reach $HF_REPO. Check name or run \`hf auth login\`." exit 1 fi rm -rf /tmp/kimi_repo_probe mkdir -p "$KIMI_ROOT" "$CACHE_ROOT" "$CALIB_DIR" echo " OK" } apply_jang_patches_step() { echo "==> Applying jang_tools patches (idempotent)" if [ ! -f "$JANG_LINEAR" ] || [ ! -f "$JANG_LOADER" ]; then echo "ERROR: jang_tools files not found at $JANG_PKG"; exit 1 fi "$PYTHON" - "$JANG_LINEAR" "$JANG_LOADER" <<'PY' import sys, pathlib linear_path = pathlib.Path(sys.argv[1]) loader_path = pathlib.Path(sys.argv[2]) MARKER = "# JANG3L_PATCH_v1" def backup_once(p: pathlib.Path): bak = p.with_suffix(p.suffix + ".jang3l.bak") if not bak.exists(): bak.write_bytes(p.read_bytes()) print(f" backup: {bak}") # ------------ patch 1+2: linear.py ------------ text = linear_path.read_text() # Signature strings that prove the patches are already applied (with or without marker) PATCHED_SIG_1 = "if in_feat % vals_per_u32 == 0:\n packed = pack_bits(indices.reshape(-1), bits).reshape(out_feat, -1)\n else:\n row_packed = [pack_bits(indices[r], bits) for r in range(out_feat)]" PATCHED_SIG_2 = "for r in range(self.out_features):\n row_idx = unpack_bits(packed_e[r], self.bits, self.in_features)" if MARKER in text: print(f" {linear_path.name}: already patched, skipping") elif PATCHED_SIG_1 in text and PATCHED_SIG_2 in text: backup_once(linear_path) text = text.replace( "import mlx.core as mx", f"import mlx.core as mx {MARKER}", 1, ) linear_path.write_text(text) print(f" {linear_path.name}: patches already present (manual edit) — marker stamped") else: backup_once(linear_path) # Patch 1: per-row pack fallback in tq_quantize_weight p1_old = """ # Vectorized pack — 117x faster than per-row loop. Safe because all # GLM/MiniMax/Qwen in_features are divisible by vals_per_u32 (32/bits): # in_feat=6144/2048 and bits=2/3/4 → no per-row padding needed, so # flattening before pack gives bit-identical output. Verified on # (2048, 6144) 2-bit: max abs diff 0 vs per-row; see # `/tmp/pack_bits_vectorized_test.py`. vals_per_u32 = 32 // bits assert in_feat % vals_per_u32 == 0, ( f"tq_quantize_weight vectorized pack assumes in_feat " f"({in_feat}) divisible by vals_per_u32 ({vals_per_u32}); " f"fall back to per-row pack if this asserts." ) packed = pack_bits(indices.reshape(-1), bits).reshape(out_feat, -1)""" p1_new = """ # JANG3L_PATCH_v1: per-row pack fallback for Kimi K2.6 3-bit (in_feat=7168/2048 # not divisible by vals_per_u32=10). Whole-tensor flatten+pack+reshape misaligns # row boundaries; per-row pack pads each row's tail correctly. See per-row # unpack patch below in _dequant_experts. vals_per_u32 = 32 // bits if in_feat % vals_per_u32 == 0: packed = pack_bits(indices.reshape(-1), bits).reshape(out_feat, -1) else: row_packed = [pack_bits(indices[r], bits) for r in range(out_feat)] packed = mx.stack(row_packed)""" p1_alt_old = """ vals_per_u32 = 32 // bits assert in_feat % vals_per_u32 == 0, ( f"tq_quantize_weight vectorized pack assumes in_feat " f"({in_feat}) divisible by vals_per_u32 ({vals_per_u32}); " f"fall back to per-row pack if this asserts." ) packed = pack_bits(indices.reshape(-1), bits).reshape(out_feat, -1)""" if p1_old in text: text = text.replace(p1_old, p1_new, 1) elif p1_alt_old in text: text = text.replace(p1_alt_old, p1_new, 1) else: print(" ERROR: linear.py: could not find tq_quantize_weight pack block") sys.exit(1) # Patch 2: per-row unpack in _dequant_experts p2_old = """ def _dequant_experts(self, expert_indices) -> mx.array: \"\"\"Dequant selected experts. Returns (n_selected, out, in) float.\"\"\" results = [] for e in expert_indices: packed_e = self.packed[e] n_el = self.out_features * self.in_features idx = unpack_bits(packed_e.reshape(-1), self.bits, n_el) idx = idx.reshape(self.out_features, self.in_features) w = mx.take(self.codebook, idx.astype(mx.uint32)) w = w * self.norms[e][:, None].astype(w.dtype) w = hadamard_inverse(w, self.signs) results.append(w) return mx.stack(results)""" p2_new = """ def _dequant_experts(self, expert_indices) -> mx.array: \"\"\"Dequant selected experts. Returns (n_selected, out, in) float. Per-row unpack handles tensors where in_features isn't divisible by vals_per_u32 (each packed row has trailing pad bits that must not bleed into the next row's data). \"\"\" # JANG3L_PATCH_v1: per-row unpack to match per-row pack format. results = [] for e in expert_indices: packed_e = self.packed[e] # (out_features, packed_cols) rows = [] for r in range(self.out_features): row_idx = unpack_bits(packed_e[r], self.bits, self.in_features) rows.append(row_idx) idx = mx.stack(rows) w = mx.take(self.codebook, idx.astype(mx.uint32)) w = w * self.norms[e][:, None].astype(w.dtype) w = hadamard_inverse(w, self.signs) results.append(w) return mx.stack(results)""" if p2_old not in text: print(" ERROR: linear.py: could not find _dequant_experts block") sys.exit(1) text = text.replace(p2_old, p2_new, 1) linear_path.write_text(text) print(f" {linear_path.name}: patched (per-row pack + per-row unpack)") # ------------ patch 3: load_jangtq.py ------------ text = loader_path.read_text() PATCHED_SIG_3 = "existing_in = getattr(existing, \"in_features\", None)\n if existing_in is None:\n try:\n existing_in = int(existing.input_dims)" if MARKER in text: print(f" {loader_path.name}: already patched, skipping") elif PATCHED_SIG_3 in text: backup_once(loader_path) # Stamp marker near the top of the file if "import gc" in text: text = text.replace("import gc", f"import gc {MARKER}", 1) else: text = f"{MARKER}\n{text}" loader_path.write_text(text) print(f" {loader_path.name}: patch already present (manual edit) — marker stamped") else: backup_once(loader_path) p3_old = """ if packed.ndim == 3: n_exp, out_feat, packed_cols = packed.shape in_features = packed_cols * vals_per_u32 new_module = TurboQuantSwitchLinear( in_features=in_features, out_features=out_feat, num_experts=n_exp, bits=bits, bias=False, seed=mxtq_seed, ) else: out_feat, packed_cols = packed.shape in_features = packed_cols * vals_per_u32 new_module = TurboQuantLinear( in_features=in_features, out_features=out_feat, bits=bits, bias=False, seed=mxtq_seed, )""" p3_new = """ # JANG3L_PATCH_v1: prefer existing module's input dim. When the converter # used per-row pack (in_feat % vals_per_u32 != 0), packed_cols is ceil and # the inferred value would overshoot by the row pad (e.g. 7168 -> 7170). # mlx_lm SwitchLinear exposes `input_dims` (property), Linear uses # `in_features` — try both before falling back. existing_in = getattr(existing, "in_features", None) if existing_in is None: try: existing_in = int(existing.input_dims) except (AttributeError, TypeError): existing_in = None if packed.ndim == 3: n_exp, out_feat, packed_cols = packed.shape in_features = existing_in if existing_in is not None else packed_cols * vals_per_u32 new_module = TurboQuantSwitchLinear( in_features=in_features, out_features=out_feat, num_experts=n_exp, bits=bits, bias=False, seed=mxtq_seed, ) else: out_feat, packed_cols = packed.shape in_features = existing_in if existing_in is not None else packed_cols * vals_per_u32 new_module = TurboQuantLinear( in_features=in_features, out_features=out_feat, bits=bits, bias=False, seed=mxtq_seed, )""" if p3_old not in text: print(" ERROR: load_jangtq.py: could not find module replacement block") sys.exit(1) text = text.replace(p3_old, p3_new, 1) loader_path.write_text(text) print(f" {loader_path.name}: patched (in_features uses existing module)") print(" All patches applied.") PY } rollback_jang_patches_step() { echo "==> Rolling back jang_tools patches" for f in "$JANG_LINEAR" "$JANG_LOADER"; do bak="$f.jang3l.bak" if [ -f "$bak" ]; then cp "$bak" "$f" rm "$bak" echo " restored: $f" else echo " no backup found: $bak (skipping)" fi done } download_step() { echo "==> Downloading $HF_REPO into $SRC_DIR (~2.05 TB)" mkdir -p "$SRC_DIR" caffeinate -dimsu \ "$HF_CLI" download "$HF_REPO" \ --local-dir "$SRC_DIR" \ --max-workers 8 echo " Source size: $(du -sh "$SRC_DIR" | cut -f1)" } prune_step() { echo "==> REAP-30 routing-aware expert pruning" echo " Approximate timings on M3 Ultra 512 GB:" echo " [1/4] build_calib_v2 ~5 min" echo " [2a/4] tokenize_calib ~5 min" echo " [2b/4] jangreap (REAP) ~5-6 hours (streaming, ~25 GB peak)" echo " [4/4] prune ~30 min (rewrites bf16 shards)" echo if [ ! -d "$SRC_DIR" ] || [ -z "$(ls -A "$SRC_DIR" 2>/dev/null)" ]; then echo "ERROR: source not downloaded yet. Run \`$0 download\` first."; exit 1 fi if [ -d "$PRUNED_DIR" ] && [ -n "$(ls -A "$PRUNED_DIR" 2>/dev/null)" ]; then echo "ERROR: $PRUNED_DIR already exists and is non-empty. Remove or move it."; exit 1 fi mkdir -p "$CALIB_DIR" "$PRUNED_DIR" if [ -f "$CALIB_JSONL" ] && [ -f "$CALIB_DIR/calib_v2.summary.json" ]; then echo " [1/4] build_calib_v2 SKIPPED (already exists: $(du -h "$CALIB_JSONL" | cut -f1))" else echo " [1/4] build_calib_v2 -> $CALIB_JSONL (default 8.6M tokens)" "$PYTHON" -m jang_tools.kimi_prune.build_calib_v2 \ --out "$CALIB_JSONL" \ --seed 42 fi if [ -f "$TOKENS_PT" ]; then echo " [2a/4] tokenize_calib SKIPPED (already exists: $(du -h "$TOKENS_PT" | cut -f1))" else echo " [2a/4] tokenize_calib -> $TOKENS_PT" "$PYTHON" -m jang_tools.kimi_prune.tokenize_calib \ --corpus "$CALIB_JSONL" \ --tokenizer-path "$SRC_DIR" \ --out "$TOKENS_PT" fi if [ -f "$PRUNE_PLAN" ]; then echo " [2b/4] jangreap SKIPPED (already exists)" else echo " [2b/4] jangreap (streaming REAP-30, ~25 GB peak) -> $PRUNE_PLAN" caffeinate -dimsu \ "$PYTHON" -m jang_tools.kimi_prune.jangreap \ --model "$SRC_DIR" \ --tokens "$TOKENS_PT" \ --out-dir "$CALIB_DIR" \ --ratios 0.30 \ --device mps fi echo " [4/4] prune -> $PRUNED_DIR" caffeinate -dimsu \ "$PYTHON" -m jang_tools.kimi_prune.prune \ --src "$SRC_DIR" \ --dst "$PRUNED_DIR" \ --plan "$PRUNE_PLAN" echo " Pruned source at: $PRUNED_DIR ($(du -sh "$PRUNED_DIR" | cut -f1))" } convert_step() { echo "==> Converting -> JANGTQ_3L (profile=3L)" if [ -d "$OUT_DIR" ] && [ -n "$(ls -A "$OUT_DIR" 2>/dev/null)" ]; then echo "ERROR: $OUT_DIR is non-empty. Remove it first."; exit 1 fi mkdir -p "$OUT_DIR" src_to_use="$SRC_DIR" if [ -d "$PRUNED_DIR" ] && [ -n "$(ls -A "$PRUNED_DIR" 2>/dev/null)" ]; then echo " pruned source detected -> using $PRUNED_DIR" src_to_use="$PRUNED_DIR" fi caffeinate -dimsu \ "$PYTHON" -m jang_tools.kimi_prune.convert_kimi_jangtq \ --src "$src_to_use" \ --dst "$OUT_DIR" \ --profile 3L echo " Output size: $(du -sh "$OUT_DIR" | cut -f1)" } finalize_step() { echo "==> Finalizing bundle (tokenizer + chat_template)" for f in tokenizer.json tokenizer_config.json chat_template.jinja generation_config.json tiktoken.model tokenization_kimi.py configuration_kimi_k25.py configuration_deepseek.py; do if [ ! -f "$OUT_DIR/$f" ]; then if [ -f "$KIMI_2L_DIR/$f" ]; then cp "$KIMI_2L_DIR/$f" "$OUT_DIR/$f" echo " copied $f from Kimi-2L bundle" elif [ -f "$SRC_DIR/$f" ]; then cp "$SRC_DIR/$f" "$OUT_DIR/$f" echo " copied $f from source" fi fi done "$PYTHON" - < Applying Kimi-correct eos_token_id + chat_template fixes" "$PYTHON" - <, <|im_user|>, <|im_assistant|> ids = [] if tj.exists(): t = json.loads(tj.read_text()) wanted = {"<|im_end|>", "<|endoftext|>", "<|eot_id|>", "[EOS]", "<|im_user|>", "<|im_assistant|>"} for tok in t.get("added_tokens", []): if tok["content"] in wanted: ids.append(tok["id"]) ids = sorted(set(ids)) if not ids and model_type == "kimi_k25": ids = list(KIMI_TURN_IDS) print(f" Kimi K2.6 detected — using known turn-boundary IDs {ids}") if not ids: print(" WARN: no turn-boundary tokens found — leaving eos_token_id as-is") else: d = json.loads(gc.read_text()) if gc.exists() else {} d["eos_token_id"] = ids if len(ids) > 1 else ids[0] gc.write_text(json.dumps(d, indent=2, ensure_ascii=False)) print(f" eos_token_id = {d['eos_token_id']}") # Make sure tokenizer_config has the correct eos_token string for Kimi. tc = out / "tokenizer_config.json" if tc.exists(): d = json.loads(tc.read_text()) eos = d.get("eos_token") eos_str = eos["content"] if isinstance(eos, dict) else eos if model_type == "kimi_k25" and eos_str != "<|im_end|>": d["eos_token"] = "<|im_end|>" tc.write_text(json.dumps(d, indent=2, ensure_ascii=False)) print(f" fixed eos_token: {eos_str!r} -> '<|im_end|>'") elif eos_str == "[EOS]": d["eos_token"] = "<|im_end|>" tc.write_text(json.dumps(d, indent=2, ensure_ascii=False)) print(" fixed eos_token: [EOS] -> <|im_end|>") PY } serve_step() { echo "==> Serving $OUT_DIR" pkill -f vmlx_engine.cli || true sleep 2 exec "$PYTHON" -m vmlx_engine.cli serve \ "$OUT_DIR" \ --served-model-name kimi-k2.6-jangtq3l \ --host 127.0.0.1 --port 8012 \ --max-tokens 4096 \ --default-temperature 0.5 \ --default-top-p 0.9 \ --default-repetition-penalty 1.1 \ --tool-call-parser moonshot \ --enable-auto-tool-choice \ --log-level INFO } cmd="${1:-all}" case "$cmd" in check) check_step ;; apply-patches) apply_jang_patches_step ;; rollback-patches) rollback_jang_patches_step ;; download) check_step; download_step ;; prune) prune_step ;; convert) check_step; convert_step ;; finalize) finalize_step ;; patch) patch_step ;; serve) serve_step ;; all) check_step apply_jang_patches_step download_step convert_step finalize_step patch_step echo echo "Done. Bundle at: $OUT_DIR" echo "Serve with: $0 serve" ;; all-pruned) check_step apply_jang_patches_step download_step prune_step convert_step finalize_step patch_step echo echo "Done. Pruned + JANGTQ_3L bundle at: $OUT_DIR" echo "Serve with: $0 serve" ;; -h|--help|help) usage ;; *) usage; exit 1 ;; esac