// GGUF Validity Bridge (v0.9.1 anti-bullshit pack) // // The dozen GGUF/VRAM calculators on HF answer "does this quant fit in my GPU?". // None answer "does it fit AND still work?". This reads a .gguf file's metadata // header directly in the browser (HTTP Range — no full multi-GB download), pulls // rope_theta + context_length + quant scheme + head geometry, then runs TAF's // γ_Padé / d_horizon + the quant-regime γ-shift to emit a quality verdict: // "fits in VRAM but attention collapses past d_horizon, and Q4 worsens γ by …". // // Parser logic is pure; the network fetch is unavoidable I/O. main.js renders. import { gammaPade } from "./gamma_check.js"; import { dHorizon } from "./yarn_planner.js"; import { predictQuantShift } from "./quant_regime.js"; // ── GGUF metadata value types (spec v2/v3) ── const GT = { U8:0, I8:1, U16:2, I16:3, U32:4, I32:5, F32:6, BOOL:7, STR:8, ARR:9, U64:10, I64:11, F64:12 }; const FIXED_SIZE = { 0:1, 1:1, 2:2, 3:2, 4:4, 5:4, 6:4, 7:1, 10:8, 11:8, 12:8 }; // general.file_type enum (llama_ftype) → human label + the quant_regime scheme id // we feed to predictQuantShift. Only the common ones; filename parsing backstops. const FTYPE = { 0: ["F32", null], 1: ["F16", null], 2: ["Q4_0", "gguf_q4_km"], 3: ["Q4_1", "gguf_q4_km"], 7: ["Q8_0", "gguf_q8_0"], 8: ["Q5_0", "gguf_q5_km"], 9: ["Q5_1", "gguf_q5_km"], 10: ["Q2_K", "gguf_q2_k"], 11: ["Q3_K_S", "gguf_q3_km"], 12: ["Q3_K_M", "gguf_q3_km"], 13: ["Q3_K_L", "gguf_q3_km"], 14: ["Q4_K_S", "gguf_q4_km"], 15: ["Q4_K_M", "gguf_q4_km"], 16: ["Q5_K_S", "gguf_q5_km"], 17: ["Q5_K_M", "gguf_q5_km"], 18: ["Q6_K", "gguf_q8_0"], }; // Filename → (label, scheme) backstop when general.file_type is absent/ambiguous. export function quantFromFilename(name) { const n = (name || "").toUpperCase(); const pairs = [ ["Q2_K", "gguf_q2_k"], ["Q3_K", "gguf_q3_km"], ["Q4_K", "gguf_q4_km"], ["Q5_K", "gguf_q5_km"], ["Q6_K", "gguf_q8_0"], ["Q8_0", "gguf_q8_0"], ["Q4_0", "gguf_q4_km"], ["Q4_1", "gguf_q4_km"], ["Q5_0", "gguf_q5_km"], ["Q5_1", "gguf_q5_km"], ["F16", null], ["BF16", null], ["F32", null], ]; for (const [tag, scheme] of pairs) { if (n.includes(tag)) return { label: tag.replace(/_$/, ""), scheme }; } return { label: "?", scheme: null }; } // List the .gguf files in a HF repo (so the user can pick a quant). export async function listGgufFiles(repo) { const resp = await fetch(`https://huggingface.co/api/models/${encodeURIComponent(repo).replace(/%2F/g, "/")}`); if (!resp.ok) throw new Error(`HTTP ${resp.status} — repo not found or private`); const data = await resp.json(); const sib = Array.isArray(data.siblings) ? data.siblings : []; return sib.map(s => s.rfilename).filter(f => /\.gguf$/i.test(f)).sort(); } // Incremental Range-fetch reader. GGUF metadata sits at the file head; arch + // rope fields precede the big tokenizer arrays, so a few MB always suffices. class GgufReader { constructor(url) { this.url = url; this.buf = new Uint8Array(0); this.dv = new DataView(this.buf.buffer); this.off = 0; this.fetched = 0; this.CHUNK = 1 << 20; // 1 MB per range this.MAX = 48 << 20; // hard cap 48 MB this.eof = false; } async ensure(n) { while (this.off + n > this.buf.length && !this.eof && this.fetched < this.MAX) { const start = this.fetched; const end = Math.min(this.fetched + this.CHUNK, this.MAX) - 1; const resp = await fetch(this.url, { headers: { Range: `bytes=${start}-${end}` } }); if (!resp.ok && resp.status !== 206 && resp.status !== 200) throw new Error(`HTTP ${resp.status}`); const part = new Uint8Array(await resp.arrayBuffer()); if (part.length === 0) { this.eof = true; break; } const merged = new Uint8Array(this.buf.length + part.length); merged.set(this.buf); merged.set(part, this.buf.length); this.buf = merged; this.dv = new DataView(this.buf.buffer); this.fetched += part.length; if (part.length < this.CHUNK) this.eof = true; // server returned the tail } if (this.off + n > this.buf.length) throw new Error("gguf_metadata_too_large"); } async u8() { await this.ensure(1); return this.dv.getUint8(this.off++); } async u16() { await this.ensure(2); const v = this.dv.getUint16(this.off, true); this.off += 2; return v; } async i16() { await this.ensure(2); const v = this.dv.getInt16(this.off, true); this.off += 2; return v; } async u32() { await this.ensure(4); const v = this.dv.getUint32(this.off, true); this.off += 4; return v; } async i32() { await this.ensure(4); const v = this.dv.getInt32(this.off, true); this.off += 4; return v; } async f32() { await this.ensure(4); const v = this.dv.getFloat32(this.off, true); this.off += 4; return v; } async f64() { await this.ensure(8); const v = this.dv.getFloat64(this.off, true); this.off += 8; return v; } // u64/i64 as Number — safe for counts/dims well under 2^53. async u64() { await this.ensure(8); const lo = this.dv.getUint32(this.off, true); const hi = this.dv.getUint32(this.off + 4, true); this.off += 8; return hi * 4294967296 + lo; } async i64() { return this.u64(); } async skip(n) { await this.ensure(0); // ensure buffer exists // skip may exceed current buffer; pull enough then advance offset await this.ensure(Math.min(n, this.MAX)); this.off += n; if (this.off > this.buf.length) { this.off = this.buf.length; throw new Error("gguf_metadata_too_large"); } } async str() { const len = await this.u64(); await this.ensure(len); const bytes = this.buf.subarray(this.off, this.off + len); this.off += len; return new TextDecoder("utf-8").decode(bytes); } } async function readValue(r, type) { switch (type) { case GT.U8: return r.u8(); case GT.I8: { const v = await r.u8(); return v > 127 ? v - 256 : v; } case GT.U16: return r.u16(); case GT.I16: return r.i16(); case GT.U32: return r.u32(); case GT.I32: return r.i32(); case GT.F32: return r.f32(); case GT.BOOL: return (await r.u8()) !== 0; case GT.STR: return r.str(); case GT.U64: return r.u64(); case GT.I64: return r.i64(); case GT.F64: return r.f64(); case GT.ARR: { const et = await r.u32(); const len = await r.u64(); if (FIXED_SIZE[et]) { await r.skip(len * FIXED_SIZE[et]); return { __array: len, elemType: et }; } if (et === GT.STR) { for (let i = 0; i < len; i++) { const sl = await r.u64(); await r.skip(sl); } return { __array: len, elemType: et }; } throw new Error("gguf_nested_array"); } default: throw new Error(`gguf_unknown_type_${type}`); } } // Parse the metadata KV block. Returns a flat { key: value } map (arrays are // returned as {__array,len} stubs — we never need their contents here). export async function fetchGgufMetadata(url) { const r = new GgufReader(url); const magic = (await r.u8()) | ((await r.u8()) << 8) | ((await r.u8()) << 16) | ((await r.u8()) << 24); if (magic !== 0x46554747 /* 'GGUF' little-endian */) throw new Error("not_a_gguf_file"); const version = await r.u32(); const tensorCount = await r.u64(); const kvCount = await r.u64(); const kv = {}; for (let i = 0; i < kvCount; i++) { const key = await r.str(); const type = await r.u32(); kv[key] = await readValue(r, type); } return { version, tensorCount, kvCount, kv, bytesRead: r.fetched }; } // Map raw GGUF metadata → HF-style config (so quant_regime + TAF math can reuse it). export function ggufToConfig(meta) { const kv = meta.kv || {}; const arch = kv["general.architecture"]; const g = (suffix, fallback = null) => (arch && kv[`${arch}.${suffix}`] !== undefined ? kv[`${arch}.${suffix}`] : fallback); const n_attn = g("attention.head_count"); const n_kv = g("attention.head_count_kv", n_attn); const hidden = g("embedding_length"); const keyLen = g("attention.key_length"); const headDim = (typeof keyLen === "number") ? keyLen : (n_attn && hidden ? hidden / n_attn : null); const ftypeEnum = kv["general.file_type"]; const ftype = (typeof ftypeEnum === "number" && FTYPE[ftypeEnum]) ? FTYPE[ftypeEnum] : null; return { architecture: arch || "?", quant_label: ftype ? ftype[0] : null, quant_scheme: ftype ? ftype[1] : null, rope_theta: g("rope.freq_base", null), context_length: g("context_length", null), rope_scaling_type: g("rope.scaling.type", null), rope_scaling_factor: g("rope.scaling.factor", null), rope_orig_ctx: g("rope.scaling.original_context_length", null), // HF-config aliases for predictQuantShift / inferNParams: num_attention_heads: n_attn ?? null, num_key_value_heads: n_kv ?? null, hidden_size: hidden ?? null, head_dim: headDim, num_hidden_layers: g("block_count", null), sliding_window: g("attention.sliding_window", null), vocab_size: g("vocab_size", null), }; } // Bridge verdict: combine GGUF geometry + TAF horizon + quant γ-shift. // cfg : ggufToConfig output (may be edited by user / filename backstop) // targetCtx : optional desired context L to check (else uses context_length) export function analyzeGguf(cfg, targetCtx) { const theta = Number(cfg.rope_theta) || 10000; const nCtx = Number(cfg.context_length) || null; const L = Number(targetCtx) || nCtx; // fp16 attention horizon — architectural, set by θ. SAME across every quant // of the model (quantisation adds noise, it does not change θ). d_horizon is // a function of the *natural* Padé γ, so it must be computed from the fp16 γ — // never from a quant-shifted γ (that inverts the formula and is meaningless). const gammaTrain = nCtx ? gammaPade(theta, nCtx) : null; const dHoriz = gammaTrain != null ? dHorizon(theta, gammaTrain) : null; // Quant γ-shift via the existing quant-regime model (architecture-aware). const quant = cfg.quant_scheme ? predictQuantShift(cfg, cfg.quant_scheme) : null; // γ at the target L: fp16, then after the quant shift. This is the quantity // that degrades monotonically with worse quant — the correct comparison axis. const gammaAtL = (theta && L) ? gammaPade(theta, L) : null; const shift = quant ? quant.gamma_shift : 0; const gammaQuant = (gammaAtL != null) ? gammaAtL - shift : null; // Verdict is driven by γ@L after quant (the direct attention-quality signal // at the target length) plus the quant-regime band. We deliberately do NOT // gate on L ≤ d_horizon: the closed-form d_horizon understates the true reach // for high-θ models (e.g. Qwen θ=1e6 keeps γ healthy far past its d_horizon), // so γ@L is the honest measure. `reaches` is reported for context only. const reaches = dHoriz != null && L != null && L <= dHoriz; const collapsed = !Number.isFinite(gammaQuant) || gammaQuant <= 0.2; const quantCliff = quant && quant.regime === "cliff"; let verdict; if (nCtx == null || theta == null) verdict = "incomplete"; else if (collapsed || quantCliff) verdict = "degrades"; else if (gammaQuant >= 0.6 && (!quant || quant.regime === "safe" || quant.regime === "mild")) verdict = "healthy"; else verdict = "usable_with_care"; return { theta, nCtx, L, gammaTrain, dHoriz, // fp16 architectural horizon (shared across quants) gammaAtL, gammaQuant, // attention at L: fp16 vs after-quant reaches, // is L within the fp16 horizon? quant, // {gamma_shift, regime, delta_ppl, ...} or null quantLabel: cfg.quant_label, arch: cfg.architecture, verdict, }; }