taf-agent / js /gguf_bridge.js
karlexmarin's picture
v0.9.1: GGUF Validity Bridge mode + binary header parser
2eb69cb
raw
history blame
11.7 kB
// GGUF Validity Bridge (v0.9.1 anti-bullshit pack)
//
// The dozen GGUF/VRAM calculators on HF answer "does this quant fit in my GPU?".
// None answer "does it fit AND still work?". This reads a .gguf file's metadata
// header directly in the browser (HTTP Range — no full multi-GB download), pulls
// rope_theta + context_length + quant scheme + head geometry, then runs TAF's
// γ_Padé / d_horizon + the quant-regime γ-shift to emit a quality verdict:
// "fits in VRAM but attention collapses past d_horizon, and Q4 worsens γ by …".
//
// Parser logic is pure; the network fetch is unavoidable I/O. main.js renders.
import { gammaPade } from "./gamma_check.js";
import { dHorizon } from "./yarn_planner.js";
import { predictQuantShift } from "./quant_regime.js";
// ── GGUF metadata value types (spec v2/v3) ──
const GT = { U8:0, I8:1, U16:2, I16:3, U32:4, I32:5, F32:6, BOOL:7, STR:8, ARR:9, U64:10, I64:11, F64:12 };
const FIXED_SIZE = { 0:1, 1:1, 2:2, 3:2, 4:4, 5:4, 6:4, 7:1, 10:8, 11:8, 12:8 };
// general.file_type enum (llama_ftype) → human label + the quant_regime scheme id
// we feed to predictQuantShift. Only the common ones; filename parsing backstops.
const FTYPE = {
0: ["F32", null],
1: ["F16", null],
2: ["Q4_0", "gguf_q4_km"],
3: ["Q4_1", "gguf_q4_km"],
7: ["Q8_0", "gguf_q8_0"],
8: ["Q5_0", "gguf_q5_km"],
9: ["Q5_1", "gguf_q5_km"],
10: ["Q2_K", "gguf_q2_k"],
11: ["Q3_K_S", "gguf_q3_km"],
12: ["Q3_K_M", "gguf_q3_km"],
13: ["Q3_K_L", "gguf_q3_km"],
14: ["Q4_K_S", "gguf_q4_km"],
15: ["Q4_K_M", "gguf_q4_km"],
16: ["Q5_K_S", "gguf_q5_km"],
17: ["Q5_K_M", "gguf_q5_km"],
18: ["Q6_K", "gguf_q8_0"],
};
// Filename → (label, scheme) backstop when general.file_type is absent/ambiguous.
export function quantFromFilename(name) {
const n = (name || "").toUpperCase();
const pairs = [
["Q2_K", "gguf_q2_k"], ["Q3_K", "gguf_q3_km"], ["Q4_K", "gguf_q4_km"],
["Q5_K", "gguf_q5_km"], ["Q6_K", "gguf_q8_0"], ["Q8_0", "gguf_q8_0"],
["Q4_0", "gguf_q4_km"], ["Q4_1", "gguf_q4_km"], ["Q5_0", "gguf_q5_km"],
["Q5_1", "gguf_q5_km"], ["F16", null], ["BF16", null], ["F32", null],
];
for (const [tag, scheme] of pairs) {
if (n.includes(tag)) return { label: tag.replace(/_$/, ""), scheme };
}
return { label: "?", scheme: null };
}
// List the .gguf files in a HF repo (so the user can pick a quant).
export async function listGgufFiles(repo) {
const resp = await fetch(`https://huggingface.co/api/models/${encodeURIComponent(repo).replace(/%2F/g, "/")}`);
if (!resp.ok) throw new Error(`HTTP ${resp.status} — repo not found or private`);
const data = await resp.json();
const sib = Array.isArray(data.siblings) ? data.siblings : [];
return sib.map(s => s.rfilename).filter(f => /\.gguf$/i.test(f)).sort();
}
// Incremental Range-fetch reader. GGUF metadata sits at the file head; arch +
// rope fields precede the big tokenizer arrays, so a few MB always suffices.
class GgufReader {
constructor(url) {
this.url = url;
this.buf = new Uint8Array(0);
this.dv = new DataView(this.buf.buffer);
this.off = 0;
this.fetched = 0;
this.CHUNK = 1 << 20; // 1 MB per range
this.MAX = 48 << 20; // hard cap 48 MB
this.eof = false;
}
async ensure(n) {
while (this.off + n > this.buf.length && !this.eof && this.fetched < this.MAX) {
const start = this.fetched;
const end = Math.min(this.fetched + this.CHUNK, this.MAX) - 1;
const resp = await fetch(this.url, { headers: { Range: `bytes=${start}-${end}` } });
if (!resp.ok && resp.status !== 206 && resp.status !== 200) throw new Error(`HTTP ${resp.status}`);
const part = new Uint8Array(await resp.arrayBuffer());
if (part.length === 0) { this.eof = true; break; }
const merged = new Uint8Array(this.buf.length + part.length);
merged.set(this.buf); merged.set(part, this.buf.length);
this.buf = merged;
this.dv = new DataView(this.buf.buffer);
this.fetched += part.length;
if (part.length < this.CHUNK) this.eof = true; // server returned the tail
}
if (this.off + n > this.buf.length) throw new Error("gguf_metadata_too_large");
}
async u8() { await this.ensure(1); return this.dv.getUint8(this.off++); }
async u16() { await this.ensure(2); const v = this.dv.getUint16(this.off, true); this.off += 2; return v; }
async i16() { await this.ensure(2); const v = this.dv.getInt16(this.off, true); this.off += 2; return v; }
async u32() { await this.ensure(4); const v = this.dv.getUint32(this.off, true); this.off += 4; return v; }
async i32() { await this.ensure(4); const v = this.dv.getInt32(this.off, true); this.off += 4; return v; }
async f32() { await this.ensure(4); const v = this.dv.getFloat32(this.off, true); this.off += 4; return v; }
async f64() { await this.ensure(8); const v = this.dv.getFloat64(this.off, true); this.off += 8; return v; }
// u64/i64 as Number — safe for counts/dims well under 2^53.
async u64() { await this.ensure(8); const lo = this.dv.getUint32(this.off, true); const hi = this.dv.getUint32(this.off + 4, true); this.off += 8; return hi * 4294967296 + lo; }
async i64() { return this.u64(); }
async skip(n) { await this.ensure(0); // ensure buffer exists
// skip may exceed current buffer; pull enough then advance offset
await this.ensure(Math.min(n, this.MAX)); this.off += n;
if (this.off > this.buf.length) { this.off = this.buf.length; throw new Error("gguf_metadata_too_large"); }
}
async str() {
const len = await this.u64();
await this.ensure(len);
const bytes = this.buf.subarray(this.off, this.off + len);
this.off += len;
return new TextDecoder("utf-8").decode(bytes);
}
}
async function readValue(r, type) {
switch (type) {
case GT.U8: return r.u8();
case GT.I8: { const v = await r.u8(); return v > 127 ? v - 256 : v; }
case GT.U16: return r.u16();
case GT.I16: return r.i16();
case GT.U32: return r.u32();
case GT.I32: return r.i32();
case GT.F32: return r.f32();
case GT.BOOL: return (await r.u8()) !== 0;
case GT.STR: return r.str();
case GT.U64: return r.u64();
case GT.I64: return r.i64();
case GT.F64: return r.f64();
case GT.ARR: {
const et = await r.u32();
const len = await r.u64();
if (FIXED_SIZE[et]) { await r.skip(len * FIXED_SIZE[et]); return { __array: len, elemType: et }; }
if (et === GT.STR) { for (let i = 0; i < len; i++) { const sl = await r.u64(); await r.skip(sl); } return { __array: len, elemType: et }; }
throw new Error("gguf_nested_array");
}
default: throw new Error(`gguf_unknown_type_${type}`);
}
}
// Parse the metadata KV block. Returns a flat { key: value } map (arrays are
// returned as {__array,len} stubs — we never need their contents here).
export async function fetchGgufMetadata(url) {
const r = new GgufReader(url);
const magic = (await r.u8()) | ((await r.u8()) << 8) | ((await r.u8()) << 16) | ((await r.u8()) << 24);
if (magic !== 0x46554747 /* 'GGUF' little-endian */) throw new Error("not_a_gguf_file");
const version = await r.u32();
const tensorCount = await r.u64();
const kvCount = await r.u64();
const kv = {};
for (let i = 0; i < kvCount; i++) {
const key = await r.str();
const type = await r.u32();
kv[key] = await readValue(r, type);
}
return { version, tensorCount, kvCount, kv, bytesRead: r.fetched };
}
// Map raw GGUF metadata → HF-style config (so quant_regime + TAF math can reuse it).
export function ggufToConfig(meta) {
const kv = meta.kv || {};
const arch = kv["general.architecture"];
const g = (suffix, fallback = null) => (arch && kv[`${arch}.${suffix}`] !== undefined ? kv[`${arch}.${suffix}`] : fallback);
const n_attn = g("attention.head_count");
const n_kv = g("attention.head_count_kv", n_attn);
const hidden = g("embedding_length");
const keyLen = g("attention.key_length");
const headDim = (typeof keyLen === "number") ? keyLen
: (n_attn && hidden ? hidden / n_attn : null);
const ftypeEnum = kv["general.file_type"];
const ftype = (typeof ftypeEnum === "number" && FTYPE[ftypeEnum]) ? FTYPE[ftypeEnum] : null;
return {
architecture: arch || "?",
quant_label: ftype ? ftype[0] : null,
quant_scheme: ftype ? ftype[1] : null,
rope_theta: g("rope.freq_base", null),
context_length: g("context_length", null),
rope_scaling_type: g("rope.scaling.type", null),
rope_scaling_factor: g("rope.scaling.factor", null),
rope_orig_ctx: g("rope.scaling.original_context_length", null),
// HF-config aliases for predictQuantShift / inferNParams:
num_attention_heads: n_attn ?? null,
num_key_value_heads: n_kv ?? null,
hidden_size: hidden ?? null,
head_dim: headDim,
num_hidden_layers: g("block_count", null),
sliding_window: g("attention.sliding_window", null),
vocab_size: g("vocab_size", null),
};
}
// Bridge verdict: combine GGUF geometry + TAF horizon + quant γ-shift.
// cfg : ggufToConfig output (may be edited by user / filename backstop)
// targetCtx : optional desired context L to check (else uses context_length)
export function analyzeGguf(cfg, targetCtx) {
const theta = Number(cfg.rope_theta) || 10000;
const nCtx = Number(cfg.context_length) || null;
const L = Number(targetCtx) || nCtx;
// fp16 attention horizon — architectural, set by θ. SAME across every quant
// of the model (quantisation adds noise, it does not change θ). d_horizon is
// a function of the *natural* Padé γ, so it must be computed from the fp16 γ —
// never from a quant-shifted γ (that inverts the formula and is meaningless).
const gammaTrain = nCtx ? gammaPade(theta, nCtx) : null;
const dHoriz = gammaTrain != null ? dHorizon(theta, gammaTrain) : null;
// Quant γ-shift via the existing quant-regime model (architecture-aware).
const quant = cfg.quant_scheme ? predictQuantShift(cfg, cfg.quant_scheme) : null;
// γ at the target L: fp16, then after the quant shift. This is the quantity
// that degrades monotonically with worse quant — the correct comparison axis.
const gammaAtL = (theta && L) ? gammaPade(theta, L) : null;
const shift = quant ? quant.gamma_shift : 0;
const gammaQuant = (gammaAtL != null) ? gammaAtL - shift : null;
// Verdict is driven by γ@L after quant (the direct attention-quality signal
// at the target length) plus the quant-regime band. We deliberately do NOT
// gate on L ≤ d_horizon: the closed-form d_horizon understates the true reach
// for high-θ models (e.g. Qwen θ=1e6 keeps γ healthy far past its d_horizon),
// so γ@L is the honest measure. `reaches` is reported for context only.
const reaches = dHoriz != null && L != null && L <= dHoriz;
const collapsed = !Number.isFinite(gammaQuant) || gammaQuant <= 0.2;
const quantCliff = quant && quant.regime === "cliff";
let verdict;
if (nCtx == null || theta == null) verdict = "incomplete";
else if (collapsed || quantCliff) verdict = "degrades";
else if (gammaQuant >= 0.6 && (!quant || quant.regime === "safe" || quant.regime === "mild")) verdict = "healthy";
else verdict = "usable_with_care";
return {
theta, nCtx, L,
gammaTrain, dHoriz, // fp16 architectural horizon (shared across quants)
gammaAtL, gammaQuant, // attention at L: fp16 vs after-quant
reaches, // is L within the fp16 horizon?
quant, // {gamma_shift, regime, delta_ppl, ...} or null
quantLabel: cfg.quant_label,
arch: cfg.architecture,
verdict,
};
}