Spaces:

karlexmarin
/

taf-agent

Running

App Files Files Community

taf-agent / js /gguf_bridge.js

karlexmarin's picture

v0.9.1: GGUF Validity Bridge mode + binary header parser

2eb69cb 18 days ago

11.7 kB

	// GGUF Validity Bridge (v0.9.1 anti-bullshit pack)
	//
	// The dozen GGUF/VRAM calculators on HF answer "does this quant fit in my GPU?".
	// None answer "does it fit AND still work?". This reads a .gguf file's metadata
	// header directly in the browser (HTTP Range — no full multi-GB download), pulls
	// rope_theta + context_length + quant scheme + head geometry, then runs TAF's
	// γ_Padé / d_horizon + the quant-regime γ-shift to emit a quality verdict:
	// "fits in VRAM but attention collapses past d_horizon, and Q4 worsens γ by …".
	//
	// Parser logic is pure; the network fetch is unavoidable I/O. main.js renders.

	import { gammaPade } from "./gamma_check.js";
	import { dHorizon } from "./yarn_planner.js";
	import { predictQuantShift } from "./quant_regime.js";

	// ── GGUF metadata value types (spec v2/v3) ──
	const GT = { U8:0, I8:1, U16:2, I16:3, U32:4, I32:5, F32:6, BOOL:7, STR:8, ARR:9, U64:10, I64:11, F64:12 };
	const FIXED_SIZE = { 0:1, 1:1, 2:2, 3:2, 4:4, 5:4, 6:4, 7:1, 10:8, 11:8, 12:8 };

	// general.file_type enum (llama_ftype) → human label + the quant_regime scheme id
	// we feed to predictQuantShift. Only the common ones; filename parsing backstops.
	const FTYPE = {
	0: ["F32", null],
	1: ["F16", null],
	2: ["Q4_0", "gguf_q4_km"],
	3: ["Q4_1", "gguf_q4_km"],
	7: ["Q8_0", "gguf_q8_0"],
	8: ["Q5_0", "gguf_q5_km"],
	9: ["Q5_1", "gguf_q5_km"],
	10: ["Q2_K", "gguf_q2_k"],
	11: ["Q3_K_S", "gguf_q3_km"],
	12: ["Q3_K_M", "gguf_q3_km"],
	13: ["Q3_K_L", "gguf_q3_km"],
	14: ["Q4_K_S", "gguf_q4_km"],
	15: ["Q4_K_M", "gguf_q4_km"],
	16: ["Q5_K_S", "gguf_q5_km"],
	17: ["Q5_K_M", "gguf_q5_km"],
	18: ["Q6_K", "gguf_q8_0"],
	};

	// Filename → (label, scheme) backstop when general.file_type is absent/ambiguous.
	export function quantFromFilename(name) {
	const n = (name \|\| "").toUpperCase();
	const pairs = [
	["Q2_K", "gguf_q2_k"], ["Q3_K", "gguf_q3_km"], ["Q4_K", "gguf_q4_km"],
	["Q5_K", "gguf_q5_km"], ["Q6_K", "gguf_q8_0"], ["Q8_0", "gguf_q8_0"],
	["Q4_0", "gguf_q4_km"], ["Q4_1", "gguf_q4_km"], ["Q5_0", "gguf_q5_km"],
	["Q5_1", "gguf_q5_km"], ["F16", null], ["BF16", null], ["F32", null],
	];
	for (const [tag, scheme] of pairs) {
	if (n.includes(tag)) return { label: tag.replace(/_$/, ""), scheme };
	}
	return { label: "?", scheme: null };
	}

	// List the .gguf files in a HF repo (so the user can pick a quant).
	export async function listGgufFiles(repo) {
	const resp = await fetch(`https://huggingface.co/api/models/${encodeURIComponent(repo).replace(/%2F/g, "/")}`);
	if (!resp.ok) throw new Error(`HTTP ${resp.status} — repo not found or private`);
	const data = await resp.json();
	const sib = Array.isArray(data.siblings) ? data.siblings : [];
	return sib.map(s => s.rfilename).filter(f => /\.gguf$/i.test(f)).sort();
	}

	// Incremental Range-fetch reader. GGUF metadata sits at the file head; arch +
	// rope fields precede the big tokenizer arrays, so a few MB always suffices.
	class GgufReader {
	constructor(url) {
	this.url = url;
	this.buf = new Uint8Array(0);
	this.dv = new DataView(this.buf.buffer);
	this.off = 0;
	this.fetched = 0;
	this.CHUNK = 1 << 20; // 1 MB per range
	this.MAX = 48 << 20; // hard cap 48 MB
	this.eof = false;
	}
	async ensure(n) {
	while (this.off + n > this.buf.length && !this.eof && this.fetched < this.MAX) {
	const start = this.fetched;
	const end = Math.min(this.fetched + this.CHUNK, this.MAX) - 1;
	const resp = await fetch(this.url, { headers: { Range: `bytes=${start}-${end}` } });
	if (!resp.ok && resp.status !== 206 && resp.status !== 200) throw new Error(`HTTP ${resp.status}`);
	const part = new Uint8Array(await resp.arrayBuffer());
	if (part.length === 0) { this.eof = true; break; }
	const merged = new Uint8Array(this.buf.length + part.length);
	merged.set(this.buf); merged.set(part, this.buf.length);
	this.buf = merged;
	this.dv = new DataView(this.buf.buffer);
	this.fetched += part.length;
	if (part.length < this.CHUNK) this.eof = true; // server returned the tail
	}
	if (this.off + n > this.buf.length) throw new Error("gguf_metadata_too_large");
	}
	async u8() { await this.ensure(1); return this.dv.getUint8(this.off++); }
	async u16() { await this.ensure(2); const v = this.dv.getUint16(this.off, true); this.off += 2; return v; }
	async i16() { await this.ensure(2); const v = this.dv.getInt16(this.off, true); this.off += 2; return v; }
	async u32() { await this.ensure(4); const v = this.dv.getUint32(this.off, true); this.off += 4; return v; }
	async i32() { await this.ensure(4); const v = this.dv.getInt32(this.off, true); this.off += 4; return v; }
	async f32() { await this.ensure(4); const v = this.dv.getFloat32(this.off, true); this.off += 4; return v; }
	async f64() { await this.ensure(8); const v = this.dv.getFloat64(this.off, true); this.off += 8; return v; }
	// u64/i64 as Number — safe for counts/dims well under 2^53.
	async u64() { await this.ensure(8); const lo = this.dv.getUint32(this.off, true); const hi = this.dv.getUint32(this.off + 4, true); this.off += 8; return hi * 4294967296 + lo; }
	async i64() { return this.u64(); }
	async skip(n) { await this.ensure(0); // ensure buffer exists
	// skip may exceed current buffer; pull enough then advance offset
	await this.ensure(Math.min(n, this.MAX)); this.off += n;
	if (this.off > this.buf.length) { this.off = this.buf.length; throw new Error("gguf_metadata_too_large"); }
	}
	async str() {
	const len = await this.u64();
	await this.ensure(len);
	const bytes = this.buf.subarray(this.off, this.off + len);
	this.off += len;
	return new TextDecoder("utf-8").decode(bytes);
	}
	}

	async function readValue(r, type) {
	switch (type) {
	case GT.U8: return r.u8();
	case GT.I8: { const v = await r.u8(); return v > 127 ? v - 256 : v; }
	case GT.U16: return r.u16();
	case GT.I16: return r.i16();
	case GT.U32: return r.u32();
	case GT.I32: return r.i32();
	case GT.F32: return r.f32();
	case GT.BOOL: return (await r.u8()) !== 0;
	case GT.STR: return r.str();
	case GT.U64: return r.u64();
	case GT.I64: return r.i64();
	case GT.F64: return r.f64();
	case GT.ARR: {
	const et = await r.u32();
	const len = await r.u64();
	if (FIXED_SIZE[et]) { await r.skip(len * FIXED_SIZE[et]); return { __array: len, elemType: et }; }
	if (et === GT.STR) { for (let i = 0; i < len; i++) { const sl = await r.u64(); await r.skip(sl); } return { __array: len, elemType: et }; }
	throw new Error("gguf_nested_array");
	}
	default: throw new Error(`gguf_unknown_type_${type}`);
	}
	}

	// Parse the metadata KV block. Returns a flat { key: value } map (arrays are
	// returned as {__array,len} stubs — we never need their contents here).
	export async function fetchGgufMetadata(url) {
	const r = new GgufReader(url);
	const magic = (await r.u8()) \| ((await r.u8()) << 8) \| ((await r.u8()) << 16) \| ((await r.u8()) << 24);
	if (magic !== 0x46554747 /* 'GGUF' little-endian */) throw new Error("not_a_gguf_file");
	const version = await r.u32();
	const tensorCount = await r.u64();
	const kvCount = await r.u64();
	const kv = {};
	for (let i = 0; i < kvCount; i++) {
	const key = await r.str();
	const type = await r.u32();
	kv[key] = await readValue(r, type);
	}
	return { version, tensorCount, kvCount, kv, bytesRead: r.fetched };
	}

	// Map raw GGUF metadata → HF-style config (so quant_regime + TAF math can reuse it).
	export function ggufToConfig(meta) {
	const kv = meta.kv \|\| {};
	const arch = kv["general.architecture"];
	const g = (suffix, fallback = null) => (arch && kv[`${arch}.${suffix}`] !== undefined ? kv[`${arch}.${suffix}`] : fallback);

	const n_attn = g("attention.head_count");
	const n_kv = g("attention.head_count_kv", n_attn);
	const hidden = g("embedding_length");
	const keyLen = g("attention.key_length");
	const headDim = (typeof keyLen === "number") ? keyLen
	: (n_attn && hidden ? hidden / n_attn : null);
	const ftypeEnum = kv["general.file_type"];
	const ftype = (typeof ftypeEnum === "number" && FTYPE[ftypeEnum]) ? FTYPE[ftypeEnum] : null;

	return {
	architecture: arch \|\| "?",
	quant_label: ftype ? ftype[0] : null,
	quant_scheme: ftype ? ftype[1] : null,
	rope_theta: g("rope.freq_base", null),
	context_length: g("context_length", null),
	rope_scaling_type: g("rope.scaling.type", null),
	rope_scaling_factor: g("rope.scaling.factor", null),
	rope_orig_ctx: g("rope.scaling.original_context_length", null),
	// HF-config aliases for predictQuantShift / inferNParams:
	num_attention_heads: n_attn ?? null,
	num_key_value_heads: n_kv ?? null,
	hidden_size: hidden ?? null,
	head_dim: headDim,
	num_hidden_layers: g("block_count", null),
	sliding_window: g("attention.sliding_window", null),
	vocab_size: g("vocab_size", null),
	};
	}

	// Bridge verdict: combine GGUF geometry + TAF horizon + quant γ-shift.
	// cfg : ggufToConfig output (may be edited by user / filename backstop)
	// targetCtx : optional desired context L to check (else uses context_length)
	export function analyzeGguf(cfg, targetCtx) {
	const theta = Number(cfg.rope_theta) \|\| 10000;
	const nCtx = Number(cfg.context_length) \|\| null;
	const L = Number(targetCtx) \|\| nCtx;

	// fp16 attention horizon — architectural, set by θ. SAME across every quant
	// of the model (quantisation adds noise, it does not change θ). d_horizon is
	// a function of the natural Padé γ, so it must be computed from the fp16 γ —
	// never from a quant-shifted γ (that inverts the formula and is meaningless).
	const gammaTrain = nCtx ? gammaPade(theta, nCtx) : null;
	const dHoriz = gammaTrain != null ? dHorizon(theta, gammaTrain) : null;

	// Quant γ-shift via the existing quant-regime model (architecture-aware).
	const quant = cfg.quant_scheme ? predictQuantShift(cfg, cfg.quant_scheme) : null;

	// γ at the target L: fp16, then after the quant shift. This is the quantity
	// that degrades monotonically with worse quant — the correct comparison axis.
	const gammaAtL = (theta && L) ? gammaPade(theta, L) : null;
	const shift = quant ? quant.gamma_shift : 0;
	const gammaQuant = (gammaAtL != null) ? gammaAtL - shift : null;

	// Verdict is driven by γ@L after quant (the direct attention-quality signal
	// at the target length) plus the quant-regime band. We deliberately do NOT
	// gate on L ≤ d_horizon: the closed-form d_horizon understates the true reach
	// for high-θ models (e.g. Qwen θ=1e6 keeps γ healthy far past its d_horizon),
	// so γ@L is the honest measure. `reaches` is reported for context only.
	const reaches = dHoriz != null && L != null && L <= dHoriz;
	const collapsed = !Number.isFinite(gammaQuant) \|\| gammaQuant <= 0.2;
	const quantCliff = quant && quant.regime === "cliff";
	let verdict;
	if (nCtx == null \|\| theta == null) verdict = "incomplete";
	else if (collapsed \|\| quantCliff) verdict = "degrades";
	else if (gammaQuant >= 0.6 && (!quant \|\| quant.regime === "safe" \|\| quant.regime === "mild")) verdict = "healthy";
	else verdict = "usable_with_care";

	return {
	theta, nCtx, L,
	gammaTrain, dHoriz, // fp16 architectural horizon (shared across quants)
	gammaAtL, gammaQuant, // attention at L: fp16 vs after-quant
	reaches, // is L within the fp16 horizon?
	quant, // {gamma_shift, regime, delta_ppl, ...} or null
	quantLabel: cfg.quant_label,
	arch: cfg.architecture,
	verdict,
	};
	}