Spaces:
Running
Running
| // YaRN / RoPE Context-Extension Planner (v0.9.0 anti-bullshit pack) | |
| // | |
| // Answers the most-asked HF question: "how do I set rope_scaling to extend | |
| // context to L, and will quality actually hold?" The VRAM calculators only | |
| // say "fits in GPU"; this says "fits AND works (or not)" using TAF's γ_Padé / | |
| // d_horizon machinery — all browser-only, no backend. | |
| // | |
| // Pure logic: returns structured codes + numbers. main.js does the i18n render. | |
| import { gammaPade } from "./gamma_check.js"; | |
| const SQRT2 = Math.SQRT2; | |
| // §26.2 — d_horizon = θ(1-γ)√2/(1+γ). null if γ outside (0,1). | |
| export function dHorizon(theta, gamma) { | |
| if (!Number.isFinite(theta) || theta <= 0) return null; | |
| if (!Number.isFinite(gamma) || gamma <= 0 || gamma >= 1) return null; | |
| return theta * (1 - gamma) * SQRT2 / (1 + gamma); | |
| } | |
| // §26.3 — θ needed to land at γ_target at context T (Padé inverse). | |
| export function thetaDesign(gammaTarget, T) { | |
| if (!(gammaTarget > -1 && gammaTarget < 1)) return null; | |
| if (!Number.isFinite(T) || T <= 0) return null; | |
| return T * SQRT2 * (1 + gammaTarget) / (2 * (1 - gammaTarget)); | |
| } | |
| // Effective base after RoPE extension. NTK/YaRN raise the base frequency; | |
| // the canonical NTK-aware bound is θ' = θ·f^(d/(d-2)) ≈ θ·f for typical head | |
| // dims (d≈128 → exponent 1.016). We use the first-order θ·f and flag it as an | |
| // estimate. Linear PI does NOT change the base — it compresses positions, so | |
| // its effect is modelled on the context axis (T/f) instead (see planExtension). | |
| export function thetaEffNTK(theta, factor) { | |
| if (!Number.isFinite(theta) || !Number.isFinite(factor) || factor <= 0) return NaN; | |
| return theta * factor; | |
| } | |
| // Default method pick: linear PI is fine for small stretches; YaRN is the | |
| // community default for ≥2× and degrades far more gracefully past 4×. | |
| export function suggestRopeType(factor) { | |
| if (factor <= 1) return "none"; | |
| if (factor < 2) return "linear"; | |
| return "yarn"; | |
| } | |
| // Build the exact config.json rope_scaling block for transformers ≥4.43. | |
| // `ropeType` ∈ {linear, dynamic, yarn, llama3}. original = trained context. | |
| export function buildRopeScaling(ropeType, factor, originalCtx) { | |
| const block = { | |
| rope_type: ropeType, | |
| factor: Math.round(factor * 1000) / 1000, | |
| original_max_position_embeddings: originalCtx, | |
| }; | |
| // YaRN exposes the interpolation ramp; ship the paper defaults so the snippet | |
| // is copy-paste runnable rather than a stub the user has to complete. | |
| if (ropeType === "yarn") { | |
| block.beta_fast = 32; | |
| block.beta_slow = 1; | |
| } | |
| return block; | |
| } | |
| // Core planner. All inputs numeric; returns numbers + warning/verdict codes. | |
| // originalCtx : model's trained context (max_position_embeddings pre-scaling) | |
| // theta : rope_theta (base). Defaults handled by caller. | |
| // targetCtx : desired context L | |
| // ropeType : optional override; else suggested from factor | |
| export function planExtension({ originalCtx, theta, targetCtx, ropeType }) { | |
| const out = { | |
| ok: false, | |
| originalCtx, theta, targetCtx, | |
| factor: null, | |
| ropeType: null, | |
| config: null, | |
| thetaEff: null, | |
| gammaNaive: null, // γ_Padé(θ, L) — NO extension: shows the problem | |
| gammaEff: null, // γ_Padé after the chosen extension method | |
| dHorizonNaive: null, | |
| dHorizonEff: null, | |
| thetaNeeded: null, // θ to keep γ healthy (0.5) at L — reference target | |
| verdict: "unknown", | |
| warnings: [], | |
| }; | |
| if (!Number.isFinite(originalCtx) || originalCtx <= 0) { | |
| out.verdict = "no_original_ctx"; | |
| return out; | |
| } | |
| if (!Number.isFinite(theta) || theta <= 0) { | |
| out.verdict = "no_theta"; | |
| return out; | |
| } | |
| if (!Number.isFinite(targetCtx) || targetCtx <= 0) { | |
| out.verdict = "no_target"; | |
| return out; | |
| } | |
| const factor = targetCtx / originalCtx; | |
| out.factor = Math.round(factor * 1000) / 1000; | |
| // Baseline (no extension) — this is what naive use at L gives. | |
| out.gammaNaive = gammaPade(theta, targetCtx); | |
| out.dHorizonNaive = dHorizon(theta, out.gammaNaive); | |
| // θ that would keep γ at a healthy 0.5 at L — a reference design target. | |
| out.thetaNeeded = thetaDesign(0.5, targetCtx); | |
| if (targetCtx <= originalCtx) { | |
| out.verdict = "no_extension_needed"; | |
| out.factor = Math.round(factor * 1000) / 1000; | |
| out.ropeType = "none"; | |
| out.gammaEff = out.gammaNaive; | |
| out.dHorizonEff = out.dHorizonNaive; | |
| return out; | |
| } | |
| const type = ropeType || suggestRopeType(factor); | |
| out.ropeType = type; | |
| out.config = buildRopeScaling(type, factor, originalCtx); | |
| if (type === "linear" || type === "dynamic") { | |
| // Linear PI / dynamic-NTK compress positions by `factor`. Modelled on the | |
| // context axis: the attention pattern at L behaves like context L/factor. | |
| out.thetaEff = theta; | |
| out.gammaEff = gammaPade(theta, targetCtx / factor); | |
| out.dHorizonEff = dHorizon(theta, out.gammaEff); | |
| if (out.dHorizonEff != null) out.dHorizonEff *= factor; // back to real-position units | |
| } else { | |
| // YaRN / NTK / llama3: raise the effective base ≈ θ·factor. | |
| out.thetaEff = thetaEffNTK(theta, factor); | |
| out.gammaEff = gammaPade(out.thetaEff, targetCtx); | |
| out.dHorizonEff = dHorizon(out.thetaEff, out.gammaEff); | |
| out.warnings.push({ code: "theta_eff_estimate", params: { thetaEff: out.thetaEff, factor: out.factor } }); | |
| } | |
| // Verdict from how much of the target the effective horizon actually covers. | |
| const horizonCover = (out.dHorizonEff != null && targetCtx > 0) | |
| ? out.dHorizonEff / targetCtx : null; | |
| if (factor > 4) { | |
| out.warnings.push({ code: "aggressive_factor", params: { factor: out.factor } }); | |
| } | |
| // Verdict weighs BOTH reach (does d_horizon cover L?) and sharpness (is γ_eff | |
| // high enough that tokens within the horizon are actually attended?). A horizon | |
| // that just barely reaches L with γ≈0.2 still means heavy decay — not "healthy". | |
| const reaches = horizonCover != null && horizonCover >= 1.0; | |
| const collapsed = !Number.isFinite(out.gammaEff) || out.gammaEff <= 0.2; | |
| if (collapsed || (horizonCover != null && horizonCover < 0.5)) { | |
| out.verdict = "degrades"; | |
| out.warnings.push({ code: "horizon_short", params: { dHorizon: out.dHorizonEff, target: targetCtx, cover: horizonCover, gammaEff: out.gammaEff } }); | |
| } else if (factor > 4) { | |
| out.verdict = "needs_finetune"; | |
| } else if (reaches && out.gammaEff >= 0.6) { | |
| out.verdict = "healthy"; | |
| } else { | |
| out.verdict = "usable_with_care"; | |
| } | |
| // Honesty caveat that always applies to closed-form extension planning. | |
| out.warnings.push({ code: "finetune_note", params: { factor: out.factor, aggressive: factor > 4 } }); | |
| out.ok = true; | |
| return out; | |
| } | |