// TAF Agent i18n — minimal translation system.
// Add languages by extending TRANSLATIONS. Set data-i18n="key" on any element.
// Persist user choice in localStorage.
export const LANGUAGES = [
{ code: "en", flag: "🇬🇧", label: "English" },
{ code: "es", flag: "🇪🇸", label: "Español" },
{ code: "fr", flag: "🇫🇷", label: "Français" },
{ code: "zh", flag: "🇨🇳", label: "中文" },
];
export const TRANSLATIONS = {
en: {
"hero.title": "🔬 TAF Agent",
"hero.tagline": "Diagnose any transformer LLM in 30 seconds. Free. No GPU. No signup.",
"hero.subtitle": "Predicts whether a model will work for your use case before you spend money or time. Everything runs in your browser — your inputs never leave this tab.",
"hero.help": "📘 Manual & examples",
"hero.quickstart_btn": "⚡ Quick start",
"hero.inventory_btn": "🧰 What it gives you",
"hero.about": "Built by an independent researcher. Open source. Not affiliated with any model vendor.",
"modes.title": "🎯 Mode",
"modes.profile": "📇 Profile a model",
"modes.compare": "🆚 Compare models",
"modes.inspector": "🔍 Inspect config",
"modes.ask": "💬 Ask plain English",
"modes.recipe": "📋 Pick recipe",
"modes.diagnose": "🩺 Diagnose CLI",
"diagnose.title": "🩺 Diagnose CLI Command Builder",
"diagnose.tip": "Browser predicts γ from config; the CLI measures γ_obs on real weights. Builder produces the exact command to run locally.",
"diagnose.desc": "Pick options and copy-paste the generated command on your local machine (Python + transformers + numpy). Fast mode ≈5 min CPU; full ≈20–60 min GPU.",
"diagnose.model_label": "HF model id:",
"diagnose.theta_label": "θ (auto if blank):",
"diagnose.n_label": "Context N:",
"diagnose.options_label": "Options:",
"diagnose.opt_fast": "--fast (CPU, ~5 min)",
"diagnose.opt_cpu": "--cpu (force CPU)",
"diagnose.opt_4bit": "--load_in_4bit (≥7B models)",
"diagnose.local_label": "--local path (optional):",
"diagnose.build_btn": "📋 Build command",
"diagnose.cmd_title": "Generated command:",
"diagnose.copy_btn": "📋 Copy to clipboard",
"diagnose.next_steps": "Next steps: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Run the command (4) Result JSON → upload via Inspect mode for full TAF analysis.",
"modes.phase": "📊 Phase diagram",
"phase.title": "📊 Phase diagram (γ × θ)",
"phase.tip": "Each dot is one model from the paper's empirical panel. x-axis log θ; y-axis γ. Hagedorn line γ=1 separates Phase A from Phase B. Hover for details, click to load into the recipe form.",
"phase.desc": "23 models in the panel; Padé curve at T=2000.",
"modes.desc": "Quickest start: paste any HuggingFace model id (e.g. meta-llama/Meta-Llama-3-8B), click Profile. See all 5 recipes scored in seconds.",
"profile.title": "📇 Profile a model",
"profile.desc": "For technicians: when you need a complete viability snapshot of a candidate model. One-click runs all 5 recipes and produces a unified TAF Card.",
"profile.preset_label": "Preset:",
"profile.preset_default": "— or pick from list —",
"profile.hf_label": "HF model id:",
"profile.fetch_btn": "📥 Fetch",
"profile.btn": "🚀 Generate full profile",
"profile.quickstart": "💡 Quick start: pick any preset → click Generate. Or paste a model id from HF Hub trending → 📥 Fetch → Generate.",
"compare.title": "🆚 Compare models side-by-side",
"compare.desc": "For technicians: when choosing between 2-3 candidate models for a specific deployment scenario. Same recipe, multiple models, side-by-side verdicts.",
"compare.recipe_label": "Recipe:",
"compare.T_eval_label": "T_eval (target context):",
"compare.models_title": "Models to compare (add up to 3)",
"compare.btn": "🚀 Compare",
"compare.example": "💡 Try: paste 3 popular 7-8B models (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), pick recipe X-2, T_eval=16000. See which best handles long context.",
"ask.title": "❓ Your question",
"ask.placeholder": "e.g. Will Mistral-7B handle 16K NIAH retrieval? Or: I have $5,000, what model can I train? Or: Cheapest GPU to serve Llama-70B at 100M tokens/day?",
"ask.btn": "🚀 Analyze",
"ask.example_btn": "💡 Try an example",
"recipe.title": "📋 Recipe",
"recipe.default": "— select a recipe —",
"recipe.input_title": "🎯 Inputs",
"verdict.title": "📊 Verdict",
"chain.title": "🔍 Computation Chain",
"chain.desc": "Every number below is deterministic Python. Click a step to expand.",
"answer.title": "💬 Plain-English Answer",
"share.btn": "🔗 Copy share link",
"share.copied": "✅ Copied to clipboard!",
"share.download": "💾 Download JSON",
"share.download_md": "📝 Markdown",
"share.download_tex": "📜 LaTeX",
"share.submit": "📤 Submit to registry",
"share.submit_clip_ok": "↗ Opened GitHub. Body copied to clipboard — paste it into the issue body.",
"share.submit_clip_fail": "↗ Opened GitHub. Clipboard blocked — body logged in browser console (F12).",
"share.import_title": "📂 Import a shared TAF result",
"a11y.skip": "Skip to main content",
// v0.6.2 — landing rework: quick-start + inventory + arch tooltips
"qs.title": "⚡ Quick start",
"qs.step1": "Paste a HuggingFace model ID (e.g. meta-llama/Meta-Llama-3-8B)",
"qs.step2": "Click 📇 Profile a model",
"qs.step3": "Read your TAF Card — verdict per use case + key numbers + math verified by Lean+Mathlib",
"qs.cta": "↓ Start now",
"inv.title": "🧰 What this tool gives you",
"inv.recipes.title": "🎯 8 recipes — does this model fit your use case?",
"inv.recipes.x1.title": "Custom train vs API",
"inv.recipes.x1.body": "which is cheaper for your traffic?",
"inv.recipes.x2.title": "Long context",
"inv.recipes.x2.body": "will it handle 32k / 128k tokens reliably?",
"inv.recipes.x3.title": "Budget",
"inv.recipes.x3.body": "with $X, what model can you train from scratch?",
"inv.recipes.x5.title": "Hardware",
"inv.recipes.x5.body": "which GPU to serve N tokens/day?",
"inv.recipes.x19.title": "KV cache",
"inv.recipes.x19.body": "how to compress without breaking quality?",
"inv.recipes.x21.title": "Imprint purity",
"inv.recipes.x21.body": "how clean is the model's positional encoding?",
"inv.recipes.x22.title": "Compute-context",
"inv.recipes.x22.body": "does the model fit the empirical band?",
"inv.recipes.x23.title": "IH-phase",
"inv.recipes.x23.body": "pre- or post-induction-head?",
"inv.diag.title": "🔬 Diagnostics",
"inv.diag.gamma": "γ predicted vs observed — auto-classifies the model into 5 regimes (normal · fraud / inflated context · compressed · over-Padé · sliding-window)",
"inv.diag.cardy": "Cardy ΔH — entropy shift between observed and nominal context",
"inv.diag.fals": "Falsification dashboard — checks 23 specific predictions (F1–F23)",
"inv.diag.alg": "Algebraic consistency — 8 mathematical identities the model must satisfy",
"inv.verify.title": "✓ Formally verified math",
"inv.verify.count": "37 theorems machine-proven in Lean 4 + Mathlib4",
"inv.verify.click": "Click any badge → opens the source line on GitHub",
"inv.verify.reverify": "Verify yourself: lake build (≈5 s after cache fetch)",
"inv.export.title": "📤 Export & share",
"inv.export.formats": "JSON · Markdown · LaTeX (paper-ready)",
"inv.export.share": "Reproducible share link (state encoded in URL)",
"inv.export.registry": "Submit to community registry on GitHub",
"arch.summary": "Architectures supported",
"arch.anyhf": "✓ Any HuggingFace public model",
"tooltip.mha": "Multi-Head Attention: each token position attends through several parallel heads at once.",
"tooltip.gqa": "Grouped Query Attention: queries share fewer keys/values than heads (saves memory but pushes γ toward Hagedorn).",
"tooltip.alibi": "Attention with Linear Biases: position info is a learned slope added to attention scores, no rotation.",
"tooltip.abspe": "Absolute Position Embeddings: each position has a fixed learned vector added to the token embedding.",
"tooltip.swa": "Sliding Window Attention: each token only attends within a fixed local window (Mistral, gemma-2 use this).",
"tooltip.ssm": "State Space Model: a sequence layer that maintains internal state instead of attention (Mamba, Jamba use this).",
// v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker
"modes.unmask": "🪟 Unmask",
"unmask.title": "🪟 Context Unmasker",
"unmask.tip": "Paste a HuggingFace model id (or raw config.json). The tool checks for sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), and GQA — anything that makes max_position_embeddings larger than the practical effective context. Mistral-7B-v0.1 is the canonical example: declared 32k, attends within ~4-8k.",
"unmask.desc": "Are you about to spend money on a model that won't actually attend that far? Paste an id and find out in 1 second. No GPU, no inference — just config.json arithmetic.",
"unmask.id_label": "HF model id:",
"unmask.fetch_btn": "🔍 Unmask",
"unmask.paste_summary": "Or paste raw config.json (private / in-dev models)",
"unmask.paste_btn": "🔍 Unmask pasted config",
"unmask.label.declared": "Declared context",
"unmask.label.effective": "Effective (estimate)",
"unmask.label.ratio": "Ratio",
"unmask.section.flags": "Architecture flags",
"unmask.section.warnings": "Warnings",
"unmask.section.reco": "Recommendation",
"unmask.flag.swa": "SWA",
"unmask.flag.rope": "RoPE scaling",
"unmask.flag.gqa": "GQA",
"unmask.flag.layers": "Layers",
"unmask.flag.dhead": "d_head",
"unmask.flag.theta": "RoPE θ",
"unmask.flag.yes": "yes",
"unmask.flag.no": "no",
"unmask.flag.full_mha": "no (full MHA, {n} heads)",
"unmask.verdict.honest": "✅ HONEST",
"unmask.verdict.inflated": "⚠ INFLATED",
"unmask.verdict.severely_inflated": "❌ SEVERELY INFLATED",
"unmask.verdict.yarn_extended": "⚠ YARN-EXTENDED",
"unmask.verdict.unknown": "❓ UNKNOWN",
"unmask.warn.swa_window": "SWA window: {window} tokens — each layer only attends within this window.",
"unmask.warn.multihop": "Multi-hop estimate: ~{multiHop} tokens (conservative: window × {factor}).",
"unmask.warn.yarn": "RoPE scaling ({type}) extends context {factor}× from ~{original} to {declared} tokens.",
"unmask.warn.yarn_advice": "RoPE-extended context — verify γ behavior at the full claimed length with the γ_check diagnostic.",
"unmask.warn.gqa_small_dhead": "Small head dim ({d_head}) + GQA: KV cache compression at long context is likely (γ pushed toward Hagedorn).",
"unmask.reco.honest": "Standard full-attention model. Effective context matches declared ({declared} tokens).",
"unmask.reco.inflated": "Effective ~{effective} tokens via SWA. Use γ_check to verify behavior at your target evaluation length.",
"unmask.reco.severely_inflated": "Treat as a ~{effective}-token context model in practice. The {declared}-token claim only applies via cross-layer attention chains, which empirically degrade past ~2× the SWA window.",
"unmask.reco.yarn_extended": "RoPE-extended context. Run a long-context benchmark (NIAH at 8k / 16k / 32k / full) to confirm the extension holds. Use γ_check with T_eval = {declared}.",
"unmask.reco.unknown": "Could not parse config. Verify the URL is a valid HF model with public config.json.",
"unmask.status.empty_id": "⚠ Enter a model id (e.g. mistralai/Mistral-7B-v0.1).",
"unmask.status.fetching": "⏳ Fetching config.json for {modelId}...",
"unmask.status.success": "✅ Analyzed {modelId} (verdict: {verdict})",
"unmask.status.empty_paste": "⚠ Paste a config.json first.",
"unmask.status.invalid_json": "❌ Not valid JSON: {error}",
"unmask.status.success_paste": "✅ Analyzed pasted config (verdict: {verdict})",
"unmask.pasted_label": "(pasted config)",
"mode_desc.ask": "Type a free-form question. The in-browser LLM picks the right recipe and runs it.",
"mode_desc.recipe": "Pick a recipe directly and fill the form. Full manual control.",
"mode_desc.profile": "Quickest start: paste any HuggingFace model id, click Profile. See all 5 recipes scored in seconds.",
"mode_desc.compare": "Pick 2-3 candidate models + one recipe. See verdicts side-by-side in a comparison table.",
"mode_desc.inspector": "Paste a config.json directly. Useful for private/in-development models not on HF Hub.",
"mode_desc.diagnose": "Build the diagnose_model.py CLI command to MEASURE γ_obs on real GPU. Browser predicts; CLI measures.",
"mode_desc.phase": "γ × θ scatter of the paper's empirical panel. Hover a dot for details, click to load into Diagnose / Recipe forms.",
"mode_desc.unmask": "Detects whether max_position_embeddings is misleading (SWA / YaRN / RoPE-scaling). Paste a model id, get a 1-line verdict.",
"profile.preset_loaded": "✅ Loaded preset for {id}. Form pre-filled. (Click 📥 Fetch to override with the latest config from HF Hub.)",
// v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
"modes.template": "📜 Chat-template",
"mode_desc.template": "Detects which chat-template family a model uses (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Gives the exact CLI flag for lm-eval / vLLM / transformers.",
"template.title": "📜 Chat-template Sniffer",
"template.tip": "Paste an HF model id (or raw tokenizer_config.json). Detects the chat-template family and gives you the exact framework command to use it correctly. lm-eval-harness silently halves accuracy if you forget to apply it (issue #1841).",
"template.desc": "Did you forget --apply_chat_template? Most multi-turn evals fail by ~50% because the chat template wasn't applied. Paste a model id, get the exact CLI flag for your stack.",
"template.id_label": "HF model id:",
"template.fetch_btn": "📜 Sniff",
"template.paste_summary": "Or paste raw tokenizer_config.json (private models)",
"template.paste_btn": "📜 Sniff pasted config",
"template.label.family": "Detected family",
"template.label.markers": "Matched markers",
"template.label.tpl_len": "Template length",
"template.section.warnings": "Warnings",
"template.section.commands": "Commands by framework",
"template.section.raw": "Raw template (preview)",
"template.family.custom": "custom (unknown family)",
"template.family.none": "(no chat_template)",
"template.verdict.ok": "✅ TEMPLATE DETECTED",
"template.verdict.custom": "⚠ CUSTOM TEMPLATE",
"template.verdict.missing": "❌ NO CHAT TEMPLATE",
"template.verdict.base_model": "ℹ BASE MODEL (no chat)",
"template.verdict.unknown": "❓ UNKNOWN",
"template.warn.no_chat_template": "No chat_template field in tokenizer_config.json. This is typical for base / pretrained-only models. If you intended an instruct-tuned model, the wrong file may be loaded.",
"template.warn.custom_template": "Template is non-standard ({length} chars). The tool could not match it against known families. Inspect the raw preview below and verify your eval framework supports it.",
"template.warn.lm_eval_apply": "lm-eval-harness: add --apply_chat_template or your accuracy will silently drop ~50% on multi-turn evals (issue #1841).",
"template.warn.vllm_apply": "vLLM serve: verify --chat-template is set (auto-detection sometimes fails for fine-tuned variants). Suggested: {name}.",
"template.status.empty_id": "⚠ Enter a model id (e.g. mistralai/Mistral-7B-Instruct-v0.3).",
"template.status.fetching": "⏳ Fetching tokenizer_config.json for {modelId}...",
"template.status.success": "✅ Sniffed {modelId} (verdict: {verdict})",
"template.status.empty_paste": "⚠ Paste a tokenizer_config.json first.",
"template.status.invalid_json":"❌ Not valid JSON: {error}",
"template.status.success_paste":"✅ Sniffed pasted config (verdict: {verdict})",
"template.pasted_label": "(pasted tokenizer_config)",
// v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
"modes.arena": "🎯 Arena CI",
"mode_desc.arena": "Recovers confidence intervals from raw pairwise vote data (Bradley-Terry MLE + bootstrap). Detects statistically tied pairs that the public Arena leaderboard hides.",
"arena.title": "🎯 Arena-Elo CI Reconstructor",
"arena.tip": "Chatbot Arena strips confidence intervals from the public leaderboard. A 5-Elo gap can be statistically meaningless. Paste raw vote data (model_a, model_b, winner) — the tool computes Bradley-Terry MLE + bootstrap CIs and lists statistical ties (CI overlap).",
"arena.desc": "Is GPT-4 actually better than Claude — or are they tied? Paste pairwise vote CSV (or click Load sample). Bradley-Terry MLE + 200-iteration bootstrap → ranked Elos with 95% CIs and statistical-tie detection. All in browser.",
"arena.sample_btn": "📊 Load sample data",
"arena.run_btn": "🎯 Compute CIs",
"arena.clear_btn": "🗑️ Clear",
"arena.csv_summary": "Vote CSV (header: model_a,model_b,winner; winner ∈ a/b/tie)",
"arena.section.ranked": "Ranked Elos with 95% CIs",
"arena.section.ties": "Statistical ties (CI overlap)",
"arena.section.summary": "Summary",
"arena.col.rank": "#",
"arena.col.model": "Model",
"arena.col.elo": "Elo",
"arena.col.ci": "95% CI",
"arena.col.ci_width": "± half-width",
"arena.col.matches": "Matches",
"arena.col.wins": "W / L / T",
"arena.col.tie_pair": "Pair",
"arena.col.tie_diff": "Elo gap",
"arena.col.tie_overlap": "CI overlap",
"arena.no_ties": "No statistical ties — all pairs distinguishable at 95% CI.",
"arena.summary.votes": "Total votes",
"arena.summary.models": "Models",
"arena.summary.ties": "Statistical ties",
"arena.summary.bootstrap": "Bootstrap iters",
"arena.summary.ci_level": "CI level",
"arena.status.empty": "⚠ Paste vote CSV or click Load sample.",
"arena.status.too_few": "⚠ Only {n} valid votes — need at least 10 to fit Bradley-Terry reliably.",
"arena.status.computing": "⏳ Computing Bradley-Terry MLE + bootstrap on {n} votes...",
"arena.status.done": "✅ {n} votes · {models} models · {ties} statistical ties · {ms} ms",
"arena.status.sample_loaded": "✅ Sample loaded (synthetic 6-model Arena data). Click Compute CIs.",
// v0.7.3 — anti-bullshit pack #4: Contamination Prior
"modes.contam": "🧪 Contamination",
"mode_desc.contam": "Bayesian-ish prior on whether a benchmark score is contaminated. Enter your model's training cutoff → rates 20+ popular benchmarks (MMLU, GSM8K, HumanEval, MMLU-Pro…).",
"contam.title": "🧪 Contamination Prior",
"contam.tip": "Computes a Bayesian-ish prior on whether a benchmark score is contaminated, based on (model training cutoff date) × (benchmark release date) × (known corpus inclusion + leak history). Open LLM Leaderboard v1 was killed in 2024 after MMLU/HellaSwag scores became contaminated.",
"contam.desc": "Should you trust your model's MMLU score? Enter the model's training cutoff date — the tool rates 20+ popular benchmarks (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) and tells you which scores are likely contaminated.",
"contam.cutoff_label": "Training cutoff:",
"contam.run_btn": "🧪 Rate all benchmarks",
"contam.section.ranked": "Benchmark contamination priors",
"contam.section.high": "🔴 High-risk benchmarks (treat scores as unreliable)",
"contam.section.medium": "🟡 Medium-risk (verify with alternates)",
"contam.section.low": "🟢 Low-risk (likely clean)",
"contam.col.benchmark": "Benchmark",
"contam.col.released": "Released",
"contam.col.gap": "Gap (months)",
"contam.col.prior": "P(contam)",
"contam.col.level": "Level",
"contam.col.corpora": "In corpora",
"contam.col.category": "Category",
"contam.label.high": "High risk",
"contam.label.medium": "Medium",
"contam.label.low": "Low",
"contam.no_entries": "(none in this category)",
"contam.advice.high": "Treat these scores as unreliable. Replace with newer / private-test alternates (MMLU-Pro, GPQA, MUSR, MATH-500).",
"contam.advice.medium": "Take with caution. Look for replication on a held-out subset or community reproductions.",
"contam.advice.low": "Score likely uncontaminated, but absence of leak is not proof — still cross-check with alternate test.",
"contam.summary.headline": "Cutoff {cutoff} · {n} benchmarks rated",
"contam.status.empty": "⚠ Enter a model training cutoff date (e.g. 2023-12).",
"contam.status.bad_date": "⚠ Bad date format. Use YYYY-MM or YYYY-MM-DD.",
"contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks rated · {high} high-risk",
// v0.7 — Help modal section
"help.v07.title": "🆕 v0.7 — Anti-bullshit pack (4 new modes)",
"help.v07.intro": "v0.7 (2026-05-06): four new modes that solve concrete pain points reported by the HuggingFace community. Each one runs in your browser with no inference — pure metadata + math.",
"help.v07.unmask.title": "🪟 Context Unmasker",
"help.v07.unmask.body": "Detects when max_position_embeddings is misleading. Mistral-7B-v0.1 declares 32k but attends within ~4-8k via SWA. Paste an HF model id → 1-second verdict (HONEST / INFLATED / SEVERELY INFLATED / YARN-EXTENDED). Catches SWA, RoPE-scaling (YaRN/linear/dynamic NTK), small-d_head + GQA. Use case: before paying GPU for 32k context, verify the model actually attends that far.",
"help.v07.template.title": "📜 Chat-template Sniffer",
"help.v07.template.body": "Detects which chat-template family a model uses (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) and gives you the exact CLI flag for lm-evaluation-harness, vLLM, and transformers. Solves issue #1841 in lm-eval-harness: forgetting --apply_chat_template silently halves multi-turn accuracy. Use case: before reporting a benchmark score, confirm you applied the template correctly.",
"help.v07.arena.title": "🎯 Arena-Elo CI Reconstructor",
"help.v07.arena.body": "Chatbot Arena strips confidence intervals from its public leaderboard — a 5-Elo gap can be statistically meaningless. Paste raw pairwise vote data (model_a, model_b, winner) → Bradley-Terry MLE + 200-iteration bootstrap → ranked Elos with 95% CIs and a \"statistical ties\" panel listing pairs whose CIs overlap. Try the Load sample button. Use case: before declaring \"model A beats model B\", verify their CIs don't overlap.",
"help.v07.contam.title": "🧪 Contamination Prior",
"help.v07.contam.body": "Bayesian-ish prior on whether a benchmark score is contaminated. Enter your model's training cutoff date → tool rates 20+ popular benchmarks (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) by P(contamination) based on time gap, corpus inclusion, and known leak history. Open LLM Leaderboard v1 was killed in 2024 after MMLU/HellaSwag scores became contaminated. Use case: decide which scores to trust when comparing two models.",
"help.v07.quant.title": "⚖️ Quant-regime Classifier",
"help.v07.quant.body": "Predicts γ-shift and ΔPPL for any (model × quant scheme: NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8, …). Architecture-aware: small d_head + aggressive GQA → more sensitive; calibrated schemes (AWQ) absorb shift better than uncalibrated (NF4). Recommends safer alternatives if a cliff is detected. Use case: before quantizing, predict whether your specific architecture × scheme combo will keep PPL acceptable, with a concrete switch-to suggestion otherwise.",
"help.v07.drift.title": "🔀 Cross-framework Drift Bound",
"help.v07.drift.body": "Same model, different scores on different setups. Tool predicts the maximum drift admissible from numerical noise alone (dtype, framework, batch). If the observed gap exceeds it → real bug, typically chat-template mismatch (lm-eval-harness issue #1841) or KV-cache layout. Try the "Load sample" button for the canonical chat-template bug. Use case: before reporting a regression or claiming reproducibility, verify whether the gap between two evals is bigger than what numerical noise can explain.",
"inv.v07.drift": "🔀 Drift — bug or noise? Predict max admissible gap between two evals",
"help.v07.niah.title": "🔍 NIAH → Reasoning Gap",
"help.v07.niah.body": "RULER paper (NVIDIA 2024) shows that long-context models often pass NIAH (needle retrieval) but fail multi-hop reasoning at the same context. Tool predicts both pass rates from architecture (γ_Padé + d_horizon + arch pressure: small d_head, GQA, SWA), reports the gap, and finds your model's \"safe reasoning context\" where reasoning stays ≥65%. Sweep mode shows the curve across 1k/4k/16k/64k/T_train. Use case: before deploying at the claimed context, find out whether the model will actually reason there or just retrieve.",
"inv.v07.niah": "🔍 NIAH→Reason — does your \"128k context\" actually reason there, or just retrieve?",
// v0.7 — Inventory modal 5th card
"inv.v07.title": "🆕 v0.7 anti-bullshit pack",
"inv.v07.unmask": "🪟 Unmask — config.json claims 32k? See if it actually attends that far",
"inv.v07.template": "📜 Chat-template — exact CLI flag so lm-eval doesn't silently halve your accuracy",
"inv.v07.arena": "🎯 Arena CI — recover the confidence intervals Chatbot Arena hides",
"inv.v07.contam": "🧪 Contamination — rate 20+ benchmarks for contamination probability",
"inv.v07.quant": "⚖️ Quant — predict γ shift + ΔPPL for any (model × quant scheme) combo",
// v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
"modes.quant": "⚖️ Quant",
"mode_desc.quant": "Predicts γ-shift and ΔPPL for any (model × quant scheme). Architecture-aware: small d_head + GQA → more sensitive. Recommends safer alternatives if a cliff is detected.",
"quant.title": "⚖️ Quant-regime Classifier",
"quant.tip": "Predicts γ-shift (and downstream ΔPPL) for a given (model × quant scheme). Generic claims like 'AWQ ~95% retention' are too vague — TAF uses d_head, GQA ratio, SWA flag, and model size to give an architecture-specific verdict. Solves: HF community widely reports unpredictable quant cliffs (NF4 -2 PPL on Phi-3 but fine on Llama-3-8B).",
"quant.desc": "Will quantizing your model break it? Paste an HF model id, pick a quant scheme — get predicted γ-shift, expected ΔPPL band, and a recommended alternative if it's a cliff. Browser-only, no GPU, no calibration set required.",
"quant.id_label": "HF model id:",
"quant.fetch_btn": "📥 Fetch config",
"quant.scheme_label": "Quant scheme:",
"quant.run_btn": "⚖️ Predict",
"quant.all_btn": "📊 Compare all schemes",
"quant.regime.safe": "✅ SAFE",
"quant.regime.mild": "✅ MILD COMPRESSION",
"quant.regime.significant": "⚠ SIGNIFICANT DEGRADATION",
"quant.regime.cliff": "❌ HEAVY CLIFF",
"quant.label.gamma_shift": "γ shift",
"quant.label.delta_ppl": "ΔPPL (est.)",
"quant.label.arch_mult": "Arch multiplier",
"quant.section.breakdown": "Breakdown",
"quant.section.reco": "Recommendation",
"quant.section.compare": "All schemes (sorted by safety)",
"quant.field.scheme": "Scheme",
"quant.field.calibrated": "calibrated",
"quant.field.uncalibrated": "uncalibrated",
"quant.field.base_penalty": "Base penalty",
"quant.field.arch_mult_full": "Architecture multiplier",
"quant.field.gamma_shift": "Predicted γ shift",
"quant.field.ppl_band": "ΔPPL band (est.)",
"quant.field.params": "Parameters",
"quant.col.scheme": "Scheme",
"quant.col.bits": "Bits",
"quant.col.gamma_shift": "γ shift",
"quant.col.ppl_band": "ΔPPL band",
"quant.col.regime": "Regime",
"quant.reco.switch_to_awq": "Switch to {scheme} — calibrated 4-bit handles small d_head + GQA much better than NF4. Expected ΔPPL drops ~2-3×.",
"quant.reco.switch_to_q5_km": "Switch to {scheme} — Q5 keeps more head dimensions intact at low cost (only ~25% bigger file).",
"quant.reco.switch_to_q4_km": "Switch to {scheme} — Q3/Q2 are too aggressive for this architecture.",
"quant.reco.consider_awq": "Consider {scheme} — calibration meaningfully reduces γ-shift on this architecture.",
"quant.reco.use_higher_bits": "Use higher-bit alternative — this architecture cannot absorb 4-bit cleanly. Try 5- or 8-bit.",
"quant.reco.verify_with_eval": "Verify with a real eval — predicted shift is borderline. Run NIAH at your target context before deploying.",
"quant.reco.no_action": "No action needed — quantization is safe for this architecture.",
"quant.summary.headline_all": "All schemes for {modelId}",
"quant.status.empty_id": "⚠ Enter a model id (e.g. meta-llama/Llama-3.2-1B).",
"quant.status.fetching": "⏳ Fetching config.json for {modelId}...",
"quant.status.fetched": "✅ Config fetched for {modelId}. Pick a scheme and click Predict (or Compare all schemes).",
"quant.status.no_scheme": "⚠ Pick a quant scheme from the dropdown.",
"quant.status.done": "✅ Predicted regime: {regime}",
"quant.status.done_all": "✅ Compared {n} schemes — sorted by safety.",
// v0.7.4 — HF Hub autocomplete privacy + rate-limit notices
"hf_auto.privacy": "🔒 Queries sent to huggingface.co/api · cached locally 5 min",
"hf_auto.rate_limited": "⚠ HuggingFace rate limit — try again in a moment, or type the full model id manually",
"hf_auto.gated_msg": "is gated. Accept the license here:",
// v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound
"modes.drift": "🔀 Drift",
"mode_desc.drift": "Predicts max-allowable drift between two benchmark scores given (framework, dtype, batch, chat-template). Flags real bugs vs numerical noise.",
"drift.title": "🔀 Cross-framework Drift Bound",
"drift.tip": "Same model, different scores on different setups. Is the gap noise or a real bug? Enter two scores with their (framework, dtype, batch, chat-template) — tool predicts the maximum allowable drift from numerical noise alone. If observed gap exceeds it → real bug, usually chat-template mismatch (lm-eval issue #1841) or KV-cache layout.",
"drift.desc": "Your model gives 67.2 on lm-eval-hf and 65.1 on vLLM-served. Bug or noise? Enter both scores with (framework, dtype, batch, chat-template applied?). Tool predicts the noise band and flags real bugs. arxiv 2506.09501 documents this as a major eval reproducibility problem.",
"drift.setup_a": "Setup A",
"drift.setup_b": "Setup B",
"drift.score": "Score",
"drift.framework": "Framework",
"drift.dtype": "Dtype",
"drift.batch": "Batch",
"drift.template": "Chat-template",
"drift.template.applied": "applied",
"drift.template.not_applied": "not applied",
"drift.template.unknown": "unknown",
"drift.run_btn": "🔀 Compute drift bound",
"drift.sample_btn": "📊 Load sample (chat-template bug)",
"drift.label.observed": "Observed gap",
"drift.label.band": "Numerical band",
"drift.label.ratio": "Gap / band",
"drift.section.setups": "Setups",
"drift.section.breakdown": "Drift contributors (numerical band)",
"drift.section.verdict": "Verdict & recommendation",
"drift.contrib.dtype": "Dtype mismatch",
"drift.contrib.framework": "Framework",
"drift.contrib.batch": "Batch difference",
"drift.contrib.template": "Chat-template MISMATCH",
"drift.dominant_cause": "Dominant cause",
"drift.cause.dtype": "dtype precision difference",
"drift.cause.framework": "framework / kernel difference",
"drift.cause.batch": "batch normalization paths",
"drift.cause.template_mismatch": "chat-template applied on one side but not the other (lm-eval-harness #1841 pattern — typical -50% drop on multi-turn)",
"drift.verdict.noise": "✅ NUMERICAL NOISE",
"drift.verdict.suspicious": "⚠ SUSPICIOUS — verify",
"drift.verdict.bug": "❌ REAL BUG — investigate",
"drift.verdict.bug_template": "❌ CHAT-TEMPLATE BUG",
"drift.reco.noise": "Gap fits within the expected numerical-noise band. No action needed; the difference is consistent with framework/dtype/batch variation alone.",
"drift.reco.suspicious": "Gap is 1–2× the predicted noise band. Borderline — possibly a real bug. Try aligning the dominant contributor (e.g. match framework or dtype) and re-test.",
"drift.reco.bug": "Gap is > 2× the predicted noise band. This is a real bug. Inspect the dominant contributor — most likely tokenizer / chat-template / KV-cache layout difference. Run lm-eval-harness with --apply_chat_template and confirm.",
"drift.reco.bug_template": "Chat-template mismatch detected. This is the most common cause of large eval discrepancies (lm-eval-harness issue #1841). Re-run the "not applied" side with --apply_chat_template (or set vLLM --chat-template <name>) and re-test.",
"drift.status.empty_scores": "⚠ Enter both scores.",
"drift.status.done": "✅ Verdict: {verdict}",
"drift.status.sample_loaded": "✅ Sample loaded (canonical chat-template bug). Click Compute drift bound.",
// v0.7.6 — anti-bullshit pack #7: NIAH → reasoning gap predictor
"modes.niah": "🔍 NIAH→Reason",
"mode_desc.niah": "Predicts NIAH (retrieval) and multi-hop reasoning pass rates at any context. Solves: long-context models often pass NIAH but fail reasoning at the same context (RULER paper).",
"modes.saturation": "📈 Saturation",
"mode_desc.saturation": "Tells you whether a benchmark still discriminates frontier models or has saturated (e.g. MMLU 88-94% top, AIME 2025 already 96-100%). Returns top-3 + verdict + recommended replacements.",
"modes.hub": "🧭 Solutions",
"mode_desc.hub": "Map of every documented LLM-eval pain → tafagent mode (if covered) + curated external tools. Find the right solution without rebuilding it. 30+ pains, 7 categories.",
"modes.yarn": "🧵 YaRN Planner",
"mode_desc.yarn": "Generate the exact rope_scaling config to extend a model past its trained context — plus a TAF verdict on whether attention quality actually holds at the target length.",
"modes.gguf": "🧊 GGUF Bridge",
"mode_desc.gguf": "Read a GGUF file's metadata header (rope_theta, context_length, quant) in your browser and get a TAF quality verdict — the question the VRAM calculators skip: fits AND works?",
"gguf.title": "🧊 GGUF Validity Bridge",
"gguf.tip": "Fits in VRAM ≠ works. The GGUF/VRAM calculators read a model's metadata to tell you if a quant fits in your GPU. This reads the SAME metadata (rope_theta, context_length, quant scheme, head geometry) straight from the .gguf header via HTTP Range — no multi-GB download — and answers the question they don't: does attention quality actually hold, and how much does the quant erode it (γ-shift, ΔPPL)?",
"gguf.desc": "Paste a GGUF repo (e.g. Qwen/Qwen2.5-7B-Instruct-GGUF), pick a quant file, and get a TAF quality verdict: the model's effective attention horizon, plus how much the chosen quantization shifts γ for this specific architecture. Reads only the file header in your browser.",
"gguf.repo_label": "GGUF repo id:",
"gguf.list_btn": "📂 List quant files",
"gguf.file_label": "Quant file:",
"gguf.target_label": "Target context L (optional):",
"gguf.analyze_btn": "🧊 Analyze GGUF",
"gguf.all_btn": "📊 Compare all quants",
"gguf.compare_title": "All quants — quality comparison",
"gguf.col.verdict": "Verdict",
"gguf.col.gamma_at_l": "γ @ L (after quant)",
"gguf.need_repo": "Enter a GGUF repo id like 'Qwen/Qwen2.5-7B-Instruct-GGUF'",
"gguf.listing": "Listing .gguf files from HF Hub…",
"gguf.no_files": "No .gguf files found in that repo.",
"gguf.found": "quant files found",
"gguf.pick_hint": "pick one and click Analyze.",
"gguf.reading": "Reading GGUF header via HTTP Range…",
"gguf.read_ok": "Header parsed",
"gguf.verdict.healthy": "HEALTHY — effective horizon reaches L with good γ after quant",
"gguf.verdict.usable_with_care":"USABLE WITH CARE — reaches L but γ is modest after quant",
"gguf.verdict.degrades": "DEGRADES — attention collapses before L (or quant pushes it there)",
"gguf.r.arch": "Architecture",
"gguf.r.ctx_train": "Trained context",
"gguf.r.horizon_fp16": "Attention horizon (fp16)",
"gguf.r.quant": "Quant scheme",
"gguf.r.gamma_shift": "γ-shift from quant",
"gguf.r.after_quant": "(after quant)",
"gguf.r.eff_horizon": "Effective horizon (quantised)",
"gguf.r.no_quant_shift": "— full precision, no γ-shift",
"gguf.r.note": "Horizon from γ_Padé / d_horizon (architecture). Quant γ-shift + ΔPPL from the quant-regime model (calibrated to llama.cpp PPL + AWQ/GPTQ papers). Both are estimates — verify borderline cases with a real eval.",
"gguf.err.not_gguf": "That file isn't a valid GGUF (bad magic).",
"gguf.err.too_large": "Metadata header exceeds the fetch cap — unusually large tokenizer. Try another quant.",
"gguf.err.incomplete": "GGUF metadata is missing rope_theta or context_length — can't compute the horizon.",
"help.v091.gguf.title": "🧊 GGUF Validity Bridge",
"help.v091.gguf.body": "The dozen GGUF/VRAM calculators (NyxKrage, oobabooga, …) read a .gguf header to tell you if a quant fits in your GPU. This reads the same header — via HTTP Range, so no multi-GB download — and answers the question they skip: does it fit AND still work? Paste a GGUF repo, pick a quant file; the bridge pulls rope_theta, context_length, the quant scheme (from general.file_type or the filename), and head geometry, then runs TAF's γ_Padé / d_horizon plus the architecture-aware quant-regime γ-shift. Output: effective attention horizon at the trained context, how far the quant erodes γ (and ΔPPL) for this model, and a verdict. Use case: 'Q4_K_M fits 8GB — but is it brain-dead past 30K?' → see the horizon and the Q4 γ-penalty before you download 6 GB.",
"yarn.title": "🧵 YaRN / RoPE Context-Extension Planner",
"yarn.tip": "Config + verdict, not just VRAM. The GGUF/VRAM calculators tell you if a context length fits in GPU. This tells you the exact rope_scaling block to put in config.json AND whether attention quality will actually hold at that length — using TAF's γ_Padé / d_horizon machinery, all in your browser.",
"yarn.desc": "Want to run a model past its trained context? Enter the model (or its θ + trained context) and your target length L. Get the copy-paste rope_scaling snippet for transformers ≥4.43, plus a TAF verdict: does the effective attention horizon reach L, or will the model just hallucinate past d_horizon?",
"yarn.model_label": "HF model id (optional):",
"yarn.fetch_btn": "📥 Fetch config",
"yarn.orig_label": "Trained context (orig max_position_embeddings):",
"yarn.theta_label": "RoPE θ (rope_theta):",
"yarn.target_label": "Target context L:",
"yarn.type_label": "RoPE scaling method:",
"yarn.type_auto": "Auto (recommended)",
"yarn.plan_btn": "🧵 Plan extension",
"yarn.need_id": "Enter a model id like 'Qwen/Qwen2.5-7B-Instruct'",
"yarn.fetching": "Fetching config.json from HF Hub…",
"yarn.loaded_hint": "Adjust if needed, then click Plan extension.",
"yarn.verdict.healthy": "HEALTHY — effective horizon reaches L with good γ",
"yarn.verdict.usable_with_care":"USABLE WITH CARE — works but γ is modest near L",
"yarn.verdict.needs_finetune": "NEEDS FINE-TUNE — factor too large for closed-form alone",
"yarn.verdict.degrades": "DEGRADES — attention collapses before L",
"yarn.verdict.no_extension_needed":"NO EXTENSION NEEDED — L already inside trained context",
"yarn.r.factor": "Extension factor",
"yarn.r.method": "Method",
"yarn.r.naive": "(no extension)",
"yarn.r.eff": "(after extension)",
"yarn.r.from": "from",
"yarn.r.snippet": "config.json snippet",
"yarn.r.collapsed": "collapsed (past Padé pole)",
"yarn.copy_btn": "Copy config",
"yarn.copied": "Copied",
"yarn.warn.theta_eff_estimate":"θ_eff ≈ θ×factor is a first-order NTK estimate; YaRN's per-band ramp may differ slightly.",
"yarn.warn.aggressive": "Aggressive factor > 4× — quality past d_horizon is unreliable without fine-tuning.",
"yarn.warn.horizon_short": "Effective horizon does not cover L — expect coherence loss past d_horizon.",
"yarn.warn.finetune": "RoPE extension here is a closed-form estimate; transformers docs + the YaRN paper recommend a short fine-tune for factors beyond ~2–4×.",
"yarn.err.no_orig": "Enter the trained context (orig max_position_embeddings), or fetch a model.",
"yarn.err.no_theta": "Enter RoPE θ (rope_theta), or fetch a model.",
"yarn.err.no_target": "Enter a target context length L.",
"help.v09.title": "🆕 v0.9 — YaRN / RoPE Context-Extension Planner",
"help.v09.intro": "v0.9 (2026-05-23): the most-asked HuggingFace question — \"how do I set rope_scaling to extend context, and will it actually work?\" — answered with a copy-paste config snippet AND a TAF quality verdict. Browser-only, no inference.",
"help.v09.yarn.title": "🧵 YaRN / RoPE Context-Extension Planner",
"help.v09.yarn.body": "The dozen GGUF/VRAM calculators on HF (NyxKrage, oobabooga, DavidAU, …) all answer the same question: does context length L fit in my GPU? None answer the harder one: does L fit AND still work? Enter a model id (or its θ + trained context) and a target length L. The planner computes the extension factor, emits the exact rope_scaling block for transformers ≥4.43 (yarn / linear / dynamic / llama3, with paper-default β ramps), then runs TAF's γ_Padé / d_horizon math: γ with no extension (the problem), γ after the chosen method (the fix), the effective attention horizon, and a verdict — HEALTHY / USABLE-WITH-CARE / NEEDS-FINETUNE / DEGRADES. It flags the θ_eff≈θ·factor estimate and the >4× fine-tune requirement honestly. Use case: 'I want Mistral-7B (θ=10k, 8k trained) at 32k' → see γ collapse from naive use, YaRN partially recover it, and get the exact config to paste.",
"niah.title": "🔍 NIAH → Reasoning Gap",
"niah.tip": "NIAH (Needle in a Haystack) tests retrieval: 'find this fact in long text'. Multi-hop reasoning tests inference: 'combine facts X+Y at the start with fact Z at the end'. RULER paper (NVIDIA 2024) shows long-context models often pass NIAH but fail reasoning at the same context. This tool predicts both pass rates from architecture alone.",
"niah.desc": "Your model claims 128k context. Will it actually reason at 64k, or just retrieve? Paste an HF model id and a target eval context — tool predicts NIAH and multi-hop reasoning pass rates, the gap, and a 'safe context' where reasoning stays ≥65%.",
"niah.id_label": "HF model id:",
"niah.fetch_btn": "📥 Fetch config",
"niah.teval_label": "Target context (T_eval):",
"niah.run_btn": "🔍 Predict",
"niah.sweep_btn": "📊 Sweep contexts",
"niah.label.niah": "NIAH pass rate",
"niah.label.reasoning": "Reasoning pass rate",
"niah.label.gap": "Gap",
"niah.label.safe_ctx": "Safe reasoning context",
"niah.section.breakdown": "Architecture breakdown",
"niah.section.reco": "Recommendation",
"niah.calib.heading": "RULER-calibrated (NVIDIA published data)",
"niah.calib.matched": "Matched {alias} → KB row {canonical}.",
"niah.calib.aggregate": "RULER aggregate",
"niah.calib.interp": "interpolated between",
"niah.calib.extrapolated": "extrapolated outside RULER's measured range",
"niah.calib.col.heuristic": "Heuristic",
"niah.calib.col.calibrated": "RULER-calibrated",
"niah.calib.col.delta": "Δ",
"niah.calib.factors": "Per-task factors from RULER paper Appendix Tables 13-16:",
"niah.calib.factors_caveat": "honest range: retrieval 0.95-1.10×, reasoning 0.60-0.85×",
"niah.calib.claimed_vs_effective": "Paper-reported",
"niah.calib.claimed": "claimed",
"niah.calib.effective": "effective",
"niah.calib.source": "Source",
"niah.calib.miss": "RULER calibration unavailable for this model — using architectural heuristic only. Add to data/ruler_kb.json if you have measured numbers.",
"niah.section.sweep": "Pass rate sweep across context lengths",
"niah.field.dhorizon": "d_horizon (effective)",
"niah.field.ratio": "T_eval / d_horizon",
"niah.field.arch_pressure": "Arch pressure (small d_head + GQA + SWA)",
"niah.field.theta": "RoPE θ",
"niah.field.t_train": "T_train (claimed)",
"niah.col.context": "T_eval",
"niah.col.niah": "NIAH",
"niah.col.reasoning": "Reasoning",
"niah.col.gap": "Gap",
"niah.col.verdict": "Verdict",
"niah.verdict.robust": "✅ ROBUST",
"niah.verdict.marginal": "⚠ MARGINAL",
"niah.verdict.degraded": "⚠ DEGRADED",
"niah.verdict.retrieval_only": "❌ RETRIEVAL-ONLY",
"niah.verdict.broken": "❌ BROKEN",
"niah.reco.robust": "Both retrieval and reasoning hold up at this context. Safe to deploy for both lookup and inference tasks.",
"niah.reco.marginal": "Borderline. Retrieval works but reasoning is shaky. Use for fact-lookup, not multi-step inference.",
"niah.reco.degraded": "Significant reasoning drop. The model can find facts but struggles to combine them. Avoid multi-hop tasks at this length.",
"niah.reco.retrieval_only": "Canonical RULER finding: model passes NIAH but fails reasoning. Useful for retrieval-augmented setups (where the LLM only locates facts) but NOT for chained inference. Cut your context to the 'safe' value below.",
"niah.reco.broken": "Model fails even basic retrieval at this context. Treat as out-of-distribution — re-test at a shorter context.",
"niah.safe_context": "≤ {ctx} tokens (reasoning ≥ 65%)",
"niah.safe_context_none": "No safe context found below your target — model fails reasoning even at small contexts.",
"niah.summary.sweep": "{modelId} — pass rates by context",
"niah.status.empty_id": "⚠ Enter a model id (e.g. meta-llama/Llama-3.1-8B-Instruct).",
"niah.status.bad_teval": "⚠ Enter a target context (≥ 512 tokens).",
"niah.status.fetching": "⏳ Fetching config.json for {modelId}...",
"niah.status.fetched": "✅ Config fetched for {modelId}. Set T_eval and click Predict (or Sweep contexts).",
"niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
"niah.status.sweep_done": "✅ Swept {n} context lengths.",
"saturation.title": "📈 Benchmark Saturation Detector",
"saturation.tip": "MMLU is saturated (88-94% all frontier models). Reporting '92% on MMLU' is now meaningless. This tool tells you which benchmarks still discriminate frontier models, which are saturated, and what to use instead. Data: DemandSphere AI Frontier Tracker (CC BY-NC 4.0) refreshed 2026-05.",
"saturation.desc": "Is your benchmark still useful? Pick a benchmark to see top-3 frontier scores, spread, and a verdict (saturated / near-saturated / discriminative) plus recommended replacements.",
"saturation.select_label": "Benchmark:",
"saturation.select.all": "— show all benchmarks —",
"saturation.run_btn": "📈 Classify",
"saturation.all_btn": "📊 Show all",
"saturation.col.spread": "Top-3 spread",
"saturation.col.mean": "Top-3 mean",
"saturation.col.n": "Models",
"saturation.col.bench": "Benchmark",
"saturation.col.verdict": "Verdict",
"saturation.col.reco": "Top reco",
"saturation.col.model": "Model",
"saturation.col.score": "Score",
"saturation.section.top3": "Top-3 frontier scores",
"saturation.section.recommendations": "Recommended alternatives",
"saturation.section.note": "Notes",
"saturation.section.all": "All tracked benchmarks",
"saturation.verdict.saturated": "🚨 SATURATED",
"saturation.verdict.near_saturated": "⚠ NEAR SATURATED",
"saturation.verdict.discriminative": "✅ DISCRIMINATIVE",
"saturation.verdict.sparse_data": "ℹ SPARSE DATA",
"saturation.borderline": "Borderline — within ±1pp of a threshold cutoff. Treat verdict as 'check carefully'.",
"saturation.unknown": "Unknown benchmark.",
"saturation.attribution": "Data: DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (open-weight historical) · last fetch 2026-05-05.",
"saturation.status.live": "✅ Live data loaded — {count} models.",
"saturation.status.baked": "ℹ Using baked snapshot (live fetch unavailable).",
"saturation.status.kb_fail": "⚠ Could not load saturation KB.",
"saturation.status.done": "✅ {name} — {verdict}",
"saturation.status.all_done": "✅ Classified {n} benchmarks.",
"help.v08.saturation.title": "📈 Benchmark Saturation Detector",
"help.v08.saturation.body": "MMLU is saturated (88-94% top), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. Use case: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.",
"inv.v08.saturation": "📈 Saturation — is your benchmark still useful, or are all frontier models tied at the top?",
// v0.8.2 — anti-bullshit pack #8: JSON CoT-aware Linter
"modes.cot": "📋 JSON CoT",
"mode_desc.cot": "Lints a JSON Schema (or example response object) for the answer-before-reasoning anti-pattern. Constrained-decoding engines emit fields in property order — if `answer` comes before `reasoning`, CoT is defeated.",
"cot.title": "📋 JSON CoT-aware Linter",
"cot.tip": "Constrained-decoding engines (llguidance, Outlines, SGLang grammars) emit JSON properties in schema order. If your schema places `answer` before `reasoning`, the model commits to a final answer first and only then writes the rationale to justify it — defeating Chain-of-Thought entirely. Paste a JSON Schema (or example object) and the linter flags the ordering.",
"cot.desc": "Reasoning before answer, always. Paste a JSON Schema or example response object — the linter reports whether reasoning fields come before answer fields and suggests a fix.",
"cot.input.placeholder": "{ \"type\": \"object\", \"properties\": { \"answer\": {\"type\": \"string\"}, \"reasoning\": {\"type\": \"string\"} } }",
"cot.lint_btn": "🔍 Lint",
"cot.example_good_btn": "↳ Example: good order",
"cot.example_bad_btn": "↳ Example: anti-pattern",
"cot.status.done": "✅ {verdict}",
"cot.col.field": "Field",
"cot.col.type": "Role",
"cot.field.reasoning": "reasoning",
"cot.field.answer": "answer",
"cot.field.other": "—",
"cot.field_count": "{n} fields",
"cot.verdict.good_order": "✅ Good order — reasoning before answer",
"cot.verdict.anti_pattern": "❌ Anti-pattern — answer before reasoning",
"cot.verdict.missing_reasoning": "⚠ Missing reasoning field",
"cot.verdict.missing_answer": "ℹ No answer-like field detected",
"cot.verdict.no_cot_fields": "ℹ No reasoning/answer fields detected",
"cot.verdict.invalid_json": "❌ Invalid JSON",
"cot.verdict.non_object": "ℹ Top-level value is not an object",
"cot.verdict.empty_fields": "ℹ No fields to analyse",
"cot.explain.good_order": "Constrained decoding will emit the rationale first, so the model can think before committing. Chain-of-Thought stays honest.",
"cot.explain.anti_pattern": "The model is forced to emit the answer field first; any reasoning that follows can only justify what was already committed. Reorder so reasoning-like fields come before answer-like fields.",
"cot.explain.missing_reasoning": "An answer field is present but no reasoning field. If you want CoT, add a `reasoning` (or `chain_of_thought`, `analysis`, …) field before the answer.",
"cot.explain.missing_answer": "A reasoning field is present but no obvious answer field. Make sure the schema actually requires the model to commit a final value.",
"cot.explain.no_cot_fields": "Object has fields, but none look reasoning- or answer-like by name. The linter is conservative — if the schema is intentional, ignore. Otherwise add explicit reasoning/answer fields.",
"cot.hint.non_object": "Top-level must be a JSON object (`{ … }`) or a JSON Schema with `properties`.",
"cot.hint.empty_fields": "No fields detected. Paste a JSON Schema, an example response, or click an example button below the textarea.",
"cot.suggested_fix.title": "✓ Suggested fix",
"cot.suggested_fix.desc": "Reordered properties — reasoning fields first, then any context fields, then answer fields. `required[]` (if present) is mirrored to match.",
"cot.suggested_fix.copy": "📋 Copy",
"cot.suggested_fix.copied": "✓ Copied",
"cot.attribution": "Refs:",
"inv.v082.cot": "📋 JSON CoT — lints structured-output schemas for the answer-before-reasoning anti-pattern that silently breaks Chain-of-Thought.",
"help.v082.cot.title": "📋 JSON CoT-aware Linter",
"help.v082.cot.body": "Constrained-decoding engines (llguidance, Outlines, SGLang grammars) emit JSON properties in the order your schema declares them. If you write { answer, reasoning } the model commits to answer first and CoT collapses into post-hoc justification. Paste any schema (or example response) — the linter classifies each field as reasoning, answer, or other, flags the ordering, and emits a reordered fix you can copy back. Use case: 'My CoT prompt works in plaintext but degrades under JSON mode' → run linter, find the inverted order, fix.",
// v0.8.3 — anti-bullshit pack #9: PEFT Anti-Pattern Checker
"modes.peft": "🔧 PEFT Lint",
"mode_desc.peft": "Static linter for PEFT/LoRA training scripts. Catches the silent base-model load (peft #2115), QLoRA prepare/get_peft_model ordering, target_modules/arch mismatch, and lora_alpha conventions.",
"peft.title": "🔧 PEFT Anti-Pattern Checker",
"peft.tip": "get_peft_model(base, config) creates a FRESH adapter — it does NOT load saved weights. Users who want to resume from a checkpoint must call PeftModel.from_pretrained(base, path). peft #2115 documents the silent base-model bug. This linter scans your training script for that pattern (and 3 others: QLoRA ordering, target_modules/arch mismatch, lora_alpha ratio).",
"peft.desc": "Don't burn 10 hours of training on a base model. Paste your PEFT setup code — the linter flags silent base-model loads, QLoRA ordering bugs, target_modules/arch mismatches, and lora_alpha conventions.",
"peft.input.placeholder": "from peft import LoraConfig, get_peft_model …",
"peft.lint_btn": "🔍 Lint",
"peft.example_bug_btn": "↳ Example: silent base-load",
"peft.example_qlora_btn": "↳ Example: QLoRA order bug",
"peft.example_clean_btn": "↳ Example: clean",
"peft.status.done": "✅ {verdict} — {n} finding(s)",
"peft.line": "line {n}",
"peft.summary": "{total} finding(s)",
"peft.attribution": "Refs:",
"peft.detected_at_line": "appears at line",
"peft.suggested_fix": "Suggested:",
"peft.detected_arch": "Detected arch",
"peft.from_model_id": "(from model id",
"peft.your_modules": "Your target_modules",
"peft.expected_modules": "Expected for this arch",
"peft.match_ratio": "{hits} of {total} match.",
"peft.ratio": "ratio",
"peft.alpha.convention": "convention is α=2r or α=r",
"peft.qlora_order.detail": "prepare_model_for_kbit_training (line {prepare_line}) runs AFTER get_peft_model (line {get_peft_model_line}). Reverse the order — call prepare FIRST, then get_peft_model.",
"peft.no_peft_calls.detail": "No get_peft_model / PeftModel.from_pretrained / LoraConfig calls detected. Paste a PEFT/LoRA setup snippet.",
"peft.verdict.errors_found": "❌ Errors found",
"peft.verdict.warnings_only": "⚠ Warnings",
"peft.verdict.info_only": "ℹ Info",
"peft.verdict.clean": "✅ Clean — no issues detected",
"peft.verdict.no_peft_calls": "ℹ No PEFT calls detected",
"peft.verdict.empty_input": "ℹ Empty input",
"peft.rule.silent_base_load.label": "Silent base-model load (peft #2115)",
"peft.rule.silent_base_load.explain": "get_peft_model(base, config) creates a NEW adapter — it does NOT load saved weights. The checkpoint hint in your code suggests you want to RESUME training from a saved adapter, but this code path will quietly start fresh and overwrite the run.",
"peft.rule.silent_base_load.fix": "Replace get_peft_model(base, config) with PeftModel.from_pretrained(base, path) when resuming. Verify with model.get_layer_status() after load.",
"peft.rule.qlora_order.label": "QLoRA ordering bug",
"peft.rule.qlora_order.explain": "prepare_model_for_kbit_training must be called BEFORE get_peft_model. Reversed, the kbit prep doesn't apply to the LoRA layers and gradient computation breaks (loss → NaN, or silent training of nothing).",
"peft.rule.qlora_order.fix": "Reorder: base = prepare_model_for_kbit_training(base) then model = get_peft_model(base, config).",
"peft.rule.target_modules_mismatch.label": "target_modules / arch mismatch",
"peft.rule.target_modules_mismatch.explain": "Your target_modules list doesn't match the conventional module names for the architecture detected in your code. PEFT will silently apply LoRA to nothing (or to the wrong layers).",
"peft.rule.target_modules_mismatch.fix": "Verify module names with print([n for n,_ in model.named_modules()]) on the loaded base model, or use the architecture-specific list shown above.",
"peft.rule.alpha_not_2r.label": "lora_alpha ≠ 2r convention",
"peft.rule.alpha_not_2r.explain": "Most published LoRA recipes use either α = 2r (effective unit scale) or α = r (reduced effective LR). A custom ratio works but warrants a sanity check.",
"peft.rule.alpha_not_2r.fix": "Sanity-check the ratio against your reference recipe. If intentional, ignore this finding.",
"peft.rule.no_peft_calls.label": "No PEFT calls detected",
"inv.v083.peft": "🔧 PEFT Lint — catches the silent get_peft_model base-load (peft #2115) + QLoRA order + target_modules / arch mismatch.",
"help.v083.peft.title": "🔧 PEFT Anti-Pattern Checker",
"help.v083.peft.body": "PEFT's get_peft_model(base, config) creates a FRESH adapter — it does not load saved weights from a path. Users who paste tutorial code and try to resume from a checkpoint silently throw away their training. peft #2115 has the canonical bug report. This linter scans your training script for the pattern + 3 related issues (QLoRA ordering, target_modules/arch mismatch, lora_alpha ratio) and reports findings with line numbers and suggested fixes. Use case: before you launch a 10-hour LoRA fine-tune, paste your script — catch the silent bugs in 200ms.",
// v0.8.4 — anti-bullshit pack #10: Prompt-Cache Diff Predictor
"modes.cache": "🔁 Cache Diff",
"mode_desc.cache": "Predicts whether a prompt edit kept the provider's prompt cache alive or invalidated it. Per-provider hit ratio + $ delta vs no-cache.",
"cache.title": "🔁 Prompt-Cache Diff Predictor",
"cache.tip": "Anthropic's cache_control cache breaks at the first token diff in the marked prefix. OpenAI auto-caches prefixes ≥1024 tokens but invalidates on any change. Gemini context cache requires ≥32K tokens. A misplaced edit silently 10x's your bill — and the API never warns you. Paste old + new prompt, see per-provider hit ratio + cost delta.",
"cache.desc": "Don't 10x your bill on a one-character edit. Paste your previous and current prompt — the predictor finds the longest common prefix, estimates tokens, and shows per-provider cache hit ratio + $ delta vs no-cache.",
"cache.old_label": "Old prompt:",
"cache.new_label": "New prompt:",
"cache.old.placeholder": "You are a helpful assistant. …",
"cache.new.placeholder": "You are a helpful assistant. …",
"cache.profile_label": "Tokenizer profile:",
"cache.profile.english": "English (chars/4)",
"cache.profile.code": "Code (chars/3.5)",
"cache.profile.mixed": "CJK / Cyrillic (chars/2)",
"cache.output_label": "Estimated output tokens:",
"cache.diff_btn": "🔍 Predict",
"cache.example_good_btn": "↳ Example: 99% hit",
"cache.example_broken_btn": "↳ Example: cache busted",
"cache.example_belowmin_btn": "↳ Example: below OpenAI min",
"cache.status.done": "✅ {verdict} — {hit}% theoretical hit",
"cache.verdict.identical": "✅ Identical — full cache hit",
"cache.verdict.divergent_can_cache":"⚠ Partial cache hit — providers vary",
"cache.verdict.divergent_below_min":"❌ Below all provider minimums — no caching possible",
"cache.verdict.fully_divergent": "❌ Fully divergent — cache invalidated",
"cache.verdict.empty_input": "ℹ Empty input",
"cache.summary.tokens": "Common prefix {common} / {total} tokens ({pct}% theoretical hit ratio).",
"cache.summary.diff_at": "First difference at line {line}.",
"cache.col.provider": "Provider",
"cache.col.hit": "Hit",
"cache.col.cost": "Base → cached",
"cache.col.savings": "Savings",
"cache.note.requires_marker": "(requires cache_control marker)",
"cache.note.below_min": "(prefix < {min} tokens — provider min)",
"cache.write_surcharge": "+ {cost} cache-write surcharge first time (Anthropic)",
"cache.diff.title": "Where the cache breaks",
"cache.diff.legend": "Green = shared prefix (cacheable). Red = first edit (everything from here is re-billed).",
"cache.hint.empty": "Paste two prompts, then Predict.",
"cache.attribution": "Refs:",
"cache.attribution.snapshot": "Prices snapshot 2026-01; verify against current provider docs before acting on $.",
"inv.v084.cache": "🔁 Cache Diff — predicts whether a prompt edit invalidated the provider's prompt cache. Per-provider hit ratio + $ delta.",
"help.v084.cache.title": "🔁 Prompt-Cache Diff Predictor",
"help.v084.cache.body": "Provider prompt caches each have different rules: Anthropic's cache_control breaks at the first token diff in the marked prefix; OpenAI auto-caches prefixes ≥1024 tokens; Gemini context caches require ≥32K tokens. A misplaced edit silently 10x's your bill — the API never warns you, and the cost only shows up on the next invoice. Paste old + new prompt, the predictor finds the longest common prefix, estimates tokens with three tokenizer profiles (English / code / CJK), and shows per-provider hit ratio + $ delta vs no-cache for Claude Opus/Sonnet/Haiku, GPT-5/mini, and Gemini 2.5 Pro. Use case: 'I tweaked the system prompt and the bill jumped — what broke?' → paste both prompts, see exactly which provider stopped caching.",
// v0.8.5 — anti-bullshit pack #11: Speculative-Decode Compatibility
"modes.speculative": "🔬 Spec-Decode",
"mode_desc.speculative": "Fetches `tokenizer.json` from HF Hub for two model ids and verifies vocab compatibility before you wire up speculative decoding. Catches the silent-mismatch bug that wastes draft compute.",
"speculative.title": "🔬 Speculative-Decode Compatibility",
"speculative.tip": "Speculative decoding (vLLM, SGLang, llama.cpp, transformers) requires the draft and target model to share an EXACT vocabulary. Any token-id disagreement means the target rejects every draft token — you pay BOTH compute costs and get WORSE throughput than baseline. The system reports nominal output (just slower), so the bug is invisible in unit tests. This tool fetches `tokenizer.json` from HF Hub for both ids and compares.",
"speculative.desc": "Don't ship spec-dec with mismatched vocabs. Paste target + draft HF model ids → tool fetches tokenizers, compares vocab type, size, sampled token-ids, special tokens, added tokens → verdict + speedup estimate.",
"speculative.target_label": "Target (large) model id:",
"speculative.draft_label": "Draft (small) model id:",
"speculative.target_label_short": "target",
"speculative.draft_label_short": "draft",
"speculative.check_btn": "🔍 Check compatibility",
"speculative.example_good_btn":"↳ Example: Llama-3.1 8B/70B (gated → mirror)",
"speculative.example_bad_btn": "↳ Example: cross-family (bad)",
"speculative.gated_note": "💡 Gated models (Llama, Mistral, Gemma) trigger an automatic open-mirror fallback (unsloth/...). HF officially discourages browser-side tokens, so the tool can't auth — but mirror tokenizers are typically byte-identical because quantization touches weights, not the tokenizer artifact.",
"speculative.mirror.heading": "Open-mirror fallback",
"speculative.mirror.target_used": "Target {original} was gated; used mirror {mirror}.",
"speculative.mirror.draft_used": "Draft {original} was gated; used mirror {mirror}.",
"speculative.mirror.warn": "Mirror tokenizers (e.g. unsloth/) are usually byte-identical to the gated original because quantization touches weights, not tokens. Verify chat-template if exact match is required (unsloth #880 documents occasional drift).",
"speculative.status.fetching": "🔄 Fetching tokenizer.json from HF Hub for both models…",
"speculative.status.done": "✅ {verdict}",
"speculative.status.error": "❌ Error",
"speculative.type_mismatch_note": "tokenizer types differ; spec-dec impossible",
"speculative.vocab_size": "Vocab size",
"speculative.size_diff": "differ — every reused id is a misalignment",
"speculative.sampled": "Token-id sample match",
"speculative.first_mismatch": "First mismatch",
"speculative.special_diff": "Special-token differences",
"speculative.added_diff": "Added-token differences",
"speculative.added_diff_more": "+ more …",
"speculative.speedup.title": "Estimated speedup band",
"speculative.speedup.params": "target {target} / draft {draft} (param ratio {ratio})",
"speculative.speedup.low": "Low (α=0.50)",
"speculative.speedup.expected":"Expected (α=0.70)",
"speculative.speedup.high": "High (α=0.85)",
"speculative.speedup.disclaimer": "α = draft acceptance rate. Real speedup depends on prompt domain, lookahead K, and engine overhead. Bands assume ideal verifier batching.",
"speculative.speedup.draft_not_smaller": "Draft is not smaller than target — spec-dec is misuse here.",
"speculative.attribution": "Refs:",
"speculative.side.target": "Target",
"speculative.side.draft": "Draft",
"speculative.fetch_error.missing_model_id": "missing model id",
"speculative.fetch_error.gated_or_private": "model is gated or private — can't fetch tokenizer without auth",
"speculative.fetch_error.not_found": "model id not found on HF Hub",
"speculative.fetch_error.fetch_failed": "fetch failed (HTTP error)",
"speculative.fetch_error.parse_failed": "JSON parse failed (file malformed)",
"speculative.fetch_error.timeout": "timeout (>15s, large tokenizer or slow connection)",
"speculative.fetch_error.network": "network error",
"speculative.fetch_error.hint": "Check the model id spelling. For gated models you'll need to view the tokenizer file via your HF account — this tool can't auth.",
"speculative.hint.missing_input": "Enter both target and draft model ids, then Check.",
"speculative.hint.identical_models": "Target and draft are the same model — spec-dec is a no-op (and wasteful).",
"speculative.verdict.compatible": "✅ Compatible — vocabs match",
"speculative.verdict.compatible_with_caveats": "✅ Compatible — but special/added tokens differ (review)",
"speculative.verdict.partial_compatible": "⚠ Partial match (95-99.9% of sampled ids)",
"speculative.verdict.type_mismatch": "❌ Tokenizer types differ — spec-dec impossible",
"speculative.verdict.vocab_size_mismatch": "❌ Vocab sizes differ — id space misaligned",
"speculative.verdict.incompatible": "❌ Incompatible — too many id mismatches",
"speculative.verdict.fetch_failed": "ℹ Couldn't fetch tokenizer",
"speculative.verdict.identical_models": "ℹ Identical models — spec-dec is a no-op",
"speculative.verdict.missing_input": "ℹ Enter both ids",
"inv.v085.speculative": "🔬 Spec-Decode — verifies tokenizer vocab compatibility between target + draft before you ship speculative decoding (the bug that gives WORSE throughput silently).",
"help.v085.speculative.title": "🔬 Speculative-Decode Compatibility",
"help.v085.speculative.body": "Speculative decoding only works if target and draft share the exact same vocabulary. Mismatched vocabs cause every draft token to be rejected — you pay BOTH compute costs and get worse throughput than baseline. Worse, the system still emits correct output (just slower), so the bug is invisible in unit tests. vLLM #4570 / #16757 / #20409 / #12488 all surface variants. This tool fetches `tokenizer.json` from HF Hub for both model ids, compares tokenizer type, vocab size, full token→id map, special tokens, and added tokens, then estimates a speedup band based on param ratio and typical α=0.5/0.7/0.85 acceptance rates. Use case: before you launch a vLLM cluster with spec-dec enabled, verify the pair is actually compatible.",
// v0.8.7 — anti-bullshit pack #13: Multilingual Tokenizer Tax
"modes.tax": "🌍 Token Tax",
"mode_desc.tax": "Real BPE encoding (browser-side via transformers.js) of pasted text across 6 vendor tokenizers. Surfaces the silent cost asymmetry across languages.",
"tax.title": "🌍 Multilingual Tokenizer Tax",
"tax.tip": "Tokenizers tax non-English text asymmetrically. The same paragraph might be 100 tokens in English but 250+ tokens in Chinese on a Latin-trained tokenizer (Llama, Phi). Cost per request and effective context BOTH degrade silently. Paste your text, see actual token counts across vendor tokenizers — no estimation, real BPE encoding via transformers.js in your browser.",
"tax.desc": "Don't 3× your bill on Chinese support. Paste any text → real per-tokenizer BPE encoding across Qwen / Phi / Llama / Gemma / GPT-4 / Claude → see the cost asymmetry vs your baseline.",
"tax.input_label": "Text to tokenize:",
"tax.input.placeholder": "Paste any text — English, Chinese, Arabic, code, …",
"tax.tokenize_btn": "🔬 Tokenize all",
"tax.sample_en_btn": "↳ Sample: English",
"tax.sample_zh_btn": "↳ Sample: 中文",
"tax.sample_ar_btn": "↳ Sample: عربى",
"tax.sample_mixed_btn": "↳ Sample: mixed",
"tax.sample_code_btn": "↳ Sample: code",
"tax.status.loading": "⏳ Loading transformers.js + tokenizers (first run can take 5-15s)…",
"tax.status.done": "✅ {n}/{total} tokenizers ran in {ms}ms",
"tax.col.tokenizer": "Tokenizer",
"tax.col.tokens": "Tokens",
"tax.col.cpt": "Chars/tok",
"tax.col.ratio": "Ratio",
"tax.summary.input": "Input: {chars} chars, {bytes} bytes",
"tax.script_breakdown": "scripts",
"tax.interp.worst": "{label} costs {pct}% more tokens than baseline for this text.",
"tax.interp.uniform": "✓ All tokenizers within ±5% — text is well-handled across vendors.",
"tax.hint.empty": "Paste some text and click Tokenize.",
"tax.all_failed": "All tokenizers failed to load.",
"tax.error.gated": "model gated (HF auth required — try the open mirror)",
"tax.error.not_found": "model id not found",
"tax.error.timeout": "timeout (large tokenizer or slow connection)",
"tax.error.network": "network error",
"tax.error.fetch_failed": "fetch failed",
"tax.error.invalid_input": "invalid input",
"tax.attribution": "Tokenizers via",
"tax.attribution.privacy": "Text is tokenized locally — never leaves the browser.",
"tax.firstload_note": "💡 First-time load: the tool fetches transformers.js (~750 KB) + each tokenizer's vocab on demand (~5-15 MB per tokenizer, cached after). Subsequent runs are instant. All processing is local — your text never leaves the browser.",
"inv.v087.tax": "🌍 Token Tax — real BPE encoding across 6 vendor tokenizers. Surfaces the silent cost asymmetry across languages (CJK / Arabic / mixed).",
"help.v087.tax.title": "🌍 Multilingual Tokenizer Tax",
"help.v087.tax.body": "Tokenizers tax non-English text asymmetrically. The same paragraph might be 100 tokens in English but 250+ in Chinese on a Latin-trained tokenizer (Llama, Phi). Both cost-per-request AND effective context degrade silently. This tool loads HuggingFace transformers.js in your browser (~750 KB CDN) and tokenizes pasted text against 6 preset vendor tokenizers (Qwen2.5, Phi-3.5, Llama-3.1, Gemma-2, GPT-4 cl100k, Claude approx). Output: per-tokenizer token count + chars-per-token + ratio vs baseline + cost-asymmetry interpretation. Auto-detects script blocks (Latin / CJK / Arabic / Cyrillic / Devanagari / Thai / Greek / Hebrew / Korean) so users see why one tokenizer is 3× another. Use case: 'My multilingual support added 30% to the bill — which language costs the most?' → paste real production text, see exact per-tokenizer breakdown.",
// v0.8.8 — anti-bullshit pack #14: LongScore (RULER + HELMET lookup)
"modes.longscore": "🎯 LongScore",
"mode_desc.longscore": "Look up your model's relative degradation past short context. RULER + HELMET KBs (n=93 models). LongScore metric from 100-LongBench (ACL 2025).",
"longscore.title": "🎯 LongScore",
"longscore.tip": "Every model claims a 128K context window, but accuracy degrades long before that. LongScore (peer-reviewed metric from 100-LongBench, ACL 2025) measures relative degradation past short context. Disentangles base ability from true long-ctx capability — so you compare degradation, not raw scores. Lookup against RULER + HELMET KBs (n=93 models).",
"longscore.desc": "How much does your model degrade past short context? Paste an HF model id → see LongScore (relative degradation) + per-length breakdown + HELMET 7-task scores when available. No GPU. No inference. Pure lookup against published benchmarks.",
"longscore.input_label": "Model id:",
"longscore.input.placeholder": "e.g. Qwen2.5-72B-Instruct or meta-llama/Llama-3.1-70B-Instruct",
"longscore.lookup_btn": "🔎 Lookup",
"longscore.example_good_btn": "↳ Example: Jamba-1.5-Large (no degradation)",
"longscore.example_mid_btn": "↳ Example: Llama-3.1-70B (moderate)",
"longscore.example_bad_btn": "↳ Example: dbrx (severe)",
"longscore.formula_note": "💡 LongScore = mean over l ∈ {16K, 32K, 64K, 128K} of (S_l − Base) / Base, where Base = mean(S_4K, S_8K). Source: 100-LongBench, ACL 2025. Data: NVIDIA RULER (per-length, n=33) + HELMET (aggregate at 128K, n=60). 0 = no degradation; -0.30 = severe.",
"longscore.miss.title": "Model not found in KB",
"longscore.miss.body": "Looked up {id}. KB has {n} models. Try a canonical HF id (e.g. Qwen2.5-72B-Instruct, Llama-3.1-70B-Instruct, Jamba-1.5-Mini).",
"longscore.miss.suggest": "Check coverage at",
"longscore.no_ruler": "⚠ No per-length data — LongScore not computable. Showing HELMET aggregate at 128K instead.",
"longscore.score_label": "LongScore",
"longscore.helmet_label": "HELMET 7-task breakdown",
"longscore.col.ctx": "Context",
"longscore.col.score": "Score",
"longscore.col.lc": "LC",
"longscore.col.task": "Task",
"longscore.source_note": "Data source",
"longscore.hint.empty": "⚠ Paste a model id first.",
"longscore.status.lookup": "⏳ Looking up…",
"longscore.status.miss": "ℹ Model not in KB",
"longscore.status.ruler_hit": "✅ RULER per-length data found",
"longscore.status.helmet_only":"ℹ HELMET aggregate only (no per-length data)",
"longscore.verdict.no_degradation": "✅ No degradation past short context",
"longscore.verdict.mild": "🟢 Mild degradation (<10%)",
"longscore.verdict.moderate": "🟠 Moderate degradation (10-20%)",
"longscore.verdict.severe": "🔴 Severe degradation (20-30%)",
"longscore.verdict.extreme": "🚨 Extreme degradation (>30%)",
"inv.v088.longscore": "🎯 LongScore — peer-reviewed degradation metric (100-LongBench, ACL 2025). Lookup any model in RULER + HELMET KBs (n=93). See how much your model actually drops past short context.",
"help.v088.longscore.title": "🎯 LongScore",
"help.v088.longscore.body": "Every long-ctx LLM claims 128K but degrades long before that. The 100-LongBench paper (ACL 2025, arXiv:2505.19293) noticed that raw long-ctx scores are dominated by base ability — a smarter model with a worse long-ctx recipe still scores higher than a less-smart model with a better recipe, masking the actual long-ctx degradation. They propose LongScore: LC_l = (S_l − Base) / Base with Base = mean(S_short), then average over long lengths. Result: a relative-degradation number per model that compares apples to apples. This tafagent mode embeds LongScore-ready data: RULER aggregate per-context (n=33 models, 4K-128K) + HELMET aggregate at 128K (n=60 models, 7 task categories). Lookup is exact-match by HF model id (lowercase, dashes, dots normalized). For models with RULER data, you get the full LongScore + per-length breakdown + verdict (no/mild/moderate/severe/extreme degradation). For HELMET-only models, you get the 7-category aggregate at 128K. Use case: 'I want to use Llama-3.1-70B-Instruct for 100K-token doc summarization — how much accuracy do I actually lose?' → paste id, see -10% LongScore (moderate degradation, mostly the 128K cliff). Decide whether to use it, switch to a model with engineered long-ctx, or chunk your input.",
"inv.v081.hub": "🧭 Solutions Hub — every documented pain mapped to a tafagent mode or curated external tool. Don't reinvent — find.",
"help.v081.hub.title": "🧭 Solutions Hub",
"help.v081.hub.body": "tafagent as integrator, not silo. 30+ pains across 7 categories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), each mapped to (a) the tafagent mode that addresses it, if any, and (b) the best-of-breed external tools the community already trusts (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Search box matches across pain, scenario, and tool name. Use case: 'I have problem X — does tafagent solve it, and if not, who does?'",
"hub.title": "🧭 Solutions Hub",
"hub.tip": "Map of every documented LLM-eval pain we know about: which tafagent mode addresses it (if any), and the best-of-breed external tools the community already trusts. Goal: full coverage. If a canonical tool exists elsewhere, we link rather than rebuild.",
"hub.desc": "Don't reinvent — find. 30+ pains mapped to tafagent modes + curated external tools. Browse by category, search by keyword, or see the gaps where new modes would help most.",
"hub.clear_btn": "✕ Clear",
"hub.no_mode": "external",
"hub.planned": "planned:",
"hub.best_for": "Best for",
"hub.not_for": "Not for",
"hub.tools": "External tools",
"hub.status.loaded": "✅ Loaded {total} pains across {categories} categories — {covered} covered by tafagent modes, {externalLinks} external links curated. Compiled {compiled}.",
"hub.status.fail": "⚠ Could not load Solutions Hub.",
"hub.search.empty": "No matches for '{query}'. Try broader terms (e.g. 'eval', 'rag', 'tokenizer').",
"hub.search.results": "Found {n} match(es) for '{query}'.",
// v0.7.7 — Task tiles (UX restructure: 14 modes grouped by user intent)
"tiles.title": "🎯 What do you want to do?",
"tiles.subtitle": "Pick a task. Each one opens the right tool below. Or scroll down for the full list of 22 modes.",
"tile.diagnose.title": "🔬 Diagnose a model",
"tile.diagnose.desc": "Will this specific model work for my use case?",
"tile.trust.title": "✓ Trust a benchmark score",
"tile.trust.desc": "Should I believe this number? Bug or noise?",
"tile.eval.title": "⚙️ Set up an eval correctly",
"tile.eval.desc": "Get the exact CLI flag for lm-eval / vLLM / transformers.",
"tile.compare.title": "🆚 Compare models",
"tile.compare.desc": "Side-by-side, or browse the empirical model landscape.",
"tile.manual.title": "📋 Manual / free-form",
"tile.manual.desc": "Pick a specific recipe by hand, or ask in plain English.",
"tile.diagnose.tip": "Start here when you have a specific model id and want a full diagnostic: Profile runs all 5 recipes at once. Unmask checks if max_position_embeddings is honest. NIAH→Reason predicts retrieval-vs-reasoning gap. LongScore looks up published RULER + HELMET data and shows real degradation past short context (peer-reviewed metric). Quant predicts whether quantizing will break it. Inspect lets you paste raw config.json for private/in-dev models.",
"tile.trust.tip": "When you see a score and want to know if it's real. Contamination rates 20+ benchmarks for likelihood the model saw them during training. Drift tells you if a gap between two evals is numerical noise or a real bug (chat-template mismatch, KV-cache layout, etc.). Arena CI reconstructs the confidence intervals Chatbot Arena hides — many top-Elo "wins" are statistically tied.",
"tile.eval.tip": "Before you run lm-eval-harness or vLLM serve, get the right CLI flag. Chat-template Sniffer detects the template family (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) and emits the exact --apply_chat_template / --chat-template invocation. Solves issue #1841 in lm-eval-harness (silent ÷2 accuracy). Diagnose CLI generates the Python command to measure γ_obs on your local GPU.",
"tile.compare.tip": "Compare: pick 2-3 candidate models + one recipe, see verdicts in a side-by-side table (e.g. Llama-3-8B vs Mistral-7B at 32k context). Phase diagram: scatter of 23 empirical models on the (log θ, γ) plane, with the Padé curve overlaid. Hover dots for details, click to load that model into the Recipe form.",
"tile.manual.tip": "Recipe: pick a specific X-N recipe (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 KV compression, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) and fill the form by hand for full control. Ask: type a free-form question; an in-browser 0.5B LLM (Qwen2.5) picks the right recipe and runs it. Best for "what would happen if..." exploration.",
"share.import_desc": "Got a JSON file from someone else's TAF analysis? Load it here to see the verdict + chain locally. Same view as if you'd run it yourself.",
"share.import_btn": "📂 Load shared JSON",
"synthesis.system": "You are a precise transformer LLM diagnostic assistant. Given pre-computed TAF formula results, write a clear plain-English summary in 4-6 sentences. Cite the section number (§X.Y) for each number you mention. Always give a concrete recommendation. Do NOT invent numbers.",
// INSPECTOR mode
"inspector.title": "🔍 Architecture Inspector",
"inspector.desc": "Paste the raw config.json contents. The tool extracts the architectural parameters and runs the full 5-recipe Profile.",
"inspector.tip": "Paste any config.json directly. Tool parses it and runs the full Profile. Useful for: private models, in-development configs, models not yet on HuggingFace, or comparing what your custom architecture would do.",
"inspector.quickstart": "💡 Use case: you have a private model not on HF Hub, or a config you're designing. Paste the raw JSON below and get a full TAF profile.",
"inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}",
"inspector.T_eval": "T_eval (your target context):",
"inspector.btn": "🚀 Inspect & profile",
// WHAT-IF slider
"whatif.title": "🎚 What-if: drag T_eval to see γ change live",
"whatif.desc": "Pure JS recompute (no Pyodide call). Shows the geometric γ_Padé and d_horizon as you slide. The full chain re-runs on click.",
"whatif.T_eval": "T_eval",
"whatif.gamma_pade": "γ_Padé",
"whatif.d_horizon": "d_horizon",
"whatif.l_niah": "L_NIAH ceiling",
"whatif.predicted": "Predicted geometric verdict",
"whatif.rerun": "↻ Recompute full chain at this T_eval",
// COMMUNITY feed
"community.title": "🌐 Recent community submissions",
"community.desc": "Live feed from the public registry. Click any submission to view full analysis.",
"community.browse_all": "Browse all →",
"community.loading": "Loading...",
"community.no_repo": "The registry repo isn't created yet. Once it exists with submissions, they'll appear here live.",
"community.no_submissions": "No submissions yet. Be the first — generate a Profile and click 📤 Submit to registry.",
// FALSIFICATION dashboard
"falsification.title": "🔬 Paper predictions — falsification status",
"falsification.desc": "The TAF framework rests on falsifiable predictions (F1-F23). Each is empirically tested. Here's the live status of every prediction in the paper.",
"falsification.summary": "{confirmed} confirmed · {partial} partial · {refuted} refuted · {untested} untested (out of {total} total predictions)",
"falsification.col.id": "ID",
"falsification.col.claim": "Claim",
"falsification.col.status": "Status",
"falsification.col.evidence": "Evidence",
"tafcard.title": "📇 TAF Card — full model profile",
"tafcard.recipes_title": "📋 Recipes — verdict per dimension",
"tafcard.recipes_count_label": "dimensions",
"tafcard.numbers_title": "🔢 Key numbers (paper §26)",
"tafcard.fals_title": "🔬 Falsification status (F1-F23)",
"tafcard.fals_none": "No falsifications applicable.",
"tafcard.diag_title": "🔬 Diagnostics — numbers · γ check · what-if",
"tafcard.verify_title": "✓ Verification — Lean + Sage + falsification",
"tafcard.share_title": "📂 Provenance & share",
"tafcard.whatif_title": "🎚️ What-if explorer",
"verdict.go": "GO",
"verdict.no": "NO",
"verdict.degraded": "DEGRADED",
"compare.title_out": "🆚 Comparison Table",
"status.loading_pyodide": "⏳ Loading Python runtime (~10MB, first time only)...",
"status.loading_taf": "⏳ Loading TAF formulas + recipes...",
"status.ready": "✅ Ready. Pick a model and click Profile to start.",
"status.computing": "🧮 Computing TAF chain...",
"status.done": "✅ Done.",
"profile.hf_placeholder": "e.g. meta-llama/Meta-Llama-3-8B or Qwen/Qwen2.5-7B",
"compare.hf_placeholder": "HF model id (e.g. meta-llama/Meta-Llama-3-8B)",
"compare.slot1_placeholder": "HF model id (e.g. meta-llama/Meta-Llama-3-8B)",
"compare.slot2_placeholder": "HF model id #2",
"compare.slot3_placeholder": "HF model id #3 (optional)",
"compare.preset_default": "— or preset —",
// Form parameters
"param.theta": "θ (rope_theta)",
"param.theta.tip": "RoPE base frequency from config.rope_theta. Higher = more long-range capacity.",
"param.T_train": "T_train",
"param.T_train.tip": "Max training context. From max_position_embeddings. Beyond this is extrapolation.",
"param.T_eval": "T_eval (your target)",
"param.T_eval.tip": "Your target inference context. The whole question is: will the model behave well at THIS length?",
"param.n_attn": "n_attention_heads",
"param.n_attn.tip": "Number of attention heads per layer. From num_attention_heads.",
"param.n_kv": "n_kv_heads",
"param.n_kv.tip": "KV heads. If < n_attention_heads → GQA (Grouped Query Attention). Reduces KV memory but pushes γ toward Hagedorn.",
"param.d_head": "head_dim",
"param.d_head.tip": "Per-head dimension. Typical 64, 96, 128. From head_dim or hidden_size / num_attention_heads.",
"param.n_layers": "n_layers",
"param.n_layers.tip": "Number of transformer blocks. From num_hidden_layers.",
"param.n_params": "n_params (e.g. 8e9)",
"param.n_params.tip": "Total parameter count. Threshold ~400M for induction-head emergence. Affects KV memory and budget recipes.",
"param.has_swa": "Has SWA?",
"param.has_swa.tip": "Sliding Window Attention. true for Mistral, gemma-2, phi-3. v0.5.3 calibration audit disabled the historical δ_SWA correction (n=1 fit).",
"common.yes": "Yes",
"common.no": "No",
// Mode tooltips
"modes.tip": "Fourteen ways to use the tool.
📇 Profile: paste a model id → 5-recipe TAF Card.
🆚 Compare: 2-3 models side-by-side on one recipe.
🔍 Inspect config: paste raw config.json → full Profile.
💬 Ask: free-form question, browser LLM picks the recipe.
📋 Recipe: manual selection with full form control.
🩺 Diagnose CLI: generate Python command for local γ measurement.
📊 Phase diagram: 23-model panel on (log θ, γ) plane.
🪟 Unmask: detect misleading max_position_embeddings (SWA / YaRN / RoPE-scaling).
📜 Chat-template: detect family + give exact CLI flag for lm-eval / vLLM / transformers.
🎯 Arena CI: reconstruct confidence intervals from raw pairwise vote data; detect statistical ties Arena hides.
🧪 Contamination: rate 20+ benchmarks for contamination probability based on training cutoff vs release date.
⚖️ Quant: predict γ-shift and ΔPPL for any (model × quant scheme); recommend safer alternative on cliff.
🔀 Drift: same model, different scores on two setups — bug or noise? Predict numerical-noise band and flag real bugs.
🔍 NIAH→Reason: predict NIAH and multi-hop reasoning pass rates from architecture; find your model's safe reasoning context.",
"profile.tip": "One-click full diagnosis. Paste any HF model id (or pick preset). Tool runs all 5 recipes (long-context, KV-compression, custom-vs-API, budget, hardware) and produces a single TAF Card with verdict per dimension + key numbers + architecture classification.
Use case: \"I'm evaluating Qwen2.5-32B for production — what's its full viability profile?\" → paste id → Profile → done.",
"compare.tip": "Same recipe, multiple models. Pick 2-3 candidate models and one recipe. See verdicts in a single comparison table.
Use case: \"I need long-context retrieval at 16K — which is best: Llama-3-8B, Mistral-7B, or Qwen-7B?\" → pick 3 + X-2 + 16K → see winner.",
// Help modal
"help.title": "📘 TAF Agent — User Manual",
"help.what.title": "What does it do?",
"help.what.body": "Predicts practical viability of any transformer LLM before you spend GPU/$. Answers questions like \"will this model work at L=32K?\" or \"should I train custom or use API?\" using deterministic Python formulas (TAF — Thermodynamic Attention Framework).",
"help.modes.title": "How to use — 7 modes",
"help.modes.profile": "📇 Profile: paste model id → all recipes at once = TAF Card. Best starting point.",
"help.modes.compare": "🆚 Compare: 2-3 models side-by-side on same recipe. Best when choosing between candidates.",
"help.modes.inspector": "🔍 Inspect config: paste raw config.json → tool parses + runs full Profile. For private models, in-development configs, or models not yet on HF Hub.",
"help.modes.ask": "💬 Ask plain English: free-form question, in-browser LLM picks the recipe. Best for casual exploration.",
"help.modes.recipe": "📋 Recipe + form: manual selection, full parameter control. Best when you want exact control.",
"help.modes.diagnose": "🩺 Diagnose CLI: generate Python command to measure γ on your local machine (transformers + numpy). Fast ≈5 min CPU; full ≈20–60 min GPU. Output JSON re-uploadable via Inspect.",
"help.modes.phase": "📊 Phase diagram: scatter plot of 23 panel models on (log θ, γ) plane. Hagedorn line γ=1 separates Phase A from Phase B. Click a dot to load that model into Recipe form.",
"help.recipes.title": "The 8 recipes available",
"help.recipe.x1.title": "X-1 Custom training vs API — compares cost of training your own model vs paying for API access.",
"help.recipe.x1.example": "Try: \"Should I train an 8B custom model or use GPT-4o for 50M tokens/month?\"
Answer types: YES (custom) / NO (API) with break-even months.",
"help.recipe.x2.title": "X-2 Long Context Viability — predicts if a model serves a target context length reliably.",
"help.recipe.x2.example": "Try: \"Will Meta-Llama-3-8B handle 32000 tokens for retrieval?\"
Chains: γ_Padé → decomposition → d_horizon → NIAH ceiling → hallucination → KV memory.
Verdict: YES / DEGRADED / NO with mitigation if needed.",
"help.recipe.x3.title": "X-3 Budget pre-flight — given $ budget, what model is feasible to train?",
"help.recipe.x3.example": "Try: \"I have $5000, what model can I train?\"
Answer: GO / TINY-MODEL / MEMORY-LIMITED with concrete N (params) and D (tokens).",
"help.recipe.x5.title": "X-5 Hardware selection — which GPU should I use to serve at target throughput?",
"help.recipe.x5.example": "Try: \"Cheapest hardware to serve Llama-3-8B at 10M tokens/day\"
Answer: best GPU + $/Mtok + capacity vs target.",
"help.recipe.x19.title": "X-19 KV Compression decision — should I use soft decay, hard cutoff, or literature methods?",
"help.recipe.x21.title": "X-21 Imprint Purity Diagnostic — predicts γ on RANDOM tokens via ν=−1/(2π); how clean is the model's RoPE prediction?",
"help.recipe.x22.title": "X-22 Compute-Context Invariant — does γ × log(N²·D) lie in panel band 51.2 ± 16.8? Detects scaling/training anomalies.",
"help.recipe.x23.title": "X-23 IH-Phase Detector — pre- or post-induction-head? Cheap probe via sign(γ_text − γ_random).",
"help.recipe.x19.example": "Try: \"How to compress KV cache for Qwen2.5-7B at 32K?\"
Answer: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
"help.recipe.x21.example": "Try: \"How clean is the RoPE prediction on Llama-3-8B?\"
Answer: predicted γ_random + purity diagnostic (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).",
"help.recipe.x22.example": "Try: \"Does Mistral-7B fit the compute-context invariant?\"
Answer: K = γ·log(N²·D), z-score, IN-BAND or OUTLIER.",
"help.recipe.x23.example": "Try: \"Is Qwen2.5-7B post-induction-head?\"
Answer: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY (with size-vs-Δγ consistency check).",
"help.section.v04": "What's new in v0.4 (sesión 29 findings 2026-04-28): three diagnostic recipes derived from cross-model panel analysis (n=22 LLMs).",
"help.divider.v04_s29": "— v0.4 (sesión 29 findings) —",
"footer.tech_stack": "Computation: Pyodide · Synthesis: WebLLM (Qwen2.5-0.5B local) · Hosting: GitHub Pages · Cost: $0",
"help.v04.imprint": "Learned-imprint slope ν = −1/(2π): RoPE rotation period 2π drives a positional bias on weights, proportional to log(N_params). Even random tokens show this scaling. ν is DERIVED — not fitted (empirical err 0.3%).",
"help.v04.invariant": "Chinchilla-attention invariant K: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Connects compute scaling and attention exponent into a single dimensionless number.",
"help.v04.ih_probe": "Δγ as IH probe: sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Cheaper than running an in-context-learning benchmark.",
"help.v04.constants": "γ-cluster on famous constants (intriguing, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (golden conjugate, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat: could be coincidence.",
"help.param.theta": "θ (rope_theta): RoPE base frequency. Higher = more long-range capacity. Typical: 10000 (early), 500000 (Llama-3), 1000000 (Qwen2.5).",
"help.param.T_train": "T_train: max context the model was trained on. From max_position_embeddings.",
"help.param.T_eval": "T_eval: your target inference context length. The key knob.",
"help.param.gqa": "n_kv_heads < n_attention_heads: model uses GQA (Grouped Query Attention). Reduces KV memory but pushes γ toward Hagedorn.",
"help.param.swa": "has_SWA: model uses Sliding Window Attention (Mistral, gemma-2).",
"help.param.nparams": "n_params: total parameter count. Threshold ~400M for induction-head emergence.",
"help.add_models.title": "Adding new models (3 ways)",
"help.add_models.preset": "Preset list: 11 popular models curated. Just select from dropdown.",
"help.add_models.hf": "HF Hub fetch: paste any model id (e.g. Qwen/Qwen2.5-32B-Instruct), click 📥 Fetch. Browser downloads config.json directly from HuggingFace, fills the form. Works for any public model.",
"help.add_models.manual": "Manual: fill the form fields directly with values from the model card.",
"help.audit.title": "The audit chain",
"help.audit.body": "Every result shows the full Computation Chain — each formula step with its inputs, output, and interpretation. Click any step to expand. Cite section numbers (§26.1, §19.1, etc.) refer to the underlying paper for derivation.",
"help.synthesis.title": "The plain-English answer",
"help.synthesis.body": "After the deterministic chain runs, an in-browser LLM (Qwen2.5-0.5B, ~350MB cached after first load) synthesizes a plain-English summary. The numbers above are always correct (deterministic Python); the synthesis is LLM-generated — verify against the chain if in doubt.",
"help.params.title": "Common parameters explained",
"help.verdicts.title": "What to look for in verdicts",
"help.verdict.yes": "YES / GO — proceed with confidence; numbers support the choice.",
"help.verdict.deg": "DEGRADED / TINY-MODEL — works but with caveats; read the action.",
"help.verdict.no": "NO / MEMORY-LIMITED — don't proceed as-is; mitigation provided.",
"help.privacy.title": "Privacy",
"help.privacy.body": "Everything runs in your browser. No telemetry, no analytics, no data sent anywhere. Even the LLM model runs locally via WebGPU/WebAssembly. Your model_ids and questions never leave this page.",
"help.source.title": "Source & paper",
"help.source.body": "Source code: github.com/karlesmarin/tafagent
Paper: Marin 2026 — Predicting How Transformers Attend (Zenodo; arXiv forthcoming)
Dataset: taf-attention-decay — 58 γ-measurements across 32 models (CC-BY-4.0)",
"footer.text": "© 2026 Carles Marin · Apache-2.0 · independent research · the tool that closes the loop of the paper.",
// §33 v0.4 (sesion 31, 2026-04-30) — new diagnostic functions
"v04.title": "🆕 v0.4 — New diagnostics (sesion 31)",
"v04.section.intro": "Four new diagnostic functions derived sesion 31 (2026-04-30) from cross-of-crosses formula games + Sócratic interrogation. Available in taf_browser.py §33.",
"v04.arch.label": "Architectural Concentration",
"v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv. Cross-panel correlational law (R²=0.30). Caveat: not per-model predictor.",
"v04.pdi.label": "PDI — Padé Deviation Index",
"v04.pdi.desc": "PDI = d_horizon_obs/T_eval. Traffic light: green (≈1), orange (>>1), yellow (<<1), red (Phase B negative).",
"v04.4bit.label": "4-bit Shift Predictor",
"v04.4bit.desc": "MHA: R²(bf16)<0.9 → γ rises; R²>0.99 → γ drops. GQA: precision-robust regardless.",
"v04.crit.label": "Critical Exponents Bundle",
"v04.crit.desc": "ν_c, β_c, η_c (=γ−1, CORRECTED), α_C, γ_susc with AM-GM minimum at γ=1−1/√2≈0.293.",
// §34 v0.5 (sesion 32, 2026-05-01) — Machine-verified framework consistency
"v05.title": "🔬 v0.5 — Machine-verified consistency (sesion 32)",
"v05.section.intro": "Sage Groebner basis + Lean Mathlib4 dual-tool verification of 15 algebraic identities of TAF critical exponents. First transformer-attention framework with formal machine-proof backing.",
"v05.verify.label": "Algebraic Consistency Check",
"v05.verify.desc": "Given measured γ, verifies 12 D-SAGE identities (D-SAGE-1: 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). All passing = framework intact. Failures indicate bf16 outliers / quantization artifacts.",
"v05.dsage1.label": "D-SAGE-1 (★★ core)",
"v05.dsage1.desc": "Quadratic identity 2η² + η·γ_χ + 1 = 0 (Sage Groebner-discovered, Lean-verified). Replaces incorrect 'triple closure' claim. Refutes paper 1's η=2γ algebraically.",
"v05.erratum.label": "Paper 1 erratum — η correction",
"v05.erratum.desc": "Paper 1 originally claimed η = 2γ. Sage Groebner + Lean Mathlib4 proved this fails (residual (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Phase A). Correct value: η = γ−1, satisfying D-SAGE-1.",
"v05.repro.label": "Reproducibility",
"v05.repro.desc": "All 15 theorems machine-proof in Lean Mathlib4 (1973 jobs build success). Sage script: analysis/sage_recursive_sweep_2026-04-30.sage. Lean code: lean_taf/taf/Taf/Identities.lean.",
// v0.5.1 — TAF Card consistency check button
"v05.consistency.title": "🔬 Algebraic consistency check (Sage + Lean v0.5)",
"v05.consistency.desc": "Verifies 12 D-SAGE algebraic identities of TAF critical exponents (machine-proof Sage Groebner basis + Lean Mathlib4). Pass = framework intact. Fail = bf16 outlier / quantization artifact.",
"v05.consistency.btn": "🔬 Verify algebraic consistency",
// v0.5.2 — Anti-Ising universality class badge
"v05.antiising.badge": "🧲 Anti-Ising class (β=γ−1<0, machine-verified)",
// v0.5.2 — Per-identity tooltips (plain English explanations)
"v05.tooltip.D_SAGE_1": "Quadratic algebraic identity connecting anomalous dimension η and susceptibility γ_χ. The CORE identity discovered by Sage Groebner basis (machine-proof). Replaces earlier wrong claim of triple closure.",
"v05.tooltip.D_SAGE_2": "In Phase A, β = γ−1 is negative (anti-Ising). Multiplied by χ = 1/(1−γ) gives exactly −1. Signature of TAF's negative-β regime.",
"v05.tooltip.D_SAGE_4": "The specific heat exponent α and susceptibility χ sum to exactly 2 in TAF. Algebraic consequence of Josephson hyperscaling.",
"v05.tooltip.D_SAGE_5": "Linear sum identity: α + γ_χ = 2(2−γ). Means as γ approaches 1 (Hagedorn), the sum approaches 2; at γ=0 it's 4.",
"v05.tooltip.D_SAGE_6": "Order parameter exponent times susceptibility exponent equals a specific quadratic in γ. Factored algebraic relation.",
"v05.tooltip.Rushbrooke_tautology": "Standard Rushbrooke hyperscaling 2β + γ_χ = ν·d at d=1. In TAF this is a TAUTOLOGY — γ_χ is defined exactly so this holds. Confirmed by Sage Groebner basis.",
"v05.tooltip.Josephson_tautology": "Standard Josephson hyperscaling 2 − α = ν·d at d=1. In TAF this is a TAUTOLOGY — α is defined exactly so this holds.",
"v05.tooltip.Fisher_independent": "Fisher relation γ_χ = (2−η)·ν. In TAF this is INDEPENDENT (does NOT close as identity, contrary to triple-closure claim). Residual is γ(2γ−3)/(1−γ).",
"v05.tooltip.eta_2gamma_REFUTED": "Paper 1 originally claimed η=2γ. This identity refutes it: residual is positive throughout Phase A. Lean Mathlib4 machine-proof refutation.",
"v05.tooltip.D_14_nu_imprint": "The learned imprint slope ν = −1/(2π) times 2π equals −1. Trivial dimensional check from paper 1.",
"v05.tooltip.D_SAGE_7": "The central charge c=3 times |ν_imprint| times 2π equals 3. Dimensional closure connecting CFT and training imprint.",
"v05.tooltip.nu_beta_id": "Correlation length exponent ν times order parameter exponent β equals −1 in Phase A. Variant of D-SAGE-2.",
"v053.calibration.title": "🔬 v0.5.3 — Calibration audit (2026-05-02)",
"v053.calibration.note": "SWA correction disabled — original δ_SWA = -0.21 was fit on n=1 model (insufficient data; group-mean +0.355 with single yes-case). post_IH correction marked exploratory — group-mean ≈ 0 in re-audit (n=22 panel) does not replicate the OLS fit. GQA correction replicates (panel +0.115 vs hardcoded +0.11). D_f formula corrected for Phase B (γ>1) — uses discrete cumulative sum instead of continuum approximation. LLaMA-3, Mistral, Gemma now report correct compression values.",
"v053.release.banner": "🔧 v0.5.3 — Audit-driven fixes: KV compression D_f now uses discrete sum (correct for all γ); δ_SWA disabled (n=1 calibration); paper §5.2 C_V coefficient erratum (1/4 → 1/12).",
// §35 v0.6 — γ predicted-vs-observed diagnostic
"gamma_check.title": "🔍 γ predicted vs observed",
"gamma_check.desc": "Enter your empirically measured γ. Tool detects regime: fraud (θ inflated) / compressed / over-Padé / SWA-random / normal.",
"gamma_check.gobs_label": "γ_observed",
"gamma_check.gobs_tip": "Empirically measured γ from your model's attention scores. Use the Diagnose CLI to obtain this from real weights.",
"gamma_check.random_label": "Random corpus?",
"gamma_check.random_tip": "Tick if γ_observed was measured on random/unstructured tokens. Distinguishes SWA signature (γ_obs > 1) from anomaly.",
"gamma_check.regime": "Regime",
"gamma_check.regime.normal": "Normal",
"gamma_check.regime.fraud": "Fraud (θ inflated)",
"gamma_check.regime.compressed": "Compressed context",
"gamma_check.regime.overpade": "Over-Padé",
"gamma_check.regime.swa": "SWA random-corpus signature",
"gamma_check.regime.unknown": "Unknown",
"gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15]: model uses its full nominal context, no anomaly.",
"gamma_check.regime.fraud.desc": "η < 0.01: nominal θ inflated. Model behaves as if θ ≪ advertised. Likely YaRN/marketing inflation without true context extension.",
"gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5): context is compressed (model attends less far than nominal θ predicts). Common in instruction-tuned / RLHF models.",
"gamma_check.regime.overpade.desc": "η > 1.5: model attends farther than Padé predicts. Possible Lerch-corrected regime or undertrained early-checkpoint.",
"gamma_check.regime.swa.desc": "γ_obs > 1.05 on random corpus = sliding-window attention signature (Mistral / Gemma family).",
"gamma_check.regime.unknown.desc": "Inputs out of range or γ_obs > 1 without random-corpus flag. Verify measurement.",
"gamma_check.validity.title": "⚠ Closed-form γ may not apply to this model",
"gamma_check.validity.body": "The Padé prediction assumes natural training without explicit attention regularization. Your η falls outside the validated band [0.85, 1.15], so the closed-form is not reliable here. Trust empirical γ (from the Phase Diagram or Diagnose CLI) over the prediction. Possible causes: heavy regularization forcing near-uniform attention, fine-tuning collapse, sliding-window architecture, or non-standard training losses. See docs/LIMITATIONS.md.",
"gamma_check.validity.fraud.hint": "Hint: η ≪ 1 typically indicates θ marketing inflation (YaRN-style) without genuine context extension, OR attention forced near-uniform by training.",
"gamma_check.validity.compressed.hint":"Hint: η ∈ [0.01, 0.5) is common in instruction-tuned / RLHF models where post-training has flattened the attention distribution.",
"gamma_check.validity.overpade.hint": "Hint: η > 1.5 may indicate an undertrained early checkpoint, a Lerch-corrected regime, or correction terms beyond the Padé approximation.",
"gamma_check.validity.swa.hint": "Hint: Sliding-window architectures (Mistral, Gemma) violate the full-attention assumption of the closed-form by design.",
"gamma_check.validity.unknown.hint": "Hint: γ_obs out of physical range or measurement noise. Verify your inputs and re-measure.",
"gamma_check.validity.summary_pill": "⚠ Validity gate",
"gamma_check.glossary.title": "ⓘ Glossary — what these variables mean",
"gamma_check.glossary.gamma_pade": "γ_Padé: closed-form prediction (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
"gamma_check.glossary.gamma_obs": "γ_observed: empirically measured from your model's attention scores (run the Diagnose CLI on real weights).",
"gamma_check.glossary.theta_eff_obs":"θ_eff (observed): inverted from γ_obs via T√2 / (1 − γ_obs). Effective θ implied by your measurement.",
"gamma_check.glossary.theta_eff_pade":"θ_eff (Padé): θ + T/√2. Effective θ predicted by closed-form.",
"gamma_check.glossary.efficiency": "η: ratio θ_eff_obs / θ_eff_Padé. ≈1 = normal · <0.01 = fraud · <0.5 = compressed · >1.5 = over-Padé.",
"gamma_check.glossary.delta_h": "ΔH_Cardy: log(θ_eff_obs / θ_nominal). Cardy entropy shift. Negative = compression entropy. ~0 = nominal match.",
"gamma_check.glossary.regime": "Regime: automatic classifier from η + γ_obs + random_corpus flag.",
// §36 v0.6 — Tooltips for inline ⓘ icons (per-variable explanations)
"tooltip.gamma_pade": "γ_Padé(T_eval): closed-form prediction (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
"tooltip.gamma_decomposed": "γ_decomposed: γ from full architectural decomposition. Padé baseline + GQA shift + post-IH shift (calibrated audit-replicated subset).",
"tooltip.d_horizon": "d_horizon: effective attention horizon. Beyond this position, scores fall below noise floor (paper §26).",
"tooltip.L_NIAH": "L_NIAH ceiling: predicted ceiling for needle-in-a-haystack retrieval reliability at current d_horizon.",
"tooltip.chi": "χ susceptibility: χ = 1/(1−γ). Diverges at the Hagedorn line γ=1.",
"tooltip.kv_memory": "KV memory @ T_eval (BF16): per-request KV cache = 2 · n_layers · n_kv_heads · d_head · T_eval bytes.",
"tooltip.theta_eff_obs": "θ_eff (observed): effective θ implied by your γ_observed: T√2 / (1 − γ_obs).",
"tooltip.theta_eff_pade": "θ_eff (Padé): effective θ predicted by closed-form: θ + T/√2.",
"tooltip.efficiency": "η = θ_eff_obs / θ_eff_Padé: efficiency ratio. ≈1 = normal · <0.01 = fraud · <0.5 = compressed · >1.5 = over-Padé.",
"tooltip.delta_h_cardy": "ΔH_Cardy: log(θ_eff_obs / θ_nominal). Cardy entropy shift. Negative = compression entropy. ~0 = nominal match.",
"tooltip.verdict_aggregate": "Verdict: worst-of across all recipes. ✅ GO = all green · ⚠ DEGRADED = ≥1 yellow · ❌ NO = ≥1 red.",
"tooltip.verdict_breakdown": "Per-recipe breakdown: each recipe tests an independent decision axis (long-context · budget · hardware · custom-vs-API · KV-compression). A ❌ on X-1 means \"use the API for your volume\" not \"the model fails\" — open the Recipes section for per-axis context.",
"tooltip.gamma_pill": "γ headline: γ_decomposed (or γ_Padé fallback). Range (0,1) = Phase A (anti-Ising). γ ≥ 1 = Hagedorn / Phase B.",
"tooltip.anti_ising": "Anti-Ising class: Phase A → β = γ−1 < 0. Machine-verified (Sage + Lean Mathlib4). See §35 v0.5.",
// §37 v0.6 — Lean+Mathlib theorem table
"lean.table.title": "📑 Lean+Mathlib theorem table",
"lean.table.desc": "Every entry below is machine-proven against Lean 4 + Mathlib4. Click any L# link to jump to the source line on GitHub. Grouped by topic — click a header to expand.",
"lean.table.theorem": "Theorem",
"lean.table.claim": "Claim",
"lean.table.tactic": "Tactic",
"lean.table.source": "Source",
"lean.table.lean": "Lean",
"lean.findings.title": "🔎 Substantive findings",
"lean.findings.detected_by": "Detected by",
"lean.findings.fixed_by": "Fixed by",
"lean.findings.recommendation":"Recommendation",
"lean.meta.repo": "Repo",
"lean.meta.build": "Build",
"lean.meta.theorems": "Theorems",
"lean.meta.verified": "verified",
"lean.meta.rejected": "rejected",
"lean.meta.sorry": "sorry",
"lean.meta.findings": "substantive findings",
"lean.manifest.loading": "Loading Lean manifest…",
"lean.manifest.error": "Lean manifest unavailable",
// Help modal — v0.6 section
"help.v06.title": "🆕 v0.6 — γ predicted-vs-observed + Cardy ΔH + Lean badges",
"help.v06.intro": "v0.6 (2026-05-06): three new diagnostics live in the TAF Card under 🔬 Diagnostics. All run in your browser; γ_observed comes from the Diagnose CLI on real weights.",
"help.v06.layout.title": "TAF Card layout (new in v0.6)",
"help.v06.layout.body": "After clicking 🚀 Generate full profile the card shows: a hero strip on top (architecture class + meta + 3 pills: aggregate verdict ✅/⚠/❌, γ headline, 🧲 Anti-Ising if Phase A) and four expandable sections: 📋 Recipes (open by default — verdict per dimension), 🔬 Diagnostics (key numbers, γ predicted vs observed, what-if explorer), ✓ Verification (Sage+Lean algebraic consistency, falsification F1-F23), 📂 Provenance & share (calibration audit + JSON download / share link / registry submit). Click any header to expand. Every variable has an inline ⓘ tooltip.",
"help.v06.gamma_check.title": "γ predicted vs observed",
"help.v06.gamma_check.body": "Enter the empirically-measured γ from your model and the tool computes η = θ_eff_obs / θ_eff_Padé and classifies into one of 5 regimes:",
"help.v06.case.normal": "Normal (η ∈ [0.85, 1.15]) — model uses its full nominal context. Use case: validate a new release before adopting it.",
"help.v06.case.fraud": "Fraud (η < 0.01) — nominal θ inflated; model behaves as if θ ≪ advertised. Use case: detect YaRN/marketing inflation (CodeLlama / Mistral-Nemo pattern).",
"help.v06.case.compressed": "Compressed (η < 0.5) — context compressed; model attends shorter than nominal θ. Use case: spot RLHF/instruction-tuning compression (LLaMA-2 pattern).",
"help.v06.case.overpade": "Over-Padé (η > 1.5) — model attends farther than Padé predicts. Use case: identify Lerch-corrected regime or undertrained early checkpoints (pythia-1b pattern).",
"help.v06.case.swa": "SWA random-corpus (γ_obs > 1.05 with random_corpus=Yes) — sliding-window attention signature. Use case: confirm Mistral / Gemma SWA on random tokens.",
"help.v06.gamma_check.validity_gate.title": "Validity gate (v0.8.9+)",
"help.v06.gamma_check.validity_gate.body": "When η falls outside [0.85, 1.15] OR the regime is not normal, the panel shows a warning banner explaining that the closed-form prediction may not apply. Trust the empirical γ in those cases. See docs/LIMITATIONS.md for the full regime-of-validity discussion (closed-form γ assumes natural attention without explicit regularization; ν = -1/(2π) assumes i.i.d. tokens).",
"help.v06.cardy.title": "Cardy ΔH diagnostic",
"help.v06.cardy.body": "ΔH_Cardy = log(θ_eff_obs / θ_nominal). Entropy shift between observed effective θ and nominal θ. Strong negative = compression entropy; near zero = nominal match. Complements η for borderline cases.",
"help.v06.lean.title": "Lean + Mathlib verification badges",
"help.v06.lean.body": "TAF identities are formally machine-proven in Lean Mathlib4: 37 theorems in 7 groups (Padé, RG flow, Cayley, D-SAGE, audit findings, erratum CV, misc) + 1 substantive finding (V-derivative factor-2, theorem V_derivative_ne_RG_beta). Source: github.com/karlesmarin/lean-taf (commit 25c77fd). Re-verify locally: git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean. The 🧲 Anti-Ising pill in the hero strip and the Verification accordion link to specific source lines.",
"help.v06.glossary.title": "Variable glossary (also embedded in TAF Card)",
"help.v06.glossary.body": "Every variable in the TAF Card has an inline ⓘ tooltip. The complete list: γ, γ_Padé, γ_decomposed, γ_observed, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, KV memory, regime. Hover any ⓘ for the definition + paper section.",
},
// ────────────────────────────────────────────────────────────────────────
// ES — Español
// ────────────────────────────────────────────────────────────────────────
es: {
// §33 v0.4 (sesion 31, 2026-04-30) — nuevas funciones diagnósticas
"v04.title": "🆕 v0.4 — Nuevos diagnósticos (sesion 31)",
"v04.section.intro": "Cuatro nuevas funciones diagnósticas derivadas en sesión 31 (2026-04-30) desde juegos de fórmulas cross-of-crosses + interrogación socrática. Disponibles en taf_browser.py §33.",
"v04.arch.label": "Concentración Arquitectural",
"v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv. Ley correlacional cross-panel (R²=0.30). Caveat: no es predictor per-model.",
"v04.pdi.label": "PDI — Índice de Desviación de Padé",
"v04.pdi.desc": "PDI = d_horizon_obs/T_eval. Semáforo: verde (≈1), naranja (>>1), amarillo (<<1), rojo (Phase B negativo).",
"v04.4bit.label": "Predictor de Shift 4-bit",
"v04.4bit.desc": "MHA: R²(bf16)<0.9 → γ sube; R²>0.99 → γ baja. GQA: precision-robusto.",
"v04.crit.label": "Bundle de Exponentes Críticos",
"v04.crit.desc": "ν_c, β_c, η_c (=γ−1, CORREGIDO), α_C, γ_susc con mínimo AM-GM en γ=1−1/√2≈0.293.",
// §34 v0.5 (sesion 32, 2026-05-01) — Consistencia algebraica verificada por máquina
"v05.title": "🔬 v0.5 — Consistencia verificada por máquina (sesion 32)",
"v05.section.intro": "Verificación dual con Sage Groebner basis + Lean Mathlib4 de 15 identidades algebraicas de los exponentes críticos TAF. Primer framework transformer-attention con respaldo formal machine-proof.",
"v05.verify.label": "Comprobación de Consistencia Algebraica",
"v05.verify.desc": "Dado γ medido, verifica 12 identidades D-SAGE (D-SAGE-1: 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). Todas pasando = framework intacto. Fallos indican bf16 outliers / artefactos de cuantización.",
"v05.dsage1.label": "D-SAGE-1 (★★ core)",
"v05.dsage1.desc": "Identidad cuadrática 2η² + η·γ_χ + 1 = 0 (descubierta por Sage Groebner, verificada Lean). Reemplaza claim incorrecto de 'cierre triple'. Refuta η=2γ del paper 1 algebraicamente.",
"v05.erratum.label": "Erratum paper 1 — corrección η",
"v05.erratum.desc": "Paper 1 afirmaba η = 2γ. Sage Groebner + Lean Mathlib4 demostraron que falla (residual (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Fase A). Valor correcto: η = γ−1, satisface D-SAGE-1.",
"v05.repro.label": "Reproducibilidad",
"v05.repro.desc": "Los 15 teoremas son machine-proof en Lean Mathlib4 (build exitoso 1973 jobs). Script Sage: analysis/sage_recursive_sweep_2026-04-30.sage. Código Lean: lean_taf/taf/Taf/Identities.lean.",
// v0.5.1 — TAF Card consistency check button
"v05.consistency.title": "🔬 Comprobación de consistencia algebraica (Sage + Lean v0.5)",
"v05.consistency.desc": "Verifica 12 identidades algebraicas D-SAGE de los exponentes críticos TAF (machine-proof Sage Groebner basis + Lean Mathlib4). Pasa = framework intacto. Falla = bf16 outlier / artefacto de cuantización.",
"v05.consistency.btn": "🔬 Verificar consistencia algebraica",
// v0.5.2 — Anti-Ising universality class badge
"v05.antiising.badge": "🧲 Clase Anti-Ising (β=γ−1<0, verificado por máquina)",
// v0.5.2 — Per-identity tooltips (explicaciones en lenguaje claro)
"v05.tooltip.D_SAGE_1": "Identidad algebraica cuadrática que conecta dimensión anómala η con susceptibilidad γ_χ. Identidad CORE descubierta por Sage Groebner basis (machine-proof). Reemplaza claim incorrecto de triple closure.",
"v05.tooltip.D_SAGE_2": "En Fase A, β = γ−1 es negativo (anti-Ising). Multiplicado por χ = 1/(1−γ) da exactamente −1. Signature del régimen negativo-β de TAF.",
"v05.tooltip.D_SAGE_4": "El exponente de calor específico α y la susceptibilidad χ suman exactamente 2 en TAF. Consecuencia algebraica del hiperescalado de Josephson.",
"v05.tooltip.D_SAGE_5": "Identidad lineal: α + γ_χ = 2(2−γ). Significa que cuando γ se acerca a 1 (Hagedorn), la suma se acerca a 2; en γ=0 vale 4.",
"v05.tooltip.D_SAGE_6": "Exponente de parámetro de orden multiplicado por exponente de susceptibilidad da una cuadrática específica en γ. Relación algebraica factorizada.",
"v05.tooltip.Rushbrooke_tautology": "Hiperescalado de Rushbrooke estándar 2β + γ_χ = ν·d en d=1. En TAF es TAUTOLOGÍA — γ_χ se define exactamente para que se cumpla. Confirmado por Sage Groebner basis.",
"v05.tooltip.Josephson_tautology": "Hiperescalado de Josephson estándar 2 − α = ν·d en d=1. En TAF es TAUTOLOGÍA — α se define exactamente para que se cumpla.",
"v05.tooltip.Fisher_independent": "Relación de Fisher γ_χ = (2−η)·ν. En TAF es INDEPENDIENTE (NO cierra como identidad, contrario al claim de triple closure). El residuo es γ(2γ−3)/(1−γ).",
"v05.tooltip.eta_2gamma_REFUTED": "Paper 1 afirmaba η=2γ. Esta identidad lo refuta: el residuo es positivo en toda Fase A. Refutación machine-proof por Lean Mathlib4.",
"v05.tooltip.D_14_nu_imprint": "La pendiente de imprint aprendido ν = −1/(2π) multiplicada por 2π da −1. Verificación dimensional trivial del paper 1.",
"v05.tooltip.D_SAGE_7": "La carga central c=3 multiplicada por |ν_imprint| multiplicada por 2π da 3. Cierre dimensional conectando CFT con imprint de entrenamiento.",
"v05.tooltip.nu_beta_id": "Exponente de longitud de correlación ν multiplicado por exponente de parámetro de orden β da −1 en Fase A. Variante de D-SAGE-2.",
"v053.calibration.title": "🔬 v0.5.3 — Auditoría de calibración (2026-05-02)",
"v053.calibration.note": "Corrección SWA desactivada — δ_SWA = -0.21 original se ajustó con n=1 modelo (datos insuficientes; promedio del único caso +0.355). Corrección post_IH marcada exploratoria — promedio de grupo ≈ 0 en re-auditoría (panel n=22) no replica el ajuste OLS. Corrección GQA replica (panel +0.115 vs hardcoded +0.11). Fórmula D_f corregida para Fase B (γ>1) — usa suma cumulativa discreta en lugar de aproximación continua. LLaMA-3, Mistral, Gemma ahora reportan valores correctos de compresión.",
"v053.release.banner": "🔧 v0.5.3 — Correcciones por audit: D_f de compresión KV ahora usa suma discreta (correcto para todo γ); δ_SWA desactivado (calibración n=1); erratum coeficiente C_V paper §5.2 (1/4 → 1/12).",
// §35 v0.6 — Diagnóstico γ predicho vs observado
"gamma_check.title": "🔍 γ predicho vs observado",
"gamma_check.desc": "Introduce tu γ medido empíricamente. La herramienta detecta el régimen: fraude (θ inflado) / comprimido / sobre-Padé / SWA-aleatorio / normal.",
"gamma_check.gobs_label": "γ_observado",
"gamma_check.gobs_tip": "γ medido empíricamente desde los attention scores de tu modelo. Usa la CLI de Diagnose para obtenerlo desde pesos reales.",
"gamma_check.random_label": "¿Corpus aleatorio?",
"gamma_check.random_tip": "Marca sí si γ_observado se midió sobre tokens aleatorios/no estructurados. Distingue la firma SWA (γ_obs > 1) de una anomalía.",
"gamma_check.regime": "Régimen",
"gamma_check.regime.normal": "Normal",
"gamma_check.regime.fraud": "Fraude (θ inflado)",
"gamma_check.regime.compressed": "Contexto comprimido",
"gamma_check.regime.overpade": "Sobre-Padé",
"gamma_check.regime.swa": "Firma SWA (corpus aleatorio)",
"gamma_check.regime.unknown": "Desconocido",
"gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15]: el modelo usa su contexto nominal completo, sin anomalías.",
"gamma_check.regime.fraud.desc": "η < 0.01: θ nominal inflado. El modelo se comporta como si θ ≪ del anunciado. Probable inflación tipo YaRN/marketing sin extensión real de contexto.",
"gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5): contexto comprimido (el modelo atiende menos lejos de lo que predice θ nominal). Común en modelos instruction-tuned / RLHF.",
"gamma_check.regime.overpade.desc": "η > 1.5: el modelo atiende más lejos de lo que Padé predice. Posible régimen Lerch-corregido o checkpoint temprano sub-entrenado.",
"gamma_check.regime.swa.desc": "γ_obs > 1.05 sobre corpus aleatorio = firma de sliding-window attention (familias Mistral / Gemma).",
"gamma_check.regime.unknown.desc": "Entradas fuera de rango o γ_obs > 1 sin flag de corpus aleatorio. Verifica la medición.",
"gamma_check.validity.title": "⚠ La fórmula cerrada de γ puede no aplicar a este modelo",
"gamma_check.validity.body": "La predicción de Padé asume entrenamiento natural sin regularización explícita de la atención. Tu η cae fuera de la banda validada [0.85, 1.15], por lo que la fórmula cerrada no es fiable aquí. Confía en γ empírico (Phase Diagram o Diagnose CLI) por encima de la predicción. Causas posibles: regularización fuerte que fuerza atención casi uniforme, colapso por fine-tuning, arquitectura sliding-window, o pérdidas no estándar. Ver docs/LIMITATIONS.md.",
"gamma_check.validity.fraud.hint": "Pista: η ≪ 1 suele indicar inflación de θ tipo marketing (YaRN) sin extensión real de contexto, O atención forzada a casi uniforme por el entrenamiento.",
"gamma_check.validity.compressed.hint":"Pista: η ∈ [0.01, 0.5) es habitual en modelos instruction-tuned / RLHF donde el post-entrenamiento ha aplanado la distribución de atención.",
"gamma_check.validity.overpade.hint": "Pista: η > 1.5 puede indicar checkpoint temprano sub-entrenado, régimen Lerch-corregido, o términos de corrección más allá de la aproximación de Padé.",
"gamma_check.validity.swa.hint": "Pista: arquitecturas sliding-window (Mistral, Gemma) violan por diseño la asunción de full-attention de la fórmula cerrada.",
"gamma_check.validity.unknown.hint": "Pista: γ_obs fuera de rango físico o ruido de medición. Verifica entradas y vuelve a medir.",
"gamma_check.validity.summary_pill": "⚠ Gate de validez",
"gamma_check.glossary.title": "ⓘ Glosario — significado de las variables",
"gamma_check.glossary.gamma_pade": "γ_Padé: predicción cerrada (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
"gamma_check.glossary.gamma_obs": "γ_observado: medido empíricamente desde los attention scores (ejecuta Diagnose CLI sobre pesos reales).",
"gamma_check.glossary.theta_eff_obs":"θ_eff (observado): invertido desde γ_obs vía T√2 / (1 − γ_obs). θ efectivo implicado por tu medición.",
"gamma_check.glossary.theta_eff_pade":"θ_eff (Padé): θ + T/√2. θ efectivo predicho por la fórmula cerrada.",
"gamma_check.glossary.efficiency": "η: ratio θ_eff_obs / θ_eff_Padé. ≈1 = normal · <0.01 = fraude · <0.5 = comprimido · >1.5 = sobre-Padé.",
"gamma_check.glossary.delta_h": "ΔH_Cardy: log(θ_eff_obs / θ_nominal). Cambio de entropía de Cardy. Negativo = entropía de compresión. ~0 = coincide con nominal.",
"gamma_check.glossary.regime": "Régimen: clasificador automático a partir de η + γ_obs + flag corpus_aleatorio.",
// §36 v0.6 — Tooltips para iconos ⓘ inline
"tooltip.gamma_pade": "γ_Padé(T_eval): predicción cerrada (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
"tooltip.gamma_decomposed": "γ_descompuesto: γ desde descomposición arquitectural completa. Línea base Padé + shift GQA + shift post-IH (subconjunto replicado en audit calibrado).",
"tooltip.d_horizon": "d_horizon: horizonte efectivo de atención. Más allá los scores caen bajo el suelo de ruido (paper §26).",
"tooltip.L_NIAH": "Techo L_NIAH: techo predicho de fiabilidad needle-in-a-haystack al d_horizon actual.",
"tooltip.chi": "χ susceptibilidad: χ = 1/(1−γ). Diverge en la línea Hagedorn γ=1.",
"tooltip.kv_memory": "Memoria KV @ T_eval (BF16): caché KV por petición = 2 · n_layers · n_kv_heads · d_head · T_eval bytes.",
"tooltip.theta_eff_obs": "θ_eff (observado): θ efectivo implicado por tu γ_observado: T√2 / (1 − γ_obs).",
"tooltip.theta_eff_pade": "θ_eff (Padé): θ efectivo predicho por la fórmula cerrada: θ + T/√2.",
"tooltip.efficiency": "η = θ_eff_obs / θ_eff_Padé: ratio de eficiencia. ≈1 = normal · <0.01 = fraude · <0.5 = comprimido · >1.5 = sobre-Padé.",
"tooltip.delta_h_cardy": "ΔH_Cardy: log(θ_eff_obs / θ_nominal). Cambio de entropía de Cardy. Negativo = entropía de compresión. ~0 = coincide con nominal.",
"tooltip.verdict_aggregate": "Veredicto: peor-de entre todas las recipes. ✅ ADELANTE = todo verde · ⚠ DEGRADADO = ≥1 amarillo · ❌ NO = ≥1 rojo.",
"tooltip.verdict_breakdown": "Desglose por recipe: cada recipe evalúa un eje de decisión independiente (contexto-largo · presupuesto · hardware · custom-vs-API · compresión-KV). Un ❌ en X-1 significa «usa la API para tu volumen» no «el modelo falla» — abre la sección Recipes para contexto por eje.",
"tooltip.gamma_pill": "γ titular: γ_descompuesto (o γ_Padé como fallback). Rango (0,1) = Fase A (anti-Ising). γ ≥ 1 = Hagedorn / Fase B.",
"tooltip.anti_ising": "Clase Anti-Ising: Fase A → β = γ−1 < 0. Machine-verified (Sage + Lean Mathlib4). Ver §35 v0.5.",
// §37 v0.6 — Tabla de teoremas Lean+Mathlib
"lean.table.title": "📑 Tabla de teoremas Lean+Mathlib",
"lean.table.desc": "Cada entrada está machine-proven contra Lean 4 + Mathlib4. Click en cualquier link L# para saltar a la línea fuente en GitHub. Agrupado por tema — click en cabecera para expandir.",
"lean.table.theorem": "Teorema",
"lean.table.claim": "Afirmación",
"lean.table.tactic": "Táctica",
"lean.table.source": "Fuente",
"lean.table.lean": "Lean",
"lean.findings.title": "🔎 Findings sustantivos",
"lean.findings.detected_by": "Detectado por",
"lean.findings.fixed_by": "Arreglado por",
"lean.findings.recommendation":"Recomendación",
"lean.meta.repo": "Repo",
"lean.meta.build": "Build",
"lean.meta.theorems": "Teoremas",
"lean.meta.verified": "verificados",
"lean.meta.rejected": "rechazados",
"lean.meta.sorry": "sorry",
"lean.meta.findings": "findings sustantivos",
"lean.manifest.loading": "Cargando manifest Lean…",
"lean.manifest.error": "Manifest Lean no disponible",
// Help modal — sección v0.6
"help.v06.title": "🆕 v0.6 — γ predicho-vs-observado + Cardy ΔH + badges Lean",
"help.v06.intro": "v0.6 (2026-05-06): tres diagnósticos nuevos viven en el TAF Card bajo 🔬 Diagnósticos. Todo corre en tu navegador; γ_observado lo obtienes con la Diagnose CLI sobre pesos reales.",
"help.v06.layout.title": "Layout del TAF Card (nuevo en v0.6)",
"help.v06.layout.body": "Tras click en 🚀 Generar perfil completo la tarjeta muestra: una tira hero arriba (clase de arquitectura + meta + 3 pills: veredicto agregado ✅/⚠/❌, γ titular, 🧲 Anti-Ising si Fase A) y cuatro secciones plegables: 📋 Recipes (abierto por defecto — veredicto por dimensión), 🔬 Diagnósticos (números clave, γ predicho vs observado, explorador what-if), ✓ Verificación (consistencia algebraica Sage+Lean, falsificación F1-F23), 📂 Procedencia y compartir (auditoría de calibración + descarga JSON / enlace / submit al registro). Click en cualquier cabecera para expandir. Cada variable tiene tooltip ⓘ inline.",
"help.v06.gamma_check.title": "γ predicho vs observado",
"help.v06.gamma_check.body": "Introduces el γ medido empíricamente y la herramienta calcula η = θ_eff_obs / θ_eff_Padé y clasifica en uno de 5 regímenes:",
"help.v06.case.normal": "Normal (η ∈ [0.85, 1.15]) — el modelo usa su contexto nominal completo. Caso de uso: validar un release nuevo antes de adoptarlo.",
"help.v06.case.fraud": "Fraude (η < 0.01) — θ nominal inflado; el modelo se comporta como si θ ≪ del anunciado. Caso de uso: detectar inflación YaRN/marketing (patrón CodeLlama / Mistral-Nemo).",
"help.v06.case.compressed": "Comprimido (η < 0.5) — contexto comprimido; el modelo atiende menos lejos que θ nominal. Caso de uso: detectar compresión por RLHF/instruction-tuning (patrón LLaMA-2).",
"help.v06.case.overpade": "Sobre-Padé (η > 1.5) — el modelo atiende más lejos que Padé predice. Caso de uso: identificar régimen Lerch-corregido o checkpoints tempranos sub-entrenados (patrón pythia-1b).",
"help.v06.case.swa": "SWA corpus aleatorio (γ_obs > 1.05 con corpus_aleatorio=Sí) — firma de sliding-window attention. Caso de uso: confirmar SWA en Mistral / Gemma sobre tokens random.",
"help.v06.gamma_check.validity_gate.title": "Gate de validez (v0.8.9+)",
"help.v06.gamma_check.validity_gate.body": "Cuando η cae fuera de [0.85, 1.15] O el régimen no es normal, el panel muestra un banner de aviso explicando que la predicción cerrada puede no aplicar. Confía en el γ empírico en esos casos. Ver docs/LIMITATIONS.md para la discusión completa del régimen-de-validez (γ cerrado asume atención natural sin regularización explícita; ν = -1/(2π) asume tokens i.i.d.).",
"help.v06.cardy.title": "Diagnóstico Cardy ΔH",
"help.v06.cardy.body": "ΔH_Cardy = log(θ_eff_obs / θ_nominal). Cambio de entropía entre el θ efectivo observado y el θ nominal. Negativo fuerte = entropía de compresión; cerca de cero = coincide con nominal. Complementa a η para casos borderline.",
"help.v06.lean.title": "Badges de verificación Lean + Mathlib",
"help.v06.lean.body": "Las identidades TAF están formalmente machine-proven en Lean Mathlib4: 37 teoremas en 7 grupos (Padé, flujo RG, Cayley, D-SAGE, hallazgos de auditoría, erratum CV, misc) + 1 hallazgo sustantivo (factor 2 en derivada V, teorema V_derivative_ne_RG_beta). Fuente: github.com/karlesmarin/lean-taf (commit 25c77fd). Re-verifica localmente: git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean. La pill 🧲 Anti-Ising del hero y la sección Verificación enlazan a líneas específicas del código fuente.",
"help.v06.glossary.title": "Glosario de variables (también embebido en TAF Card)",
"help.v06.glossary.body": "Cada variable del TAF Card tiene un tooltip ⓘ inline. Lista completa: γ, γ_Padé, γ_descompuesto, γ_observado, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, memoria KV, régimen. Pasa el ratón sobre cualquier ⓘ para la definición + sección del paper.",
"hero.title": "🔬 TAF Agent",
"hero.tagline": "Diagnostica cualquier LLM transformer en 30 segundos. Gratis. Sin GPU. Sin registro.",
"hero.subtitle": "Predice si un modelo te servirá para tu caso de uso antes de gastar dinero o tiempo. Todo corre en tu navegador — tus datos nunca salen de esta pestaña.",
"hero.help": "📘 Manual y ejemplos",
"hero.quickstart_btn": "⚡ Inicio rápido",
"hero.inventory_btn": "🧰 Qué te ofrece",
"hero.about": "Construido por un investigador independiente. Código abierto. Sin afiliación con ningún proveedor de modelos.",
"modes.title": "🎯 Modo",
"modes.profile": "📇 Perfilar un modelo",
"modes.compare": "🆚 Comparar modelos",
"modes.inspector": "🔍 Inspeccionar config",
"modes.ask": "💬 Pregunta libre",
"modes.recipe": "📋 Elegir receta",
"modes.diagnose": "🩺 Diagnóstico CLI",
"diagnose.title": "🩺 Generador del comando Diagnose CLI",
"diagnose.tip": "El navegador predice γ desde config; el CLI mide γ_obs sobre los pesos reales. Este generador produce el comando exacto para ejecutar localmente.",
"diagnose.desc": "Elige opciones y copia-pega el comando generado en tu máquina local (Python + transformers + numpy). Modo rápido ≈5 min CPU; completo ≈20–60 min GPU.",
"diagnose.model_label": "ID del modelo HF:",
"diagnose.theta_label": "θ (auto si vacío):",
"diagnose.n_label": "Contexto N:",
"diagnose.options_label": "Opciones:",
"diagnose.opt_fast": "--fast (CPU, ~5 min)",
"diagnose.opt_cpu": "--cpu (forzar CPU)",
"diagnose.opt_4bit": "--load_in_4bit (modelos ≥7B)",
"diagnose.local_label": "--local path (opcional):",
"diagnose.build_btn": "📋 Generar comando",
"diagnose.cmd_title": "Comando generado:",
"diagnose.copy_btn": "📋 Copiar al portapapeles",
"diagnose.next_steps": "Siguientes pasos: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Ejecuta el comando (4) JSON resultado → subir vía modo Inspect para análisis TAF completo.",
"modes.phase": "📊 Diagrama de fase",
"phase.title": "📊 Diagrama de fase (γ × θ)",
"phase.tip": "Cada punto es un modelo del panel empírico del paper. x: log θ; y: γ. La línea Hagedorn γ=1 separa Fase A de Fase B. Hover para detalles, click para cargar en el formulario.",
"phase.desc": "23 modelos en el panel; curva Padé a T=2000.",
"modes.desc": "Inicio rápido: pega cualquier id de modelo HuggingFace (ej. meta-llama/Meta-Llama-3-8B), click Perfilar. Verás las 5 recetas evaluadas en segundos.",
"profile.title": "📇 Perfilar un modelo",
"profile.desc": "Para técnicos: cuando necesitas una foto completa de viabilidad de un modelo candidato. Un click ejecuta las 5 recetas y produce una TAF Card unificada.",
"profile.preset_label": "Preset:",
"profile.preset_default": "— o elige de la lista —",
"profile.hf_label": "ID modelo HF:",
"profile.fetch_btn": "📥 Cargar",
"profile.btn": "🚀 Generar perfil completo",
"profile.quickstart": "💡 Inicio rápido: elige cualquier preset → click Generar. O pega un id desde HF Hub trending → 📥 Cargar → Generar.",
"compare.title": "🆚 Comparar modelos lado a lado",
"compare.desc": "Para técnicos: cuando eliges entre 2-3 modelos candidatos para un escenario de despliegue específico. Misma receta, múltiples modelos, veredictos lado a lado.",
"compare.recipe_label": "Receta:",
"compare.T_eval_label": "T_eval (contexto objetivo):",
"compare.models_title": "Modelos a comparar (hasta 3)",
"compare.btn": "🚀 Comparar",
"compare.example": "💡 Prueba: pega 3 modelos populares de 7-8B (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), receta X-2, T_eval=16000. Mira cuál maneja mejor contexto largo.",
"ask.title": "❓ Tu pregunta",
"ask.placeholder": "ej. ¿Mistral-7B aguanta 16K NIAH retrieval? O: Tengo 5,000$, ¿qué modelo puedo entrenar? O: ¿GPU más barato para servir Llama-70B a 100M tokens/día?",
"ask.btn": "🚀 Analizar",
"ask.example_btn": "💡 Probar ejemplo",
"recipe.title": "📋 Receta",
"recipe.default": "— elige una receta —",
"recipe.input_title": "🎯 Entradas",
"verdict.title": "📊 Veredicto",
"chain.title": "🔍 Cadena de cálculo",
"chain.desc": "Cada número de abajo es Python determinista. Click en un paso para expandir.",
"answer.title": "💬 Respuesta en lenguaje natural",
"share.btn": "🔗 Copiar link",
"share.copied": "✅ ¡Copiado al portapapeles!",
"share.download": "💾 Descargar JSON",
"share.download_md": "📝 Markdown",
"share.download_tex": "📜 LaTeX",
"share.submit": "📤 Enviar al registry",
"share.submit_clip_ok": "↗ GitHub abierto. Cuerpo copiado al portapapeles — pégalo en el cuerpo del issue.",
"share.submit_clip_fail": "↗ GitHub abierto. Portapapeles bloqueado — cuerpo volcado en la consola del navegador (F12).",
"share.import_title": "📂 Importar un resultado TAF compartido",
"a11y.skip": "Saltar al contenido principal",
// v0.6.2 — rework de landing: inicio rápido + inventario + tooltips de arquitectura
"qs.title": "⚡ Inicio rápido",
"qs.step1": "Pega un model ID de HuggingFace (ej. meta-llama/Meta-Llama-3-8B)",
"qs.step2": "Click en 📇 Profile a model",
"qs.step3": "Lee tu TAF Card — veredicto por caso de uso + números clave + matemáticas verificadas con Lean+Mathlib",
"qs.cta": "↓ Empezar ahora",
"inv.title": "🧰 Qué te ofrece esta herramienta",
"inv.recipes.title": "🎯 8 recetas — ¿sirve este modelo para tu caso?",
"inv.recipes.x1.title": "Entrenar propio vs API",
"inv.recipes.x1.body": "¿cuál sale más barato para tu tráfico?",
"inv.recipes.x2.title": "Contexto largo",
"inv.recipes.x2.body": "¿aguanta 32k / 128k tokens de forma fiable?",
"inv.recipes.x3.title": "Presupuesto",
"inv.recipes.x3.body": "con $X, ¿qué modelo puedes entrenar desde cero?",
"inv.recipes.x5.title": "Hardware",
"inv.recipes.x5.body": "¿qué GPU para servir N tokens/día?",
"inv.recipes.x19.title": "KV cache",
"inv.recipes.x19.body": "¿cómo comprimir sin romper la calidad?",
"inv.recipes.x21.title": "Pureza de imprint",
"inv.recipes.x21.body": "¿cómo de limpia es la codificación posicional del modelo?",
"inv.recipes.x22.title": "Compute-context",
"inv.recipes.x22.body": "¿el modelo entra en la banda empírica?",
"inv.recipes.x23.title": "Fase IH",
"inv.recipes.x23.body": "¿pre- o post-induction-head?",
"inv.diag.title": "🔬 Diagnósticos",
"inv.diag.gamma": "γ predicho vs observado — auto-clasifica el modelo en 5 regímenes (normal · fraude / contexto inflado · comprimido · over-Padé · sliding-window)",
"inv.diag.cardy": "Cardy ΔH — desplazamiento de entropía entre contexto observado y nominal",
"inv.diag.fals": "Tabla de falsabilidad — comprueba 23 predicciones específicas (F1–F23)",
"inv.diag.alg": "Consistencia algebraica — 8 identidades matemáticas que el modelo debe cumplir",
"inv.verify.title": "✓ Matemáticas formalmente verificadas",
"inv.verify.count": "37 teoremas machine-proven en Lean 4 + Mathlib4",
"inv.verify.click": "Click en cualquier badge → abre la línea fuente en GitHub",
"inv.verify.reverify": "Verifícalo tú: lake build (≈5 s tras cache)",
"inv.export.title": "📤 Exportar y compartir",
"inv.export.formats": "JSON · Markdown · LaTeX (listo para paper)",
"inv.export.share": "Link reproducible (estado codificado en URL)",
"inv.export.registry": "Envía al registro comunitario en GitHub",
"arch.summary": "Arquitecturas soportadas",
"arch.anyhf": "✓ Cualquier modelo público de HuggingFace",
"tooltip.mha": "Multi-Head Attention: cada posición atiende mediante varios heads paralelos a la vez.",
"tooltip.gqa": "Grouped Query Attention: las queries comparten menos keys/values que heads (ahorra memoria pero empuja γ hacia Hagedorn).",
"tooltip.alibi": "Attention with Linear Biases: la info de posición es una pendiente aprendida añadida a los scores, sin rotación.",
"tooltip.abspe": "Absolute Position Embeddings: cada posición tiene un vector fijo aprendido sumado al embedding del token.",
"tooltip.swa": "Sliding Window Attention: cada token solo atiende dentro de una ventana local fija (Mistral, gemma-2 lo usan).",
"tooltip.ssm": "State Space Model: capa de secuencia que mantiene estado interno en lugar de atención (Mamba, Jamba lo usan).",
// v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker
"modes.unmask": "🪟 Desenmascarar",
"unmask.title": "🪟 Desenmascarador de contexto",
"unmask.tip": "Pega un id de modelo HuggingFace (o config.json crudo). La herramienta detecta sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), y GQA — todo lo que hace que max_position_embeddings sea mayor que el contexto efectivo real. Mistral-7B-v0.1 es el ejemplo canónico: declara 32k, atiende dentro de ~4-8k.",
"unmask.desc": "¿Estás a punto de gastar dinero en un modelo que en realidad no atiende tan lejos? Pega un id y descúbrelo en 1 segundo. Sin GPU, sin inferencia — solo aritmética sobre config.json.",
"unmask.id_label": "ID modelo HF:",
"unmask.fetch_btn": "🔍 Desenmascarar",
"unmask.paste_summary": "O pega config.json crudo (modelos privados / en desarrollo)",
"unmask.paste_btn": "🔍 Desenmascarar config pegado",
"unmask.label.declared": "Contexto declarado",
"unmask.label.effective": "Efectivo (estimado)",
"unmask.label.ratio": "Ratio",
"unmask.section.flags": "Banderas de arquitectura",
"unmask.section.warnings": "Avisos",
"unmask.section.reco": "Recomendación",
"unmask.flag.swa": "SWA",
"unmask.flag.rope": "RoPE scaling",
"unmask.flag.gqa": "GQA",
"unmask.flag.layers": "Capas",
"unmask.flag.dhead": "d_head",
"unmask.flag.theta": "RoPE θ",
"unmask.flag.yes": "sí",
"unmask.flag.no": "no",
"unmask.flag.full_mha": "no (MHA completo, {n} heads)",
"unmask.verdict.honest": "✅ HONESTO",
"unmask.verdict.inflated": "⚠ INFLADO",
"unmask.verdict.severely_inflated": "❌ GRAVEMENTE INFLADO",
"unmask.verdict.yarn_extended": "⚠ YARN-EXTENDIDO",
"unmask.verdict.unknown": "❓ DESCONOCIDO",
"unmask.warn.swa_window": "Ventana SWA: {window} tokens — cada capa solo atiende dentro de esta ventana.",
"unmask.warn.multihop": "Estimación multi-hop: ~{multiHop} tokens (conservador: ventana × {factor}).",
"unmask.warn.yarn": "RoPE scaling ({type}) extiende contexto {factor}× desde ~{original} hasta {declared} tokens.",
"unmask.warn.yarn_advice": "Contexto RoPE-extendido — verifica el comportamiento de γ a la longitud declarada con el diagnóstico γ_check.",
"unmask.warn.gqa_small_dhead": "head dim pequeño ({d_head}) + GQA: probable compresión de KV cache a contexto largo (γ empujado hacia Hagedorn).",
"unmask.reco.honest": "Modelo de atención completa estándar. Contexto efectivo coincide con declarado ({declared} tokens).",
"unmask.reco.inflated": "Efectivo ~{effective} tokens vía SWA. Usa γ_check para verificar el comportamiento a tu longitud objetivo.",
"unmask.reco.severely_inflated": "Trátalo como un modelo de ~{effective} tokens en la práctica. El claim de {declared} tokens solo aplica vía cadenas de atención cross-layer, que empíricamente degradan más allá de ~2× la ventana SWA.",
"unmask.reco.yarn_extended": "Contexto RoPE-extendido. Corre un benchmark long-context (NIAH a 8k / 16k / 32k / full) para confirmar que la extensión se sostiene. Usa γ_check con T_eval = {declared}.",
"unmask.reco.unknown": "No se pudo parsear el config. Verifica que la URL sea un modelo HF válido con config.json público.",
"unmask.status.empty_id": "⚠ Introduce un model id (ej. mistralai/Mistral-7B-v0.1).",
"unmask.status.fetching": "⏳ Obteniendo config.json para {modelId}...",
"unmask.status.success": "✅ Analizado {modelId} (veredicto: {verdict})",
"unmask.status.empty_paste": "⚠ Pega un config.json primero.",
"unmask.status.invalid_json": "❌ JSON inválido: {error}",
"unmask.status.success_paste": "✅ Config pegado analizado (veredicto: {verdict})",
"unmask.pasted_label": "(config pegado)",
"mode_desc.ask": "Escribe una pregunta libre. El LLM en el navegador elige la receta correcta y la ejecuta.",
"mode_desc.recipe": "Selecciona una receta directamente y rellena el formulario. Control manual completo.",
"mode_desc.profile": "Inicio más rápido: pega cualquier model id de HuggingFace, click Profile. Mira las 5 recetas en segundos.",
"mode_desc.compare": "Elige 2-3 modelos candidatos + una receta. Ve veredictos lado a lado en tabla.",
"mode_desc.inspector": "Pega un config.json directamente. Útil para modelos privados / en desarrollo no en HF Hub.",
"mode_desc.diagnose": "Construye el comando CLI diagnose_model.py para MEDIR γ_obs en GPU real. El navegador predice; el CLI mide.",
"mode_desc.phase": "Scatter γ × θ del panel empírico del paper. Hover sobre puntos para detalles, click para cargar en Diagnose / Recipe.",
"mode_desc.unmask": "Detecta si max_position_embeddings es engañoso (SWA / YaRN / RoPE-scaling). Pega un model id, obtén un veredicto en 1 línea.",
"profile.preset_loaded": "✅ Preset cargado para {id}. Formulario pre-rellenado. (Click 📥 Fetch para sobreescribir con el último config de HF Hub.)",
// v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
"modes.template": "📜 Chat-template",
"mode_desc.template": "Detecta qué familia de chat-template usa un modelo (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Da el flag CLI exacto para lm-eval / vLLM / transformers.",
"template.title": "📜 Detector de Chat-template",
"template.tip": "Pega un model id de HF (o tokenizer_config.json crudo). Detecta la familia del chat-template y te da el comando exacto para usarlo bien. lm-eval-harness divide la accuracy entre 2 silenciosamente si te olvidas de aplicarlo (issue #1841).",
"template.desc": "¿Olvidaste --apply_chat_template? La mayoría de evals multi-turn fallan ~50% porque el chat template no se aplicó. Pega un model id, obtén el flag CLI exacto para tu stack.",
"template.id_label": "ID modelo HF:",
"template.fetch_btn": "📜 Detectar",
"template.paste_summary": "O pega tokenizer_config.json crudo (modelos privados)",
"template.paste_btn": "📜 Detectar config pegado",
"template.label.family": "Familia detectada",
"template.label.markers": "Marcadores coincidentes",
"template.label.tpl_len": "Longitud template",
"template.section.warnings": "Avisos",
"template.section.commands": "Comandos por framework",
"template.section.raw": "Template crudo (preview)",
"template.family.custom": "custom (familia desconocida)",
"template.family.none": "(sin chat_template)",
"template.verdict.ok": "✅ TEMPLATE DETECTADO",
"template.verdict.custom": "⚠ TEMPLATE CUSTOM",
"template.verdict.missing": "❌ SIN CHAT TEMPLATE",
"template.verdict.base_model": "ℹ MODELO BASE (sin chat)",
"template.verdict.unknown": "❓ DESCONOCIDO",
"template.warn.no_chat_template": "Sin campo chat_template en tokenizer_config.json. Típico de modelos base / pretrained. Si esperabas un modelo instruct-tuned, puede que el archivo cargado sea incorrecto.",
"template.warn.custom_template": "Template no estándar ({length} chars). La herramienta no lo encajó en familias conocidas. Revisa el preview y verifica que tu framework de eval lo soporta.",
"template.warn.lm_eval_apply": "lm-eval-harness: añade --apply_chat_template o tu accuracy bajará ~50% silenciosamente en evals multi-turn (issue #1841).",
"template.warn.vllm_apply": "vLLM serve: verifica que --chat-template esté puesto (la auto-detección a veces falla en variantes fine-tuned). Sugerido: {name}.",
"template.status.empty_id": "⚠ Introduce un model id (ej. mistralai/Mistral-7B-Instruct-v0.3).",
"template.status.fetching": "⏳ Obteniendo tokenizer_config.json para {modelId}...",
"template.status.success": "✅ Detectado {modelId} (veredicto: {verdict})",
"template.status.empty_paste": "⚠ Pega un tokenizer_config.json primero.",
"template.status.invalid_json":"❌ JSON inválido: {error}",
"template.status.success_paste":"✅ Config pegado detectado (veredicto: {verdict})",
"template.pasted_label": "(tokenizer_config pegado)",
// v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
"modes.arena": "🎯 Arena CI",
"mode_desc.arena": "Recupera intervalos de confianza desde datos crudos de votos pairwise (MLE Bradley-Terry + bootstrap). Detecta pares estadísticamente empatados que el leaderboard público de Arena oculta.",
"arena.title": "🎯 Reconstructor Arena-Elo CI",
"arena.tip": "Chatbot Arena oculta los intervalos de confianza en el leaderboard público. Una diferencia de 5 Elo puede ser estadísticamente irrelevante. Pega datos crudos de votos (model_a, model_b, winner) — la herramienta calcula MLE Bradley-Terry + bootstrap CIs y lista los empates estadísticos (overlap de CI).",
"arena.desc": "¿GPT-4 es realmente mejor que Claude — o están empatados? Pega CSV de votos pairwise (o click Cargar sample). MLE Bradley-Terry + 200 iteraciones de bootstrap → Elos ranked con CIs 95% y detección de empates estadísticos. Todo en el navegador.",
"arena.sample_btn": "📊 Cargar datos sample",
"arena.run_btn": "🎯 Calcular CIs",
"arena.clear_btn": "🗑️ Limpiar",
"arena.csv_summary": "CSV de votos (header: model_a,model_b,winner; winner ∈ a/b/tie)",
"arena.section.ranked": "Elos ranked con CIs 95%",
"arena.section.ties": "Empates estadísticos (overlap CI)",
"arena.section.summary": "Resumen",
"arena.col.rank": "#",
"arena.col.model": "Modelo",
"arena.col.elo": "Elo",
"arena.col.ci": "CI 95%",
"arena.col.ci_width": "± semi-anchura",
"arena.col.matches": "Partidas",
"arena.col.wins": "V / D / E",
"arena.col.tie_pair": "Par",
"arena.col.tie_diff": "Brecha Elo",
"arena.col.tie_overlap": "Overlap CI",
"arena.no_ties": "Sin empates estadísticos — todos los pares distinguibles al CI 95%.",
"arena.summary.votes": "Votos totales",
"arena.summary.models": "Modelos",
"arena.summary.ties": "Empates estadísticos",
"arena.summary.bootstrap": "Iteraciones bootstrap",
"arena.summary.ci_level": "Nivel CI",
"arena.status.empty": "⚠ Pega un CSV de votos o click en Cargar sample.",
"arena.status.too_few": "⚠ Solo {n} votos válidos — se necesitan al menos 10 para ajustar Bradley-Terry de forma fiable.",
"arena.status.computing": "⏳ Calculando MLE Bradley-Terry + bootstrap sobre {n} votos...",
"arena.status.done": "✅ {n} votos · {models} modelos · {ties} empates estadísticos · {ms} ms",
"arena.status.sample_loaded": "✅ Sample cargado (datos sintéticos Arena de 6 modelos). Click en Calcular CIs.",
// v0.7.3 — anti-bullshit pack #4: Contamination Prior
"modes.contam": "🧪 Contaminación",
"mode_desc.contam": "Prior bayesiano-ish sobre si un score de benchmark está contaminado. Introduce la fecha de cutoff de entrenamiento → puntúa 20+ benchmarks populares (MMLU, GSM8K, HumanEval, MMLU-Pro…).",
"contam.title": "🧪 Prior de Contaminación",
"contam.tip": "Calcula un prior bayesiano-ish sobre si un score de benchmark está contaminado, basado en (fecha de cutoff de entrenamiento) × (fecha de release del benchmark) × (inclusión conocida en corpus + historial de leaks). Open LLM Leaderboard v1 fue cancelado en 2024 tras la contaminación de MMLU/HellaSwag.",
"contam.desc": "¿Deberías confiar en el MMLU de tu modelo? Introduce la fecha cutoff de entrenamiento — la herramienta puntúa 20+ benchmarks populares (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) y te dice qué scores son probablemente contaminados.",
"contam.cutoff_label": "Cutoff entrenamiento:",
"contam.run_btn": "🧪 Puntuar todos los benchmarks",
"contam.section.ranked": "Priors de contaminación por benchmark",
"contam.section.high": "🔴 Benchmarks de alto riesgo (trata los scores como no fiables)",
"contam.section.medium": "🟡 Riesgo medio (verifica con alternativas)",
"contam.section.low": "🟢 Bajo riesgo (probablemente limpios)",
"contam.col.benchmark": "Benchmark",
"contam.col.released": "Release",
"contam.col.gap": "Gap (meses)",
"contam.col.prior": "P(contam)",
"contam.col.level": "Nivel",
"contam.col.corpora": "En corpus",
"contam.col.category": "Categoría",
"contam.label.high": "Alto riesgo",
"contam.label.medium": "Medio",
"contam.label.low": "Bajo",
"contam.no_entries": "(ninguno en esta categoría)",
"contam.advice.high": "Trata estos scores como no fiables. Sustituye por alternativas más recientes / con test privado (MMLU-Pro, GPQA, MUSR, MATH-500).",
"contam.advice.medium": "Toma con cautela. Busca replicación sobre subset held-out o reproducciones comunitarias.",
"contam.advice.low": "Score probablemente no contaminado, pero ausencia de leak no es prueba — verifica también con test alternativo.",
"contam.summary.headline": "Cutoff {cutoff} · {n} benchmarks puntuados",
"contam.status.empty": "⚠ Introduce una fecha cutoff de entrenamiento (ej. 2023-12).",
"contam.status.bad_date": "⚠ Formato de fecha incorrecto. Usa YYYY-MM o YYYY-MM-DD.",
"contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks puntuados · {high} de alto riesgo",
// v0.7 — Sección Help modal
"help.v07.title": "🆕 v0.7 — Pack anti-bullshit (4 modos nuevos)",
"help.v07.intro": "v0.7 (2026-05-06): cuatro modos nuevos que resuelven problemas concretos reportados por la comunidad HuggingFace. Cada uno corre en tu navegador sin inferencia — pura metadata + matemáticas.",
"help.v07.unmask.title": "🪟 Desenmascarador de Contexto",
"help.v07.unmask.body": "Detecta cuándo max_position_embeddings es engañoso. Mistral-7B-v0.1 declara 32k pero atiende dentro de ~4-8k vía SWA. Pega un id HF → veredicto en 1 segundo (HONESTO / INFLADO / GRAVEMENTE INFLADO / YARN-EXTENDIDO). Pilla SWA, RoPE-scaling (YaRN/linear/dynamic NTK), d_head pequeño + GQA. Caso de uso: antes de pagar GPU para 32k de contexto, verifica que el modelo realmente atiende tan lejos.",
"help.v07.template.title": "📜 Detector de Chat-template",
"help.v07.template.body": "Detecta qué familia de chat-template usa un modelo (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) y te da el flag CLI exacto para lm-evaluation-harness, vLLM, y transformers. Resuelve el issue #1841 de lm-eval-harness: olvidar --apply_chat_template divide la accuracy multi-turn por 2 silenciosamente. Caso de uso: antes de reportar un score, confirma que aplicaste el template correctamente.",
"help.v07.arena.title": "🎯 Reconstructor Arena-Elo CI",
"help.v07.arena.body": "Chatbot Arena oculta los intervalos de confianza en su leaderboard público — una diferencia de 5 Elo puede ser estadísticamente irrelevante. Pega datos crudos de votos pairwise (model_a, model_b, winner) → MLE Bradley-Terry + bootstrap de 200 iteraciones → Elos ranked con CIs 95% y un panel de \"empates estadísticos\" listando pares cuyos CIs se solapan. Prueba el botón Cargar sample. Caso de uso: antes de afirmar \"modelo A vence a modelo B\", verifica que sus CIs no se solapen.",
"help.v07.contam.title": "🧪 Prior de Contaminación",
"help.v07.contam.body": "Prior bayesiano-ish sobre si un score de benchmark está contaminado. Introduce la fecha cutoff de entrenamiento de tu modelo → la herramienta puntúa 20+ benchmarks populares (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) por P(contaminación) según gap temporal, inclusión en corpus y historial de leaks conocidos. Open LLM Leaderboard v1 fue cancelado en 2024 tras la contaminación de MMLU/HellaSwag. Caso de uso: decide qué scores te puedes creer al comparar dos modelos.",
"help.v07.quant.title": "⚖️ Clasificador de régimen de cuantización",
"help.v07.quant.body": "Predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización: NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8…). Arch-aware: d_head pequeño + GQA agresivo → más sensible; los esquemas calibrados (AWQ) absorben mejor el shift que los no calibrados (NF4). Recomienda alternativas más seguras si detecta cliff. Caso de uso: antes de cuantizar, predice si tu combo arquitectura × esquema mantendrá la PPL aceptable, con sugerencia concreta de switch si no.",
"help.v07.drift.title": "🔀 Cota de drift entre frameworks",
"help.v07.drift.body": "Mismo modelo, scores distintos en setups distintos. La herramienta predice el drift máximo admisible solo por ruido numérico (dtype, framework, batch). Si el gap observado lo excede → bug real, normalmente chat-template mismatch (issue #1841 de lm-eval-harness) o layout de KV-cache. Prueba el botón "Cargar sample" para el bug canónico de chat-template. Caso de uso: antes de reportar una regresión o reclamar reproducibilidad, verifica si el gap entre dos evals es mayor de lo que el ruido numérico puede explicar.",
"inv.v07.drift": "🔀 Drift — ¿bug o ruido? Predice el gap máximo admisible entre dos evals",
"help.v07.niah.title": "🔍 Gap NIAH → Reasoning",
"help.v07.niah.body": "El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH (retrieval de needle) pero fallan reasoning multi-hop al mismo contexto. La herramienta predice ambas tasas de pass desde la arquitectura (γ_Padé + d_horizon + presión arq: d_head pequeño, GQA, SWA), reporta el gap, y encuentra el \"contexto seguro de reasoning\" donde reasoning se mantiene ≥65%. Modo barrido muestra la curva a 1k/4k/16k/64k/T_train. Caso de uso: antes de desplegar al contexto declarado, descubre si el modelo realmente razonará ahí o solo encontrará.",
"inv.v07.niah": "🔍 NIAH→Reason — ¿tu \"128k\" realmente razona ahí, o solo encuentra?",
// v0.7 — Inventory modal 5ª card
"inv.v07.title": "🆕 Pack anti-bullshit v0.7",
"inv.v07.unmask": "🪟 Unmask — ¿config.json declara 32k? Mira si de verdad atiende tan lejos",
"inv.v07.template": "📜 Chat-template — flag CLI exacto para que lm-eval no divida tu accuracy entre 2 silenciosamente",
"inv.v07.arena": "🎯 Arena CI — recupera los intervalos de confianza que Chatbot Arena oculta",
"inv.v07.contam": "🧪 Contaminación — puntúa 20+ benchmarks por probabilidad de contaminación",
"inv.v07.quant": "⚖️ Quant — predice γ-shift + ΔPPL para cualquier combo (modelo × esquema de cuantización)",
// v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
"modes.quant": "⚖️ Quant",
"mode_desc.quant": "Predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización). Arch-aware: d_head pequeño + GQA → más sensible. Recomienda alternativas más seguras si detecta cliff.",
"quant.title": "⚖️ Clasificador de régimen de cuantización",
"quant.tip": "Predice γ-shift (y la ΔPPL resultante) para un par (modelo × esquema). Claims genéricos como 'AWQ ~95% retención' son demasiado vagos — TAF usa d_head, ratio GQA, flag SWA y tamaño del modelo para dar veredicto arquitectura-específico. Resuelve: la comunidad HF reporta cliffs de cuantización impredecibles (NF4 -2 PPL en Phi-3 pero bien en Llama-3-8B).",
"quant.desc": "¿Cuantizar romperá tu modelo? Pega un id HF, elige esquema de cuantización — obtén γ-shift predicho, banda ΔPPL esperada y alternativa recomendada si es un cliff. Solo navegador, sin GPU, sin set de calibración.",
"quant.id_label": "ID modelo HF:",
"quant.fetch_btn": "📥 Fetch config",
"quant.scheme_label": "Esquema cuant:",
"quant.run_btn": "⚖️ Predecir",
"quant.all_btn": "📊 Comparar todos los esquemas",
"quant.regime.safe": "✅ SEGURO",
"quant.regime.mild": "✅ COMPRESIÓN LEVE",
"quant.regime.significant": "⚠ DEGRADACIÓN SIGNIFICATIVA",
"quant.regime.cliff": "❌ CLIFF FUERTE",
"quant.label.gamma_shift": "γ shift",
"quant.label.delta_ppl": "ΔPPL (est.)",
"quant.label.arch_mult": "Multiplicador arch",
"quant.section.breakdown": "Desglose",
"quant.section.reco": "Recomendación",
"quant.section.compare": "Todos los esquemas (ordenados por seguridad)",
"quant.field.scheme": "Esquema",
"quant.field.calibrated": "calibrado",
"quant.field.uncalibrated": "no calibrado",
"quant.field.base_penalty": "Penalización base",
"quant.field.arch_mult_full": "Multiplicador arquitectónico",
"quant.field.gamma_shift": "γ shift predicho",
"quant.field.ppl_band": "Banda ΔPPL (est.)",
"quant.field.params": "Parámetros",
"quant.col.scheme": "Esquema",
"quant.col.bits": "Bits",
"quant.col.gamma_shift": "γ shift",
"quant.col.ppl_band": "Banda ΔPPL",
"quant.col.regime": "Régimen",
"quant.reco.switch_to_awq": "Cambia a {scheme} — el 4-bit calibrado maneja d_head pequeño + GQA mucho mejor que NF4. ΔPPL esperada cae ~2-3×.",
"quant.reco.switch_to_q5_km": "Cambia a {scheme} — Q5 mantiene más dimensiones de head intactas a bajo coste (solo ~25% más grande).",
"quant.reco.switch_to_q4_km": "Cambia a {scheme} — Q3/Q2 son demasiado agresivos para esta arquitectura.",
"quant.reco.consider_awq": "Considera {scheme} — la calibración reduce γ-shift significativamente en esta arquitectura.",
"quant.reco.use_higher_bits": "Usa alternativa de mayor bit — esta arquitectura no absorbe 4-bit limpiamente. Prueba 5 u 8-bit.",
"quant.reco.verify_with_eval": "Verifica con eval real — el shift predicho está en el límite. Corre NIAH a tu contexto objetivo antes de desplegar.",
"quant.reco.no_action": "No requiere acción — la cuantización es segura para esta arquitectura.",
"quant.summary.headline_all": "Todos los esquemas para {modelId}",
"quant.status.empty_id": "⚠ Introduce un model id (ej. meta-llama/Llama-3.2-1B).",
"quant.status.fetching": "⏳ Obteniendo config.json para {modelId}...",
"quant.status.fetched": "✅ Config obtenido para {modelId}. Elige un esquema y click Predecir (o Comparar todos).",
"quant.status.no_scheme": "⚠ Elige un esquema de cuantización del dropdown.",
"quant.status.done": "✅ Régimen predicho: {regime}",
"quant.status.done_all": "✅ Comparados {n} esquemas — ordenados por seguridad.",
// v0.7.4 — autocomplete HF Hub: privacy + rate-limit
"hf_auto.privacy": "🔒 Queries enviadas a huggingface.co/api · caché local 5 min",
"hf_auto.rate_limited": "⚠ Rate limit de HuggingFace — espera un momento, o teclea el id completo manualmente",
"hf_auto.gated_msg": "es gated. Acepta la licencia aquí:",
// v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound
"modes.drift": "🔀 Drift",
"mode_desc.drift": "Predice el drift máximo permitido entre dos scores de benchmark dados (framework, dtype, batch, chat-template). Distingue bugs reales de ruido numérico.",
"drift.title": "🔀 Cota de drift entre frameworks",
"drift.tip": "Mismo modelo, scores distintos en setups distintos. ¿La diferencia es ruido o un bug real? Introduce dos scores con su (framework, dtype, batch, chat-template) — la herramienta predice el drift máximo permitido por ruido numérico solo. Si el gap observado lo excede → bug real, normalmente chat-template mismatch (issue #1841 de lm-eval) o layout de KV-cache.",
"drift.desc": "Tu modelo da 67.2 en lm-eval-hf y 65.1 en vLLM-served. ¿Bug o ruido? Introduce ambos scores con (framework, dtype, batch, ¿chat-template aplicado?). La herramienta predice la banda de ruido y flagea bugs reales. arxiv 2506.09501 documenta esto como problema mayor de reproducibilidad de evals.",
"drift.setup_a": "Setup A",
"drift.setup_b": "Setup B",
"drift.score": "Score",
"drift.framework": "Framework",
"drift.dtype": "Dtype",
"drift.batch": "Batch",
"drift.template": "Chat-template",
"drift.template.applied": "aplicado",
"drift.template.not_applied": "no aplicado",
"drift.template.unknown": "desconocido",
"drift.run_btn": "🔀 Calcular cota de drift",
"drift.sample_btn": "📊 Cargar sample (bug de chat-template)",
"drift.label.observed": "Gap observado",
"drift.label.band": "Banda numérica",
"drift.label.ratio": "Gap / banda",
"drift.section.setups": "Setups",
"drift.section.breakdown": "Contribuyentes al drift (banda numérica)",
"drift.section.verdict": "Veredicto y recomendación",
"drift.contrib.dtype": "Mismatch de dtype",
"drift.contrib.framework": "Framework",
"drift.contrib.batch": "Diferencia de batch",
"drift.contrib.template": "MISMATCH de chat-template",
"drift.dominant_cause": "Causa dominante",
"drift.cause.dtype": "diferencia de precisión dtype",
"drift.cause.framework": "diferencia de framework / kernel",
"drift.cause.batch": "paths de normalización por batch",
"drift.cause.template_mismatch": "chat-template aplicado en un lado pero no en el otro (patrón #1841 de lm-eval-harness — típico -50% en multi-turn)",
"drift.verdict.noise": "✅ RUIDO NUMÉRICO",
"drift.verdict.suspicious": "⚠ SOSPECHOSO — verifica",
"drift.verdict.bug": "❌ BUG REAL — investiga",
"drift.verdict.bug_template": "❌ BUG DE CHAT-TEMPLATE",
"drift.reco.noise": "El gap encaja en la banda esperada de ruido numérico. No requiere acción; la diferencia es consistente con variación de framework/dtype/batch sola.",
"drift.reco.suspicious": "El gap es 1–2× la banda predicha. Borderline — posible bug real. Intenta alinear el contribuyente dominante (ej. iguala framework o dtype) y re-testea.",
"drift.reco.bug": "El gap es > 2× la banda predicha. Es un bug real. Inspecciona el contribuyente dominante — probablemente diferencia de tokenizer / chat-template / layout de KV-cache. Corre lm-eval-harness con --apply_chat_template y confirma.",
"drift.reco.bug_template": "Mismatch de chat-template detectado. Es la causa más común de gaps grandes en evals (issue #1841 de lm-eval-harness). Re-corre el lado "no aplicado" con --apply_chat_template (o pon vLLM --chat-template <name>) y re-testea.",
"drift.status.empty_scores": "⚠ Introduce ambos scores.",
"drift.status.done": "✅ Veredicto: {verdict}",
"drift.status.sample_loaded": "✅ Sample cargado (bug canónico de chat-template). Click en Calcular cota de drift.",
// v0.7.6 — anti-bullshit pack #7: NIAH → predictor de gap de reasoning
"modes.niah": "🔍 NIAH→Reason",
"mode_desc.niah": "Predice tasas de pass de NIAH (retrieval) y reasoning multi-hop a cualquier contexto. Resuelve: modelos long-context pasan NIAH pero fallan reasoning al mismo contexto (paper RULER).",
"modes.saturation": "📈 Saturación",
"mode_desc.saturation": "Te dice si un benchmark sigue discriminando frontier models o ya está saturado (ej. MMLU 88-94% top, AIME 2025 ya 96-100%). Devuelve top-3 + veredicto + reemplazos recomendados.",
"modes.hub": "🧭 Soluciones",
"mode_desc.hub": "Mapa de cada problema documentado de LLM-eval → mode tafagent (si cubierto) + herramientas externas curadas. Encuentra la solución sin reinventarla. 30+ pains, 7 categorías.",
"modes.yarn": "🧵 Planificador YaRN",
"mode_desc.yarn": "Genera la configuración rope_scaling exacta para extender un modelo más allá de su contexto entrenado — más un veredicto TAF sobre si la calidad de atención aguanta realmente a la longitud objetivo.",
"modes.gguf": "🧊 Puente GGUF",
"mode_desc.gguf": "Lee la cabecera de metadata de un archivo GGUF (rope_theta, context_length, quant) en tu navegador y obtén un veredicto de calidad TAF — la pregunta que los calculadores de VRAM ignoran: ¿cabe Y funciona?",
"gguf.title": "🧊 Puente de validez GGUF",
"gguf.tip": "Caber en VRAM ≠ funcionar. Los calculadores GGUF/VRAM leen la metadata de un modelo para decirte si un quant cabe en tu GPU. Esto lee la MISMA metadata (rope_theta, context_length, esquema de quant, geometría de cabezas) directamente de la cabecera .gguf vía HTTP Range — sin descargar GB — y responde lo que ellos no: ¿aguanta de verdad la calidad de atención, y cuánto la erosiona el quant (γ-shift, ΔPPL)?",
"gguf.desc": "Pega un repo GGUF (p.ej. Qwen/Qwen2.5-7B-Instruct-GGUF), elige un archivo de quant, y obtén un veredicto de calidad TAF: el horizonte de atención efectivo del modelo, más cuánto desplaza γ la cuantización elegida para esta arquitectura concreta. Solo lee la cabecera del archivo en tu navegador.",
"gguf.repo_label": "ID del repo GGUF:",
"gguf.list_btn": "📂 Listar archivos quant",
"gguf.file_label": "Archivo quant:",
"gguf.target_label": "Contexto objetivo L (opcional):",
"gguf.analyze_btn": "🧊 Analizar GGUF",
"gguf.all_btn": "📊 Comparar todos los quants",
"gguf.compare_title": "Todos los quants — comparación de calidad",
"gguf.col.verdict": "Veredicto",
"gguf.col.gamma_at_l": "γ @ L (tras quant)",
"gguf.need_repo": "Introduce un id de repo GGUF como 'Qwen/Qwen2.5-7B-Instruct-GGUF'",
"gguf.listing": "Listando archivos .gguf de HF Hub…",
"gguf.no_files": "No se encontraron archivos .gguf en ese repo.",
"gguf.found": "archivos quant encontrados",
"gguf.pick_hint": "elige uno y pulsa Analizar.",
"gguf.reading": "Leyendo cabecera GGUF vía HTTP Range…",
"gguf.read_ok": "Cabecera analizada",
"gguf.verdict.healthy": "SANO — el horizonte efectivo alcanza L con buen γ tras quant",
"gguf.verdict.usable_with_care":"USABLE CON CUIDADO — alcanza L pero γ es modesto tras quant",
"gguf.verdict.degrades": "DEGRADA — la atención colapsa antes de L (o el quant la empuja ahí)",
"gguf.r.arch": "Arquitectura",
"gguf.r.ctx_train": "Contexto entrenado",
"gguf.r.horizon_fp16": "Horizonte de atención (fp16)",
"gguf.r.quant": "Esquema de quant",
"gguf.r.gamma_shift": "γ-shift por quant",
"gguf.r.after_quant": "(tras quant)",
"gguf.r.eff_horizon": "Horizonte efectivo (cuantizado)",
"gguf.r.no_quant_shift": "— precisión completa, sin γ-shift",
"gguf.r.note": "Horizonte desde γ_Padé / d_horizon (arquitectura). γ-shift de quant + ΔPPL desde el modelo quant-regime (calibrado a PPL de llama.cpp + papers AWQ/GPTQ). Ambos son estimaciones — verifica los casos límite con un eval real.",
"gguf.err.not_gguf": "Ese archivo no es un GGUF válido (magic incorrecto).",
"gguf.err.too_large": "La cabecera de metadata supera el límite de descarga — tokenizer inusualmente grande. Prueba otro quant.",
"gguf.err.incomplete": "A la metadata GGUF le falta rope_theta o context_length — no se puede calcular el horizonte.",
"help.v091.gguf.title": "🧊 Puente de validez GGUF",
"help.v091.gguf.body": "La docena de calculadores GGUF/VRAM (NyxKrage, oobabooga, …) leen una cabecera .gguf para decirte si un quant cabe en tu GPU. Esto lee la misma cabecera — vía HTTP Range, sin descargar GB — y responde lo que ellos saltan: ¿cabe Y además funciona? Pega un repo GGUF, elige un archivo de quant; el puente extrae rope_theta, context_length, el esquema de quant (de general.file_type o del nombre del archivo), y la geometría de cabezas, luego corre γ_Padé / d_horizon de TAF más el γ-shift de quant consciente de arquitectura. Salida: horizonte de atención efectivo en el contexto entrenado, cuánto erosiona γ el quant (y ΔPPL) para este modelo, y un veredicto. Caso de uso: 'Q4_K_M cabe en 8GB — ¿pero se vuelve tonto pasado 30K?' → ve el horizonte y la penalización γ de Q4 antes de descargar 6 GB.",
"yarn.title": "🧵 Planificador de extensión de contexto YaRN / RoPE",
"yarn.tip": "Config + veredicto, no solo VRAM. Las calculadoras GGUF/VRAM te dicen si una longitud de contexto cabe en la GPU. Esto te da el bloque rope_scaling exacto para config.json Y si la calidad de atención aguantará realmente a esa longitud — con la maquinaria γ_Padé / d_horizon de TAF, todo en tu navegador.",
"yarn.desc": "¿Quieres usar un modelo más allá de su contexto entrenado? Introduce el modelo (o su θ + contexto entrenado) y tu longitud objetivo L. Obtén el fragmento rope_scaling listo para pegar (transformers ≥4.43), más un veredicto TAF: ¿llega el horizonte de atención efectivo a L, o el modelo alucinará pasado d_horizon?",
"yarn.model_label": "ID del modelo HF (opcional):",
"yarn.fetch_btn": "📥 Obtener config",
"yarn.orig_label": "Contexto entrenado (orig max_position_embeddings):",
"yarn.theta_label": "θ de RoPE (rope_theta):",
"yarn.target_label": "Contexto objetivo L:",
"yarn.type_label": "Método de escalado RoPE:",
"yarn.type_auto": "Auto (recomendado)",
"yarn.plan_btn": "🧵 Planificar extensión",
"yarn.need_id": "Introduce un id de modelo como 'Qwen/Qwen2.5-7B-Instruct'",
"yarn.fetching": "Obteniendo config.json de HF Hub…",
"yarn.loaded_hint": "Ajusta si hace falta, luego pulsa Planificar extensión.",
"yarn.verdict.healthy": "SANO — el horizonte efectivo alcanza L con buen γ",
"yarn.verdict.usable_with_care":"USABLE CON CUIDADO — funciona pero γ es modesto cerca de L",
"yarn.verdict.needs_finetune": "NECESITA FINE-TUNE — factor demasiado grande para solo forma cerrada",
"yarn.verdict.degrades": "DEGRADA — la atención colapsa antes de L",
"yarn.verdict.no_extension_needed":"NO HACE FALTA EXTENDER — L ya está dentro del contexto entrenado",
"yarn.r.factor": "Factor de extensión",
"yarn.r.method": "Método",
"yarn.r.naive": "(sin extensión)",
"yarn.r.eff": "(tras extensión)",
"yarn.r.from": "desde",
"yarn.r.snippet": "fragmento config.json",
"yarn.r.collapsed": "colapsado (pasado el polo de Padé)",
"yarn.copy_btn": "Copiar config",
"yarn.copied": "Copiado",
"yarn.warn.theta_eff_estimate":"θ_eff ≈ θ×factor es una estimación NTK de primer orden; la rampa por banda de YaRN puede diferir ligeramente.",
"yarn.warn.aggressive": "Factor agresivo > 4× — la calidad pasado d_horizon no es fiable sin fine-tuning.",
"yarn.warn.horizon_short": "El horizonte efectivo no cubre L — espera pérdida de coherencia pasado d_horizon.",
"yarn.warn.finetune": "La extensión RoPE aquí es una estimación de forma cerrada; los docs de transformers + el paper de YaRN recomiendan un fine-tune corto para factores más allá de ~2–4×.",
"yarn.err.no_orig": "Introduce el contexto entrenado (orig max_position_embeddings), u obtén un modelo.",
"yarn.err.no_theta": "Introduce θ de RoPE (rope_theta), u obtén un modelo.",
"yarn.err.no_target": "Introduce una longitud de contexto objetivo L.",
"help.v09.title": "🆕 v0.9 — Planificador de extensión de contexto YaRN / RoPE",
"help.v09.intro": "v0.9 (2026-05-23): la pregunta más frecuente de HuggingFace — \"¿cómo configuro rope_scaling para extender el contexto, y funcionará de verdad?\" — respondida con un fragmento de config para pegar Y un veredicto de calidad TAF. Solo navegador, sin inferencia.",
"help.v09.yarn.title": "🧵 Planificador de extensión de contexto YaRN / RoPE",
"help.v09.yarn.body": "La docena de calculadoras GGUF/VRAM en HF (NyxKrage, oobabooga, DavidAU, …) responden todas a lo mismo: ¿cabe la longitud de contexto L en mi GPU? Ninguna responde a la difícil: ¿cabe L Y además funciona? Introduce un id de modelo (o su θ + contexto entrenado) y una longitud objetivo L. El planificador calcula el factor de extensión, emite el bloque rope_scaling exacto para transformers ≥4.43 (yarn / linear / dynamic / llama3, con rampas β por defecto del paper), luego corre la matemática γ_Padé / d_horizon de TAF: γ sin extensión (el problema), γ tras el método elegido (la solución), el horizonte de atención efectivo, y un veredicto — SANO / USABLE-CON-CUIDADO / NECESITA-FINETUNE / DEGRADA. Marca con honestidad la estimación θ_eff≈θ·factor y el requisito de fine-tune para >4×. Caso de uso: 'Quiero Mistral-7B (θ=10k, 8k entrenado) a 32k' → ves γ colapsar con uso ingenuo, YaRN recuperarlo parcialmente, y obtienes la config exacta para pegar.",
"niah.title": "🔍 Gap NIAH → Reasoning",
"niah.tip": "NIAH (Needle in a Haystack) testea retrieval: 'encuentra este hecho en texto largo'. Reasoning multi-hop testea inferencia: 'combina hechos X+Y del principio con hecho Z del final'. El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH pero fallan reasoning al mismo contexto. Esta herramienta predice ambas tasas desde la arquitectura sola.",
"niah.desc": "Tu modelo dice 128k de contexto. ¿Razonará realmente a 64k, o solo encontrará? Pega un model id HF y un contexto objetivo — la herramienta predice tasas de pass NIAH y reasoning multi-hop, el gap, y un 'contexto seguro' donde reasoning se mantiene ≥65%.",
"niah.id_label": "ID modelo HF:",
"niah.fetch_btn": "📥 Fetch config",
"niah.teval_label": "Contexto objetivo (T_eval):",
"niah.run_btn": "🔍 Predecir",
"niah.sweep_btn": "📊 Barrer contextos",
"niah.label.niah": "Tasa pass NIAH",
"niah.label.reasoning": "Tasa pass Reasoning",
"niah.label.gap": "Gap",
"niah.label.safe_ctx": "Contexto seguro de reasoning",
"niah.section.breakdown": "Desglose arquitectónico",
"niah.section.reco": "Recomendación",
"niah.calib.heading": "Calibrado con RULER (datos publicados por NVIDIA)",
"niah.calib.matched": "Coincide {alias} → fila KB {canonical}.",
"niah.calib.aggregate": "Agregado RULER",
"niah.calib.interp": "interpolado entre",
"niah.calib.extrapolated": "extrapolado fuera del rango medido por RULER",
"niah.calib.col.heuristic": "Heurística",
"niah.calib.col.calibrated": "Calibrado RULER",
"niah.calib.col.delta": "Δ",
"niah.calib.factors": "Factores por tarea del paper RULER, Apéndice Tablas 13-16:",
"niah.calib.factors_caveat": "rango honesto: retrieval 0.95-1.10×, reasoning 0.60-0.85×",
"niah.calib.claimed_vs_effective": "Reportado en paper",
"niah.calib.claimed": "claimed",
"niah.calib.effective": "effective",
"niah.calib.source": "Fuente",
"niah.calib.miss": "Calibración RULER no disponible para este modelo — usando solo heurística arquitectónica. Añade a data/ruler_kb.json si tienes números medidos.",
"niah.section.sweep": "Barrido de tasas pass por longitud de contexto",
"niah.field.dhorizon": "d_horizon (efectivo)",
"niah.field.ratio": "T_eval / d_horizon",
"niah.field.arch_pressure": "Presión arq (d_head pequeño + GQA + SWA)",
"niah.field.theta": "RoPE θ",
"niah.field.t_train": "T_train (declarado)",
"niah.col.context": "T_eval",
"niah.col.niah": "NIAH",
"niah.col.reasoning": "Reasoning",
"niah.col.gap": "Gap",
"niah.col.verdict": "Veredicto",
"niah.verdict.robust": "✅ ROBUSTO",
"niah.verdict.marginal": "⚠ MARGINAL",
"niah.verdict.degraded": "⚠ DEGRADADO",
"niah.verdict.retrieval_only": "❌ SOLO RETRIEVAL",
"niah.verdict.broken": "❌ ROTO",
"niah.reco.robust": "Tanto retrieval como reasoning aguantan a este contexto. Seguro para desplegar tareas de lookup e inferencia.",
"niah.reco.marginal": "Borderline. Retrieval funciona pero reasoning está flojo. Úsalo para lookup, no para inferencia multi-paso.",
"niah.reco.degraded": "Caída significativa de reasoning. El modelo encuentra hechos pero le cuesta combinarlos. Evita tareas multi-hop a esta longitud.",
"niah.reco.retrieval_only": "Hallazgo canónico de RULER: el modelo pasa NIAH pero falla reasoning. Útil para setups RAG (donde el LLM solo localiza hechos) pero NO para inferencia encadenada. Reduce tu contexto al valor 'seguro' de abajo.",
"niah.reco.broken": "El modelo falla incluso retrieval básico a este contexto. Trátalo como out-of-distribution — re-testea a contexto más corto.",
"niah.safe_context": "≤ {ctx} tokens (reasoning ≥ 65%)",
"niah.safe_context_none": "No se encontró contexto seguro bajo tu objetivo — el modelo falla reasoning incluso a contextos pequeños.",
"niah.summary.sweep": "{modelId} — tasas pass por contexto",
"niah.status.empty_id": "⚠ Introduce un model id (ej. meta-llama/Llama-3.1-8B-Instruct).",
"niah.status.bad_teval": "⚠ Introduce un contexto objetivo (≥ 512 tokens).",
"niah.status.fetching": "⏳ Obteniendo config.json para {modelId}...",
"niah.status.fetched": "✅ Config obtenido para {modelId}. Pon T_eval y click Predecir (o Barrer contextos).",
"niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
"niah.status.sweep_done": "✅ Barridos {n} largos de contexto.",
"saturation.title": "📈 Detector de saturación de benchmarks",
"saturation.tip": "MMLU está saturado (88-94% en todos los frontier). Reportar '92% en MMLU' ya no significa nada. Esta herramienta te dice qué benchmarks aún discriminan frontier models, cuáles están saturados, y qué usar en su lugar. Datos: DemandSphere AI Frontier Tracker (CC BY-NC 4.0) refrescado 2026-05.",
"saturation.desc": "¿Sigue siendo útil tu benchmark? Elige un benchmark para ver top-3 frontier scores, spread, y un veredicto (saturated / near-saturated / discriminative) + reemplazos recomendados.",
"saturation.select_label": "Benchmark:",
"saturation.select.all": "— mostrar todos los benchmarks —",
"saturation.run_btn": "📈 Clasificar",
"saturation.all_btn": "📊 Mostrar todos",
"saturation.col.spread": "Spread top-3",
"saturation.col.mean": "Media top-3",
"saturation.col.n": "Modelos",
"saturation.col.bench": "Benchmark",
"saturation.col.verdict": "Veredicto",
"saturation.col.reco": "Mejor reco",
"saturation.col.model": "Modelo",
"saturation.col.score": "Score",
"saturation.section.top3": "Top-3 frontier scores",
"saturation.section.recommendations": "Alternativas recomendadas",
"saturation.section.note": "Notas",
"saturation.section.all": "Todos los benchmarks rastreados",
"saturation.verdict.saturated": "🚨 SATURADO",
"saturation.verdict.near_saturated": "⚠ CASI SATURADO",
"saturation.verdict.discriminative": "✅ DISCRIMINATIVO",
"saturation.verdict.sparse_data": "ℹ DATOS ESCASOS",
"saturation.borderline": "Borderline — dentro de ±1pp de un umbral. Trata el veredicto como 'verifica con cuidado'.",
"saturation.unknown": "Benchmark desconocido.",
"saturation.attribution": "Datos: DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (histórico open-weight) · último fetch 2026-05-05.",
"saturation.status.live": "✅ Datos en vivo cargados — {count} modelos.",
"saturation.status.baked": "ℹ Usando snapshot baked (fetch en vivo no disponible).",
"saturation.status.kb_fail": "⚠ No se pudo cargar el KB de saturación.",
"saturation.status.done": "✅ {name} — {verdict}",
"saturation.status.all_done": "✅ Clasificados {n} benchmarks.",
"help.v08.saturation.title": "📈 Detector de saturación de benchmarks",
"help.v08.saturation.body": "MMLU está saturado (top 88-94%), AIME 2025 saturó a los pocos meses de salir, HumanEval near-saturated. Elige cualquier benchmark y la herramienta retorna top-3 frontier scores, spread, media, y un veredicto — saturated / near-saturated / discriminative — más un reemplazo recomendado (ej. MMLU → MMLU-Pro / GPQA / HLE). Fetch en vivo desde DemandSphere AI Frontier Tracker (CC BY-NC 4.0) cuando llega; snapshot baked 2026-05-05 cuando no. Caso de uso: antes de citar '92% en MMLU' o diseñar una eval, verifica si el benchmark aún discrimina algo.",
"inv.v08.saturation": "📈 Saturation — ¿sigue siendo útil tu benchmark, o están todos los frontiers empatados arriba?",
// v0.8.2 — anti-bullshit pack #8: JSON CoT-aware Linter
"modes.cot": "📋 JSON CoT",
"mode_desc.cot": "Lintea un JSON Schema (o ejemplo de respuesta) buscando el anti-patrón respuesta-antes-de-razonamiento. Los motores de constrained decoding emiten campos en el orden del schema — si `answer` va antes que `reasoning`, el CoT se rompe.",
"cot.title": "📋 Linter JSON con consciencia CoT",
"cot.tip": "Los motores de constrained decoding (llguidance, Outlines, gramáticas SGLang) emiten propiedades JSON en el orden del schema. Si tu schema pone `answer` antes de `reasoning`, el modelo se compromete con la respuesta final primero y solo después escribe el razonamiento para justificarla — rompiendo Chain-of-Thought por completo. Pega un JSON Schema (o objeto de ejemplo) y el linter señala el ordenamiento.",
"cot.desc": "Razonamiento antes que respuesta, siempre. Pega un JSON Schema o un objeto de respuesta de ejemplo — el linter dice si los campos de razonamiento van antes que los de respuesta y propone una corrección.",
"cot.input.placeholder": "{ \"type\": \"object\", \"properties\": { \"answer\": {\"type\": \"string\"}, \"reasoning\": {\"type\": \"string\"} } }",
"cot.lint_btn": "🔍 Lintear",
"cot.example_good_btn": "↳ Ejemplo: orden correcto",
"cot.example_bad_btn": "↳ Ejemplo: anti-patrón",
"cot.status.done": "✅ {verdict}",
"cot.col.field": "Campo",
"cot.col.type": "Rol",
"cot.field.reasoning": "razonamiento",
"cot.field.answer": "respuesta",
"cot.field.other": "—",
"cot.field_count": "{n} campos",
"cot.verdict.good_order": "✅ Orden correcto — razonamiento antes que respuesta",
"cot.verdict.anti_pattern": "❌ Anti-patrón — respuesta antes que razonamiento",
"cot.verdict.missing_reasoning": "⚠ Falta campo de razonamiento",
"cot.verdict.missing_answer": "ℹ No se detecta campo tipo respuesta",
"cot.verdict.no_cot_fields": "ℹ Sin campos de razonamiento/respuesta detectados",
"cot.verdict.invalid_json": "❌ JSON inválido",
"cot.verdict.non_object": "ℹ El valor superior no es un objeto",
"cot.verdict.empty_fields": "ℹ Sin campos para analizar",
"cot.explain.good_order": "El constrained decoding emitirá el razonamiento primero, así el modelo puede pensar antes de comprometerse. Chain-of-Thought se mantiene honesto.",
"cot.explain.anti_pattern": "El modelo se ve forzado a emitir el campo de respuesta primero; cualquier razonamiento posterior solo justifica lo ya comprometido. Reordena para que los campos tipo razonamiento vayan antes que los tipo respuesta.",
"cot.explain.missing_reasoning": "Hay un campo de respuesta pero ningún campo de razonamiento. Si quieres CoT, añade un campo `reasoning` (o `chain_of_thought`, `analysis`, …) antes de la respuesta.",
"cot.explain.missing_answer": "Hay un campo de razonamiento pero ningún campo de respuesta evidente. Asegúrate de que el schema realmente exija al modelo comprometer un valor final.",
"cot.explain.no_cot_fields": "El objeto tiene campos pero ninguno se ve como razonamiento o respuesta por su nombre. El linter es conservador — si el schema es intencional, ignóralo. Si no, añade campos explícitos de razonamiento/respuesta.",
"cot.hint.non_object": "El valor de nivel superior debe ser un objeto JSON (`{ … }`) o un JSON Schema con `properties`.",
"cot.hint.empty_fields": "Sin campos detectados. Pega un JSON Schema, una respuesta de ejemplo, o pulsa un botón de ejemplo bajo el textarea.",
"cot.suggested_fix.title": "✓ Corrección sugerida",
"cot.suggested_fix.desc": "Propiedades reordenadas — campos de razonamiento primero, luego cualquier campo de contexto, luego los de respuesta. `required[]` (si existe) se reordena igual.",
"cot.suggested_fix.copy": "📋 Copiar",
"cot.suggested_fix.copied": "✓ Copiado",
"cot.attribution": "Referencias:",
"inv.v082.cot": "📋 JSON CoT — lintea schemas de structured outputs buscando el anti-patrón respuesta-antes-de-razonamiento que silenciosamente rompe Chain-of-Thought.",
"help.v082.cot.title": "📋 Linter JSON con consciencia CoT",
"help.v082.cot.body": "Los motores de constrained decoding (llguidance, Outlines, gramáticas SGLang) emiten propiedades JSON en el orden que declara tu schema. Si escribes { answer, reasoning } el modelo se compromete con answer primero y el CoT se reduce a justificación post-hoc. Pega cualquier schema (o respuesta de ejemplo) — el linter clasifica cada campo como razonamiento, respuesta u otro, señala el ordenamiento, y emite una corrección reordenada para copiar de vuelta. Caso de uso: 'Mi prompt CoT funciona en texto pero degrada en modo JSON' → ejecuta linter, encuentra el orden invertido, corrige.",
// v0.8.3 — anti-bullshit pack #9: PEFT Anti-Pattern Checker
"modes.peft": "🔧 PEFT Lint",
"mode_desc.peft": "Linter estático para scripts de entrenamiento PEFT/LoRA. Detecta carga silenciosa del modelo base (peft #2115), orden de prepare_model_for_kbit_training/get_peft_model en QLoRA, mismatch de target_modules/arch, y conveniones de lora_alpha.",
"peft.title": "🔧 Verificador de anti-patrones PEFT",
"peft.tip": "get_peft_model(base, config) crea un adapter NUEVO — NO carga pesos guardados. Quien quiera reanudar desde un checkpoint debe llamar PeftModel.from_pretrained(base, path). peft #2115 documenta el bug de carga silenciosa del modelo base. Este linter escanea tu script en busca de ese patrón (y otros 3: orden QLoRA, mismatch target_modules/arch, ratio lora_alpha).",
"peft.desc": "No quemes 10 horas de entrenamiento sobre un modelo base. Pega tu código de setup PEFT — el linter señala cargas silenciosas del base, bugs de orden QLoRA, mismatches target_modules/arch, y conveniones lora_alpha.",
"peft.input.placeholder": "from peft import LoraConfig, get_peft_model …",
"peft.lint_btn": "🔍 Lintear",
"peft.example_bug_btn": "↳ Ejemplo: carga silenciosa del base",
"peft.example_qlora_btn": "↳ Ejemplo: bug de orden QLoRA",
"peft.example_clean_btn": "↳ Ejemplo: limpio",
"peft.status.done": "✅ {verdict} — {n} hallazgo(s)",
"peft.line": "línea {n}",
"peft.summary": "{total} hallazgo(s)",
"peft.attribution": "Referencias:",
"peft.detected_at_line": "aparece en la línea",
"peft.suggested_fix": "Sugerencia:",
"peft.detected_arch": "Arch detectada",
"peft.from_model_id": "(desde model id",
"peft.your_modules": "Tus target_modules",
"peft.expected_modules": "Esperados para esta arch",
"peft.match_ratio": "{hits} de {total} coinciden.",
"peft.ratio": "ratio",
"peft.alpha.convention": "la convención es α=2r o α=r",
"peft.qlora_order.detail": "prepare_model_for_kbit_training (línea {prepare_line}) corre DESPUÉS de get_peft_model (línea {get_peft_model_line}). Invierte el orden — llama prepare PRIMERO, luego get_peft_model.",
"peft.no_peft_calls.detail": "No se detectan llamadas a get_peft_model / PeftModel.from_pretrained / LoraConfig. Pega un snippet de setup PEFT/LoRA.",
"peft.verdict.errors_found": "❌ Errores encontrados",
"peft.verdict.warnings_only": "⚠ Avisos",
"peft.verdict.info_only": "ℹ Info",
"peft.verdict.clean": "✅ Limpio — sin issues detectados",
"peft.verdict.no_peft_calls": "ℹ Sin llamadas PEFT detectadas",
"peft.verdict.empty_input": "ℹ Entrada vacía",
"peft.rule.silent_base_load.label": "Carga silenciosa del modelo base (peft #2115)",
"peft.rule.silent_base_load.explain": "get_peft_model(base, config) crea un adapter NUEVO — NO carga pesos guardados. La pista de checkpoint en tu código sugiere que quieres REANUDAR el entrenamiento desde un adapter guardado, pero esta ruta arrancará silenciosamente desde cero y sobrescribirá la corrida.",
"peft.rule.silent_base_load.fix": "Reemplaza get_peft_model(base, config) por PeftModel.from_pretrained(base, path) al reanudar. Verifica con model.get_layer_status() tras cargar.",
"peft.rule.qlora_order.label": "Bug de orden QLoRA",
"peft.rule.qlora_order.explain": "prepare_model_for_kbit_training debe llamarse ANTES de get_peft_model. Invertido, la prep kbit no aplica a las capas LoRA y el cálculo del gradiente se rompe (loss → NaN, o entrenamiento silencioso de nada).",
"peft.rule.qlora_order.fix": "Reordena: base = prepare_model_for_kbit_training(base) luego model = get_peft_model(base, config).",
"peft.rule.target_modules_mismatch.label": "Mismatch target_modules / arch",
"peft.rule.target_modules_mismatch.explain": "Tu lista target_modules no coincide con los nombres convencionales para la arquitectura detectada en tu código. PEFT aplicará LoRA silenciosamente a nada (o a las capas equivocadas).",
"peft.rule.target_modules_mismatch.fix": "Verifica los nombres con print([n for n,_ in model.named_modules()]) sobre el modelo base cargado, o usa la lista específica de la arch mostrada arriba.",
"peft.rule.alpha_not_2r.label": "lora_alpha ≠ 2r (convención)",
"peft.rule.alpha_not_2r.explain": "La mayoría de recetas LoRA publicadas usan o α = 2r (escala efectiva unitaria) o α = r (LR efectivo reducido). Un ratio custom funciona pero merece una verificación.",
"peft.rule.alpha_not_2r.fix": "Verifica el ratio contra tu receta de referencia. Si es intencional, ignora este hallazgo.",
"peft.rule.no_peft_calls.label": "Sin llamadas PEFT detectadas",
"inv.v083.peft": "🔧 PEFT Lint — detecta la carga silenciosa de get_peft_model sobre el base (peft #2115) + orden QLoRA + mismatch target_modules / arch.",
"help.v083.peft.title": "🔧 Verificador de anti-patrones PEFT",
"help.v083.peft.body": "El get_peft_model(base, config) de PEFT crea un adapter NUEVO — no carga pesos guardados desde una ruta. Quien pega código de tutorial e intenta reanudar desde un checkpoint tira silenciosamente su entrenamiento. peft #2115 tiene el bug report canónico. Este linter escanea tu script buscando el patrón + 3 issues relacionados (orden QLoRA, mismatch target_modules/arch, ratio lora_alpha) y reporta hallazgos con números de línea y sugerencias. Caso de uso: antes de lanzar un fine-tune LoRA de 10 horas, pega tu script — atrapa los bugs silenciosos en 200ms.",
// v0.8.4 — anti-bullshit pack #10: Prompt-Cache Diff Predictor
"modes.cache": "🔁 Cache Diff",
"mode_desc.cache": "Predice si una edición del prompt mantuvo viva la prompt cache del proveedor o la invalidó. Hit ratio por proveedor + delta $ vs sin caché.",
"cache.title": "🔁 Predictor de Diff de Prompt-Cache",
"cache.tip": "El cache_control de Anthropic se rompe al primer token diferente del prefijo marcado. OpenAI auto-cachea prefijos ≥1024 tokens pero invalida ante cualquier cambio. La context cache de Gemini requiere ≥32K tokens. Una edición mal puesta silenciosamente 10x tu factura — y la API nunca avisa. Pega prompt viejo + nuevo, ve el hit ratio por proveedor + delta de coste.",
"cache.desc": "No 10x tu factura por un edit de un carácter. Pega tu prompt anterior y el actual — el predictor halla el prefijo común más largo, estima tokens, y muestra hit ratio por proveedor + delta $ vs sin caché.",
"cache.old_label": "Prompt viejo:",
"cache.new_label": "Prompt nuevo:",
"cache.old.placeholder": "Eres un asistente útil. …",
"cache.new.placeholder": "Eres un asistente útil. …",
"cache.profile_label": "Perfil de tokenizer:",
"cache.profile.english": "Inglés (chars/4)",
"cache.profile.code": "Código (chars/3.5)",
"cache.profile.mixed": "CJK / Cirílico (chars/2)",
"cache.output_label": "Tokens de salida estimados:",
"cache.diff_btn": "🔍 Predecir",
"cache.example_good_btn": "↳ Ejemplo: hit 99%",
"cache.example_broken_btn": "↳ Ejemplo: caché rota",
"cache.example_belowmin_btn": "↳ Ejemplo: bajo mínimo OpenAI",
"cache.status.done": "✅ {verdict} — {hit}% hit teórico",
"cache.verdict.identical": "✅ Idénticos — hit completo",
"cache.verdict.divergent_can_cache":"⚠ Hit parcial — varía por proveedor",
"cache.verdict.divergent_below_min":"❌ Por debajo de mínimos — no hay caché posible",
"cache.verdict.fully_divergent": "❌ Totalmente divergentes — caché invalidada",
"cache.verdict.empty_input": "ℹ Entrada vacía",
"cache.summary.tokens": "Prefijo común {common} / {total} tokens ({pct}% hit ratio teórico).",
"cache.summary.diff_at": "Primera diferencia en la línea {line}.",
"cache.col.provider": "Proveedor",
"cache.col.hit": "Hit",
"cache.col.cost": "Base → cached",
"cache.col.savings": "Ahorro",
"cache.note.requires_marker": "(requiere marcador cache_control)",
"cache.note.below_min": "(prefijo < {min} tokens — mínimo del proveedor)",
"cache.write_surcharge": "+ {cost} sobrecargo de cache-write la primera vez (Anthropic)",
"cache.diff.title": "Dónde se rompe la caché",
"cache.diff.legend": "Verde = prefijo compartido (cacheable). Rojo = primera edición (todo desde aquí se re-factura).",
"cache.hint.empty": "Pega dos prompts, luego Predecir.",
"cache.attribution": "Referencias:",
"cache.attribution.snapshot": "Precios snapshot 2026-01; verifica con la doc actual del proveedor antes de actuar sobre $.",
"inv.v084.cache": "🔁 Cache Diff — predice si un edit del prompt invalidó la prompt cache del proveedor. Hit ratio por proveedor + delta $.",
"help.v084.cache.title": "🔁 Predictor de Diff de Prompt-Cache",
"help.v084.cache.body": "Las prompt caches de cada proveedor tienen reglas distintas: el cache_control de Anthropic se rompe al primer token diferente del prefijo marcado; OpenAI auto-cachea prefijos ≥1024 tokens; las context caches de Gemini requieren ≥32K tokens. Una edición mal puesta silenciosamente 10x tu factura — la API no avisa, y el coste solo aparece en la siguiente factura. Pega prompt viejo + nuevo, el predictor halla el prefijo común más largo, estima tokens con tres perfiles de tokenizer (inglés / código / CJK), y muestra hit ratio por proveedor + delta $ vs sin caché para Claude Opus/Sonnet/Haiku, GPT-5/mini, y Gemini 2.5 Pro. Caso de uso: 'Tweaké el system prompt y la factura saltó — ¿qué se rompió?' → pega ambos prompts, ve exactamente qué proveedor dejó de cachear.",
// v0.8.5 — anti-bullshit pack #11: Speculative-Decode Compatibility
"modes.speculative": "🔬 Spec-Decode",
"mode_desc.speculative": "Hace fetch del `tokenizer.json` desde HF Hub para dos model ids y verifica compatibilidad de vocab antes de cablear speculative decoding. Atrapa el bug de mismatch silencioso que desperdicia compute del draft.",
"speculative.title": "🔬 Compatibilidad de Speculative-Decode",
"speculative.tip": "El speculative decoding (vLLM, SGLang, llama.cpp, transformers) requiere que draft y target compartan vocabulario EXACTO. Cualquier desacuerdo de token-id hace que el target rechace cada token del draft — pagas AMBOS computes y obtienes PEOR throughput que baseline. El sistema reporta output nominal (solo más lento), así que el bug es invisible en tests unitarios. Esta tool hace fetch de `tokenizer.json` desde HF Hub para ambos ids y compara.",
"speculative.desc": "No envíes spec-dec con vocabs mismatched. Pega target + draft model ids → tool hace fetch de tokenizers, compara tipo de vocab, tamaño, token-ids muestreados, special tokens, added tokens → veredicto + estimación de speedup.",
"speculative.target_label": "Model id del target (grande):",
"speculative.draft_label": "Model id del draft (pequeño):",
"speculative.target_label_short": "target",
"speculative.draft_label_short": "draft",
"speculative.check_btn": "🔍 Verificar compatibilidad",
"speculative.example_good_btn":"↳ Ejemplo: Llama-3.1 8B/70B (gated → mirror)",
"speculative.example_bad_btn": "↳ Ejemplo: cross-family (malo)",
"speculative.gated_note": "💡 Modelos gated (Llama, Mistral, Gemma) disparan un fallback automático a mirror open (unsloth/...). HF desaconseja oficialmente tokens en browser, así que la tool no puede autenticar — pero los tokenizers de mirrors son típicamente byte-idénticos porque la cuantización toca weights, no el artefacto del tokenizer.",
"speculative.mirror.heading": "Fallback a open-mirror",
"speculative.mirror.target_used": "Target {original} estaba gated; se usó mirror {mirror}.",
"speculative.mirror.draft_used": "Draft {original} estaba gated; se usó mirror {mirror}.",
"speculative.mirror.warn": "Los tokenizers de mirror (ej. unsloth/) suelen ser byte-idénticos al original gated porque la cuantización toca weights, no tokens. Verifica chat-template si necesitas match exacto (unsloth #880 documenta drift ocasional).",
"speculative.status.fetching": "🔄 Haciendo fetch de tokenizer.json desde HF Hub para ambos modelos…",
"speculative.status.done": "✅ {verdict}",
"speculative.status.error": "❌ Error",
"speculative.type_mismatch_note": "tipos de tokenizer difieren; spec-dec imposible",
"speculative.vocab_size": "Tamaño del vocab",
"speculative.size_diff": "difieren — cada id reusado es un mismatch",
"speculative.sampled": "Match de token-id muestreado",
"speculative.first_mismatch": "Primer mismatch",
"speculative.special_diff": "Diferencias de special tokens",
"speculative.added_diff": "Diferencias de added tokens",
"speculative.added_diff_more": "+ más …",
"speculative.speedup.title": "Banda estimada de speedup",
"speculative.speedup.params": "target {target} / draft {draft} (ratio de params {ratio})",
"speculative.speedup.low": "Bajo (α=0.50)",
"speculative.speedup.expected":"Esperado (α=0.70)",
"speculative.speedup.high": "Alto (α=0.85)",
"speculative.speedup.disclaimer": "α = tasa de aceptación del draft. El speedup real depende del dominio del prompt, lookahead K, y overhead del engine. Las bandas asumen verifier batching ideal.",
"speculative.speedup.draft_not_smaller": "El draft no es más pequeño que el target — spec-dec es mal uso aquí.",
"speculative.attribution": "Referencias:",
"speculative.side.target": "Target",
"speculative.side.draft": "Draft",
"speculative.fetch_error.missing_model_id": "falta el model id",
"speculative.fetch_error.gated_or_private": "modelo es gated o privado — no se puede hacer fetch del tokenizer sin auth",
"speculative.fetch_error.not_found": "model id no encontrado en HF Hub",
"speculative.fetch_error.fetch_failed": "fetch falló (error HTTP)",
"speculative.fetch_error.parse_failed": "parse JSON falló (archivo malformado)",
"speculative.fetch_error.timeout": "timeout (>15s, tokenizer grande o conexión lenta)",
"speculative.fetch_error.network": "error de red",
"speculative.fetch_error.hint": "Verifica el spelling del model id. Para modelos gated necesitas ver el tokenizer vía tu cuenta HF — esta tool no puede autenticar.",
"speculative.hint.missing_input": "Ingresa ambos model ids (target y draft), luego Verificar.",
"speculative.hint.identical_models": "Target y draft son el mismo modelo — spec-dec es un no-op (y desperdicio).",
"speculative.verdict.compatible": "✅ Compatible — vocabs coinciden",
"speculative.verdict.compatible_with_caveats": "✅ Compatible — pero special/added tokens difieren (revisar)",
"speculative.verdict.partial_compatible": "⚠ Match parcial (95-99.9% de los ids muestreados)",
"speculative.verdict.type_mismatch": "❌ Tipos de tokenizer difieren — spec-dec imposible",
"speculative.verdict.vocab_size_mismatch": "❌ Tamaños de vocab difieren — espacio de id desalineado",
"speculative.verdict.incompatible": "❌ Incompatibles — demasiados mismatches de id",
"speculative.verdict.fetch_failed": "ℹ No se pudo hacer fetch del tokenizer",
"speculative.verdict.identical_models": "ℹ Modelos idénticos — spec-dec es un no-op",
"speculative.verdict.missing_input": "ℹ Ingresa ambos ids",
"inv.v085.speculative": "🔬 Spec-Decode — verifica compatibilidad de vocab del tokenizer entre target + draft antes de enviar speculative decoding (el bug que da PEOR throughput silenciosamente).",
"help.v085.speculative.title": "🔬 Compatibilidad de Speculative-Decode",
"help.v085.speculative.body": "El speculative decoding solo funciona si target y draft comparten exactamente el mismo vocabulario. Vocabs mismatched hacen que cada token del draft sea rechazado — pagas AMBOS computes y obtienes peor throughput que baseline. Peor: el sistema sigue emitiendo output correcto (solo más lento), así que el bug es invisible en tests unitarios. vLLM #4570 / #16757 / #20409 / #12488 surfacen variantes. Esta tool hace fetch de `tokenizer.json` desde HF Hub para ambos ids, compara tipo de tokenizer, tamaño de vocab, mapa completo token→id, special tokens, y added tokens, luego estima una banda de speedup basada en ratio de params y tasas típicas α=0.5/0.7/0.85 de aceptación. Caso de uso: antes de lanzar un cluster vLLM con spec-dec habilitado, verifica que el par sea compatible.",
// v0.8.7 — anti-bullshit pack #13: Multilingual Tokenizer Tax
"modes.tax": "🌍 Token Tax",
"mode_desc.tax": "BPE real (transformers.js en browser) sobre texto pegado a través de 6 tokenizers de vendor. Surface la asimetría de coste silenciosa entre idiomas.",
"tax.title": "🌍 Impuesto de Tokenizer Multilingüe",
"tax.tip": "Los tokenizers gravan el texto no-inglés de forma asimétrica. El mismo párrafo puede ser 100 tokens en inglés pero 250+ en chino en un tokenizer entrenado en Latin (Llama, Phi). Coste por request Y contexto efectivo degradan silenciosamente. Pega tu texto, ve token counts reales a través de tokenizers de vendor — sin estimación, BPE real vía transformers.js en tu navegador.",
"tax.desc": "No 3× tu factura en soporte chino. Pega cualquier texto → BPE real por-tokenizer a través de Qwen / Phi / Llama / Gemma / GPT-4 / Claude → ve la asimetría de coste vs tu baseline.",
"tax.input_label": "Texto a tokenizar:",
"tax.input.placeholder": "Pega cualquier texto — inglés, chino, árabe, código, …",
"tax.tokenize_btn": "🔬 Tokenizar todos",
"tax.sample_en_btn": "↳ Ejemplo: English",
"tax.sample_zh_btn": "↳ Ejemplo: 中文",
"tax.sample_ar_btn": "↳ Ejemplo: عربى",
"tax.sample_mixed_btn": "↳ Ejemplo: mixto",
"tax.sample_code_btn": "↳ Ejemplo: código",
"tax.status.loading": "⏳ Cargando transformers.js + tokenizers (primera ejecución puede tardar 5-15s)…",
"tax.status.done": "✅ {n}/{total} tokenizers en {ms}ms",
"tax.col.tokenizer": "Tokenizer",
"tax.col.tokens": "Tokens",
"tax.col.cpt": "Chars/tok",
"tax.col.ratio": "Ratio",
"tax.summary.input": "Entrada: {chars} caracteres, {bytes} bytes",
"tax.script_breakdown": "scripts",
"tax.interp.worst": "{label} cuesta {pct}% más tokens que baseline para este texto.",
"tax.interp.uniform": "✓ Todos los tokenizers dentro de ±5% — texto bien manejado entre vendors.",
"tax.hint.empty": "Pega texto y haz click en Tokenizar.",
"tax.all_failed": "Todos los tokenizers fallaron.",
"tax.error.gated": "modelo gated (auth HF requerida — prueba mirror open)",
"tax.error.not_found": "model id no encontrado",
"tax.error.timeout": "timeout (tokenizer grande o conexión lenta)",
"tax.error.network": "error de red",
"tax.error.fetch_failed": "fetch falló",
"tax.error.invalid_input": "entrada inválida",
"tax.attribution": "Tokenizers vía",
"tax.attribution.privacy": "El texto se tokeniza localmente — nunca sale del navegador.",
"tax.firstload_note": "💡 Primera carga: la tool descarga transformers.js (~750 KB) + el vocab de cada tokenizer bajo demanda (~5-15 MB por tokenizer, cacheados después). Ejecuciones siguientes son instantáneas. Todo el procesamiento es local — tu texto nunca sale del navegador.",
"inv.v087.tax": "🌍 Token Tax — BPE real sobre 6 tokenizers de vendor. Surface la asimetría de coste silenciosa entre idiomas (CJK / árabe / mixto).",
"help.v087.tax.title": "🌍 Impuesto de Tokenizer Multilingüe",
"help.v087.tax.body": "Los tokenizers gravan el texto no-inglés de forma asimétrica. El mismo párrafo puede ser 100 tokens en inglés pero 250+ en chino en un tokenizer entrenado en Latin (Llama, Phi). Tanto coste-por-request COMO contexto efectivo degradan silenciosamente. Esta tool carga HuggingFace transformers.js en tu navegador (~750 KB CDN) y tokeniza el texto pegado contra 6 tokenizers preset de vendor (Qwen2.5, Phi-3.5, Llama-3.1, Gemma-2, GPT-4 cl100k, Claude aprox). Output: token count por tokenizer + chars-per-token + ratio vs baseline + interpretación de asimetría. Auto-detecta bloques de script (Latin / CJK / árabe / cirílico / devanagari / tailandés / griego / hebreo / coreano) para que veas por qué un tokenizer es 3× otro. Caso de uso: 'Mi soporte multilingüe añadió 30% a la factura — ¿qué idioma cuesta más?' → pega texto real de producción, ve breakdown exacto por tokenizer.",
// v0.8.8 — anti-bullshit pack #14: LongScore (RULER + HELMET lookup)
"modes.longscore": "🎯 LongScore",
"mode_desc.longscore": "Consulta la degradación relativa de tu modelo más allá del contexto corto. KBs RULER + HELMET (n=93 modelos). Métrica LongScore de 100-LongBench (ACL 2025).",
"longscore.title": "🎯 LongScore",
"longscore.tip": "Cada modelo dice tener ventana de 128K, pero la accuracy degrada mucho antes. LongScore (métrica peer-reviewed de 100-LongBench, ACL 2025) mide la degradación relativa más allá del contexto corto. Separa la base ability de la capacidad real long-ctx — comparas degradación, no scores brutos. Lookup contra KBs RULER + HELMET (n=93 modelos).",
"longscore.desc": "¿Cuánto degrada tu modelo más allá del contexto corto? Pega un id de modelo HF → ve LongScore (degradación relativa) + breakdown por longitud + scores HELMET 7-task cuando estén disponibles. Sin GPU. Sin inferencia. Lookup puro contra benchmarks publicados.",
"longscore.input_label": "Id del modelo:",
"longscore.input.placeholder": "ej. Qwen2.5-72B-Instruct o meta-llama/Llama-3.1-70B-Instruct",
"longscore.lookup_btn": "🔎 Buscar",
"longscore.example_good_btn": "↳ Ejemplo: Jamba-1.5-Large (sin degradación)",
"longscore.example_mid_btn": "↳ Ejemplo: Llama-3.1-70B (moderado)",
"longscore.example_bad_btn": "↳ Ejemplo: dbrx (severo)",
"longscore.formula_note": "💡 LongScore = media sobre l ∈ {16K, 32K, 64K, 128K} de (S_l − Base) / Base, donde Base = media(S_4K, S_8K). Fuente: 100-LongBench, ACL 2025. Datos: NVIDIA RULER (per-length, n=33) + HELMET (agregado a 128K, n=60). 0 = sin degradación; -0.30 = severo.",
"longscore.miss.title": "Modelo no encontrado en KB",
"longscore.miss.body": "Buscado {id}. KB tiene {n} modelos. Prueba un id HF canónico (ej. Qwen2.5-72B-Instruct, Llama-3.1-70B-Instruct, Jamba-1.5-Mini).",
"longscore.miss.suggest": "Comprueba cobertura en",
"longscore.no_ruler": "⚠ Sin datos per-length — LongScore no computable. Mostrando agregado HELMET a 128K.",
"longscore.score_label": "LongScore",
"longscore.helmet_label": "Breakdown HELMET 7-task",
"longscore.col.ctx": "Contexto",
"longscore.col.score": "Score",
"longscore.col.lc": "LC",
"longscore.col.task": "Tarea",
"longscore.source_note": "Fuente",
"longscore.hint.empty": "⚠ Pega un id de modelo primero.",
"longscore.status.lookup": "⏳ Buscando…",
"longscore.status.miss": "ℹ Modelo no en KB",
"longscore.status.ruler_hit": "✅ Datos RULER per-length encontrados",
"longscore.status.helmet_only":"ℹ Solo agregado HELMET (sin datos per-length)",
"longscore.verdict.no_degradation": "✅ Sin degradación más allá del contexto corto",
"longscore.verdict.mild": "🟢 Degradación leve (<10%)",
"longscore.verdict.moderate": "🟠 Degradación moderada (10-20%)",
"longscore.verdict.severe": "🔴 Degradación severa (20-30%)",
"longscore.verdict.extreme": "🚨 Degradación extrema (>30%)",
"inv.v088.longscore": "🎯 LongScore — métrica de degradación peer-reviewed (100-LongBench, ACL 2025). Lookup de cualquier modelo en KBs RULER + HELMET (n=93). Ve cuánto cae tu modelo en realidad más allá del contexto corto.",
"help.v088.longscore.title": "🎯 LongScore",
"help.v088.longscore.body": "Cada LLM long-ctx dice 128K pero degrada mucho antes. El paper 100-LongBench (ACL 2025, arXiv:2505.19293) notó que los scores brutos long-ctx están dominados por base ability — un modelo más smart con peor receta long-ctx puntúa más que uno menos smart con mejor receta, ocultando la degradación real. Proponen LongScore: LC_l = (S_l − Base) / Base con Base = media(S_short), luego promedio sobre longitudes largas. Resultado: número de degradación relativa por modelo que compara apples to apples. Este mode tafagent embebe datos LongScore-ready: agregado RULER per-context (n=33 modelos, 4K-128K) + agregado HELMET a 128K (n=60 modelos, 7 categorías). Lookup es match exacto por id HF (lowercase, dashes, dots normalizados). Para modelos con datos RULER, obtienes el LongScore completo + breakdown per-length + verdict (no/leve/moderado/severo/extremo). Para modelos solo-HELMET, obtienes el agregado 7-categorías a 128K. Caso de uso: '¿quiero usar Llama-3.1-70B-Instruct para resumen de docs 100K-token — cuánta accuracy pierdo realmente?' → pega id, ve -10% LongScore (degradación moderada, sobre todo el cliff a 128K). Decide si usarlo, cambiar a un modelo con long-ctx engineered, o chunkear tu input.",
"inv.v081.hub": "🧭 Solutions Hub — cada pain documentado mapeado a un mode tafagent o herramienta externa curada. No reinventes — encuentra.",
"help.v081.hub.title": "🧭 Solutions Hub",
"help.v081.hub.body": "tafagent como integrador, no silo. 30+ pains en 7 categorías (eval reliability · diagnósticos · setup · training · retrieval · multimodal · observability), cada uno mapeado a (a) el mode tafagent que lo resuelve, si existe, y (b) las herramientas externas best-of-breed que la comunidad ya usa (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Caja de búsqueda matchea pain, scenario, y nombre de herramienta. Caso de uso: 'tengo problema X — ¿lo resuelve tafagent, y si no, quién?'",
"hub.title": "🧭 Solutions Hub",
"hub.tip": "Mapa de cada pain de LLM-eval documentado: qué mode tafagent lo resuelve (si alguno), y las herramientas externas best-of-breed que la comunidad ya usa. Objetivo: cobertura total. Si la herramienta canónica existe en otra parte, enlazamos en vez de rebuildear.",
"hub.desc": "No reinventes — encuentra. 30+ pains mapeados a modes tafagent + herramientas externas curadas. Navega por categoría, busca por keyword, o ve los huecos donde nuevos modes ayudarían más.",
"hub.clear_btn": "✕ Limpiar",
"hub.no_mode": "externo",
"hub.planned": "planeado:",
"hub.best_for": "Mejor para",
"hub.not_for": "No para",
"hub.tools": "Herramientas externas",
"hub.status.loaded": "✅ Cargados {total} pains en {categories} categorías — {covered} cubiertos por modes tafagent, {externalLinks} enlaces externos curados. Compilado {compiled}.",
"hub.status.fail": "⚠ No se pudo cargar Solutions Hub.",
"hub.search.empty": "Sin coincidencias para '{query}'. Prueba términos más amplios (ej. 'eval', 'rag', 'tokenizer').",
"hub.search.results": "Encontradas {n} coincidencia(s) para '{query}'.",
// v0.7.7 — Tiles de tareas (UX restructure: 14 modos agrupados por intención)
"tiles.title": "🎯 ¿Qué quieres hacer?",
"tiles.subtitle": "Elige una tarea. Cada una abre la herramienta adecuada debajo. O baja para la lista completa de 22 modos.",
"tile.diagnose.title": "🔬 Diagnosticar un modelo",
"tile.diagnose.desc": "¿Servirá este modelo concreto para mi caso de uso?",
"tile.trust.title": "✓ Confiar en un score de benchmark",
"tile.trust.desc": "¿Me creo este número? ¿Es bug o ruido?",
"tile.eval.title": "⚙️ Configurar bien una eval",
"tile.eval.desc": "Obtén el flag CLI exacto para lm-eval / vLLM / transformers.",
"tile.compare.title": "🆚 Comparar modelos",
"tile.compare.desc": "Lado a lado, o explora el panel empírico de modelos.",
"tile.manual.title": "📋 Manual / libre",
"tile.manual.desc": "Elige una receta concreta a mano, o pregunta en inglés llano.",
"tile.diagnose.tip": "Empieza aquí cuando tengas un id de modelo concreto y quieras diagnóstico completo: Profile corre las 5 recetas a la vez. Unmask comprueba si max_position_embeddings es honesto. NIAH→Reason predice el gap retrieval-vs-reasoning. LongScore consulta datos publicados de RULER + HELMET y muestra la degradación real más allá del contexto corto (métrica peer-reviewed). Quant predice si cuantizar lo romperá. Inspect permite pegar config.json crudo para modelos privados / en desarrollo.",
"tile.trust.tip": "Cuando ves un score y quieres saber si es real. Contamination puntúa 20+ benchmarks por probabilidad de que el modelo los viera en entrenamiento. Drift te dice si el gap entre dos evals es ruido numérico o bug real (chat-template mismatch, layout KV-cache, etc.). Arena CI reconstruye los intervalos de confianza que Chatbot Arena oculta — muchas "victorias" top-Elo están estadísticamente empatadas.",
"tile.eval.tip": "Antes de correr lm-eval-harness o vLLM serve, obtén el flag CLI correcto. Chat-template Sniffer detecta la familia de template (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) y emite la invocación exacta --apply_chat_template / --chat-template. Resuelve el issue #1841 de lm-eval-harness (÷2 accuracy silencioso). Diagnose CLI genera el comando Python para medir γ_obs en tu GPU local.",
"tile.compare.tip": "Compare: elige 2-3 modelos candidatos + una receta, ve veredictos en tabla lado a lado (ej. Llama-3-8B vs Mistral-7B a 32k). Phase diagram: scatter de 23 modelos empíricos en el plano (log θ, γ), con la curva Padé superpuesta. Hover puntos para detalles, click para cargar ese modelo en la Recipe form.",
"tile.manual.tip": "Recipe: elige una receta X-N específica (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 compresión KV, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) y rellena la form a mano para control total. Ask: escribe una pregunta libre; un LLM 0.5B (Qwen2.5) en tu navegador elige la receta correcta y la ejecuta. Ideal para exploración "qué pasaría si...".",
"share.import_desc": "¿Tienes un fichero JSON del análisis TAF de alguien? Cárgalo aquí para ver el veredicto + cadena localmente. La misma vista que si lo hubieras ejecutado tú.",
"share.import_btn": "📂 Cargar JSON compartido",
"synthesis.system": "Eres un asistente de diagnóstico preciso para LLMs transformer. Dados resultados de fórmulas TAF pre-calculados, escribe un resumen claro en español de 4-6 frases. Cita el número de sección (§X.Y) para cada número que menciones. Da siempre una recomendación concreta. NO inventes números.",
// INSPECTOR mode
"inspector.title": "🔍 Inspector de Arquitectura",
"inspector.desc": "Pega el contenido crudo de config.json. La herramienta extrae los parámetros arquitectónicos y ejecuta el Profile completo de 5 recetas.",
"inspector.tip": "Pega cualquier config.json directamente. La herramienta lo parsea y ejecuta el Profile completo. Útil para: modelos privados, configs en desarrollo, modelos aún no en HuggingFace, o comparar qué haría tu arquitectura custom.",
"inspector.quickstart": "💡 Caso de uso: tienes un modelo privado no en HF Hub, o una config que estás diseñando. Pega el JSON crudo abajo y obtén un perfil TAF completo.",
"inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}",
"inspector.T_eval": "T_eval (tu contexto objetivo):",
"inspector.btn": "🚀 Inspeccionar y perfilar",
// WHAT-IF slider
"whatif.title": "🎚 What-if: arrastra T_eval para ver γ cambiar en vivo",
"whatif.desc": "Recálculo puro JS (sin llamada Pyodide). Muestra γ_Padé y d_horizon geométricos mientras deslizas. Click en el botón para re-ejecutar la cadena completa.",
"whatif.T_eval": "T_eval",
"whatif.gamma_pade": "γ_Padé",
"whatif.d_horizon": "d_horizon",
"whatif.l_niah": "Techo L_NIAH",
"whatif.predicted": "Veredicto geométrico predicho",
"whatif.rerun": "↻ Re-calcular cadena completa con este T_eval",
// COMMUNITY feed
"community.title": "🌐 Envíos recientes de la comunidad",
"community.desc": "Feed en vivo del registry público. Click en cualquier envío para ver análisis completo.",
"community.browse_all": "Ver todo →",
"community.loading": "Cargando...",
"community.no_repo": "El repo del registry aún no está creado. Cuando exista con envíos, aparecerán aquí en vivo.",
"community.no_submissions": "Sin envíos aún. Sé el primero — genera un Profile y click 📤 Enviar al registry.",
// FALSIFICATION dashboard
"falsification.title": "🔬 Predicciones del paper — estado de falsificación",
"falsification.desc": "El framework TAF se basa en predicciones falsificables (F1-F23). Cada una está empíricamente testada. Aquí está el estado en vivo de cada predicción del paper.",
"falsification.summary": "{confirmed} confirmadas · {partial} parciales · {refuted} refutadas · {untested} sin testear (de {total} predicciones totales)",
"falsification.col.id": "ID",
"falsification.col.claim": "Claim",
"falsification.col.status": "Estado",
"falsification.col.evidence": "Evidencia",
"tafcard.title": "📇 TAF Card — perfil completo del modelo",
"tafcard.recipes_title": "📋 Recetas — veredicto por dimensión",
"tafcard.recipes_count_label": "dimensiones",
"tafcard.numbers_title": "🔢 Números clave (paper §26)",
"tafcard.fals_title": "🔬 Estado de falsificación (F1-F23)",
"tafcard.fals_none": "Sin falsificaciones aplicables.",
"tafcard.diag_title": "🔬 Diagnósticos — números · γ check · what-if",
"tafcard.verify_title": "✓ Verificación — Lean + Sage + falsificación",
"tafcard.share_title": "📂 Procedencia y compartir",
"tafcard.whatif_title": "🎚️ Explorador what-if",
"verdict.go": "ADELANTE",
"verdict.no": "NO",
"verdict.degraded": "DEGRADADO",
"compare.title_out": "🆚 Tabla comparativa",
"status.loading_pyodide": "⏳ Cargando runtime Python (~10MB, solo primera vez)...",
"status.loading_taf": "⏳ Cargando fórmulas TAF + recetas...",
"status.ready": "✅ Listo. Elige un modelo y click Perfilar para empezar.",
"status.computing": "🧮 Calculando cadena TAF...",
"status.done": "✅ Hecho.",
"profile.hf_placeholder": "ej. meta-llama/Meta-Llama-3-8B o Qwen/Qwen2.5-7B",
"compare.hf_placeholder": "ID modelo HF (ej. meta-llama/Meta-Llama-3-8B)",
"compare.slot1_placeholder": "ID modelo HF (ej. meta-llama/Meta-Llama-3-8B)",
"compare.slot2_placeholder": "ID modelo HF #2",
"compare.slot3_placeholder": "ID modelo HF #3 (opcional)",
"compare.preset_default": "— o preset —",
// Parámetros del formulario
"param.theta": "θ (rope_theta)",
"param.theta.tip": "Frecuencia base RoPE de config.rope_theta. Mayor = más capacidad de largo alcance.",
"param.T_train": "T_train",
"param.T_train.tip": "Contexto máximo de entrenamiento. De max_position_embeddings. Más allá es extrapolación.",
"param.T_eval": "T_eval (tu objetivo)",
"param.T_eval.tip": "Tu contexto de inferencia objetivo. La pregunta clave: ¿se comportará bien el modelo a ESTA longitud?",
"param.n_attn": "n_attention_heads",
"param.n_attn.tip": "Número de attention heads por capa. De num_attention_heads.",
"param.n_kv": "n_kv_heads",
"param.n_kv.tip": "KV heads. Si < n_attention_heads → GQA (Grouped Query Attention). Reduce memoria KV pero empuja γ hacia Hagedorn.",
"param.d_head": "head_dim",
"param.d_head.tip": "Dimensión por head. Típico 64, 96, 128. De head_dim o hidden_size / num_attention_heads.",
"param.n_layers": "n_layers",
"param.n_layers.tip": "Número de bloques transformer. De num_hidden_layers.",
"param.n_params": "n_params (ej. 8e9)",
"param.n_params.tip": "Número total de parámetros. Umbral ~400M para emergencia de induction heads. Afecta memoria KV y recipes de presupuesto.",
"param.has_swa": "¿Tiene SWA?",
"param.has_swa.tip": "Sliding Window Attention. true para Mistral, gemma-2, phi-3. El audit de calibración v0.5.3 desactivó la corrección histórica δ_SWA (ajuste n=1).",
"common.yes": "Sí",
"common.no": "No",
// Tooltips de modos
"modes.tip": "Catorce formas de usar la herramienta.
📇 Perfil: pega un id → TAF Card de 5 recetas.
🆚 Comparar: 2-3 modelos lado a lado en una receta.
🔍 Inspeccionar config: pega config.json crudo → Perfil completo.
💬 Pregunta: pregunta libre, el LLM del navegador elige la receta.
📋 Receta: selección manual con control total del formulario.
🩺 Diagnóstico CLI: genera comando Python para medir γ localmente.
📊 Diagrama de fase: panel de 23 modelos en plano (log θ, γ).
🪟 Desenmascarar: detecta max_position_embeddings engañoso (SWA / YaRN / RoPE-scaling).
📜 Chat-template: detecta familia + da el flag CLI exacto para lm-eval / vLLM / transformers.
🎯 Arena CI: reconstruye intervalos de confianza desde votos pairwise crudos; detecta empates estadísticos que Arena oculta.
🧪 Contaminación: puntúa 20+ benchmarks por probabilidad de contaminación según cutoff de entrenamiento vs fecha de release.
⚖️ Quant: predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización); recomienda alternativa segura si hay cliff.
🔀 Drift: mismo modelo, scores distintos en dos setups — ¿bug o ruido? Predice banda de ruido numérico y flagea bugs reales.
🔍 NIAH→Reason: predice tasas pass NIAH y reasoning multi-hop desde arquitectura; encuentra el contexto seguro de reasoning.",
"profile.tip": "Diagnóstico completo en un click. Pega cualquier id de modelo HF (o elige preset). La herramienta ejecuta las 5 recetas (contexto largo, compresión KV, custom vs API, presupuesto, hardware) y produce una única TAF Card con veredicto por dimensión + números clave + clasificación arquitectónica.
Caso de uso: \"Estoy evaluando Qwen2.5-32B para producción — ¿cuál es su perfil completo de viabilidad?\" → pega id → Perfilar → listo.",
"compare.tip": "Misma receta, múltiples modelos. Elige 2-3 modelos candidatos y una receta. Ve los veredictos en una única tabla comparativa.
Caso de uso: \"Necesito recuperación de contexto largo a 16K — ¿cuál es mejor: Llama-3-8B, Mistral-7B o Qwen-7B?\" → elige 3 + X-2 + 16K → ve el ganador.",
// Modal de ayuda
"help.title": "📘 TAF Agent — Manual de Usuario",
"help.what.title": "¿Qué hace?",
"help.what.body": "Predice la viabilidad práctica de cualquier LLM transformer antes de gastar GPU/€. Responde preguntas como \"¿funcionará este modelo a L=32K?\" o \"¿debería entrenar custom o usar API?\" usando fórmulas Python deterministas (TAF — Thermodynamic Attention Framework).",
"help.modes.title": "Cómo usar — 7 modos",
"help.modes.profile": "📇 Perfilar: pega id de modelo → todas las recetas a la vez = TAF Card. Mejor punto de inicio.",
"help.modes.compare": "🆚 Comparar: 2-3 modelos lado a lado en la misma receta. Mejor al elegir entre candidatos.",
"help.modes.inspector": "🔍 Inspeccionar config: pega config.json crudo → la herramienta lo parsea y ejecuta el Perfil completo. Para modelos privados, configs en desarrollo, o modelos aún no en HF Hub.",
"help.modes.ask": "💬 Pregunta libre: pregunta en lenguaje natural, el LLM del navegador elige la receta. Mejor para exploración casual.",
"help.modes.recipe": "📋 Receta + formulario: selección manual, control total de parámetros. Mejor cuando quieres control exacto.",
"help.modes.diagnose": "🩺 Diagnóstico CLI: genera comando Python para medir γ en tu máquina local (transformers + numpy). Rápido ≈5 min CPU; completo ≈20–60 min GPU. JSON resultado re-subible por Inspect.",
"help.modes.phase": "📊 Diagrama de fase: scatter de 23 modelos del panel en plano (log θ, γ). Línea Hagedorn γ=1 separa Fase A de Fase B. Click en un punto para cargar ese modelo en el formulario de Receta.",
"help.recipes.title": "Las 8 recetas disponibles",
"help.recipe.x1.title": "X-1 Entrenamiento custom vs API — compara coste de entrenar tu propio modelo vs pagar API.",
"help.recipe.x1.example": "Prueba: \"¿Entrenar 8B custom o usar GPT-4o para 50M tokens/mes?\"
Respuestas: SÍ (custom) / NO (API) con meses para break-even.",
"help.recipe.x2.title": "X-2 Viabilidad contexto largo — predice si un modelo sirve longitud objetivo de manera fiable.",
"help.recipe.x2.example": "Prueba: \"¿Meta-Llama-3-8B maneja 32000 tokens para retrieval?\"
Cadena: γ_Padé → descomposición → d_horizon → techo NIAH → alucinación → memoria KV.
Veredicto: SÍ / DEGRADADO / NO con mitigación si hace falta.",
"help.recipe.x3.title": "X-3 Pre-flight presupuesto — dado un presupuesto $, ¿qué modelo es viable entrenar?",
"help.recipe.x3.example": "Prueba: \"Tengo $5000, ¿qué modelo puedo entrenar?\"
Respuesta: GO / TINY-MODEL / MEMORY-LIMITED con N (params) y D (tokens) concretos.",
"help.recipe.x5.title": "X-5 Selección hardware — ¿qué GPU usar para servir al throughput objetivo?",
"help.recipe.x5.example": "Prueba: \"Hardware más barato para servir Llama-3-8B a 10M tokens/día\"
Respuesta: mejor GPU + $/Mtok + capacidad vs objetivo.",
"help.recipe.x19.title": "X-19 Decisión compresión KV — ¿usar soft decay, hard cutoff, o métodos de literatura?",
"help.recipe.x21.title": "X-21 Diagnóstico Pureza Imprint — predice γ sobre tokens RANDOM via ν=−1/(2π); ¿cuán limpia es la predicción RoPE del modelo?",
"help.recipe.x22.title": "X-22 Invariante Compute-Context — ¿γ × log(N²·D) está en banda 51.2 ± 16.8? Detecta anomalías de scaling/training.",
"help.recipe.x23.title": "X-23 Detector Fase IH — ¿pre- o post-induction-head? Probe barato via sign(γ_text − γ_random).",
"help.recipe.x21.example": "Prueba: «¿Cuán limpia es la predicción RoPE en Llama-3-8B?»
Respuesta: γ_random predicho + diagnóstico (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).",
"help.recipe.x22.example": "Prueba: «¿Mistral-7B entra en el invariante compute-context?»
Respuesta: K = γ·log(N²·D), z-score, IN-BAND u OUTLIER.",
"help.recipe.x23.example": "Prueba: «¿Qwen2.5-7B es post-induction-head?»
Respuesta: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY (chequeo consistencia tamaño vs Δγ).",
"help.section.v04": "Novedades v0.4 (hallazgos sesión 29 del 2026-04-28): tres recipes diagnósticas derivadas del análisis panel cross-model (n=22 LLMs).",
"help.divider.v04_s29": "— v0.4 (hallazgos sesión 29) —",
"footer.tech_stack": "Cómputo: Pyodide · Síntesis: WebLLM (Qwen2.5-0.5B local) · Hosting: GitHub Pages · Coste: $0",
"help.v04.imprint": "Slope imprint aprendido ν = −1/(2π): el periodo de rotación RoPE 2π provoca un sesgo posicional en los pesos, proporcional a log(N_params). Incluso tokens random muestran este scaling. ν es DERIVADO — no ajustado (err empírico 0.3%).",
"help.v04.invariant": "Invariante Chinchilla-atención K: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Conecta compute scaling y exponente de atención en un solo número adimensional.",
"help.v04.ih_probe": "Δγ como probe IH: sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Más barato que correr un benchmark in-context-learning.",
"help.v04.constants": "γ-cluster en constantes famosas (intrigante, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (conjugado áureo, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat: podría ser coincidencia.",
"help.recipe.x19.example": "Prueba: \"¿Cómo comprimir caché KV para Qwen2.5-7B a 32K?\"
Respuesta: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
"help.param.theta": "θ (rope_theta): frecuencia base RoPE. Mayor = más capacidad de largo alcance. Típico: 10000 (modelos antiguos), 500000 (Llama-3), 1000000 (Qwen2.5).",
"help.param.T_train": "T_train: contexto máximo que vio el modelo durante entrenamiento. De max_position_embeddings.",
"help.param.T_eval": "T_eval: tu longitud de contexto objetivo en inferencia. La perilla clave.",
"help.param.gqa": "n_kv_heads < n_attention_heads: el modelo usa GQA (Grouped Query Attention). Reduce memoria KV pero empuja γ hacia Hagedorn.",
"help.param.swa": "has_SWA: el modelo usa Sliding Window Attention (Mistral, gemma-2).",
"help.param.nparams": "n_params: número total de parámetros. Umbral ~400M para emergencia de induction heads.",
"help.add_models.title": "Añadir nuevos modelos (3 maneras)",
"help.add_models.preset": "Lista de presets: 11 modelos populares curados. Selecciona del dropdown.",
"help.add_models.hf": "HF Hub fetch: pega cualquier id (ej. Qwen/Qwen2.5-32B-Instruct), click 📥 Cargar. El navegador descarga config.json directamente de HuggingFace, llena el formulario. Funciona con cualquier modelo público.",
"help.add_models.manual": "Manual: rellena los campos directamente con valores de la model card.",
"help.audit.title": "La cadena auditable",
"help.audit.body": "Cada resultado muestra la Cadena de Cálculo completa — cada paso de fórmula con sus entradas, salida e interpretación. Click en cualquier paso para expandir. Las referencias de sección (§26.1, §19.1, etc.) apuntan al paper para la derivación.",
"help.synthesis.title": "La respuesta en lenguaje natural",
"help.synthesis.body": "Tras ejecutar la cadena determinista, un LLM en el navegador (Qwen2.5-0.5B, ~350MB cacheado tras primera carga) sintetiza un resumen en lenguaje natural. Los números arriba son siempre correctos (Python determinista); la síntesis la genera el LLM — verifica contra la cadena si dudas.",
"help.params.title": "Parámetros comunes explicados",
"help.verdicts.title": "Qué mirar en los veredictos",
"help.verdict.yes": "SÍ / GO — procede con confianza; los números apoyan la elección.",
"help.verdict.deg": "DEGRADADO / TINY-MODEL — funciona con caveats; lee la acción.",
"help.verdict.no": "NO / MEMORY-LIMITED — no procedas tal cual; se da mitigación.",
"help.privacy.title": "Privacidad",
"help.privacy.body": "Todo corre en tu navegador. Sin telemetría, sin analytics, sin datos enviados a ningún sitio. Incluso el modelo LLM corre localmente vía WebGPU/WebAssembly. Tus model_ids y preguntas nunca abandonan esta página.",
"help.source.title": "Código fuente y paper",
"help.source.body": "Código: github.com/karlesmarin/tafagent
Paper: Marin 2026 — Predicting How Transformers Attend (Zenodo; arXiv próximamente)
Dataset: taf-attention-decay — 58 mediciones γ sobre 32 modelos (CC-BY-4.0)",
"footer.text": "© 2026 Carles Marin · Apache-2.0 · investigación independiente · la herramienta que cierra el círculo del paper.",
},
// ────────────────────────────────────────────────────────────────────────
// FR — Français
// ────────────────────────────────────────────────────────────────────────
fr: {
// §33 v0.4 (sesion 31, 2026-04-30) — nouvelles fonctions de diagnostic
"v04.title": "🆕 v0.4 — Nouveaux diagnostics (sesion 31)",
"v04.section.intro": "Quatre nouvelles fonctions diagnostiques dérivées en session 31 (2026-04-30) depuis jeux de formules cross-of-crosses + interrogation socratique. Disponibles dans taf_browser.py §33.",
"v04.arch.label": "Concentration Architecturale",
"v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv. Loi corrélationnelle cross-panel (R²=0.30). Caveat : pas un prédicteur par-modèle.",
"v04.pdi.label": "PDI — Indice de Déviation de Padé",
"v04.pdi.desc": "PDI = d_horizon_obs/T_eval. Feu : vert (≈1), orange (>>1), jaune (<<1), rouge (Phase B négatif).",
"v04.4bit.label": "Prédicteur de Décalage 4-bit",
"v04.4bit.desc": "MHA : R²(bf16)<0.9 → γ monte ; R²>0.99 → γ descend. GQA : précision-robuste.",
"v04.crit.label": "Ensemble d'Exposants Critiques",
"v04.crit.desc": "ν_c, β_c, η_c (=γ−1, CORRIGÉ), α_C, γ_susc avec minimum AM-GM à γ=1−1/√2≈0.293.",
// §34 v0.5 (session 32, 2026-05-01) — Cohérence algébrique vérifiée par machine
"v05.title": "🔬 v0.5 — Cohérence vérifiée par machine (session 32)",
"v05.section.intro": "Vérification duale par Sage Groebner basis + Lean Mathlib4 de 15 identités algébriques des exposants critiques TAF. Premier framework transformer-attention avec preuve formelle machine.",
"v05.verify.label": "Vérification de Cohérence Algébrique",
"v05.verify.desc": "Étant donné γ mesuré, vérifie 12 identités D-SAGE (D-SAGE-1 : 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). Toutes passantes = framework intact. Échecs = outliers bf16 / artefacts de quantification.",
"v05.dsage1.label": "D-SAGE-1 (★★ core)",
"v05.dsage1.desc": "Identité quadratique 2η² + η·γ_χ + 1 = 0 (découverte par Sage Groebner, vérifiée Lean). Remplace l'affirmation incorrecte de 'fermeture triple'. Réfute η=2γ du paper 1 algébriquement.",
"v05.erratum.label": "Erratum paper 1 — correction η",
"v05.erratum.desc": "Paper 1 affirmait η = 2γ. Sage Groebner + Lean Mathlib4 ont prouvé l'échec (résidu (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Phase A). Valeur correcte : η = γ−1, satisfaisant D-SAGE-1.",
"v05.repro.label": "Reproductibilité",
"v05.repro.desc": "Les 15 théorèmes sont machine-proof en Lean Mathlib4 (build réussi 1973 jobs). Script Sage : analysis/sage_recursive_sweep_2026-04-30.sage. Code Lean : lean_taf/taf/Taf/Identities.lean.",
// v0.5.1 — TAF Card consistency check button
"v05.consistency.title": "🔬 Vérification de cohérence algébrique (Sage + Lean v0.5)",
"v05.consistency.desc": "Vérifie 12 identités algébriques D-SAGE des exposants critiques TAF (machine-proof Sage Groebner basis + Lean Mathlib4). Passe = framework intact. Échec = outlier bf16 / artefact de quantification.",
"v05.consistency.btn": "🔬 Vérifier cohérence algébrique",
// v0.5.2 — Anti-Ising universality class badge
"v05.antiising.badge": "🧲 Classe Anti-Ising (β=γ−1<0, vérifié par machine)",
// v0.5.2 — Tooltips par identité (explications en langage clair)
"v05.tooltip.D_SAGE_1": "Identité algébrique quadratique reliant la dimension anormale η et la susceptibilité γ_χ. Identité CENTRALE découverte par Sage Groebner basis (machine-proof). Remplace l'ancienne affirmation incorrecte de triple closure.",
"v05.tooltip.D_SAGE_2": "En Phase A, β = γ−1 est négatif (anti-Ising). Multiplié par χ = 1/(1−γ) donne exactement −1. Signature du régime négatif-β de TAF.",
"v05.tooltip.D_SAGE_4": "L'exposant de chaleur spécifique α et la susceptibilité χ se somment exactement à 2 en TAF. Conséquence algébrique de l'hyperscaling de Josephson.",
"v05.tooltip.D_SAGE_5": "Identité linéaire : α + γ_χ = 2(2−γ). Signifie que quand γ s'approche de 1 (Hagedorn), la somme s'approche de 2 ; à γ=0 elle vaut 4.",
"v05.tooltip.D_SAGE_6": "Exposant de paramètre d'ordre multiplié par exposant de susceptibilité donne une quadratique spécifique en γ. Relation algébrique factorisée.",
"v05.tooltip.Rushbrooke_tautology": "Hyperscaling de Rushbrooke standard 2β + γ_χ = ν·d à d=1. En TAF c'est une TAUTOLOGIE — γ_χ est défini exactement pour que cela soit vrai. Confirmé par Sage Groebner basis.",
"v05.tooltip.Josephson_tautology": "Hyperscaling de Josephson standard 2 − α = ν·d à d=1. En TAF c'est une TAUTOLOGIE — α est défini exactement pour que cela soit vrai.",
"v05.tooltip.Fisher_independent": "Relation de Fisher γ_χ = (2−η)·ν. En TAF est INDÉPENDANTE (ne ferme PAS comme identité, contrairement à l'affirmation de triple closure). Le résidu est γ(2γ−3)/(1−γ).",
"v05.tooltip.eta_2gamma_REFUTED": "Paper 1 affirmait η=2γ. Cette identité le réfute : le résidu est positif dans toute la Phase A. Réfutation machine-proof par Lean Mathlib4.",
"v05.tooltip.D_14_nu_imprint": "La pente d'empreinte apprise ν = −1/(2π) multipliée par 2π donne −1. Vérification dimensionnelle triviale du paper 1.",
"v05.tooltip.D_SAGE_7": "La charge centrale c=3 multipliée par |ν_imprint| multipliée par 2π donne 3. Fermeture dimensionnelle reliant CFT à l'empreinte d'entraînement.",
"v05.tooltip.nu_beta_id": "Exposant de longueur de corrélation ν multiplié par exposant de paramètre d'ordre β donne −1 en Phase A. Variante de D-SAGE-2.",
"v053.calibration.title": "🔬 v0.5.3 — Audit de calibrage (2026-05-02)",
"v053.calibration.note": "Correction SWA désactivée — δ_SWA = -0.21 d'origine était calibrée sur n=1 modèle (données insuffisantes ; moyenne du cas unique +0.355). Correction post_IH marquée exploratoire — moyenne de groupe ≈ 0 en ré-audit (panel n=22) ne réplique pas l'ajustement OLS. Correction GQA réplique (panel +0.115 vs hardcoded +0.11). Formule D_f corrigée pour Phase B (γ>1) — utilise une somme cumulative discrète au lieu d'une approximation continue. LLaMA-3, Mistral, Gemma rapportent maintenant des valeurs de compression correctes.",
"v053.release.banner": "🔧 v0.5.3 — Corrections issues d'audit : D_f de compression KV utilise maintenant la somme discrète (correct pour tout γ) ; δ_SWA désactivé (calibrage n=1) ; erratum du coefficient C_V paper §5.2 (1/4 → 1/12).",
// §35 v0.6 — Diagnostic γ prédit vs observé
"gamma_check.title": "🔍 γ prédit vs observé",
"gamma_check.desc": "Saisissez votre γ mesuré empiriquement. L'outil détecte le régime : fraude (θ gonflé) / comprimé / sur-Padé / SWA-aléatoire / normal.",
"gamma_check.gobs_label": "γ_observé",
"gamma_check.gobs_tip": "γ mesuré empiriquement à partir des attention scores de votre modèle. Utilisez la CLI Diagnose pour l'obtenir depuis les poids réels.",
"gamma_check.random_label": "Corpus aléatoire ?",
"gamma_check.random_tip": "Cochez si γ_observé a été mesuré sur des tokens aléatoires/non structurés. Distingue la signature SWA (γ_obs > 1) d'une anomalie.",
"gamma_check.regime": "Régime",
"gamma_check.regime.normal": "Normal",
"gamma_check.regime.fraud": "Fraude (θ gonflé)",
"gamma_check.regime.compressed": "Contexte comprimé",
"gamma_check.regime.overpade": "Sur-Padé",
"gamma_check.regime.swa": "Signature SWA (corpus aléatoire)",
"gamma_check.regime.unknown": "Inconnu",
"gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15] : le modèle utilise son contexte nominal complet, sans anomalie.",
"gamma_check.regime.fraud.desc": "η < 0.01 : θ nominal gonflé. Le modèle se comporte comme si θ ≪ annoncé. Probable inflation YaRN/marketing sans vraie extension de contexte.",
"gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5) : contexte comprimé (le modèle attend moins loin que ne le prédit θ nominal). Fréquent en instruction-tuned / RLHF.",
"gamma_check.regime.overpade.desc": "η > 1.5 : le modèle attend plus loin que Padé ne le prédit. Régime Lerch-corrigé possible ou checkpoint précoce sous-entraîné.",
"gamma_check.regime.swa.desc": "γ_obs > 1.05 sur corpus aléatoire = signature de sliding-window attention (familles Mistral / Gemma).",
"gamma_check.regime.unknown.desc": "Entrées hors plage ou γ_obs > 1 sans flag corpus_aléatoire. Vérifiez la mesure.",
"gamma_check.validity.title": "⚠ La forme close de γ peut ne pas s'appliquer à ce modèle",
"gamma_check.validity.body": "La prédiction de Padé suppose un entraînement naturel sans régularisation explicite de l'attention. Votre η sort de la bande validée [0.85, 1.15], donc la forme close n'est pas fiable ici. Faites confiance au γ empirique (Phase Diagram ou Diagnose CLI) plutôt qu'à la prédiction. Causes possibles : régularisation forte forçant une attention quasi uniforme, effondrement au fine-tuning, architecture sliding-window, ou pertes non standard. Voir docs/LIMITATIONS.md.",
"gamma_check.validity.fraud.hint": "Indice : η ≪ 1 indique typiquement une inflation marketing de θ (style YaRN) sans vraie extension de contexte, OU une attention forcée quasi uniforme par l'entraînement.",
"gamma_check.validity.compressed.hint":"Indice : η ∈ [0.01, 0.5) est courant dans les modèles instruction-tuned / RLHF où le post-entraînement a aplati la distribution d'attention.",
"gamma_check.validity.overpade.hint": "Indice : η > 1.5 peut indiquer un checkpoint précoce sous-entraîné, un régime Lerch-corrigé, ou des termes de correction au-delà de l'approximation de Padé.",
"gamma_check.validity.swa.hint": "Indice : les architectures sliding-window (Mistral, Gemma) violent par conception l'hypothèse de full-attention de la forme close.",
"gamma_check.validity.unknown.hint": "Indice : γ_obs hors plage physique ou bruit de mesure. Vérifiez vos entrées et remesurez.",
"gamma_check.validity.summary_pill": "⚠ Garde de validité",
"gamma_check.glossary.title": "ⓘ Glossaire — signification des variables",
"gamma_check.glossary.gamma_pade": "γ_Padé : prédiction fermée (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
"gamma_check.glossary.gamma_obs": "γ_observé : mesuré empiriquement à partir des attention scores (exécutez Diagnose CLI sur poids réels).",
"gamma_check.glossary.theta_eff_obs":"θ_eff (observé) : inversé depuis γ_obs via T√2 / (1 − γ_obs). θ effectif impliqué par votre mesure.",
"gamma_check.glossary.theta_eff_pade":"θ_eff (Padé) : θ + T/√2. θ effectif prédit par la formule fermée.",
"gamma_check.glossary.efficiency": "η : rapport θ_eff_obs / θ_eff_Padé. ≈1 = normal · <0.01 = fraude · <0.5 = comprimé · >1.5 = sur-Padé.",
"gamma_check.glossary.delta_h": "ΔH_Cardy : log(θ_eff_obs / θ_nominal). Variation d'entropie de Cardy. Négatif = entropie de compression. ~0 = correspondance nominale.",
"gamma_check.glossary.regime": "Régime : classifieur automatique à partir de η + γ_obs + flag corpus_aléatoire.",
// §36 v0.6 — Tooltips pour icônes ⓘ inline
"tooltip.gamma_pade": "γ_Padé(T_eval) : prédiction fermée (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
"tooltip.gamma_decomposed": "γ_décomposé : γ depuis la décomposition architecturale complète. Ligne de base Padé + shift GQA + shift post-IH (sous-ensemble répliqué dans audit calibré).",
"tooltip.d_horizon": "d_horizon : horizon d'attention effectif. Au-delà, les scores tombent sous le plancher de bruit (paper §26).",
"tooltip.L_NIAH": "Plafond L_NIAH : plafond prédit de fiabilité needle-in-a-haystack au d_horizon courant.",
"tooltip.chi": "χ susceptibilité : χ = 1/(1−γ). Diverge à la ligne Hagedorn γ=1.",
"tooltip.kv_memory": "Mémoire KV @ T_eval (BF16) : cache KV par requête = 2 · n_layers · n_kv_heads · d_head · T_eval octets.",
"tooltip.theta_eff_obs": "θ_eff (observé) : θ effectif impliqué par votre γ_observé : T√2 / (1 − γ_obs).",
"tooltip.theta_eff_pade": "θ_eff (Padé) : θ effectif prédit par la formule fermée : θ + T/√2.",
"tooltip.efficiency": "η = θ_eff_obs / θ_eff_Padé : ratio d'efficacité. ≈1 = normal · <0.01 = fraude · <0.5 = comprimé · >1.5 = sur-Padé.",
"tooltip.delta_h_cardy": "ΔH_Cardy : log(θ_eff_obs / θ_nominal). Variation d'entropie de Cardy. Négatif = entropie de compression. ~0 = correspondance nominale.",
"tooltip.verdict_aggregate": "Verdict : pire-de toutes les recettes. ✅ GO = tout vert · ⚠ DÉGRADÉ = ≥1 jaune · ❌ NON = ≥1 rouge.",
"tooltip.verdict_breakdown": "Décomposition par recette : chaque recette teste un axe de décision indépendant (contexte-long · budget · matériel · custom-vs-API · compression-KV). Un ❌ en X-1 signifie « utilisez l'API pour votre volume » et non « le modèle échoue » — ouvrez la section Recettes pour le contexte par axe.",
"tooltip.gamma_pill": "γ vedette : γ_décomposé (ou γ_Padé en fallback). Plage (0,1) = Phase A (anti-Ising). γ ≥ 1 = Hagedorn / Phase B.",
"tooltip.anti_ising": "Classe Anti-Ising : Phase A → β = γ−1 < 0. Machine-verified (Sage + Lean Mathlib4). Voir §35 v0.5.",
// §37 v0.6 — Table des théorèmes Lean+Mathlib
"lean.table.title": "📑 Table des théorèmes Lean+Mathlib",
"lean.table.desc": "Chaque entrée ci-dessous est machine-proven contre Lean 4 + Mathlib4. Cliquez sur un lien L# pour aller à la ligne source sur GitHub. Groupé par thème — cliquez sur un en-tête pour déplier.",
"lean.table.theorem": "Théorème",
"lean.table.claim": "Énoncé",
"lean.table.tactic": "Tactique",
"lean.table.source": "Source",
"lean.table.lean": "Lean",
"lean.findings.title": "🔎 Findings substantiels",
"lean.findings.detected_by": "Détecté par",
"lean.findings.fixed_by": "Corrigé par",
"lean.findings.recommendation":"Recommandation",
"lean.meta.repo": "Repo",
"lean.meta.build": "Build",
"lean.meta.theorems": "Théorèmes",
"lean.meta.verified": "vérifiés",
"lean.meta.rejected": "rejetés",
"lean.meta.sorry": "sorry",
"lean.meta.findings": "findings substantiels",
"lean.manifest.loading": "Chargement du manifeste Lean…",
"lean.manifest.error": "Manifeste Lean indisponible",
// Help modal — section v0.6
"help.v06.title": "🆕 v0.6 — γ prédit-vs-observé + Cardy ΔH + badges Lean",
"help.v06.intro": "v0.6 (2026-05-06) : trois nouveaux diagnostics vivent dans la TAF Card sous 🔬 Diagnostics. Tout tourne dans votre navigateur ; γ_observé provient de la Diagnose CLI sur poids réels.",
"help.v06.layout.title": "Disposition de la TAF Card (nouveau en v0.6)",
"help.v06.layout.body": "Après avoir cliqué 🚀 Générer profil complet, la carte affiche : une bande hero en haut (classe d'architecture + méta + 3 pills : verdict agrégé ✅/⚠/❌, γ vedette, 🧲 Anti-Ising si Phase A) et quatre sections pliables : 📋 Recettes (ouverte par défaut — verdict par dimension), 🔬 Diagnostics (nombres clés, γ prédit vs observé, explorateur what-if), ✓ Vérification (cohérence algébrique Sage+Lean, falsification F1-F23), 📂 Provenance & partage (audit de calibration + téléchargement JSON / lien / soumission registre). Cliquez sur n'importe quel en-tête pour déplier. Chaque variable a un tooltip ⓘ inline.",
"help.v06.gamma_check.title": "γ prédit vs observé",
"help.v06.gamma_check.body": "Saisissez le γ mesuré empiriquement et l'outil calcule η = θ_eff_obs / θ_eff_Padé et classe en l'un de 5 régimes :",
"help.v06.case.normal": "Normal (η ∈ [0.85, 1.15]) — le modèle utilise son contexte nominal complet. Cas d'usage : valider une nouvelle release avant adoption.",
"help.v06.case.fraud": "Fraude (η < 0.01) — θ nominal gonflé ; le modèle se comporte comme si θ ≪ annoncé. Cas d'usage : détecter inflation YaRN/marketing (motif CodeLlama / Mistral-Nemo).",
"help.v06.case.compressed": "Comprimé (η < 0.5) — contexte comprimé ; le modèle attend moins loin que θ nominal. Cas d'usage : repérer compression par RLHF/instruction-tuning (motif LLaMA-2).",
"help.v06.case.overpade": "Sur-Padé (η > 1.5) — le modèle attend plus loin que Padé ne le prédit. Cas d'usage : identifier régime Lerch-corrigé ou checkpoints précoces sous-entraînés (motif pythia-1b).",
"help.v06.case.swa": "SWA corpus aléatoire (γ_obs > 1.05 avec corpus_aléatoire=Oui) — signature de sliding-window attention. Cas d'usage : confirmer SWA Mistral / Gemma sur tokens aléatoires.",
"help.v06.gamma_check.validity_gate.title": "Garde de validité (v0.8.9+)",
"help.v06.gamma_check.validity_gate.body": "Lorsque η sort de [0.85, 1.15] OU que le régime n'est pas normal, le panneau affiche une bannière d'avertissement expliquant que la prédiction en forme close peut ne pas s'appliquer. Faites confiance au γ empirique dans ces cas. Voir docs/LIMITATIONS.md pour la discussion complète du régime de validité (γ en forme close suppose une attention naturelle sans régularisation explicite ; ν = -1/(2π) suppose des tokens i.i.d.).",
"help.v06.cardy.title": "Diagnostic Cardy ΔH",
"help.v06.cardy.body": "ΔH_Cardy = log(θ_eff_obs / θ_nominal). Variation d'entropie entre le θ effectif observé et le θ nominal. Fortement négatif = entropie de compression ; proche de zéro = correspondance nominale. Complète η pour les cas borderline.",
"help.v06.lean.title": "Badges de vérification Lean + Mathlib",
"help.v06.lean.body": "Les identités TAF sont formellement machine-proven en Lean Mathlib4 : 37 théorèmes en 7 groupes (Padé, flot RG, Cayley, D-SAGE, résultats d'audit, erratum CV, divers) + 1 résultat substantiel (facteur 2 dans la dérivée V, théorème V_derivative_ne_RG_beta). Source : github.com/karlesmarin/lean-taf (commit 25c77fd). Re-vérifier localement : git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean. La pill 🧲 Anti-Ising du hero et la section Vérification renvoient à des lignes sources spécifiques.",
"help.v06.glossary.title": "Glossaire des variables (également intégré dans la TAF Card)",
"help.v06.glossary.body": "Chaque variable de la TAF Card a un tooltip ⓘ inline. Liste complète : γ, γ_Padé, γ_décomposé, γ_observé, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, mémoire KV, régime. Survolez n'importe quel ⓘ pour la définition + section du paper.",
"hero.title": "🔬 TAF Agent",
"hero.tagline": "Diagnostiquez n'importe quel LLM transformer en 30 secondes. Gratuit. Sans GPU. Sans inscription.",
"hero.subtitle": "Prédit si un modèle conviendra à votre cas d'usage avant que vous ne dépensiez argent ou temps. Tout tourne dans votre navigateur — vos données ne quittent jamais cet onglet.",
"hero.help": "📘 Manuel et exemples",
"hero.quickstart_btn": "⚡ Démarrage rapide",
"hero.inventory_btn": "🧰 Ce que ça offre",
"hero.about": "Conçu par un chercheur indépendant. Open source. Non affilié à un fournisseur de modèles.",
"modes.title": "🎯 Mode",
"modes.profile": "📇 Profiler un modèle",
"modes.compare": "🆚 Comparer des modèles",
"modes.inspector": "🔍 Inspecter config",
"modes.ask": "💬 Question libre",
"modes.recipe": "📋 Choisir une recette",
"modes.diagnose": "🩺 Diagnose CLI",
"diagnose.title": "🩺 Générateur de commande Diagnose CLI",
"diagnose.tip": "Le navigateur prédit γ à partir de la config; le CLI mesure γ_obs sur les poids réels. Ce générateur produit la commande exacte à exécuter localement.",
"diagnose.desc": "Choisis les options et copie-colle la commande générée sur ta machine locale (Python + transformers + numpy). Mode rapide ≈5 min CPU; complet ≈20–60 min GPU.",
"diagnose.model_label": "ID du modèle HF:",
"diagnose.theta_label": "θ (auto si vide):",
"diagnose.n_label": "Contexte N:",
"diagnose.options_label": "Options:",
"diagnose.opt_fast": "--fast (CPU, ~5 min)",
"diagnose.opt_cpu": "--cpu (forcer CPU)",
"diagnose.opt_4bit": "--load_in_4bit (modèles ≥7B)",
"diagnose.local_label": "--local path (optionnel):",
"diagnose.build_btn": "📋 Générer la commande",
"diagnose.cmd_title": "Commande générée :",
"diagnose.copy_btn": "📋 Copier dans le presse-papiers",
"diagnose.next_steps": "Prochaines étapes: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Exécute la commande (4) JSON résultat → upload via mode Inspect pour analyse TAF complète.",
"modes.phase": "📊 Diagramme de phase",
"phase.title": "📊 Diagramme de phase (γ × θ)",
"phase.tip": "Chaque point est un modèle du panel empirique du paper. x: log θ; y: γ. La ligne Hagedorn γ=1 sépare Phase A de Phase B. Hover pour détails, click pour charger dans le formulaire.",
"phase.desc": "23 modèles dans le panel; courbe Padé à T=2000.",
"modes.desc": "Démarrage rapide: collez n'importe quel id de modèle HuggingFace (ex. meta-llama/Meta-Llama-3-8B), cliquez Profiler. Voyez les 5 recettes évaluées en quelques secondes.",
"profile.title": "📇 Profiler un modèle",
"profile.desc": "Pour techniciens: quand vous avez besoin d'un instantané complet de viabilité d'un modèle candidat. Un clic exécute les 5 recettes et produit une TAF Card unifiée.",
"profile.preset_label": "Préréglage:",
"profile.preset_default": "— ou choisir dans la liste —",
"profile.hf_label": "ID modèle HF:",
"profile.fetch_btn": "📥 Charger",
"profile.btn": "🚀 Générer profil complet",
"profile.quickstart": "💡 Démarrage rapide: choisissez un préréglage → cliquez Générer. Ou collez un id depuis HF Hub tendances → 📥 Charger → Générer.",
"compare.title": "🆚 Comparer côte à côte",
"compare.desc": "Pour techniciens: quand vous choisissez entre 2-3 modèles candidats pour un scénario de déploiement spécifique. Même recette, plusieurs modèles, verdicts côte à côte.",
"compare.recipe_label": "Recette:",
"compare.T_eval_label": "T_eval (contexte cible):",
"compare.models_title": "Modèles à comparer (jusqu'à 3)",
"compare.btn": "🚀 Comparer",
"compare.example": "💡 Essayez: collez 3 modèles populaires de 7-8B (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), recette X-2, T_eval=16000. Voyez lequel gère le mieux le contexte long.",
"ask.title": "❓ Votre question",
"ask.placeholder": "ex. Mistral-7B gérera-t-il 16K NIAH? Ou: J'ai 5,000$, quel modèle puis-je entraîner? Ou: GPU le moins cher pour servir Llama-70B à 100M tokens/jour?",
"ask.btn": "🚀 Analyser",
"ask.example_btn": "💡 Essayer un exemple",
"recipe.title": "📋 Recette",
"recipe.default": "— choisir une recette —",
"recipe.input_title": "🎯 Entrées",
"verdict.title": "📊 Verdict",
"chain.title": "🔍 Chaîne de calcul",
"chain.desc": "Chaque nombre ci-dessous est du Python déterministe. Cliquez sur une étape pour développer.",
"answer.title": "💬 Réponse en langage naturel",
"share.btn": "🔗 Copier le lien",
"share.copied": "✅ Copié dans le presse-papiers!",
"share.download": "💾 Télécharger JSON",
"share.download_md": "📝 Markdown",
"share.download_tex": "📜 LaTeX",
"share.submit": "📤 Soumettre au registry",
"share.submit_clip_ok": "↗ GitHub ouvert. Corps copié dans le presse-papiers — collez-le dans le corps de l'issue.",
"share.submit_clip_fail": "↗ GitHub ouvert. Presse-papiers bloqué — corps dans la console du navigateur (F12).",
"share.import_title": "📂 Importer un résultat TAF partagé",
"a11y.skip": "Aller au contenu principal",
// v0.6.2 — refonte de la landing : démarrage rapide + inventaire + tooltips d'architecture
"qs.title": "⚡ Démarrage rapide",
"qs.step1": "Collez un model ID HuggingFace (ex. meta-llama/Meta-Llama-3-8B)",
"qs.step2": "Cliquez sur 📇 Profile a model",
"qs.step3": "Lisez votre TAF Card — verdict par cas d'usage + chiffres clés + maths vérifiées par Lean+Mathlib",
"qs.cta": "↓ Commencer",
"inv.title": "🧰 Ce que cet outil vous offre",
"inv.recipes.title": "🎯 8 recettes — ce modèle convient-il à votre usage ?",
"inv.recipes.x1.title": "Entraînement propre vs API",
"inv.recipes.x1.body": "lequel coûte moins cher pour votre trafic ?",
"inv.recipes.x2.title": "Contexte long",
"inv.recipes.x2.body": "tient-il 32k / 128k tokens de manière fiable ?",
"inv.recipes.x3.title": "Budget",
"inv.recipes.x3.body": "avec $X, quel modèle pouvez-vous entraîner ?",
"inv.recipes.x5.title": "Matériel",
"inv.recipes.x5.body": "quel GPU pour servir N tokens/jour ?",
"inv.recipes.x19.title": "KV cache",
"inv.recipes.x19.body": "comment compresser sans casser la qualité ?",
"inv.recipes.x21.title": "Pureté d'imprint",
"inv.recipes.x21.body": "à quel point l'encodage positionnel est-il propre ?",
"inv.recipes.x22.title": "Compute-contexte",
"inv.recipes.x22.body": "le modèle entre-t-il dans la bande empirique ?",
"inv.recipes.x23.title": "Phase IH",
"inv.recipes.x23.body": "pré- ou post-induction-head ?",
"inv.diag.title": "🔬 Diagnostics",
"inv.diag.gamma": "γ prédit vs observé — auto-classe le modèle en 5 régimes (normal · fraude / contexte gonflé · compressé · over-Padé · sliding-window)",
"inv.diag.cardy": "Cardy ΔH — décalage d'entropie entre contexte observé et nominal",
"inv.diag.fals": "Tableau de falsifiabilité — vérifie 23 prédictions spécifiques (F1–F23)",
"inv.diag.alg": "Cohérence algébrique — 8 identités mathématiques que le modèle doit satisfaire",
"inv.verify.title": "✓ Maths formellement vérifiées",
"inv.verify.count": "37 théorèmes machine-proven en Lean 4 + Mathlib4",
"inv.verify.click": "Cliquez sur un badge → ouvre la ligne source sur GitHub",
"inv.verify.reverify": "Vérifiez vous-même : lake build (≈5 s après cache)",
"inv.export.title": "📤 Export et partage",
"inv.export.formats": "JSON · Markdown · LaTeX (prêt pour papier)",
"inv.export.share": "Lien reproductible (état encodé dans l'URL)",
"inv.export.registry": "Soumettre au registre communautaire sur GitHub",
"arch.summary": "Architectures prises en charge",
"arch.anyhf": "✓ Tout modèle public HuggingFace",
"tooltip.mha": "Multi-Head Attention : chaque position attend via plusieurs têtes parallèles à la fois.",
"tooltip.gqa": "Grouped Query Attention : les queries partagent moins de keys/values que de heads (économise mémoire mais pousse γ vers Hagedorn).",
"tooltip.alibi": "Attention with Linear Biases : l'info de position est une pente apprise ajoutée aux scores, sans rotation.",
"tooltip.abspe": "Absolute Position Embeddings : chaque position a un vecteur fixe appris ajouté au token.",
"tooltip.swa": "Sliding Window Attention : chaque token n'attend que dans une fenêtre locale fixe (Mistral, gemma-2 l'utilisent).",
"tooltip.ssm": "State Space Model : couche de séquence qui maintient un état interne au lieu d'attention (Mamba, Jamba l'utilisent).",
// v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker
"modes.unmask": "🪟 Démasquer",
"unmask.title": "🪟 Démasqueur de contexte",
"unmask.tip": "Collez un id de modèle HuggingFace (ou config.json brut). L'outil détecte sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), et GQA — tout ce qui rend max_position_embeddings plus grand que le contexte effectif réel. Mistral-7B-v0.1 est l'exemple canonique : déclare 32k, attend dans ~4-8k.",
"unmask.desc": "Êtes-vous sur le point de dépenser de l'argent sur un modèle qui n'attend pas vraiment aussi loin ? Collez un id et découvrez-le en 1 seconde. Sans GPU, sans inférence — juste de l'arithmétique sur config.json.",
"unmask.id_label": "ID modèle HF :",
"unmask.fetch_btn": "🔍 Démasquer",
"unmask.paste_summary": "Ou collez config.json brut (modèles privés / en dev)",
"unmask.paste_btn": "🔍 Démasquer config collé",
"unmask.label.declared": "Contexte déclaré",
"unmask.label.effective": "Effectif (estimé)",
"unmask.label.ratio": "Ratio",
"unmask.section.flags": "Drapeaux d'architecture",
"unmask.section.warnings": "Avertissements",
"unmask.section.reco": "Recommandation",
"unmask.flag.swa": "SWA",
"unmask.flag.rope": "RoPE scaling",
"unmask.flag.gqa": "GQA",
"unmask.flag.layers": "Couches",
"unmask.flag.dhead": "d_head",
"unmask.flag.theta": "RoPE θ",
"unmask.flag.yes": "oui",
"unmask.flag.no": "non",
"unmask.flag.full_mha": "non (MHA complet, {n} heads)",
"unmask.verdict.honest": "✅ HONNÊTE",
"unmask.verdict.inflated": "⚠ GONFLÉ",
"unmask.verdict.severely_inflated": "❌ GRAVEMENT GONFLÉ",
"unmask.verdict.yarn_extended": "⚠ YARN-ÉTENDU",
"unmask.verdict.unknown": "❓ INCONNU",
"unmask.warn.swa_window": "Fenêtre SWA : {window} tokens — chaque couche n'attend que dans cette fenêtre.",
"unmask.warn.multihop": "Estimation multi-hop : ~{multiHop} tokens (conservateur : fenêtre × {factor}).",
"unmask.warn.yarn": "RoPE scaling ({type}) étend le contexte {factor}× de ~{original} à {declared} tokens.",
"unmask.warn.yarn_advice": "Contexte RoPE-étendu — vérifiez le comportement de γ à la longueur déclarée avec le diagnostic γ_check.",
"unmask.warn.gqa_small_dhead": "Petite head dim ({d_head}) + GQA : compression de KV cache probable en contexte long (γ poussé vers Hagedorn).",
"unmask.reco.honest": "Modèle d'attention complète standard. Contexte effectif correspond au déclaré ({declared} tokens).",
"unmask.reco.inflated": "Effectif ~{effective} tokens via SWA. Utilisez γ_check pour vérifier le comportement à votre longueur cible.",
"unmask.reco.severely_inflated": "Traitez-le comme un modèle de ~{effective} tokens en pratique. Le claim de {declared} tokens ne s'applique que via des chaînes d'attention cross-layer, qui dégradent empiriquement au-delà de ~2× la fenêtre SWA.",
"unmask.reco.yarn_extended": "Contexte RoPE-étendu. Lancez un benchmark long-context (NIAH à 8k / 16k / 32k / full) pour confirmer que l'extension tient. Utilisez γ_check avec T_eval = {declared}.",
"unmask.reco.unknown": "Impossible de parser le config. Vérifiez que l'URL est un modèle HF valide avec config.json public.",
"unmask.status.empty_id": "⚠ Saisissez un model id (ex. mistralai/Mistral-7B-v0.1).",
"unmask.status.fetching": "⏳ Récupération config.json pour {modelId}...",
"unmask.status.success": "✅ {modelId} analysé (verdict : {verdict})",
"unmask.status.empty_paste": "⚠ Collez d'abord un config.json.",
"unmask.status.invalid_json": "❌ JSON invalide : {error}",
"unmask.status.success_paste": "✅ Config collé analysé (verdict : {verdict})",
"unmask.pasted_label": "(config collé)",
"mode_desc.ask": "Tapez une question libre. Le LLM dans le navigateur choisit la recette et l'exécute.",
"mode_desc.recipe": "Sélectionnez une recette directement et remplissez le formulaire. Contrôle manuel complet.",
"mode_desc.profile": "Démarrage le plus rapide : collez n'importe quel model id HuggingFace, cliquez Profile. Voyez les 5 recettes en quelques secondes.",
"mode_desc.compare": "Choisissez 2-3 modèles candidats + une recette. Verdicts côte à côte dans un tableau.",
"mode_desc.inspector": "Collez un config.json directement. Utile pour modèles privés / en dev non publiés sur HF Hub.",
"mode_desc.diagnose": "Construit la commande CLI diagnose_model.py pour MESURER γ_obs sur GPU réel. Le navigateur prédit ; le CLI mesure.",
"mode_desc.phase": "Scatter γ × θ du panel empirique du papier. Survolez les points pour détails, cliquez pour charger dans Diagnose / Recipe.",
"mode_desc.unmask": "Détecte si max_position_embeddings est trompeur (SWA / YaRN / RoPE-scaling). Collez un model id, obtenez un verdict en 1 ligne.",
"profile.preset_loaded": "✅ Préréglage chargé pour {id}. Formulaire pré-rempli. (Cliquez 📥 Fetch pour écraser avec le dernier config depuis HF Hub.)",
// v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
"modes.template": "📜 Chat-template",
"mode_desc.template": "Détecte la famille de chat-template d'un modèle (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Donne le flag CLI exact pour lm-eval / vLLM / transformers.",
"template.title": "📜 Détecteur de Chat-template",
"template.tip": "Collez un model id HF (ou tokenizer_config.json brut). Détecte la famille du chat-template et donne le commande exacte pour l'utiliser correctement. lm-eval-harness divise l'accuracy par 2 silencieusement si vous oubliez de l'appliquer (issue #1841).",
"template.desc": "Avez-vous oublié --apply_chat_template ? La plupart des évals multi-tours échouent à ~50% parce que le chat template n'a pas été appliqué. Collez un model id, obtenez le flag CLI exact pour votre stack.",
"template.id_label": "ID modèle HF :",
"template.fetch_btn": "📜 Détecter",
"template.paste_summary": "Ou collez tokenizer_config.json brut (modèles privés)",
"template.paste_btn": "📜 Détecter config collé",
"template.label.family": "Famille détectée",
"template.label.markers": "Marqueurs correspondants",
"template.label.tpl_len": "Longueur du template",
"template.section.warnings": "Avertissements",
"template.section.commands": "Commandes par framework",
"template.section.raw": "Template brut (preview)",
"template.family.custom": "custom (famille inconnue)",
"template.family.none": "(pas de chat_template)",
"template.verdict.ok": "✅ TEMPLATE DÉTECTÉ",
"template.verdict.custom": "⚠ TEMPLATE CUSTOM",
"template.verdict.missing": "❌ PAS DE CHAT TEMPLATE",
"template.verdict.base_model": "ℹ MODÈLE DE BASE (sans chat)",
"template.verdict.unknown": "❓ INCONNU",
"template.warn.no_chat_template": "Pas de champ chat_template dans tokenizer_config.json. Typique des modèles base / pré-entraînés. Si vous attendiez un modèle instruct-tuned, le mauvais fichier peut être chargé.",
"template.warn.custom_template": "Template non standard ({length} chars). L'outil n'a pas pu le faire correspondre aux familles connues. Inspectez le preview et vérifiez que votre framework d'éval le supporte.",
"template.warn.lm_eval_apply": "lm-eval-harness : ajoutez --apply_chat_template ou votre accuracy chutera silencieusement de ~50% sur les évals multi-tours (issue #1841).",
"template.warn.vllm_apply": "vLLM serve : vérifiez que --chat-template est défini (l'auto-détection échoue parfois sur les variantes fine-tunées). Suggéré : {name}.",
"template.status.empty_id": "⚠ Saisissez un model id (ex. mistralai/Mistral-7B-Instruct-v0.3).",
"template.status.fetching": "⏳ Récupération tokenizer_config.json pour {modelId}...",
"template.status.success": "✅ {modelId} détecté (verdict : {verdict})",
"template.status.empty_paste": "⚠ Collez d'abord un tokenizer_config.json.",
"template.status.invalid_json":"❌ JSON invalide : {error}",
"template.status.success_paste":"✅ Config collé détecté (verdict : {verdict})",
"template.pasted_label": "(tokenizer_config collé)",
// v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
"modes.arena": "🎯 Arena CI",
"mode_desc.arena": "Récupère les intervalles de confiance à partir des données brutes de votes pairwise (MLE Bradley-Terry + bootstrap). Détecte les paires statistiquement à égalité que le leaderboard public d'Arena cache.",
"arena.title": "🎯 Reconstructeur Arena-Elo CI",
"arena.tip": "Chatbot Arena masque les intervalles de confiance dans le leaderboard public. Un écart de 5 Elo peut être statistiquement insignifiant. Collez les données brutes de votes (model_a, model_b, winner) — l'outil calcule le MLE Bradley-Terry + bootstrap CIs et liste les égalités statistiques (overlap CI).",
"arena.desc": "GPT-4 est-il vraiment meilleur que Claude — ou sont-ils à égalité ? Collez le CSV de votes pairwise (ou cliquez Charger un échantillon). MLE Bradley-Terry + 200 itérations de bootstrap → Elos classés avec CIs 95% et détection d'égalités statistiques. Tout dans le navigateur.",
"arena.sample_btn": "📊 Charger échantillon",
"arena.run_btn": "🎯 Calculer CIs",
"arena.clear_btn": "🗑️ Effacer",
"arena.csv_summary": "CSV de votes (header : model_a,model_b,winner ; winner ∈ a/b/tie)",
"arena.section.ranked": "Elos classés avec CIs 95%",
"arena.section.ties": "Égalités statistiques (overlap CI)",
"arena.section.summary": "Résumé",
"arena.col.rank": "#",
"arena.col.model": "Modèle",
"arena.col.elo": "Elo",
"arena.col.ci": "CI 95%",
"arena.col.ci_width": "± demi-largeur",
"arena.col.matches": "Matchs",
"arena.col.wins": "V / D / E",
"arena.col.tie_pair": "Paire",
"arena.col.tie_diff": "Écart Elo",
"arena.col.tie_overlap": "Overlap CI",
"arena.no_ties": "Aucune égalité statistique — toutes les paires sont distinguables à 95% CI.",
"arena.summary.votes": "Total des votes",
"arena.summary.models": "Modèles",
"arena.summary.ties": "Égalités statistiques",
"arena.summary.bootstrap": "Itérations bootstrap",
"arena.summary.ci_level": "Niveau CI",
"arena.status.empty": "⚠ Collez un CSV de votes ou cliquez sur Charger échantillon.",
"arena.status.too_few": "⚠ Seulement {n} votes valides — il en faut au moins 10 pour ajuster Bradley-Terry de manière fiable.",
"arena.status.computing": "⏳ Calcul MLE Bradley-Terry + bootstrap sur {n} votes...",
"arena.status.done": "✅ {n} votes · {models} modèles · {ties} égalités statistiques · {ms} ms",
"arena.status.sample_loaded": "✅ Échantillon chargé (données Arena synthétiques 6 modèles). Cliquez sur Calculer CIs.",
// v0.7.3 — anti-bullshit pack #4: Contamination Prior
"modes.contam": "🧪 Contamination",
"mode_desc.contam": "Prior bayésien-ish sur la contamination d'un score de benchmark. Saisissez le cutoff d'entraînement → note 20+ benchmarks populaires (MMLU, GSM8K, HumanEval, MMLU-Pro…).",
"contam.title": "🧪 Prior de Contamination",
"contam.tip": "Calcule un prior bayésien-ish indiquant si un score de benchmark est contaminé, basé sur (date de cutoff d'entraînement) × (date de sortie du benchmark) × (inclusion connue dans corpus + historique de leaks). Open LLM Leaderboard v1 a été tué en 2024 après la contamination de MMLU/HellaSwag.",
"contam.desc": "Devez-vous faire confiance au score MMLU de votre modèle ? Saisissez la date de cutoff d'entraînement — l'outil note 20+ benchmarks populaires (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) et vous dit quels scores sont probablement contaminés.",
"contam.cutoff_label": "Cutoff entraînement :",
"contam.run_btn": "🧪 Noter tous les benchmarks",
"contam.section.ranked": "Priors de contamination par benchmark",
"contam.section.high": "🔴 Benchmarks à haut risque (traitez les scores comme non fiables)",
"contam.section.medium": "🟡 Risque moyen (vérifiez avec des alternatives)",
"contam.section.low": "🟢 Faible risque (probablement propres)",
"contam.col.benchmark": "Benchmark",
"contam.col.released": "Sorti",
"contam.col.gap": "Écart (mois)",
"contam.col.prior": "P(contam)",
"contam.col.level": "Niveau",
"contam.col.corpora": "Dans corpus",
"contam.col.category": "Catégorie",
"contam.label.high": "Haut risque",
"contam.label.medium": "Moyen",
"contam.label.low": "Faible",
"contam.no_entries": "(aucun dans cette catégorie)",
"contam.advice.high": "Traitez ces scores comme non fiables. Remplacez par des alternatives plus récentes / à test privé (MMLU-Pro, GPQA, MUSR, MATH-500).",
"contam.advice.medium": "À prendre avec précaution. Cherchez une réplication sur un subset held-out ou des reproductions communautaires.",
"contam.advice.low": "Score probablement non contaminé, mais absence de leak n'est pas une preuve — vérifiez avec un test alternatif.",
"contam.summary.headline": "Cutoff {cutoff} · {n} benchmarks notés",
"contam.status.empty": "⚠ Saisissez une date de cutoff d'entraînement (ex. 2023-12).",
"contam.status.bad_date": "⚠ Format de date incorrect. Utilisez YYYY-MM ou YYYY-MM-DD.",
"contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks notés · {high} à haut risque",
// v0.7 — Section Help modal
"help.v07.title": "🆕 v0.7 — Pack anti-bullshit (4 nouveaux modes)",
"help.v07.intro": "v0.7 (2026-05-06) : quatre nouveaux modes qui résolvent des problèmes concrets remontés par la communauté HuggingFace. Chacun tourne dans votre navigateur sans inférence — pure métadonnée + maths.",
"help.v07.unmask.title": "🪟 Démasqueur de Contexte",
"help.v07.unmask.body": "Détecte quand max_position_embeddings est trompeur. Mistral-7B-v0.1 déclare 32k mais attend dans ~4-8k via SWA. Collez un id HF → verdict en 1 seconde (HONNÊTE / GONFLÉ / GRAVEMENT GONFLÉ / YARN-ÉTENDU). Détecte SWA, RoPE-scaling (YaRN/linear/dynamic NTK), petit d_head + GQA. Cas d'usage : avant de payer un GPU pour 32k de contexte, vérifiez que le modèle attend vraiment aussi loin.",
"help.v07.template.title": "📜 Détecteur de Chat-template",
"help.v07.template.body": "Détecte la famille de chat-template d'un modèle (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) et donne le flag CLI exact pour lm-evaluation-harness, vLLM, et transformers. Résout l'issue #1841 de lm-eval-harness : oublier --apply_chat_template divise l'accuracy multi-tours par 2 silencieusement. Cas d'usage : avant de reporter un score, confirmez avoir appliqué le template correctement.",
"help.v07.arena.title": "🎯 Reconstructeur Arena-Elo CI",
"help.v07.arena.body": "Chatbot Arena masque les intervalles de confiance de son leaderboard public — un écart de 5 Elo peut être statistiquement insignifiant. Collez des données brutes de votes pairwise (model_a, model_b, winner) → MLE Bradley-Terry + bootstrap 200 itérations → Elos classés avec CIs 95% et un panneau \"égalités statistiques\" listant les paires dont les CIs se chevauchent. Essayez le bouton Charger échantillon. Cas d'usage : avant de déclarer \"modèle A bat modèle B\", vérifiez que leurs CIs ne se chevauchent pas.",
"help.v07.contam.title": "🧪 Prior de Contamination",
"help.v07.contam.body": "Prior bayésien-ish sur la contamination d'un score de benchmark. Saisissez la date de cutoff d'entraînement de votre modèle → l'outil note 20+ benchmarks populaires (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) par P(contamination) selon l'écart temporel, l'inclusion dans corpus et l'historique de leaks connus. Open LLM Leaderboard v1 a été tué en 2024 après la contamination de MMLU/HellaSwag. Cas d'usage : décidez quels scores croire en comparant deux modèles.",
"help.v07.quant.title": "⚖️ Classificateur de régime de quantification",
"help.v07.quant.body": "Prédit le γ-shift et ΔPPL pour tout (modèle × schéma de quantification : NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8…). Arch-aware : petit d_head + GQA agressif → plus sensible ; les schémas calibrés (AWQ) absorbent mieux le shift que les non calibrés (NF4). Recommande des alternatives plus sûres si un cliff est détecté. Cas d'usage : avant de quantifier, prédisez si votre combo architecture × schéma maintiendra la PPL acceptable, avec une suggestion concrète de switch sinon.",
"help.v07.drift.title": "🔀 Borne de drift inter-frameworks",
"help.v07.drift.body": "Même modèle, scores différents sur setups différents. L'outil prédit le drift max admissible dû au seul bruit numérique (dtype, framework, batch). Si l'écart observé le dépasse → vrai bug, généralement chat-template mismatch (issue #1841 lm-eval-harness) ou layout KV-cache. Essayez le bouton "Charger échantillon" pour le bug chat-template canonique. Cas d'usage : avant de reporter une régression ou de revendiquer la reproductibilité, vérifiez si l'écart entre deux évals est plus grand que ce que le bruit numérique peut expliquer.",
"inv.v07.drift": "🔀 Drift — bug ou bruit ? Prédit l'écart max admissible entre deux évals",
"help.v07.niah.title": "🔍 Gap NIAH → Reasoning",
"help.v07.niah.body": "Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH (retrieval de needle) mais échouent au reasoning multi-hop au même contexte. L'outil prédit les deux taux de réussite à partir de l'architecture (γ_Padé + d_horizon + pression arch : petit d_head, GQA, SWA), reporte le gap, et trouve le \"contexte sûr pour reasoning\" où le reasoning reste ≥65%. Mode balayage montre la courbe à 1k/4k/16k/64k/T_train. Cas d'usage : avant de déployer au contexte revendiqué, découvrez si le modèle va vraiment raisonner là ou seulement retrouver.",
"inv.v07.niah": "🔍 NIAH→Reason — votre \"128k\" raisonne-t-il vraiment là, ou seulement retrouve ?",
// v0.7 — Inventory modal 5ème card
"inv.v07.title": "🆕 Pack anti-bullshit v0.7",
"inv.v07.unmask": "🪟 Unmask — config.json annonce 32k ? Voyez s'il attend vraiment aussi loin",
"inv.v07.template": "📜 Chat-template — flag CLI exact pour que lm-eval ne divise pas votre accuracy par 2 en silence",
"inv.v07.arena": "🎯 Arena CI — récupère les intervalles de confiance que Chatbot Arena cache",
"inv.v07.contam": "🧪 Contamination — note 20+ benchmarks par probabilité de contamination",
"inv.v07.quant": "⚖️ Quant — prédit le γ-shift + ΔPPL pour tout combo (modèle × schéma de quantification)",
// v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
"modes.quant": "⚖️ Quant",
"mode_desc.quant": "Prédit le γ-shift et ΔPPL pour tout (modèle × schéma de quantification). Arch-aware : petit d_head + GQA → plus sensible. Recommande des alternatives plus sûres si un cliff est détecté.",
"quant.title": "⚖️ Classificateur de régime de quantification",
"quant.tip": "Prédit le γ-shift (et la ΔPPL résultante) pour une paire (modèle × schéma). Les claims génériques comme 'AWQ ~95% retention' sont trop vagues — TAF utilise d_head, ratio GQA, flag SWA et taille du modèle pour donner un verdict arch-spécifique. Résout : la communauté HF rapporte des cliffs de quantification imprédictibles (NF4 -2 PPL sur Phi-3 mais OK sur Llama-3-8B).",
"quant.desc": "La quantification cassera-t-elle votre modèle ? Collez un id HF, choisissez un schéma — obtenez le γ-shift prédit, la bande ΔPPL attendue et une alternative recommandée si c'est un cliff. Navigateur uniquement, sans GPU, sans set de calibration.",
"quant.id_label": "ID modèle HF :",
"quant.fetch_btn": "📥 Récupérer config",
"quant.scheme_label": "Schéma quant :",
"quant.run_btn": "⚖️ Prédire",
"quant.all_btn": "📊 Comparer tous les schémas",
"quant.regime.safe": "✅ SÛR",
"quant.regime.mild": "✅ COMPRESSION LÉGÈRE",
"quant.regime.significant": "⚠ DÉGRADATION SIGNIFICATIVE",
"quant.regime.cliff": "❌ CLIFF SÉVÈRE",
"quant.label.gamma_shift": "γ shift",
"quant.label.delta_ppl": "ΔPPL (est.)",
"quant.label.arch_mult": "Multiplicateur arch",
"quant.section.breakdown": "Détail",
"quant.section.reco": "Recommandation",
"quant.section.compare": "Tous les schémas (triés par sécurité)",
"quant.field.scheme": "Schéma",
"quant.field.calibrated": "calibré",
"quant.field.uncalibrated": "non calibré",
"quant.field.base_penalty": "Pénalité de base",
"quant.field.arch_mult_full": "Multiplicateur architectural",
"quant.field.gamma_shift": "γ shift prédit",
"quant.field.ppl_band": "Bande ΔPPL (est.)",
"quant.field.params": "Paramètres",
"quant.col.scheme": "Schéma",
"quant.col.bits": "Bits",
"quant.col.gamma_shift": "γ shift",
"quant.col.ppl_band": "Bande ΔPPL",
"quant.col.regime": "Régime",
"quant.reco.switch_to_awq": "Passez à {scheme} — le 4-bit calibré gère bien mieux les petits d_head + GQA que NF4. ΔPPL attendue chute ~2-3×.",
"quant.reco.switch_to_q5_km": "Passez à {scheme} — Q5 garde plus de dimensions de head intactes à faible coût (~25% plus grand seulement).",
"quant.reco.switch_to_q4_km": "Passez à {scheme} — Q3/Q2 sont trop agressifs pour cette architecture.",
"quant.reco.consider_awq": "Considérez {scheme} — la calibration réduit significativement le γ-shift sur cette architecture.",
"quant.reco.use_higher_bits": "Utilisez une alternative à plus de bits — cette architecture n'absorbe pas le 4-bit proprement. Essayez 5 ou 8-bit.",
"quant.reco.verify_with_eval": "Vérifiez avec une vraie éval — le shift prédit est borderline. Lancez NIAH à votre contexte cible avant de déployer.",
"quant.reco.no_action": "Pas d'action requise — la quantification est sûre pour cette architecture.",
"quant.summary.headline_all": "Tous les schémas pour {modelId}",
"quant.status.empty_id": "⚠ Saisissez un model id (ex. meta-llama/Llama-3.2-1B).",
"quant.status.fetching": "⏳ Récupération config.json pour {modelId}...",
"quant.status.fetched": "✅ Config récupéré pour {modelId}. Choisissez un schéma et cliquez Prédire (ou Comparer tous).",
"quant.status.no_scheme": "⚠ Choisissez un schéma de quantification dans le dropdown.",
"quant.status.done": "✅ Régime prédit : {regime}",
"quant.status.done_all": "✅ Comparé {n} schémas — triés par sécurité.",
// v0.7.4 — autocomplete HF Hub : confidentialité + rate-limit
"hf_auto.privacy": "🔒 Requêtes envoyées à huggingface.co/api · cache local 5 min",
"hf_auto.rate_limited": "⚠ Rate limit HuggingFace — réessayez dans un moment, ou tapez l'id complet manuellement",
"hf_auto.gated_msg": "est gated. Acceptez la licence ici :",
// v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound
"modes.drift": "🔀 Drift",
"mode_desc.drift": "Prédit le drift max admissible entre deux scores de benchmark donnés (framework, dtype, batch, chat-template). Distingue les vrais bugs du bruit numérique.",
"drift.title": "🔀 Borne de drift inter-frameworks",
"drift.tip": "Même modèle, scores différents sur des setups différents. L'écart est-il du bruit ou un vrai bug ? Saisissez deux scores avec leur (framework, dtype, batch, chat-template) — l'outil prédit le drift max admissible dû au seul bruit numérique. Si l'écart observé le dépasse → vrai bug, généralement chat-template mismatch (issue #1841 lm-eval) ou layout KV-cache.",
"drift.desc": "Votre modèle donne 67.2 sur lm-eval-hf et 65.1 sur vLLM-served. Bug ou bruit ? Saisissez les deux scores avec (framework, dtype, batch, chat-template appliqué ?). L'outil prédit la bande de bruit et signale les vrais bugs. arxiv 2506.09501 documente cela comme un problème majeur de reproductibilité d'évals.",
"drift.setup_a": "Setup A",
"drift.setup_b": "Setup B",
"drift.score": "Score",
"drift.framework": "Framework",
"drift.dtype": "Dtype",
"drift.batch": "Batch",
"drift.template": "Chat-template",
"drift.template.applied": "appliqué",
"drift.template.not_applied": "non appliqué",
"drift.template.unknown": "inconnu",
"drift.run_btn": "🔀 Calculer la borne de drift",
"drift.sample_btn": "📊 Charger échantillon (bug chat-template)",
"drift.label.observed": "Écart observé",
"drift.label.band": "Bande numérique",
"drift.label.ratio": "Écart / bande",
"drift.section.setups": "Setups",
"drift.section.breakdown": "Contributeurs au drift (bande numérique)",
"drift.section.verdict": "Verdict et recommandation",
"drift.contrib.dtype": "Mismatch de dtype",
"drift.contrib.framework": "Framework",
"drift.contrib.batch": "Différence de batch",
"drift.contrib.template": "MISMATCH de chat-template",
"drift.dominant_cause": "Cause dominante",
"drift.cause.dtype": "différence de précision dtype",
"drift.cause.framework": "différence de framework / kernel",
"drift.cause.batch": "chemins de normalisation par batch",
"drift.cause.template_mismatch": "chat-template appliqué d'un côté mais pas de l'autre (motif #1841 lm-eval-harness — typiquement -50% sur multi-tours)",
"drift.verdict.noise": "✅ BRUIT NUMÉRIQUE",
"drift.verdict.suspicious": "⚠ SUSPECT — vérifiez",
"drift.verdict.bug": "❌ VRAI BUG — investiguez",
"drift.verdict.bug_template": "❌ BUG DE CHAT-TEMPLATE",
"drift.reco.noise": "L'écart entre dans la bande de bruit numérique attendue. Pas d'action requise ; la différence est cohérente avec la seule variation framework/dtype/batch.",
"drift.reco.suspicious": "L'écart est 1–2× la bande prédite. Borderline — possible vrai bug. Essayez d'aligner le contributeur dominant (ex. égalisez framework ou dtype) et re-testez.",
"drift.reco.bug": "L'écart est > 2× la bande prédite. C'est un vrai bug. Inspectez le contributeur dominant — probablement une différence de tokenizer / chat-template / layout KV-cache. Lancez lm-eval-harness avec --apply_chat_template et confirmez.",
"drift.reco.bug_template": "Mismatch de chat-template détecté. C'est la cause la plus commune des grands écarts d'évals (issue #1841 lm-eval-harness). Relancez le côté "non appliqué" avec --apply_chat_template (ou réglez vLLM --chat-template <name>) et re-testez.",
"drift.status.empty_scores": "⚠ Saisissez les deux scores.",
"drift.status.done": "✅ Verdict : {verdict}",
"drift.status.sample_loaded": "✅ Échantillon chargé (bug chat-template canonique). Cliquez sur Calculer la borne de drift.",
// v0.7.6 — anti-bullshit pack #7: prédicteur de gap NIAH → reasoning
"modes.niah": "🔍 NIAH→Reason",
"mode_desc.niah": "Prédit les taux de réussite NIAH (retrieval) et reasoning multi-hop à n'importe quel contexte. Résout : les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte (paper RULER).",
"modes.saturation": "📈 Saturation",
"mode_desc.saturation": "Indique si un benchmark discrimine encore les frontier models ou s'il est saturé (ex. MMLU 88-94% top, AIME 2025 déjà 96-100%). Retourne top-3 + verdict + remplacements recommandés.",
"modes.hub": "🧭 Solutions",
"mode_desc.hub": "Carte de chaque problème documenté de LLM-eval → mode tafagent (si couvert) + outils externes curés. Trouvez la solution sans la réinventer. 30+ pains, 7 catégories.",
"modes.yarn": "🧵 Planificateur YaRN",
"mode_desc.yarn": "Génère la configuration rope_scaling exacte pour étendre un modèle au-delà de son contexte d'entraînement — plus un verdict TAF sur la tenue réelle de la qualité d'attention à la longueur cible.",
"modes.gguf": "🧊 Pont GGUF",
"mode_desc.gguf": "Lit l'en-tête de métadonnées d'un fichier GGUF (rope_theta, context_length, quant) dans votre navigateur et donne un verdict de qualité TAF — la question que les calculateurs de VRAM ignorent : tient ET fonctionne ?",
"gguf.title": "🧊 Pont de validité GGUF",
"gguf.tip": "Tenir dans la VRAM ≠ fonctionner. Les calculateurs GGUF/VRAM lisent les métadonnées d'un modèle pour dire si un quant tient dans le GPU. Ceci lit les MÊMES métadonnées (rope_theta, context_length, schéma de quant, géométrie des têtes) directement depuis l'en-tête .gguf via HTTP Range — sans télécharger des Go — et répond à ce qu'ils n'abordent pas : la qualité d'attention tient-elle vraiment, et de combien le quant l'érode-t-il (γ-shift, ΔPPL) ?",
"gguf.desc": "Collez un dépôt GGUF (ex. Qwen/Qwen2.5-7B-Instruct-GGUF), choisissez un fichier de quant, et obtenez un verdict de qualité TAF : l'horizon d'attention effectif du modèle, plus de combien la quantification choisie décale γ pour cette architecture précise. Ne lit que l'en-tête du fichier dans votre navigateur.",
"gguf.repo_label": "ID du dépôt GGUF :",
"gguf.list_btn": "📂 Lister les fichiers quant",
"gguf.file_label": "Fichier quant :",
"gguf.target_label": "Contexte cible L (optionnel) :",
"gguf.analyze_btn": "🧊 Analyser le GGUF",
"gguf.all_btn": "📊 Comparer tous les quants",
"gguf.compare_title": "Tous les quants — comparaison de qualité",
"gguf.col.verdict": "Verdict",
"gguf.col.gamma_at_l": "γ @ L (après quant)",
"gguf.need_repo": "Saisissez un id de dépôt GGUF comme 'Qwen/Qwen2.5-7B-Instruct-GGUF'",
"gguf.listing": "Listage des fichiers .gguf depuis HF Hub…",
"gguf.no_files": "Aucun fichier .gguf trouvé dans ce dépôt.",
"gguf.found": "fichiers quant trouvés",
"gguf.pick_hint": "choisissez-en un et cliquez Analyser.",
"gguf.reading": "Lecture de l'en-tête GGUF via HTTP Range…",
"gguf.read_ok": "En-tête analysé",
"gguf.verdict.healthy": "SAIN — l'horizon effectif atteint L avec un bon γ après quant",
"gguf.verdict.usable_with_care":"UTILISABLE AVEC PRUDENCE — atteint L mais γ est modeste après quant",
"gguf.verdict.degrades": "DÉGRADE — l'attention s'effondre avant L (ou le quant l'y pousse)",
"gguf.r.arch": "Architecture",
"gguf.r.ctx_train": "Contexte d'entraînement",
"gguf.r.horizon_fp16": "Horizon d'attention (fp16)",
"gguf.r.quant": "Schéma de quant",
"gguf.r.gamma_shift": "γ-shift dû au quant",
"gguf.r.after_quant": "(après quant)",
"gguf.r.eff_horizon": "Horizon effectif (quantifié)",
"gguf.r.no_quant_shift": "— pleine précision, pas de γ-shift",
"gguf.r.note": "Horizon depuis γ_Padé / d_horizon (architecture). γ-shift de quant + ΔPPL depuis le modèle quant-regime (calibré sur la PPL de llama.cpp + papiers AWQ/GPTQ). Les deux sont des estimations — vérifiez les cas limites avec un éval réel.",
"gguf.err.not_gguf": "Ce fichier n'est pas un GGUF valide (mauvais magic).",
"gguf.err.too_large": "L'en-tête de métadonnées dépasse la limite de téléchargement — tokenizer inhabituellement grand. Essayez un autre quant.",
"gguf.err.incomplete": "Il manque rope_theta ou context_length dans les métadonnées GGUF — impossible de calculer l'horizon.",
"help.v091.gguf.title": "🧊 Pont de validité GGUF",
"help.v091.gguf.body": "La douzaine de calculateurs GGUF/VRAM (NyxKrage, oobabooga, …) lisent un en-tête .gguf pour dire si un quant tient dans le GPU. Ceci lit le même en-tête — via HTTP Range, sans télécharger des Go — et répond à ce qu'ils sautent : tient-il ET fonctionne-t-il encore ? Collez un dépôt GGUF, choisissez un fichier de quant ; le pont extrait rope_theta, context_length, le schéma de quant (depuis general.file_type ou le nom de fichier) et la géométrie des têtes, puis exécute γ_Padé / d_horizon de TAF plus le γ-shift de quant conscient de l'architecture. Sortie : horizon d'attention effectif au contexte d'entraînement, de combien le quant érode γ (et ΔPPL) pour ce modèle, et un verdict. Cas d'usage : 'Q4_K_M tient dans 8 Go — mais est-il abruti au-delà de 30K ?' → voyez l'horizon et la pénalité γ de Q4 avant de télécharger 6 Go.",
"yarn.title": "🧵 Planificateur d'extension de contexte YaRN / RoPE",
"yarn.tip": "Config + verdict, pas seulement la VRAM. Les calculateurs GGUF/VRAM disent si une longueur de contexte tient dans le GPU. Ceci donne le bloc rope_scaling exact pour config.json ET si la qualité d'attention tiendra réellement à cette longueur — avec la machinerie γ_Padé / d_horizon de TAF, entièrement dans votre navigateur.",
"yarn.desc": "Vous voulez utiliser un modèle au-delà de son contexte d'entraînement ? Saisissez le modèle (ou son θ + contexte d'entraînement) et votre longueur cible L. Obtenez le fragment rope_scaling prêt à coller (transformers ≥4.43), plus un verdict TAF : l'horizon d'attention effectif atteint-il L, ou le modèle va-t-il halluciner au-delà de d_horizon ?",
"yarn.model_label": "ID du modèle HF (optionnel) :",
"yarn.fetch_btn": "📥 Récupérer config",
"yarn.orig_label": "Contexte d'entraînement (orig max_position_embeddings) :",
"yarn.theta_label": "θ de RoPE (rope_theta) :",
"yarn.target_label": "Contexte cible L :",
"yarn.type_label": "Méthode de scaling RoPE :",
"yarn.type_auto": "Auto (recommandé)",
"yarn.plan_btn": "🧵 Planifier l'extension",
"yarn.need_id": "Saisissez un id de modèle comme 'Qwen/Qwen2.5-7B-Instruct'",
"yarn.fetching": "Récupération de config.json depuis HF Hub…",
"yarn.loaded_hint": "Ajustez si besoin, puis cliquez sur Planifier l'extension.",
"yarn.verdict.healthy": "SAIN — l'horizon effectif atteint L avec un bon γ",
"yarn.verdict.usable_with_care":"UTILISABLE AVEC PRUDENCE — fonctionne mais γ est modeste près de L",
"yarn.verdict.needs_finetune": "NÉCESSITE UN FINE-TUNE — facteur trop grand pour la seule forme close",
"yarn.verdict.degrades": "DÉGRADE — l'attention s'effondre avant L",
"yarn.verdict.no_extension_needed":"AUCUNE EXTENSION NÉCESSAIRE — L déjà dans le contexte d'entraînement",
"yarn.r.factor": "Facteur d'extension",
"yarn.r.method": "Méthode",
"yarn.r.naive": "(sans extension)",
"yarn.r.eff": "(après extension)",
"yarn.r.from": "depuis",
"yarn.r.snippet": "fragment config.json",
"yarn.r.collapsed": "effondré (au-delà du pôle de Padé)",
"yarn.copy_btn": "Copier la config",
"yarn.copied": "Copié",
"yarn.warn.theta_eff_estimate":"θ_eff ≈ θ×facteur est une estimation NTK au premier ordre ; la rampe par bande de YaRN peut légèrement différer.",
"yarn.warn.aggressive": "Facteur agressif > 4× — la qualité au-delà de d_horizon n'est pas fiable sans fine-tuning.",
"yarn.warn.horizon_short": "L'horizon effectif ne couvre pas L — attendez-vous à une perte de cohérence au-delà de d_horizon.",
"yarn.warn.finetune": "L'extension RoPE ici est une estimation en forme close ; la doc de transformers + le papier YaRN recommandent un court fine-tune pour des facteurs au-delà de ~2–4×.",
"yarn.err.no_orig": "Saisissez le contexte d'entraînement (orig max_position_embeddings), ou récupérez un modèle.",
"yarn.err.no_theta": "Saisissez θ de RoPE (rope_theta), ou récupérez un modèle.",
"yarn.err.no_target": "Saisissez une longueur de contexte cible L.",
"help.v09.title": "🆕 v0.9 — Planificateur d'extension de contexte YaRN / RoPE",
"help.v09.intro": "v0.9 (2026-05-23) : la question la plus posée sur HuggingFace — \"comment régler rope_scaling pour étendre le contexte, et est-ce que ça marchera vraiment ?\" — résolue avec un fragment de config à coller ET un verdict de qualité TAF. Navigateur uniquement, sans inférence.",
"help.v09.yarn.title": "🧵 Planificateur d'extension de contexte YaRN / RoPE",
"help.v09.yarn.body": "La douzaine de calculateurs GGUF/VRAM sur HF (NyxKrage, oobabooga, DavidAU, …) répondent tous à la même question : la longueur de contexte L tient-elle dans mon GPU ? Aucun ne répond à la plus difficile : L tient-il ET fonctionne-t-il encore ? Saisissez un id de modèle (ou son θ + contexte d'entraînement) et une longueur cible L. Le planificateur calcule le facteur d'extension, émet le bloc rope_scaling exact pour transformers ≥4.43 (yarn / linear / dynamic / llama3, avec rampes β par défaut du papier), puis exécute la math γ_Padé / d_horizon de TAF : γ sans extension (le problème), γ après la méthode choisie (la solution), l'horizon d'attention effectif, et un verdict — SAIN / UTILISABLE-AVEC-PRUDENCE / NÉCESSITE-FINETUNE / DÉGRADE. Il signale honnêtement l'estimation θ_eff≈θ·facteur et l'exigence de fine-tune au-delà de 4×. Cas d'usage : 'Je veux Mistral-7B (θ=10k, 8k entraîné) à 32k' → voyez γ s'effondrer en usage naïf, YaRN le récupérer partiellement, et obtenez la config exacte à coller.",
"niah.title": "🔍 Gap NIAH → Reasoning",
"niah.tip": "NIAH (Needle in a Haystack) teste le retrieval : 'trouve ce fait dans un long texte'. Le reasoning multi-hop teste l'inférence : 'combine les faits X+Y au début avec le fait Z à la fin'. Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte. Cet outil prédit les deux taux à partir de la seule architecture.",
"niah.desc": "Votre modèle revendique 128k de contexte. Va-t-il vraiment raisonner à 64k, ou seulement retrouver ? Collez un model id HF et un contexte cible — l'outil prédit les taux de réussite NIAH et reasoning multi-hop, le gap, et un 'contexte sûr' où le reasoning reste ≥65%.",
"niah.id_label": "ID modèle HF :",
"niah.fetch_btn": "📥 Récupérer config",
"niah.teval_label": "Contexte cible (T_eval) :",
"niah.run_btn": "🔍 Prédire",
"niah.sweep_btn": "📊 Balayer les contextes",
"niah.label.niah": "Taux NIAH",
"niah.label.reasoning": "Taux Reasoning",
"niah.label.gap": "Gap",
"niah.label.safe_ctx": "Contexte sûr pour reasoning",
"niah.section.breakdown": "Détail architectural",
"niah.section.reco": "Recommandation",
"niah.calib.heading": "Calibré avec RULER (données publiées par NVIDIA)",
"niah.calib.matched": "Correspond {alias} → ligne KB {canonical}.",
"niah.calib.aggregate": "Agrégat RULER",
"niah.calib.interp": "interpolé entre",
"niah.calib.extrapolated": "extrapolé hors de la plage mesurée par RULER",
"niah.calib.col.heuristic": "Heuristique",
"niah.calib.col.calibrated": "Calibré RULER",
"niah.calib.col.delta": "Δ",
"niah.calib.factors": "Facteurs par tâche du paper RULER, Appendice Tables 13-16 :",
"niah.calib.factors_caveat": "plage honnête : retrieval 0.95-1.10×, reasoning 0.60-0.85×",
"niah.calib.claimed_vs_effective": "Rapporté dans le paper",
"niah.calib.claimed": "claimed",
"niah.calib.effective": "effective",
"niah.calib.source": "Source",
"niah.calib.miss": "Calibration RULER indisponible pour ce modèle — utilisation de l'heuristique architecturale seule. Ajoutez à data/ruler_kb.json si vous avez des chiffres mesurés.",
"niah.section.sweep": "Balayage des taux par longueur de contexte",
"niah.field.dhorizon": "d_horizon (effectif)",
"niah.field.ratio": "T_eval / d_horizon",
"niah.field.arch_pressure": "Pression arch (petit d_head + GQA + SWA)",
"niah.field.theta": "RoPE θ",
"niah.field.t_train": "T_train (revendiqué)",
"niah.col.context": "T_eval",
"niah.col.niah": "NIAH",
"niah.col.reasoning": "Reasoning",
"niah.col.gap": "Gap",
"niah.col.verdict": "Verdict",
"niah.verdict.robust": "✅ ROBUSTE",
"niah.verdict.marginal": "⚠ MARGINAL",
"niah.verdict.degraded": "⚠ DÉGRADÉ",
"niah.verdict.retrieval_only": "❌ RETRIEVAL UNIQUEMENT",
"niah.verdict.broken": "❌ CASSÉ",
"niah.reco.robust": "Retrieval et reasoning tiennent tous deux à ce contexte. Sûr de déployer pour les tâches de lookup et d'inférence.",
"niah.reco.marginal": "Borderline. Le retrieval fonctionne mais le reasoning est fragile. À utiliser pour le lookup, pas pour l'inférence multi-étapes.",
"niah.reco.degraded": "Chute significative du reasoning. Le modèle trouve des faits mais peine à les combiner. Évitez les tâches multi-hop à cette longueur.",
"niah.reco.retrieval_only": "Constat canonique de RULER : le modèle passe NIAH mais échoue au reasoning. Utile pour les setups RAG (où le LLM ne fait que localiser les faits) mais PAS pour l'inférence chaînée. Réduisez votre contexte à la valeur 'sûre' ci-dessous.",
"niah.reco.broken": "Le modèle échoue même au retrieval basique à ce contexte. Traitez-le comme hors-distribution — re-testez à un contexte plus court.",
"niah.safe_context": "≤ {ctx} tokens (reasoning ≥ 65%)",
"niah.safe_context_none": "Aucun contexte sûr trouvé sous votre cible — le modèle échoue au reasoning même à de petits contextes.",
"niah.summary.sweep": "{modelId} — taux par contexte",
"niah.status.empty_id": "⚠ Saisissez un model id (ex. meta-llama/Llama-3.1-8B-Instruct).",
"niah.status.bad_teval": "⚠ Saisissez un contexte cible (≥ 512 tokens).",
"niah.status.fetching": "⏳ Récupération config.json pour {modelId}...",
"niah.status.fetched": "✅ Config récupéré pour {modelId}. Réglez T_eval et cliquez Prédire (ou Balayer les contextes).",
"niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
"niah.status.sweep_done": "✅ Balayé {n} longueurs de contexte.",
"saturation.title": "📈 Détecteur de saturation des benchmarks",
"saturation.tip": "MMLU est saturé (88-94% sur tous les frontier models). Annoncer '92% sur MMLU' n'a plus de sens. Cet outil vous dit quels benchmarks discriminent encore les frontier models, lesquels sont saturés, et quoi utiliser à la place. Données : DemandSphere AI Frontier Tracker (CC BY-NC 4.0) rafraîchi 2026-05.",
"saturation.desc": "Votre benchmark est-il encore utile ? Choisissez un benchmark pour voir top-3 frontier scores, spread, et un verdict (saturated / near-saturated / discriminative) + remplacements recommandés.",
"saturation.select_label": "Benchmark :",
"saturation.select.all": "— afficher tous les benchmarks —",
"saturation.run_btn": "📈 Classer",
"saturation.all_btn": "📊 Afficher tout",
"saturation.col.spread": "Écart top-3",
"saturation.col.mean": "Moyenne top-3",
"saturation.col.n": "Modèles",
"saturation.col.bench": "Benchmark",
"saturation.col.verdict": "Verdict",
"saturation.col.reco": "Reco principale",
"saturation.col.model": "Modèle",
"saturation.col.score": "Score",
"saturation.section.top3": "Top-3 frontier scores",
"saturation.section.recommendations": "Alternatives recommandées",
"saturation.section.note": "Notes",
"saturation.section.all": "Tous les benchmarks suivis",
"saturation.verdict.saturated": "🚨 SATURÉ",
"saturation.verdict.near_saturated": "⚠ PRESQUE SATURÉ",
"saturation.verdict.discriminative": "✅ DISCRIMINATIF",
"saturation.verdict.sparse_data": "ℹ DONNÉES RARES",
"saturation.borderline": "Borderline — à ±1pp d'un seuil de coupure. Traitez le verdict comme 'à vérifier soigneusement'.",
"saturation.unknown": "Benchmark inconnu.",
"saturation.attribution": "Données : DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (historique open-weight) · dernier fetch 2026-05-05.",
"saturation.status.live": "✅ Données en direct chargées — {count} modèles.",
"saturation.status.baked": "ℹ Utilisation du snapshot baked (fetch en direct indisponible).",
"saturation.status.kb_fail": "⚠ Impossible de charger le KB de saturation.",
"saturation.status.done": "✅ {name} — {verdict}",
"saturation.status.all_done": "✅ {n} benchmarks classés.",
"help.v08.saturation.title": "📈 Détecteur de saturation des benchmarks",
"help.v08.saturation.body": "MMLU est saturé (top 88-94%), AIME 2025 saturé en quelques mois après sa sortie, HumanEval presque saturé. Choisissez un benchmark et l'outil retourne top-3 frontier scores, spread, moyenne, et un verdict — saturated / near-saturated / discriminative — plus un remplacement recommandé (ex. MMLU → MMLU-Pro / GPQA / HLE). Fetch en direct depuis DemandSphere AI Frontier Tracker (CC BY-NC 4.0) si accessible ; snapshot baked 2026-05-05 sinon. Cas d'usage : avant de citer '92% sur MMLU' ou de concevoir une eval, vérifiez si le benchmark discrimine encore quelque chose.",
"inv.v08.saturation": "📈 Saturation — votre benchmark est-il encore utile, ou tous les frontiers sont-ils à égalité au sommet ?",
// v0.8.2 — anti-bullshit pack #8: JSON CoT-aware Linter
"modes.cot": "📋 JSON CoT",
"mode_desc.cot": "Linte un JSON Schema (ou un objet de réponse exemple) à la recherche de l'anti-pattern réponse-avant-raisonnement. Les moteurs de décodage contraint émettent les champs dans l'ordre du schema — si `answer` précède `reasoning`, la chaîne de pensée est cassée.",
"cot.title": "📋 Linter JSON conscient de CoT",
"cot.tip": "Les moteurs de décodage contraint (llguidance, Outlines, grammaires SGLang) émettent les propriétés JSON dans l'ordre du schema. Si votre schema place `answer` avant `reasoning`, le modèle s'engage sur la réponse finale en premier et n'écrit la justification qu'ensuite — détruisant la Chaîne de Pensée. Collez un JSON Schema (ou un objet exemple) et le linter signale l'ordre.",
"cot.desc": "Le raisonnement avant la réponse, toujours. Collez un JSON Schema ou un objet de réponse exemple — le linter rapporte si les champs de raisonnement viennent avant ceux de réponse et propose une correction.",
"cot.input.placeholder": "{ \"type\": \"object\", \"properties\": { \"answer\": {\"type\": \"string\"}, \"reasoning\": {\"type\": \"string\"} } }",
"cot.lint_btn": "🔍 Linter",
"cot.example_good_btn": "↳ Exemple : ordre correct",
"cot.example_bad_btn": "↳ Exemple : anti-pattern",
"cot.status.done": "✅ {verdict}",
"cot.col.field": "Champ",
"cot.col.type": "Rôle",
"cot.field.reasoning": "raisonnement",
"cot.field.answer": "réponse",
"cot.field.other": "—",
"cot.field_count": "{n} champs",
"cot.verdict.good_order": "✅ Bon ordre — raisonnement avant réponse",
"cot.verdict.anti_pattern": "❌ Anti-pattern — réponse avant raisonnement",
"cot.verdict.missing_reasoning": "⚠ Champ de raisonnement manquant",
"cot.verdict.missing_answer": "ℹ Aucun champ type réponse détecté",
"cot.verdict.no_cot_fields": "ℹ Aucun champ raisonnement/réponse détecté",
"cot.verdict.invalid_json": "❌ JSON invalide",
"cot.verdict.non_object": "ℹ La valeur de premier niveau n'est pas un objet",
"cot.verdict.empty_fields": "ℹ Aucun champ à analyser",
"cot.explain.good_order": "Le décodage contraint émettra le raisonnement en premier, le modèle peut donc réfléchir avant de s'engager. La Chaîne de Pensée reste honnête.",
"cot.explain.anti_pattern": "Le modèle est forcé d'émettre le champ de réponse en premier ; tout raisonnement qui suit ne fait que justifier ce qui est déjà engagé. Réordonnez pour que les champs raisonnement viennent avant les champs réponse.",
"cot.explain.missing_reasoning": "Un champ de réponse est présent mais aucun champ de raisonnement. Si vous voulez du CoT, ajoutez un champ `reasoning` (ou `chain_of_thought`, `analysis`, …) avant la réponse.",
"cot.explain.missing_answer": "Un champ de raisonnement est présent mais aucun champ de réponse évident. Vérifiez que le schema force réellement le modèle à s'engager sur une valeur finale.",
"cot.explain.no_cot_fields": "L'objet a des champs, mais aucun ne ressemble à du raisonnement ou de la réponse par son nom. Le linter est conservateur — si le schema est intentionnel, ignorez. Sinon ajoutez des champs explicites raisonnement/réponse.",
"cot.hint.non_object": "La valeur de premier niveau doit être un objet JSON (`{ … }`) ou un JSON Schema avec `properties`.",
"cot.hint.empty_fields": "Aucun champ détecté. Collez un JSON Schema, une réponse exemple, ou cliquez un bouton d'exemple sous le textarea.",
"cot.suggested_fix.title": "✓ Correction suggérée",
"cot.suggested_fix.desc": "Propriétés réordonnées — champs raisonnement d'abord, puis tout champ de contexte, puis les champs réponse. `required[]` (s'il existe) est réordonné en correspondance.",
"cot.suggested_fix.copy": "📋 Copier",
"cot.suggested_fix.copied": "✓ Copié",
"cot.attribution": "Réfs :",
"inv.v082.cot": "📋 JSON CoT — linte les schemas de structured outputs à la recherche de l'anti-pattern réponse-avant-raisonnement qui casse silencieusement la Chaîne de Pensée.",
"help.v082.cot.title": "📋 Linter JSON conscient de CoT",
"help.v082.cot.body": "Les moteurs de décodage contraint (llguidance, Outlines, grammaires SGLang) émettent les propriétés JSON dans l'ordre que votre schema déclare. Si vous écrivez { answer, reasoning } le modèle s'engage sur answer en premier et le CoT se réduit à une justification a posteriori. Collez n'importe quel schema (ou réponse exemple) — le linter classe chaque champ comme raisonnement, réponse ou autre, signale l'ordre, et émet une correction réordonnée à copier. Cas d'usage : 'Mon prompt CoT marche en texte brut mais dégrade en mode JSON' → lancez le linter, trouvez l'ordre inversé, corrigez.",
// v0.8.3 — anti-bullshit pack #9: PEFT Anti-Pattern Checker
"modes.peft": "🔧 PEFT Lint",
"mode_desc.peft": "Linter statique pour les scripts d'entraînement PEFT/LoRA. Attrape le chargement silencieux du modèle de base (peft #2115), l'ordre prepare/get_peft_model en QLoRA, le mismatch target_modules/arch, et les conventions de lora_alpha.",
"peft.title": "🔧 Vérificateur d'anti-patterns PEFT",
"peft.tip": "get_peft_model(base, config) crée un nouvel adaptateur — il ne CHARGE PAS les poids sauvegardés. Pour reprendre depuis un checkpoint il faut appeler PeftModel.from_pretrained(base, path). peft #2115 documente le bug du chargement silencieux. Ce linter scanne votre script à la recherche de ce pattern (et 3 autres : ordre QLoRA, mismatch target_modules/arch, ratio lora_alpha).",
"peft.desc": "Ne brûlez pas 10 heures d'entraînement sur un modèle de base. Collez votre code de setup PEFT — le linter signale les chargements silencieux du base, les bugs d'ordre QLoRA, les mismatches target_modules/arch, et les conventions lora_alpha.",
"peft.input.placeholder": "from peft import LoraConfig, get_peft_model …",
"peft.lint_btn": "🔍 Linter",
"peft.example_bug_btn": "↳ Exemple : chargement silencieux du base",
"peft.example_qlora_btn": "↳ Exemple : bug d'ordre QLoRA",
"peft.example_clean_btn": "↳ Exemple : propre",
"peft.status.done": "✅ {verdict} — {n} découverte(s)",
"peft.line": "ligne {n}",
"peft.summary": "{total} découverte(s)",
"peft.attribution": "Réfs :",
"peft.detected_at_line": "apparaît à la ligne",
"peft.suggested_fix": "Suggéré :",
"peft.detected_arch": "Arch détectée",
"peft.from_model_id": "(depuis model id",
"peft.your_modules": "Vos target_modules",
"peft.expected_modules": "Attendus pour cette arch",
"peft.match_ratio": "{hits} sur {total} correspondent.",
"peft.ratio": "ratio",
"peft.alpha.convention": "la convention est α=2r ou α=r",
"peft.qlora_order.detail": "prepare_model_for_kbit_training (ligne {prepare_line}) s'exécute APRÈS get_peft_model (ligne {get_peft_model_line}). Inversez l'ordre — appelez prepare D'ABORD, puis get_peft_model.",
"peft.no_peft_calls.detail": "Aucun appel à get_peft_model / PeftModel.from_pretrained / LoraConfig détecté. Collez un snippet de setup PEFT/LoRA.",
"peft.verdict.errors_found": "❌ Erreurs trouvées",
"peft.verdict.warnings_only": "⚠ Avertissements",
"peft.verdict.info_only": "ℹ Info",
"peft.verdict.clean": "✅ Propre — aucun problème détecté",
"peft.verdict.no_peft_calls": "ℹ Aucun appel PEFT détecté",
"peft.verdict.empty_input": "ℹ Entrée vide",
"peft.rule.silent_base_load.label": "Chargement silencieux du modèle de base (peft #2115)",
"peft.rule.silent_base_load.explain": "get_peft_model(base, config) crée un NOUVEL adaptateur — il NE charge PAS les poids sauvegardés. L'indice de checkpoint dans votre code suggère que vous voulez REPRENDRE l'entraînement depuis un adaptateur sauvegardé, mais ce chemin de code redémarrera silencieusement à zéro et écrasera la run.",
"peft.rule.silent_base_load.fix": "Remplacez get_peft_model(base, config) par PeftModel.from_pretrained(base, path) à la reprise. Vérifiez avec model.get_layer_status() après le chargement.",
"peft.rule.qlora_order.label": "Bug d'ordre QLoRA",
"peft.rule.qlora_order.explain": "prepare_model_for_kbit_training doit être appelé AVANT get_peft_model. Inversé, la préparation kbit ne s'applique pas aux couches LoRA et le calcul du gradient casse (loss → NaN, ou entraînement silencieux de rien).",
"peft.rule.qlora_order.fix": "Réordonnez : base = prepare_model_for_kbit_training(base) puis model = get_peft_model(base, config).",
"peft.rule.target_modules_mismatch.label": "Mismatch target_modules / arch",
"peft.rule.target_modules_mismatch.explain": "Votre liste target_modules ne correspond pas aux noms de modules conventionnels pour l'architecture détectée dans votre code. PEFT appliquera LoRA silencieusement à rien (ou aux mauvaises couches).",
"peft.rule.target_modules_mismatch.fix": "Vérifiez les noms avec print([n for n,_ in model.named_modules()]) sur le modèle de base chargé, ou utilisez la liste spécifique à l'arch montrée ci-dessus.",
"peft.rule.alpha_not_2r.label": "lora_alpha ≠ 2r (convention)",
"peft.rule.alpha_not_2r.explain": "La plupart des recettes LoRA publiées utilisent soit α = 2r (échelle effective unitaire) soit α = r (LR effectif réduit). Un ratio custom marche mais mérite une vérification.",
"peft.rule.alpha_not_2r.fix": "Vérifiez le ratio contre votre recette de référence. Si intentionnel, ignorez cette découverte.",
"peft.rule.no_peft_calls.label": "Aucun appel PEFT détecté",
"inv.v083.peft": "🔧 PEFT Lint — attrape le chargement silencieux de get_peft_model sur le base (peft #2115) + ordre QLoRA + mismatch target_modules / arch.",
"help.v083.peft.title": "🔧 Vérificateur d'anti-patterns PEFT",
"help.v083.peft.body": "Le get_peft_model(base, config) de PEFT crée un NOUVEL adaptateur — il ne charge pas les poids sauvegardés depuis un chemin. Quiconque colle du code de tuto et essaie de reprendre depuis un checkpoint jette silencieusement son entraînement. peft #2115 contient le bug report canonique. Ce linter scanne votre script à la recherche du pattern + 3 problèmes liés (ordre QLoRA, mismatch target_modules/arch, ratio lora_alpha) et rapporte les découvertes avec numéros de ligne et corrections suggérées. Cas d'usage : avant de lancer un fine-tune LoRA de 10 heures, collez votre script — attrapez les bugs silencieux en 200ms.",
// v0.8.4 — anti-bullshit pack #10: Prompt-Cache Diff Predictor
"modes.cache": "🔁 Cache Diff",
"mode_desc.cache": "Prédit si une édition du prompt a gardé le cache prompt du fournisseur vivant ou l'a invalidé. Taux de hit par fournisseur + delta $ vs sans cache.",
"cache.title": "🔁 Prédicteur de Diff Prompt-Cache",
"cache.tip": "Le cache_control d'Anthropic casse au premier token différent du préfixe marqué. OpenAI auto-cache les préfixes ≥1024 tokens mais invalide à tout changement. Le context cache Gemini requiert ≥32K tokens. Une édition mal placée 10x silencieusement votre facture — et l'API ne prévient jamais. Collez ancien + nouveau prompt, voyez le taux de hit par fournisseur + delta de coût.",
"cache.desc": "Ne 10x pas votre facture sur une édition d'un caractère. Collez votre prompt précédent et actuel — le prédicteur trouve le plus long préfixe commun, estime les tokens, et montre le taux de hit par fournisseur + delta $ vs sans cache.",
"cache.old_label": "Ancien prompt :",
"cache.new_label": "Nouveau prompt :",
"cache.old.placeholder": "Vous êtes un assistant utile. …",
"cache.new.placeholder": "Vous êtes un assistant utile. …",
"cache.profile_label": "Profil de tokenizer :",
"cache.profile.english": "Anglais (chars/4)",
"cache.profile.code": "Code (chars/3.5)",
"cache.profile.mixed": "CJK / Cyrillique (chars/2)",
"cache.output_label": "Tokens de sortie estimés :",
"cache.diff_btn": "🔍 Prédire",
"cache.example_good_btn": "↳ Exemple : 99% hit",
"cache.example_broken_btn": "↳ Exemple : cache cassé",
"cache.example_belowmin_btn": "↳ Exemple : sous le minimum OpenAI",
"cache.status.done": "✅ {verdict} — {hit}% hit théorique",
"cache.verdict.identical": "✅ Identiques — hit complet",
"cache.verdict.divergent_can_cache":"⚠ Hit partiel — varie selon fournisseur",
"cache.verdict.divergent_below_min":"❌ En dessous des minimums — pas de cache possible",
"cache.verdict.fully_divergent": "❌ Totalement divergents — cache invalidé",
"cache.verdict.empty_input": "ℹ Entrée vide",
"cache.summary.tokens": "Préfixe commun {common} / {total} tokens (taux de hit théorique {pct}%).",
"cache.summary.diff_at": "Première différence à la ligne {line}.",
"cache.col.provider": "Fournisseur",
"cache.col.hit": "Hit",
"cache.col.cost": "Base → cached",
"cache.col.savings": "Économies",
"cache.note.requires_marker": "(nécessite le marqueur cache_control)",
"cache.note.below_min": "(préfixe < {min} tokens — min du fournisseur)",
"cache.write_surcharge": "+ {cost} surcharge cache-write la première fois (Anthropic)",
"cache.diff.title": "Où le cache casse",
"cache.diff.legend": "Vert = préfixe partagé (cacheable). Rouge = première édition (tout à partir d'ici est re-facturé).",
"cache.hint.empty": "Collez deux prompts, puis Prédire.",
"cache.attribution": "Réfs :",
"cache.attribution.snapshot": "Prix snapshot 2026-01 ; vérifiez avec la doc actuelle du fournisseur avant d'agir sur $.",
"inv.v084.cache": "🔁 Cache Diff — prédit si une édition du prompt a invalidé le cache prompt du fournisseur. Taux de hit par fournisseur + delta $.",
"help.v084.cache.title": "🔁 Prédicteur de Diff Prompt-Cache",
"help.v084.cache.body": "Les caches prompt de chaque fournisseur ont des règles différentes : le cache_control d'Anthropic casse au premier token différent du préfixe marqué ; OpenAI auto-cache les préfixes ≥1024 tokens ; les context caches Gemini requièrent ≥32K tokens. Une édition mal placée 10x silencieusement votre facture — l'API ne prévient pas, et le coût n'apparaît qu'à la facture suivante. Collez ancien + nouveau prompt, le prédicteur trouve le plus long préfixe commun, estime les tokens avec trois profils de tokenizer (anglais / code / CJK), et montre le taux de hit par fournisseur + delta $ vs sans cache pour Claude Opus/Sonnet/Haiku, GPT-5/mini, et Gemini 2.5 Pro. Cas d'usage : 'J'ai modifié le system prompt et la facture a sauté — qu'est-ce qui a cassé ?' → collez les deux prompts, voyez exactement quel fournisseur a arrêté de cacher.",
// v0.8.5 — anti-bullshit pack #11: Speculative-Decode Compatibility
"modes.speculative": "🔬 Spec-Decode",
"mode_desc.speculative": "Récupère `tokenizer.json` depuis HF Hub pour deux model ids et vérifie la compatibilité du vocab avant de configurer le speculative decoding. Attrape le bug de mismatch silencieux qui gaspille le compute du draft.",
"speculative.title": "🔬 Compatibilité Speculative-Decode",
"speculative.tip": "Le speculative decoding (vLLM, SGLang, llama.cpp, transformers) requiert que draft et target partagent un vocabulaire EXACT. Tout désaccord de token-id fait que le target rejette chaque token du draft — vous payez LES DEUX coûts de compute et obtenez UN PIRE débit que la baseline. Le système rapporte une sortie nominale (juste plus lente), donc le bug est invisible aux tests unitaires. Cet outil récupère `tokenizer.json` depuis HF Hub pour les deux ids et compare.",
"speculative.desc": "Ne déployez pas spec-dec avec des vocabs mismatched. Collez target + draft model ids → l'outil fetch les tokenizers, compare type de vocab, taille, token-ids échantillonnés, special tokens, added tokens → verdict + estimation de speedup.",
"speculative.target_label": "Model id du target (gros) :",
"speculative.draft_label": "Model id du draft (petit) :",
"speculative.target_label_short": "target",
"speculative.draft_label_short": "draft",
"speculative.check_btn": "🔍 Vérifier compatibilité",
"speculative.example_good_btn":"↳ Exemple : Llama-3.1 8B/70B (gated → mirror)",
"speculative.example_bad_btn": "↳ Exemple : cross-family (mauvais)",
"speculative.gated_note": "💡 Modèles gated (Llama, Mistral, Gemma) déclenchent un fallback automatique vers un open-mirror (unsloth/...). HF déconseille officiellement les tokens côté navigateur, donc l'outil ne peut pas auth — mais les tokenizers des mirrors sont typiquement byte-identiques car la quantification touche les poids, pas l'artefact du tokenizer.",
"speculative.mirror.heading": "Fallback open-mirror",
"speculative.mirror.target_used": "Target {original} était gated ; utilisation du mirror {mirror}.",
"speculative.mirror.draft_used": "Draft {original} était gated ; utilisation du mirror {mirror}.",
"speculative.mirror.warn": "Les tokenizers mirror (ex. unsloth/) sont habituellement byte-identiques au gated original car la quantification touche les poids, pas les tokens. Vérifiez le chat-template si un match exact est requis (unsloth #880 documente une dérive occasionnelle).",
"speculative.status.fetching": "🔄 Récupération de tokenizer.json depuis HF Hub pour les deux modèles…",
"speculative.status.done": "✅ {verdict}",
"speculative.status.error": "❌ Erreur",
"speculative.type_mismatch_note": "types de tokenizer diffèrent ; spec-dec impossible",
"speculative.vocab_size": "Taille du vocab",
"speculative.size_diff": "diffèrent — chaque id réutilisé est un mismatch",
"speculative.sampled": "Match de token-id échantillonné",
"speculative.first_mismatch": "Premier mismatch",
"speculative.special_diff": "Différences de special tokens",
"speculative.added_diff": "Différences de added tokens",
"speculative.added_diff_more": "+ plus …",
"speculative.speedup.title": "Bande de speedup estimée",
"speculative.speedup.params": "target {target} / draft {draft} (ratio de params {ratio})",
"speculative.speedup.low": "Bas (α=0.50)",
"speculative.speedup.expected":"Attendu (α=0.70)",
"speculative.speedup.high": "Haut (α=0.85)",
"speculative.speedup.disclaimer": "α = taux d'acceptation du draft. Le speedup réel dépend du domaine du prompt, lookahead K, et overhead du moteur. Les bandes supposent un verifier batching idéal.",
"speculative.speedup.draft_not_smaller": "Le draft n'est pas plus petit que le target — spec-dec est un mauvais usage ici.",
"speculative.attribution": "Réfs :",
"speculative.side.target": "Target",
"speculative.side.draft": "Draft",
"speculative.fetch_error.missing_model_id": "model id manquant",
"speculative.fetch_error.gated_or_private": "modèle gated ou privé — impossible de récupérer le tokenizer sans auth",
"speculative.fetch_error.not_found": "model id non trouvé sur HF Hub",
"speculative.fetch_error.fetch_failed": "fetch échoué (erreur HTTP)",
"speculative.fetch_error.parse_failed": "parse JSON échoué (fichier malformé)",
"speculative.fetch_error.timeout": "timeout (>15s, gros tokenizer ou connexion lente)",
"speculative.fetch_error.network": "erreur réseau",
"speculative.fetch_error.hint": "Vérifiez l'orthographe du model id. Pour les modèles gated, consultez le tokenizer via votre compte HF — cet outil ne peut pas auth.",
"speculative.hint.missing_input": "Entrez les deux model ids (target et draft), puis Vérifier.",
"speculative.hint.identical_models": "Target et draft sont le même modèle — spec-dec est un no-op (et un gaspillage).",
"speculative.verdict.compatible": "✅ Compatible — vocabs correspondent",
"speculative.verdict.compatible_with_caveats": "✅ Compatible — mais special/added tokens diffèrent (à revoir)",
"speculative.verdict.partial_compatible": "⚠ Match partiel (95-99.9% des ids échantillonnés)",
"speculative.verdict.type_mismatch": "❌ Types de tokenizer diffèrent — spec-dec impossible",
"speculative.verdict.vocab_size_mismatch": "❌ Tailles de vocab diffèrent — espace d'id désaligné",
"speculative.verdict.incompatible": "❌ Incompatibles — trop de mismatches d'id",
"speculative.verdict.fetch_failed": "ℹ Récupération du tokenizer impossible",
"speculative.verdict.identical_models": "ℹ Modèles identiques — spec-dec est un no-op",
"speculative.verdict.missing_input": "ℹ Entrez les deux ids",
"inv.v085.speculative": "🔬 Spec-Decode — vérifie la compatibilité du vocab du tokenizer entre target + draft avant de déployer le speculative decoding (le bug qui donne UN PIRE débit silencieusement).",
"help.v085.speculative.title": "🔬 Compatibilité Speculative-Decode",
"help.v085.speculative.body": "Le speculative decoding ne marche que si target et draft partagent exactement le même vocabulaire. Des vocabs mismatched font que chaque token du draft est rejeté — vous payez LES DEUX coûts de compute et obtenez un pire débit que la baseline. Pire : le système émet toujours une sortie correcte (juste plus lente), donc le bug est invisible aux tests unitaires. vLLM #4570 / #16757 / #20409 / #12488 surfent les variantes. Cet outil récupère `tokenizer.json` depuis HF Hub pour les deux model ids, compare le type de tokenizer, la taille du vocab, la map complète token→id, les special tokens, et les added tokens, puis estime une bande de speedup basée sur le ratio de params et les taux α=0.5/0.7/0.85 d'acceptation typiques. Cas d'usage : avant de lancer un cluster vLLM avec spec-dec activé, vérifiez que la paire est compatible.",
// v0.8.7 — anti-bullshit pack #13: Multilingual Tokenizer Tax
"modes.tax": "🌍 Token Tax",
"mode_desc.tax": "Encodage BPE réel (côté navigateur via transformers.js) du texte collé sur 6 tokenizers de fournisseurs. Révèle l'asymétrie de coût silencieuse entre langues.",
"tax.title": "🌍 Taxe Tokenizer Multilingue",
"tax.tip": "Les tokenizers taxent le texte non-anglais de façon asymétrique. Le même paragraphe peut faire 100 tokens en anglais mais 250+ en chinois sur un tokenizer entraîné en Latin (Llama, Phi). Coût par requête ET contexte effectif dégradent silencieusement. Collez votre texte, voyez les vrais token counts à travers les tokenizers fournisseurs — pas d'estimation, BPE réel via transformers.js dans votre navigateur.",
"tax.desc": "Ne 3× pas votre facture sur le support chinois. Collez n'importe quel texte → encodage BPE réel par tokenizer (Qwen / Phi / Llama / Gemma / GPT-4 / Claude) → voyez l'asymétrie de coût vs votre baseline.",
"tax.input_label": "Texte à tokenizer :",
"tax.input.placeholder": "Collez n'importe quel texte — anglais, chinois, arabe, code, …",
"tax.tokenize_btn": "🔬 Tokenizer tous",
"tax.sample_en_btn": "↳ Exemple : English",
"tax.sample_zh_btn": "↳ Exemple : 中文",
"tax.sample_ar_btn": "↳ Exemple : عربى",
"tax.sample_mixed_btn": "↳ Exemple : mixte",
"tax.sample_code_btn": "↳ Exemple : code",
"tax.status.loading": "⏳ Chargement transformers.js + tokenizers (la première exécution peut prendre 5-15s)…",
"tax.status.done": "✅ {n}/{total} tokenizers en {ms}ms",
"tax.col.tokenizer": "Tokenizer",
"tax.col.tokens": "Tokens",
"tax.col.cpt": "Chars/tok",
"tax.col.ratio": "Ratio",
"tax.summary.input": "Entrée : {chars} caractères, {bytes} octets",
"tax.script_breakdown": "scripts",
"tax.interp.worst": "{label} coûte {pct}% de tokens en plus que la baseline pour ce texte.",
"tax.interp.uniform": "✓ Tous les tokenizers à ±5% — texte bien géré par les fournisseurs.",
"tax.hint.empty": "Collez du texte puis Tokenizer.",
"tax.all_failed": "Tous les tokenizers ont échoué.",
"tax.error.gated": "modèle gated (auth HF requise — essayez le mirror open)",
"tax.error.not_found": "model id introuvable",
"tax.error.timeout": "timeout (gros tokenizer ou connexion lente)",
"tax.error.network": "erreur réseau",
"tax.error.fetch_failed": "fetch échoué",
"tax.error.invalid_input": "entrée invalide",
"tax.attribution": "Tokenizers via",
"tax.attribution.privacy": "Le texte est tokenizé localement — ne quitte jamais le navigateur.",
"tax.firstload_note": "💡 Premier chargement : l'outil récupère transformers.js (~750 KB) + le vocab de chaque tokenizer à la demande (~5-15 MB par tokenizer, mis en cache après). Les exécutions suivantes sont instantanées. Tout le traitement est local — votre texte ne quitte jamais le navigateur.",
"inv.v087.tax": "🌍 Token Tax — encodage BPE réel sur 6 tokenizers fournisseurs. Révèle l'asymétrie de coût silencieuse entre langues (CJK / arabe / mixte).",
"help.v087.tax.title": "🌍 Taxe Tokenizer Multilingue",
"help.v087.tax.body": "Les tokenizers taxent le texte non-anglais de façon asymétrique. Le même paragraphe peut faire 100 tokens en anglais mais 250+ en chinois sur un tokenizer entraîné en Latin (Llama, Phi). Coût-par-requête ET contexte effectif dégradent silencieusement. Cet outil charge HuggingFace transformers.js dans votre navigateur (~750 KB CDN) et tokenize le texte collé contre 6 tokenizers preset de fournisseurs (Qwen2.5, Phi-3.5, Llama-3.1, Gemma-2, GPT-4 cl100k, Claude approx). Sortie : token count par tokenizer + chars-per-token + ratio vs baseline + interprétation d'asymétrie. Auto-détecte les blocs de script (Latin / CJK / arabe / cyrillique / devanagari / thaï / grec / hébreu / coréen) pour voir pourquoi un tokenizer est 3× un autre. Cas d'usage : 'Mon support multilingue a ajouté 30% à la facture — quelle langue coûte le plus ?' → collez du texte de production réel, voyez le breakdown exact par tokenizer.",
// v0.8.8 — anti-bullshit pack #14 : LongScore (RULER + HELMET lookup)
"modes.longscore": "🎯 LongScore",
"mode_desc.longscore": "Recherchez la dégradation relative de votre modèle au-delà du contexte court. KBs RULER + HELMET (n=93 modèles). Métrique LongScore de 100-LongBench (ACL 2025).",
"longscore.title": "🎯 LongScore",
"longscore.tip": "Chaque modèle prétend une fenêtre 128K, mais la précision dégrade bien avant. LongScore (métrique peer-reviewed de 100-LongBench, ACL 2025) mesure la dégradation relative au-delà du contexte court. Sépare la base ability de la vraie capacité long-ctx — vous comparez la dégradation, pas les scores bruts. Lookup contre KBs RULER + HELMET (n=93 modèles).",
"longscore.desc": "Combien votre modèle dégrade-t-il au-delà du contexte court ? Collez un id modèle HF → voyez LongScore (dégradation relative) + breakdown par longueur + scores HELMET 7-task quand disponibles. Pas de GPU. Pas d'inférence. Lookup pur contre des benchmarks publiés.",
"longscore.input_label": "Id du modèle :",
"longscore.input.placeholder": "ex. Qwen2.5-72B-Instruct ou meta-llama/Llama-3.1-70B-Instruct",
"longscore.lookup_btn": "🔎 Rechercher",
"longscore.example_good_btn": "↳ Exemple : Jamba-1.5-Large (sans dégradation)",
"longscore.example_mid_btn": "↳ Exemple : Llama-3.1-70B (modéré)",
"longscore.example_bad_btn": "↳ Exemple : dbrx (sévère)",
"longscore.formula_note": "💡 LongScore = moyenne sur l ∈ {16K, 32K, 64K, 128K} de (S_l − Base) / Base, où Base = moyenne(S_4K, S_8K). Source : 100-LongBench, ACL 2025. Données : NVIDIA RULER (per-length, n=33) + HELMET (agrégat à 128K, n=60). 0 = pas de dégradation ; -0.30 = sévère.",
"longscore.miss.title": "Modèle non trouvé en KB",
"longscore.miss.body": "Recherché {id}. KB contient {n} modèles. Essayez un id HF canonique (ex. Qwen2.5-72B-Instruct, Llama-3.1-70B-Instruct, Jamba-1.5-Mini).",
"longscore.miss.suggest": "Vérifiez la couverture sur",
"longscore.no_ruler": "⚠ Pas de données per-length — LongScore non calculable. Affichage agrégat HELMET à 128K.",
"longscore.score_label": "LongScore",
"longscore.helmet_label": "Breakdown HELMET 7-task",
"longscore.col.ctx": "Contexte",
"longscore.col.score": "Score",
"longscore.col.lc": "LC",
"longscore.col.task": "Tâche",
"longscore.source_note": "Source",
"longscore.hint.empty": "⚠ Collez un id modèle d'abord.",
"longscore.status.lookup": "⏳ Recherche…",
"longscore.status.miss": "ℹ Modèle pas en KB",
"longscore.status.ruler_hit": "✅ Données RULER per-length trouvées",
"longscore.status.helmet_only":"ℹ Agrégat HELMET seulement (pas de données per-length)",
"longscore.verdict.no_degradation": "✅ Pas de dégradation au-delà du contexte court",
"longscore.verdict.mild": "🟢 Dégradation légère (<10%)",
"longscore.verdict.moderate": "🟠 Dégradation modérée (10-20%)",
"longscore.verdict.severe": "🔴 Dégradation sévère (20-30%)",
"longscore.verdict.extreme": "🚨 Dégradation extrême (>30%)",
"inv.v088.longscore": "🎯 LongScore — métrique de dégradation peer-reviewed (100-LongBench, ACL 2025). Lookup de tout modèle dans KBs RULER + HELMET (n=93). Voyez combien votre modèle chute réellement au-delà du contexte court.",
"help.v088.longscore.title": "🎯 LongScore",
"help.v088.longscore.body": "Chaque LLM long-ctx prétend 128K mais dégrade bien avant. Le paper 100-LongBench (ACL 2025, arXiv:2505.19293) a remarqué que les scores long-ctx bruts sont dominés par la base ability — un modèle plus smart avec une moins bonne recette long-ctx score plus qu'un moins smart avec une meilleure recette, masquant la vraie dégradation. Ils proposent LongScore : LC_l = (S_l − Base) / Base avec Base = moyenne(S_short), puis moyenne sur les longueurs longues. Résultat : un nombre de dégradation relative par modèle qui compare apples to apples. Ce mode tafagent embarque les données LongScore-ready : agrégat RULER per-context (n=33 modèles, 4K-128K) + agrégat HELMET à 128K (n=60 modèles, 7 catégories). Lookup est match exact par id HF (lowercase, dashes, dots normalisés). Pour les modèles avec données RULER, vous obtenez le LongScore complet + breakdown per-length + verdict (pas/légère/modérée/sévère/extrême). Pour les modèles HELMET-only, vous obtenez l'agrégat 7-catégories à 128K. Cas d'usage : 'je veux utiliser Llama-3.1-70B-Instruct pour résumé de docs 100K-token — combien de précision je perds vraiment ?' → collez l'id, voyez -10% LongScore (dégradation modérée, surtout le cliff à 128K). Décidez de l'utiliser, passer à un modèle avec long-ctx engineered, ou chunker votre input.",
"inv.v081.hub": "🧭 Solutions Hub — chaque pain documenté mappé à un mode tafagent ou outil externe curé. Ne réinventez pas — trouvez.",
"help.v081.hub.title": "🧭 Solutions Hub",
"help.v081.hub.body": "tafagent comme intégrateur, pas silo. 30+ pains à travers 7 catégories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), chacun mappé à (a) le mode tafagent qui le résout, s'il existe, et (b) les outils externes best-of-breed que la communauté utilise déjà (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). La barre de recherche matche pain, scénario, et nom d'outil. Cas d'usage : 'j'ai le problème X — tafagent le résout-il, et sinon, qui ?'",
"hub.title": "🧭 Solutions Hub",
"hub.tip": "Carte de chaque pain de LLM-eval documenté : quel mode tafagent l'adresse (si applicable), et les outils externes best-of-breed que la communauté utilise déjà. Objectif : couverture totale. Si l'outil canonique existe ailleurs, nous lions plutôt que de reconstruire.",
"hub.desc": "Ne réinventez pas — trouvez. 30+ pains mappés à des modes tafagent + outils externes curés. Naviguez par catégorie, recherchez par mot-clé, ou voyez les lacunes où de nouveaux modes aideraient le plus.",
"hub.clear_btn": "✕ Effacer",
"hub.no_mode": "externe",
"hub.planned": "prévu :",
"hub.best_for": "Idéal pour",
"hub.not_for": "Pas pour",
"hub.tools": "Outils externes",
"hub.status.loaded": "✅ Chargés {total} pains dans {categories} catégories — {covered} couverts par des modes tafagent, {externalLinks} liens externes curés. Compilé {compiled}.",
"hub.status.fail": "⚠ Impossible de charger Solutions Hub.",
"hub.search.empty": "Aucune correspondance pour '{query}'. Essayez des termes plus larges (ex. 'eval', 'rag', 'tokenizer').",
"hub.search.results": "{n} correspondance(s) trouvée(s) pour '{query}'.",
// v0.7.7 — Tuiles de tâches (refonte UX : 14 modes regroupés par intention)
"tiles.title": "🎯 Que voulez-vous faire ?",
"tiles.subtitle": "Choisissez une tâche. Chacune ouvre l'outil adéquat ci-dessous. Ou faites défiler pour la liste complète des 22 modes.",
"tile.diagnose.title": "🔬 Diagnostiquer un modèle",
"tile.diagnose.desc": "Ce modèle conviendra-t-il à mon cas d'usage ?",
"tile.trust.title": "✓ Faire confiance à un score",
"tile.trust.desc": "Dois-je croire ce nombre ? Bug ou bruit ?",
"tile.eval.title": "⚙️ Configurer une éval correctement",
"tile.eval.desc": "Obtenez le flag CLI exact pour lm-eval / vLLM / transformers.",
"tile.compare.title": "🆚 Comparer des modèles",
"tile.compare.desc": "Côte à côte, ou explorez le panel empirique de modèles.",
"tile.manual.title": "📋 Manuel / libre",
"tile.manual.desc": "Choisissez une recette à la main, ou demandez en langage naturel.",
"tile.diagnose.tip": "Commencez ici quand vous avez un id de modèle spécifique et voulez un diagnostic complet : Profile lance les 5 recettes d'un coup. Unmask vérifie si max_position_embeddings est honnête. NIAH→Reason prédit le gap retrieval-vs-reasoning. LongScore recherche les données publiées RULER + HELMET et montre la dégradation réelle au-delà du contexte court (métrique peer-reviewed). Quant prédit si quantifier va le casser. Inspect permet de coller un config.json brut pour modèles privés / en dev.",
"tile.trust.tip": "Quand vous voyez un score et voulez savoir s'il est réel. Contamination note 20+ benchmarks selon la probabilité que le modèle les ait vus en entraînement. Drift vous dit si l'écart entre deux évals est du bruit numérique ou un vrai bug (chat-template mismatch, layout KV-cache, etc.). Arena CI reconstruit les intervalles de confiance que Chatbot Arena cache — beaucoup de "victoires" top-Elo sont statistiquement à égalité.",
"tile.eval.tip": "Avant de lancer lm-eval-harness ou vLLM serve, obtenez le bon flag CLI. Chat-template Sniffer détecte la famille de template (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) et émet l'invocation exacte --apply_chat_template / --chat-template. Résout l'issue #1841 de lm-eval-harness (÷2 accuracy silencieux). Diagnose CLI génère la commande Python pour mesurer γ_obs sur votre GPU local.",
"tile.compare.tip": "Compare : choisissez 2-3 modèles candidats + une recette, voyez les verdicts dans un tableau côte à côte (ex. Llama-3-8B vs Mistral-7B à 32k). Phase diagram : nuage de 23 modèles empiriques dans le plan (log θ, γ), avec la courbe Padé superposée. Survolez les points pour détails, cliquez pour charger ce modèle dans le formulaire Recipe.",
"tile.manual.tip": "Recipe : choisissez une recette X-N spécifique (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 compression KV, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) et remplissez le formulaire à la main pour contrôle total. Ask : tapez une question libre ; un LLM 0.5B (Qwen2.5) dans votre navigateur choisit la bonne recette et la lance. Idéal pour explorer "que se passerait-il si...".",
"share.import_desc": "Vous avez un fichier JSON de l'analyse TAF de quelqu'un ? Chargez-le ici pour voir le verdict + la chaîne localement. La même vue que si vous l'aviez exécuté vous-même.",
"share.import_btn": "📂 Charger JSON partagé",
"synthesis.system": "Vous êtes un assistant de diagnostic précis pour LLMs transformer. Étant donné des résultats de formules TAF pré-calculés, écrivez un résumé clair en français de 4-6 phrases. Citez le numéro de section (§X.Y) pour chaque nombre mentionné. Donnez toujours une recommandation concrète. N'INVENTEZ PAS de nombres.",
// INSPECTOR mode
"inspector.title": "🔍 Inspecteur d'Architecture",
"inspector.desc": "Collez le contenu brut de config.json. L'outil extrait les paramètres architecturaux et exécute le Profil complet à 5 recettes.",
"inspector.tip": "Collez n'importe quel config.json directement. L'outil le parse et exécute le Profil complet. Utile pour : modèles privés, configs en développement, modèles pas encore sur HuggingFace, ou comparer ce que ferait votre architecture custom.",
"inspector.quickstart": "💡 Cas d'usage : vous avez un modèle privé pas sur HF Hub, ou une config que vous concevez. Collez le JSON brut ci-dessous et obtenez un profil TAF complet.",
"inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}",
"inspector.T_eval": "T_eval (votre contexte cible) :",
"inspector.btn": "🚀 Inspecter et profiler",
// WHAT-IF slider
"whatif.title": "🎚 What-if : faites glisser T_eval pour voir γ changer en direct",
"whatif.desc": "Recalcul pur JS (sans appel Pyodide). Montre γ_Padé et d_horizon géométriques pendant que vous glissez. Cliquez pour ré-exécuter la chaîne complète.",
"whatif.T_eval": "T_eval",
"whatif.gamma_pade": "γ_Padé",
"whatif.d_horizon": "d_horizon",
"whatif.l_niah": "Plafond L_NIAH",
"whatif.predicted": "Verdict géométrique prédit",
"whatif.rerun": "↻ Recalculer la chaîne complète à ce T_eval",
// COMMUNITY feed
"community.title": "🌐 Soumissions récentes de la communauté",
"community.desc": "Flux en direct du registre public. Cliquez sur n'importe quelle soumission pour voir l'analyse complète.",
"community.browse_all": "Voir tout →",
"community.loading": "Chargement...",
"community.no_repo": "Le repo du registre n'est pas encore créé. Une fois qu'il existe avec des soumissions, elles apparaîtront ici en direct.",
"community.no_submissions": "Aucune soumission. Soyez le premier — générez un Profil et cliquez 📤 Soumettre au registry.",
// FALSIFICATION dashboard
"falsification.title": "🔬 Prédictions du paper — statut de falsification",
"falsification.desc": "Le framework TAF repose sur des prédictions falsifiables (F1-F23). Chacune est empiriquement testée. Voici le statut en direct de chaque prédiction du paper.",
"falsification.summary": "{confirmed} confirmées · {partial} partielles · {refuted} réfutées · {untested} non testées (sur {total} prédictions au total)",
"falsification.col.id": "ID",
"falsification.col.claim": "Claim",
"falsification.col.status": "Statut",
"falsification.col.evidence": "Preuve",
"tafcard.title": "📇 TAF Card — profil complet du modèle",
"tafcard.recipes_title": "📋 Recettes — verdict par dimension",
"tafcard.recipes_count_label": "dimensions",
"tafcard.numbers_title": "🔢 Nombres clés (paper §26)",
"tafcard.fals_title": "🔬 État de falsification (F1-F23)",
"tafcard.fals_none": "Aucune falsification applicable.",
"tafcard.diag_title": "🔬 Diagnostics — nombres · contrôle γ · what-if",
"tafcard.verify_title": "✓ Vérification — Lean + Sage + falsification",
"tafcard.share_title": "📂 Provenance & partage",
"tafcard.whatif_title": "🎚️ Explorateur what-if",
"verdict.go": "GO",
"verdict.no": "NON",
"verdict.degraded": "DÉGRADÉ",
"compare.title_out": "🆚 Tableau comparatif",
"status.loading_pyodide": "⏳ Chargement du runtime Python (~10MB, première fois)...",
"status.loading_taf": "⏳ Chargement des formules TAF + recettes...",
"status.ready": "✅ Prêt. Choisissez un modèle et cliquez Profiler pour commencer.",
"status.computing": "🧮 Calcul de la chaîne TAF...",
"status.done": "✅ Terminé.",
"profile.hf_placeholder": "ex. meta-llama/Meta-Llama-3-8B ou Qwen/Qwen2.5-7B",
"compare.hf_placeholder": "ID modèle HF (ex. meta-llama/Meta-Llama-3-8B)",
"compare.slot1_placeholder": "ID modèle HF (ex. meta-llama/Meta-Llama-3-8B)",
"compare.slot2_placeholder": "ID modèle HF #2",
"compare.slot3_placeholder": "ID modèle HF #3 (optionnel)",
"compare.preset_default": "— ou préréglage —",
// Paramètres du formulaire
"param.theta": "θ (rope_theta)",
"param.theta.tip": "Fréquence de base RoPE de config.rope_theta. Plus haut = plus de capacité longue portée.",
"param.T_train": "T_train",
"param.T_train.tip": "Contexte max d'entraînement. De max_position_embeddings. Au-delà c'est de l'extrapolation.",
"param.T_eval": "T_eval (votre cible)",
"param.T_eval.tip": "Votre contexte d'inférence cible. La question clé : le modèle se comportera-t-il bien à CETTE longueur ?",
"param.n_attn": "n_attention_heads",
"param.n_attn.tip": "Nombre d'attention heads par couche. De num_attention_heads.",
"param.n_kv": "n_kv_heads",
"param.n_kv.tip": "KV heads. Si < n_attention_heads → GQA (Grouped Query Attention). Réduit la mémoire KV mais pousse γ vers Hagedorn.",
"param.d_head": "head_dim",
"param.d_head.tip": "Dimension par head. Typique 64, 96, 128. De head_dim ou hidden_size / num_attention_heads.",
"param.n_layers": "n_layers",
"param.n_layers.tip": "Nombre de blocs transformer. De num_hidden_layers.",
"param.n_params": "n_params (ex. 8e9)",
"param.n_params.tip": "Nombre total de paramètres. Seuil ~400M pour l'émergence d'induction heads. Affecte la mémoire KV et les recettes de budget.",
"param.has_swa": "A SWA ?",
"param.has_swa.tip": "Sliding Window Attention. true pour Mistral, gemma-2, phi-3. L'audit de calibration v0.5.3 a désactivé la correction historique δ_SWA (ajustement n=1).",
"common.yes": "Oui",
"common.no": "Non",
// Tooltips des modes
"modes.tip": "Quatorze façons d'utiliser l'outil.
📇 Profil: collez un id → TAF Card avec 5 recettes.
🆚 Comparer: 2-3 modèles côte à côte sur une recette.
🔍 Inspecter config: collez config.json brut → Profil complet.
💬 Question: question libre, le LLM du navigateur choisit la recette.
📋 Recette: sélection manuelle avec contrôle total du formulaire.
🩺 Diagnostic CLI: génère commande Python pour mesurer γ localement.
📊 Diagramme de phase: panel de 23 modèles dans le plan (log θ, γ).
🪟 Démasquer: détecte un max_position_embeddings trompeur (SWA / YaRN / RoPE-scaling).
📜 Chat-template: détecte la famille + donne le flag CLI exact pour lm-eval / vLLM / transformers.
🎯 Arena CI: reconstruit les intervalles de confiance depuis les votes pairwise bruts ; détecte les égalités statistiques qu'Arena cache.
🧪 Contamination: note 20+ benchmarks pour leur probabilité de contamination selon le cutoff d'entraînement vs la date de sortie.
⚖️ Quant: prédit γ-shift et ΔPPL pour tout (modèle × schéma de quantification) ; recommande une alternative sûre en cas de cliff.
🔀 Drift: même modèle, scores différents sur deux setups — bug ou bruit ? Prédit la bande de bruit numérique et signale les vrais bugs.
🔍 NIAH→Reason: prédit les taux NIAH et reasoning multi-hop depuis l'architecture ; trouve le contexte sûr pour reasoning.",
"profile.tip": "Diagnostic complet en un clic. Collez n'importe quel id de modèle HF (ou choisissez préréglage). L'outil exécute les 5 recettes (contexte long, compression KV, custom vs API, budget, hardware) et produit une TAF Card unique avec verdict par dimension + nombres clés + classification architecturale.
Cas d'usage: « J'évalue Qwen2.5-32B pour la production — quel est son profil complet de viabilité ? » → collez id → Profiler → fait.",
"compare.tip": "Même recette, plusieurs modèles. Choisissez 2-3 modèles candidats et une recette. Voyez les verdicts dans un seul tableau comparatif.
Cas d'usage: « J'ai besoin de récupération longue contexte à 16K — quel est le meilleur : Llama-3-8B, Mistral-7B ou Qwen-7B ? » → choisissez 3 + X-2 + 16K → voyez le gagnant.",
// Modal d'aide
"help.title": "📘 TAF Agent — Manuel d'utilisation",
"help.what.title": "Que fait-il ?",
"help.what.body": "Prédit la viabilité pratique de tout LLM transformer avant de dépenser du GPU/€. Répond à des questions comme « ce modèle fonctionnera-t-il à L=32K ? » ou « dois-je entraîner sur mesure ou utiliser une API ? » via des formules Python déterministes (TAF — Thermodynamic Attention Framework).",
"help.modes.title": "Comment l'utiliser — 7 modes",
"help.modes.profile": "📇 Profiler: collez id de modèle → toutes les recettes à la fois = TAF Card. Meilleur point de départ.",
"help.modes.compare": "🆚 Comparer: 2-3 modèles côte à côte sur la même recette. Mieux pour choisir entre candidats.",
"help.modes.inspector": "🔍 Inspecter config: collez config.json brut → l'outil le parse et lance le Profil complet. Pour modèles privés, configs en développement, ou modèles pas encore sur HF Hub.",
"help.modes.ask": "💬 Question libre: question en langage naturel, le LLM du navigateur choisit la recette. Mieux pour exploration casuelle.",
"help.modes.recipe": "📋 Recette + formulaire: sélection manuelle, contrôle total des paramètres. Mieux quand vous voulez un contrôle exact.",
"help.modes.diagnose": "🩺 Diagnostic CLI: génère commande Python pour mesurer γ sur votre machine locale (transformers + numpy). Rapide ≈5 min CPU; complet ≈20–60 min GPU. JSON résultat ré-uploadable via Inspect.",
"help.modes.phase": "📊 Diagramme de phase: nuage de 23 modèles du panel dans le plan (log θ, γ). Ligne Hagedorn γ=1 sépare Phase A de Phase B. Cliquer un point pour charger ce modèle dans le formulaire Recette.",
"help.recipes.title": "Les 8 recettes disponibles",
"help.recipe.x1.title": "X-1 Entraînement custom vs API — compare le coût d'entraîner votre propre modèle vs payer l'accès API.",
"help.recipe.x1.example": "Essayez: « Dois-je entraîner un 8B custom ou utiliser GPT-4o pour 50M tokens/mois ? »
Réponses: OUI (custom) / NON (API) avec mois pour break-even.",
"help.recipe.x2.title": "X-2 Viabilité contexte long — prédit si un modèle sert une longueur cible de manière fiable.",
"help.recipe.x2.example": "Essayez: « Meta-Llama-3-8B gérera-t-il 32000 tokens pour récupération ? »
Chaîne: γ_Padé → décomposition → d_horizon → plafond NIAH → hallucination → mémoire KV.
Verdict: OUI / DÉGRADÉ / NON avec mitigation si nécessaire.",
"help.recipe.x3.title": "X-3 Pre-flight budget — étant donné un budget $, quel modèle est faisable à entraîner ?",
"help.recipe.x3.example": "Essayez: « J'ai $5000, quel modèle puis-je entraîner ? »
Réponse: GO / TINY-MODEL / MEMORY-LIMITED avec N (params) et D (tokens) concrets.",
"help.recipe.x5.title": "X-5 Sélection hardware — quel GPU utiliser pour servir au throughput cible ?",
"help.recipe.x5.example": "Essayez: « Hardware le moins cher pour servir Llama-3-8B à 10M tokens/jour »
Réponse: meilleur GPU + $/Mtok + capacité vs cible.",
"help.recipe.x19.title": "X-19 Décision compression KV — utiliser soft decay, hard cutoff, ou méthodes de littérature ?",
"help.recipe.x21.title": "X-21 Diagnostic Pureté Imprint — prédit γ sur tokens RANDOM via ν=−1/(2π); à quel point la prédiction RoPE du modèle est-elle propre ?",
"help.recipe.x22.title": "X-22 Invariant Compute-Context — γ × log(N²·D) est-il dans la bande 51.2 ± 16.8 ? Détecte anomalies de scaling/training.",
"help.recipe.x23.title": "X-23 Détecteur Phase IH — pré- ou post-induction-head ? Probe peu coûteux via sign(γ_text − γ_random).",
"help.recipe.x19.example": "Essayez: « Comment compresser le cache KV pour Qwen2.5-7B à 32K ? »
Réponse: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
"help.recipe.x21.example": "Essayez: « Quelle est la pureté de la prédiction RoPE sur Llama-3-8B ? »
Réponse: γ_random prédit + diagnostic (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).",
"help.recipe.x22.example": "Essayez: « Mistral-7B entre-t-il dans l'invariant compute-context ? »
Réponse: K = γ·log(N²·D), z-score, IN-BAND ou OUTLIER.",
"help.recipe.x23.example": "Essayez: « Qwen2.5-7B est-il post-induction-head ? »
Réponse: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY.",
"help.section.v04": "Nouveautés v0.4 (résultats session 29, 2026-04-28) : trois recettes de diagnostic dérivées de l'analyse panel cross-model (n=22 LLMs).",
"help.divider.v04_s29": "— v0.4 (résultats session 29) —",
"footer.tech_stack": "Calcul : Pyodide · Synthèse : WebLLM (Qwen2.5-0.5B local) · Hébergement : GitHub Pages · Coût : 0 $",
"help.v04.imprint": "Pente d'imprint apprise ν = −1/(2π) : la période de rotation RoPE 2π entraîne un biais positionnel dans les poids, proportionnel à log(N_params). Même les tokens aléatoires montrent ce scaling. ν est DÉRIVÉ — non ajusté (erreur empirique 0,3 %).",
"help.v04.invariant": "Invariant Chinchilla-attention K : γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Connecte le scaling de compute et l'exposant d'attention en un seul nombre sans dimension.",
"help.v04.ih_probe": "Δγ comme probe IH : sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Moins coûteux que de lancer un benchmark in-context-learning.",
"help.v04.constants": "γ-cluster sur constantes célèbres (intriguant, n=4) : CodeLlama-13b γ=0.382 ≈ 1−1/φ (conjugué doré, err 0,0003) ; pythia-1.4b γ=0.705 ≈ 1/√2 ; Llama-2-7b γ=0.287 ≈ 1−1/√2 ; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat : peut être coïncidence.",
"help.param.theta": "θ (rope_theta): fréquence de base RoPE. Plus haut = plus de capacité longue portée. Typique: 10000 (anciens), 500000 (Llama-3), 1000000 (Qwen2.5).",
"help.param.T_train": "T_train: contexte max vu par le modèle pendant l'entraînement. De max_position_embeddings.",
"help.param.T_eval": "T_eval: votre longueur de contexte cible en inférence. Le bouton clé.",
"help.param.gqa": "n_kv_heads < n_attention_heads: le modèle utilise GQA (Grouped Query Attention). Réduit la mémoire KV mais pousse γ vers Hagedorn.",
"help.param.swa": "has_SWA: le modèle utilise Sliding Window Attention (Mistral, gemma-2).",
"help.param.nparams": "n_params: nombre total de paramètres. Seuil ~400M pour l'émergence des induction heads.",
"help.add_models.title": "Ajouter de nouveaux modèles (3 façons)",
"help.add_models.preset": "Liste de préréglages: 11 modèles populaires curés. Sélectionnez dans le dropdown.",
"help.add_models.hf": "HF Hub fetch: collez n'importe quel id (ex. Qwen/Qwen2.5-32B-Instruct), cliquez 📥 Charger. Le navigateur télécharge config.json directement de HuggingFace, remplit le formulaire. Fonctionne avec tout modèle public.",
"help.add_models.manual": "Manuel: remplissez les champs directement avec les valeurs de la model card.",
"help.audit.title": "La chaîne auditable",
"help.audit.body": "Chaque résultat montre la Chaîne de Calcul complète — chaque étape de formule avec ses entrées, sortie et interprétation. Cliquez sur n'importe quelle étape pour développer. Les références de section (§26.1, §19.1, etc.) renvoient au paper pour la dérivation.",
"help.synthesis.title": "La réponse en langage naturel",
"help.synthesis.body": "Après exécution de la chaîne déterministe, un LLM dans le navigateur (Qwen2.5-0.5B, ~350MB cachés après premier chargement) synthétise un résumé en langage naturel. Les nombres ci-dessus sont toujours corrects (Python déterministe) ; la synthèse est générée par LLM — vérifiez contre la chaîne en cas de doute.",
"help.params.title": "Paramètres communs expliqués",
"help.verdicts.title": "Quoi regarder dans les verdicts",
"help.verdict.yes": "OUI / GO — procédez avec confiance ; les nombres soutiennent le choix.",
"help.verdict.deg": "DÉGRADÉ / TINY-MODEL — fonctionne avec caveats ; lisez l'action.",
"help.verdict.no": "NON / MEMORY-LIMITED — ne procédez pas tel quel ; mitigation fournie.",
"help.privacy.title": "Confidentialité",
"help.privacy.body": "Tout s'exécute dans votre navigateur. Pas de télémétrie, pas d'analytique, pas de données envoyées ailleurs. Même le modèle LLM s'exécute localement via WebGPU/WebAssembly. Vos model_ids et questions ne quittent jamais cette page.",
"help.source.title": "Code source et paper",
"help.source.body": "Code : github.com/karlesmarin/tafagent
Paper : Marin 2026 — Predicting How Transformers Attend (Zenodo ; arXiv à venir)
Dataset : taf-attention-decay — 58 mesures γ sur 32 modèles (CC-BY-4.0)",
"footer.text": "© 2026 Carles Marin · Apache-2.0 · recherche indépendante · l'outil qui ferme la boucle du paper.",
},
// ────────────────────────────────────────────────────────────────────────
// ZH — 中文
// ────────────────────────────────────────────────────────────────────────
zh: {
// §33 v0.4 (sesion 31, 2026-04-30) — 新诊断功能
"v04.title": "🆕 v0.4 — 新诊断 (会话 31)",
"v04.section.intro": "会话 31 (2026-04-30) 从公式 cross-of-crosses 游戏 + 苏格拉底质询中得出的四个新诊断函数。在 taf_browser.py §33 中可用。",
"v04.arch.label": "架构集中度",
"v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv。跨面板相关性定律(R²=0.30)。警告:不是逐模型预测器。",
"v04.pdi.label": "PDI — Padé 偏差指数",
"v04.pdi.desc": "PDI = d_horizon_obs/T_eval。交通灯:绿色(≈1)、橙色(>>1)、黄色(<<1)、红色(B 阶段负值)。",
"v04.4bit.label": "4 位精度移位预测器",
"v04.4bit.desc": "MHA: R²(bf16)<0.9 → γ 上升;R²>0.99 → γ 下降。GQA: 精度稳健。",
"v04.crit.label": "临界指数捆绑",
"v04.crit.desc": "ν_c、β_c、η_c (=γ−1, 已修正)、α_C、γ_susc,AM-GM 最小值在 γ=1−1/√2≈0.293。",
// §34 v0.5 (会话 32, 2026-05-01) — 机器验证的代数一致性
"v05.title": "🔬 v0.5 — 机器验证一致性 (会话 32)",
"v05.section.intro": "Sage Groebner basis + Lean Mathlib4 双工具验证 TAF 临界指数的15 个代数恒等式。首个具有形式化机器证明支持的 transformer-attention 框架。",
"v05.verify.label": "代数一致性检查",
"v05.verify.desc": "给定测得的 γ,验证 12 个 D-SAGE 恒等式(D-SAGE-1:2η²+η·γ_χ+1=0、β·χ=−1、α+χ=2 等)。全部通过 = 框架完整。失败表明 bf16 异常值 / 量化伪影。",
"v05.dsage1.label": "D-SAGE-1 (★★ 核心)",
"v05.dsage1.desc": "二次恒等式 2η² + η·γ_χ + 1 = 0(Sage Groebner 发现, Lean 验证)。取代错误的 '三重闭合' 主张。从代数上反驳 paper 1 的 η=2γ。",
"v05.erratum.label": "Paper 1 勘误 — η 修正",
"v05.erratum.desc": "Paper 1 原本声明 η = 2γ。Sage Groebner + Lean Mathlib4 证明此为失败(残差 (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ A 相)。正确值:η = γ−1,满足 D-SAGE-1。",
"v05.repro.label": "可重现性",
"v05.repro.desc": "全部 15 个定理在 Lean Mathlib4 中机器证明(build 成功 1973 jobs)。Sage 脚本:analysis/sage_recursive_sweep_2026-04-30.sage。Lean 代码:lean_taf/taf/Taf/Identities.lean。",
// v0.5.1 — TAF Card consistency check button
"v05.consistency.title": "🔬 代数一致性检查 (Sage + Lean v0.5)",
"v05.consistency.desc": "验证 TAF 临界指数的 12 个 D-SAGE 代数恒等式(Sage Groebner basis + Lean Mathlib4 机器证明)。通过 = 框架完整。失败 = bf16 异常值 / 量化伪影。",
"v05.consistency.btn": "🔬 验证代数一致性",
// v0.5.2 — Anti-Ising universality class badge
"v05.antiising.badge": "🧲 反 Ising 类 (β=γ−1<0,机器验证)",
// v0.5.2 — 每个恒等式的工具提示(通俗解释)
"v05.tooltip.D_SAGE_1": "二次代数恒等式,连接异常维度 η 和磁化率 γ_χ。Sage Groebner basis 发现的核心恒等式(机器证明)。取代了之前关于三重闭合的错误声明。",
"v05.tooltip.D_SAGE_2": "在 A 相中,β = γ−1 为负(反 Ising)。乘以 χ = 1/(1−γ) 恰好等于 −1。TAF 负 β 体制的标志。",
"v05.tooltip.D_SAGE_4": "比热指数 α 和磁化率 χ 在 TAF 中精确加和为 2。Josephson 超标度的代数推论。",
"v05.tooltip.D_SAGE_5": "线性恒等式:α + γ_χ = 2(2−γ)。意味着当 γ 接近 1(Hagedorn)时,总和接近 2;在 γ=0 时为 4。",
"v05.tooltip.D_SAGE_6": "序参量指数乘以磁化率指数等于 γ 的特定二次式。因式分解的代数关系。",
"v05.tooltip.Rushbrooke_tautology": "标准 Rushbrooke 超标度 2β + γ_χ = ν·d 在 d=1。在 TAF 中这是一个重言式 — γ_χ 的定义就是为了使其成立。Sage Groebner basis 确认。",
"v05.tooltip.Josephson_tautology": "标准 Josephson 超标度 2 − α = ν·d 在 d=1。在 TAF 中这是一个重言式 — α 的定义就是为了使其成立。",
"v05.tooltip.Fisher_independent": "Fisher 关系 γ_χ = (2−η)·ν。在 TAF 中是独立的(不作为恒等式闭合,与三重闭合声明相反)。残差为 γ(2γ−3)/(1−γ)。",
"v05.tooltip.eta_2gamma_REFUTED": "Paper 1 声称 η=2γ。这个恒等式驳斥了它:残差在整个 A 相中为正。Lean Mathlib4 的机器证明驳斥。",
"v05.tooltip.D_14_nu_imprint": "学习到的印记斜率 ν = −1/(2π) 乘以 2π 得 −1。来自 paper 1 的简单维度检查。",
"v05.tooltip.D_SAGE_7": "中心电荷 c=3 乘以 |ν_imprint| 乘以 2π 得 3。连接 CFT 和训练印记的维度闭合。",
"v05.tooltip.nu_beta_id": "关联长度指数 ν 乘以序参量指数 β 在 A 相中得 −1。D-SAGE-2 的变体。",
"v053.calibration.title": "🔬 v0.5.3 — 校准审计 (2026-05-02)",
"v053.calibration.note": "SWA 修正已禁用 — 原 δ_SWA = -0.21 基于 n=1 模型拟合(数据不足;唯一案例的均值为 +0.355)。post_IH 修正标记为探索性 — 重审中组均值 ≈ 0(n=22 面板)未能复现 OLS 拟合。GQA 修正可复现(面板 +0.115 vs 硬编码 +0.11)。D_f 公式修正 Phase B (γ>1) — 使用离散累积和代替连续近似。LLaMA-3、Mistral、Gemma 现在报告正确的压缩值。",
"v053.release.banner": "🔧 v0.5.3 — 审计驱动的修复:KV 压缩 D_f 现使用离散和(适用于所有 γ);δ_SWA 禁用(n=1 校准);论文 §5.2 C_V 系数勘误 (1/4 → 1/12)。",
// §35 v0.6 — γ 预测 vs 观测 诊断
"gamma_check.title": "🔍 γ 预测 vs 观测",
"gamma_check.desc": "输入你经验测量的 γ。工具自动检测体制:欺诈 (θ 虚高) / 压缩 / 超 Padé / SWA-随机 / 正常。",
"gamma_check.gobs_label": "γ_观测",
"gamma_check.gobs_tip": "从模型注意力分数经验测量的 γ。使用 Diagnose CLI 从真实权重获取。",
"gamma_check.random_label": "随机语料?",
"gamma_check.random_tip": "若 γ_观测在随机/无结构 token 上测得请勾选。区分 SWA 签名 (γ_obs > 1) 与异常。",
"gamma_check.regime": "体制",
"gamma_check.regime.normal": "正常",
"gamma_check.regime.fraud": "欺诈 (θ 虚高)",
"gamma_check.regime.compressed": "上下文压缩",
"gamma_check.regime.overpade": "超 Padé",
"gamma_check.regime.swa": "SWA 签名 (随机语料)",
"gamma_check.regime.unknown": "未知",
"gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15]:模型完全利用名义上下文,无异常。",
"gamma_check.regime.fraud.desc": "η < 0.01:名义 θ 虚高。模型表现如同 θ 远小于宣称值。可能是 YaRN/营销虚标,无真实上下文扩展。",
"gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5):上下文压缩 (模型注意距离比名义 θ 预测更短)。常见于 instruction-tuned / RLHF 模型。",
"gamma_check.regime.overpade.desc": "η > 1.5:模型注意距离超过 Padé 预测。可能是 Lerch 修正体制或欠训练早期 checkpoint。",
"gamma_check.regime.swa.desc": "随机语料上 γ_obs > 1.05 = 滑动窗口注意力签名 (Mistral / Gemma 系列)。",
"gamma_check.regime.unknown.desc": "输入超范围或 γ_obs > 1 但未标记随机语料。请核验测量。",
"gamma_check.validity.title": "⚠ 闭式 γ 可能不适用于此模型",
"gamma_check.validity.body": "Padé 预测假设没有显式注意力正则化的自然训练。你的 η 落在已验证带 [0.85, 1.15] 之外,因此闭式公式在此处不可靠。优先信任经验 γ (Phase Diagram 或 Diagnose CLI) 而非预测值。可能原因:强正则化迫使注意力近乎均匀、微调导致崩溃、滑动窗口架构、或非标准训练损失。详见 docs/LIMITATIONS.md。",
"gamma_check.validity.fraud.hint": "提示:η ≪ 1 通常表示 θ 营销虚标 (YaRN 风格) 而非真实上下文扩展,或训练强制注意力近乎均匀。",
"gamma_check.validity.compressed.hint":"提示:η ∈ [0.01, 0.5) 在 instruction-tuned / RLHF 模型中常见,后训练已使注意力分布扁平化。",
"gamma_check.validity.overpade.hint": "提示:η > 1.5 可能表示欠训练早期 checkpoint、Lerch 修正体制、或超出 Padé 近似的修正项。",
"gamma_check.validity.swa.hint": "提示:滑动窗口架构 (Mistral, Gemma) 在设计上违反闭式公式的 full-attention 假设。",
"gamma_check.validity.unknown.hint": "提示:γ_obs 超出物理范围或测量噪声。请核验输入并重新测量。",
"gamma_check.validity.summary_pill": "⚠ 有效性门",
"gamma_check.glossary.title": "ⓘ 词汇表 — 变量含义",
"gamma_check.glossary.gamma_pade": "γ_Padé:闭式预测 (2−z)/(2+z), z = T√2/θ。论文 §sec:gamma_decomposition。",
"gamma_check.glossary.gamma_obs": "γ_观测:从注意力分数经验测得 (在真实权重上运行 Diagnose CLI)。",
"gamma_check.glossary.theta_eff_obs":"θ_eff (观测):由 γ_obs 反演 T√2 / (1 − γ_obs)。测量隐含的有效 θ。",
"gamma_check.glossary.theta_eff_pade":"θ_eff (Padé):θ + T/√2。闭式公式预测的有效 θ。",
"gamma_check.glossary.efficiency": "η:θ_eff_obs / θ_eff_Padé 比值。≈1 = 正常 · <0.01 = 欺诈 · <0.5 = 压缩 · >1.5 = 超 Padé。",
"gamma_check.glossary.delta_h": "ΔH_Cardy:log(θ_eff_obs / θ_nominal)。Cardy 熵变。负值 = 压缩熵。~0 = 与名义匹配。",
"gamma_check.glossary.regime": "体制:基于 η + γ_obs + 随机语料标志的自动分类器。",
// §36 v0.6 — 内联 ⓘ 图标提示
"tooltip.gamma_pade": "γ_Padé(T_eval):闭式预测 (2−z)/(2+z), z = T√2/θ。论文 §sec:gamma_decomposition。",
"tooltip.gamma_decomposed": "γ_分解:基于完整架构分解的 γ。Padé 基线 + GQA 偏移 + post-IH 偏移 (校准审计已复制子集)。",
"tooltip.d_horizon": "d_horizon:有效注意力视野。超过此位置分数低于噪声底 (论文 §26)。",
"tooltip.L_NIAH": "L_NIAH 上限:当前 d_horizon 下针-在-干草堆检索可靠性的预测上限。",
"tooltip.chi": "χ 易感性:χ = 1/(1−γ)。在 Hagedorn 线 γ=1 处发散。",
"tooltip.kv_memory": "KV 内存 @ T_eval (BF16):每请求 KV 缓存 = 2 · n_layers · n_kv_heads · d_head · T_eval 字节。",
"tooltip.theta_eff_obs": "θ_eff (观测):由 γ_观测 隐含的有效 θ:T√2 / (1 − γ_obs)。",
"tooltip.theta_eff_pade": "θ_eff (Padé):闭式公式预测的有效 θ:θ + T/√2。",
"tooltip.efficiency": "η = θ_eff_obs / θ_eff_Padé:效率比。≈1 = 正常 · <0.01 = 欺诈 · <0.5 = 压缩 · >1.5 = 超 Padé。",
"tooltip.delta_h_cardy": "ΔH_Cardy:log(θ_eff_obs / θ_nominal)。Cardy 熵变。负值 = 压缩熵。~0 = 与名义匹配。",
"tooltip.verdict_aggregate": "判定:所有配方中最差。✅ 通过 = 全绿 · ⚠ 降级 = ≥1 黄 · ❌ 否 = ≥1 红。",
"tooltip.verdict_breakdown": "各配方分解:每个配方测试一个独立的决策轴 (长上下文 · 预算 · 硬件 · 自训 vs API · KV 压缩)。X-1 上的 ❌ 表示「按你的量级用 API」而非「模型失败」——展开 Recipes 节查看各轴上下文。",
"tooltip.gamma_pill": "γ 头条:γ_分解 (或 γ_Padé 回退)。范围 (0,1) = 相位 A (反伊辛)。γ ≥ 1 = Hagedorn / 相位 B。",
"tooltip.anti_ising": "反伊辛类:相位 A → β = γ−1 < 0。机器证明 (Sage + Lean Mathlib4)。见 §35 v0.5。",
// §37 v0.6 — Lean+Mathlib 定理表
"lean.table.title": "📑 Lean+Mathlib 定理表",
"lean.table.desc": "下方每一项都已机器证明对 Lean 4 + Mathlib4。点击任意 L# 链接跳转到 GitHub 源码行。按主题分组——点击标题展开。",
"lean.table.theorem": "定理",
"lean.table.claim": "陈述",
"lean.table.tactic": "策略",
"lean.table.source": "出处",
"lean.table.lean": "Lean",
"lean.findings.title": "🔎 实质性发现",
"lean.findings.detected_by": "检测于",
"lean.findings.fixed_by": "修正于",
"lean.findings.recommendation":"建议",
"lean.meta.repo": "仓库",
"lean.meta.build": "构建",
"lean.meta.theorems": "定理",
"lean.meta.verified": "已验证",
"lean.meta.rejected": "已拒绝",
"lean.meta.sorry": "sorry",
"lean.meta.findings": "项实质性发现",
"lean.manifest.loading": "正在加载 Lean 清单…",
"lean.manifest.error": "Lean 清单不可用",
// 帮助弹窗 — v0.6 节
"help.v06.title": "🆕 v0.6 — γ 预测-vs-观测 + Cardy ΔH + Lean 徽章",
"help.v06.intro": "v0.6 (2026-05-06):三个新诊断位于 TAF 卡的 🔬 诊断 下。全部在浏览器运行;γ_观测来自在真实权重上运行 Diagnose CLI。",
"help.v06.layout.title": "TAF 卡布局 (v0.6 新增)",
"help.v06.layout.body": "点击 🚀 生成完整画像 后,卡片展示:顶部一条 hero 条 (架构类 + 元信息 + 3 个 pill:聚合判定 ✅/⚠/❌、γ 头条、🧲 反伊辛若处于相位 A) 和四个 可展开节:📋 配方 (默认展开 — 各维度判定)、🔬 诊断 (关键数字、γ 预测 vs 观测、what-if 浏览器)、✓ 验证 (Sage+Lean 代数一致性、可证伪 F1-F23)、📂 来源与分享 (校准审计 + JSON 下载 / 链接 / 注册表提交)。点击任意标题展开。每个变量都有内联 ⓘ 提示。",
"help.v06.gamma_check.title": "γ 预测 vs 观测",
"help.v06.gamma_check.body": "输入经验测量的 γ,工具计算 η = θ_eff_obs / θ_eff_Padé 并分类到 5 种体制之一:",
"help.v06.case.normal": "正常 (η ∈ [0.85, 1.15]) — 模型完整使用名义上下文。用例:在采用前验证新发布。",
"help.v06.case.fraud": "欺诈 (η < 0.01) — 名义 θ 虚高;模型表现如同 θ ≪ 宣称值。用例:检测 YaRN/营销虚标 (CodeLlama / Mistral-Nemo 模式)。",
"help.v06.case.compressed": "压缩 (η < 0.5) — 上下文压缩;模型注意距离比名义 θ 短。用例:识别 RLHF/指令调优引起的压缩 (LLaMA-2 模式)。",
"help.v06.case.overpade": "超 Padé (η > 1.5) — 模型注意距离超过 Padé 预测。用例:识别 Lerch 修正体制或欠训练早期 checkpoint (pythia-1b 模式)。",
"help.v06.case.swa": "SWA 随机语料 (γ_obs > 1.05 且 随机语料=是) — 滑动窗口注意力签名。用例:在随机 token 上确认 Mistral / Gemma SWA。",
"help.v06.gamma_check.validity_gate.title": "有效性门 (v0.8.9+)",
"help.v06.gamma_check.validity_gate.body": "当 η 落在 [0.85, 1.15] 之外或体制非正常时,面板显示警告横幅,说明闭式预测可能不适用。在这些情况下信任经验 γ。完整的有效性体制讨论见 docs/LIMITATIONS.md(闭式 γ 假设无显式正则化的自然注意力;ν = -1/(2π) 假设 i.i.d. tokens)。",
"help.v06.cardy.title": "Cardy ΔH 诊断",
"help.v06.cardy.body": "ΔH_Cardy = log(θ_eff_obs / θ_nominal)。观测有效 θ 与名义 θ 之间的熵变。强负值 = 压缩熵;接近零 = 与名义匹配。在边界情况下补充 η。",
"help.v06.lean.title": "Lean + Mathlib 验证徽章",
"help.v06.lean.body": "TAF 恒等式在 Lean Mathlib4 中形式化机器证明:37 个定理分布于 7 组(Padé、RG 流、Cayley、D-SAGE、审计发现、CV 勘误、杂项)+ 1 项实质性发现(V 导数 2 倍因子,定理 V_derivative_ne_RG_beta)。源:github.com/karlesmarin/lean-taf(commit 25c77fd)。本地重新验证:git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean。Hero 中的 🧲 反伊辛 pill 与验证手风琴链接到具体源码行。",
"help.v06.glossary.title": "变量词汇表 (亦嵌入 TAF 卡)",
"help.v06.glossary.body": "TAF 卡中每个变量都有内联 ⓘ 提示。完整列表:γ、γ_Padé、γ_分解、γ_观测、θ、θ_eff_obs、θ_eff_Padé、η、ΔH_Cardy、χ、d_horizon、L_NIAH、KV 内存、体制。鼠标悬停任意 ⓘ 查看定义 + 论文章节。",
"hero.title": "🔬 TAF Agent",
"hero.tagline": "30 秒诊断任意 transformer LLM。免费。无需 GPU。无需注册。",
"hero.subtitle": "在你花钱或花时间之前,预测某个模型是否适合你的用例。所有计算在浏览器本地运行 — 你的输入永远不会离开此标签页。",
"hero.help": "📘 手册与示例",
"hero.quickstart_btn": "⚡ 快速开始",
"hero.inventory_btn": "🧰 它能给你什么",
"hero.about": "由独立研究员构建。开源。不隶属于任何模型供应商。",
"modes.title": "🎯 模式",
"modes.profile": "📇 模型画像",
"modes.compare": "🆚 比较模型",
"modes.inspector": "🔍 检查 config",
"modes.ask": "💬 自由提问",
"modes.recipe": "📋 选择配方",
"modes.diagnose": "🩺 诊断 CLI",
"diagnose.title": "🩺 诊断 CLI 命令生成器",
"diagnose.tip": "浏览器从 config 预测 γ;CLI 在真实权重上测量 γ_obs。此生成器产生在本地运行的精确命令。",
"diagnose.desc": "选择选项并将生成的命令复制粘贴到本地机器(Python + transformers + numpy)。快速模式 ≈5 分钟 CPU;完整 ≈20–60 分钟 GPU。",
"diagnose.model_label": "HF 模型 id:",
"diagnose.theta_label": "θ(留空自动):",
"diagnose.n_label": "上下文 N:",
"diagnose.options_label": "选项:",
"diagnose.opt_fast": "--fast(CPU,≈5 分钟)",
"diagnose.opt_cpu": "--cpu(强制 CPU)",
"diagnose.opt_4bit": "--load_in_4bit(≥7B 模型)",
"diagnose.local_label": "--local 路径(可选):",
"diagnose.build_btn": "📋 生成命令",
"diagnose.cmd_title": "生成的命令:",
"diagnose.copy_btn": "📋 复制到剪贴板",
"diagnose.next_steps": "下一步: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) 运行命令 (4) JSON 结果 → 通过 Inspect 模式上传以进行完整 TAF 分析。",
"modes.phase": "📊 相图",
"phase.title": "📊 相图(γ × θ)",
"phase.tip": "每个点是论文经验数据集中的一个模型。x 轴: log θ; y 轴: γ。Hagedorn 线 γ=1 分隔 A 相和 B 相。悬停查看详情,点击加载到表单。",
"phase.desc": "数据集中 23 个模型;Padé 曲线在 T=2000。",
"modes.desc": "最快开始: 粘贴任意 HuggingFace 模型 id (例如 meta-llama/Meta-Llama-3-8B),点击 画像。秒内看到所有 5 个配方的评分。",
"profile.title": "📇 模型画像",
"profile.desc": "面向技术人员: 当您需要候选模型的完整可行性快照时。一键运行所有 5 个配方,生成统一的 TAF 卡。",
"profile.preset_label": "预设:",
"profile.preset_default": "— 或从列表中选择 —",
"profile.hf_label": "HF 模型 id:",
"profile.fetch_btn": "📥 获取",
"profile.btn": "🚀 生成完整画像",
"profile.quickstart": "💡 快速开始: 选择任意预设 → 点击生成。或从 HF Hub 热门 粘贴一个 id → 📥 获取 → 生成。",
"compare.title": "🆚 模型并排比较",
"compare.desc": "面向技术人员: 当为特定部署场景在 2-3 个候选模型之间选择时。同一配方,多个模型,并排判定。",
"compare.recipe_label": "配方:",
"compare.T_eval_label": "T_eval (目标上下文):",
"compare.models_title": "要比较的模型(最多 3 个)",
"compare.btn": "🚀 比较",
"compare.example": "💡 尝试: 粘贴 3 个流行的 7-8B 模型 (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B),配方 X-2, T_eval=16000。查看哪个最适合长上下文。",
"ask.title": "❓ 您的问题",
"ask.placeholder": "例如: Mistral-7B 能处理 16K NIAH 检索吗?或: 我有 5,000 美元,可以训练什么模型?或: 以每天 1 亿 tokens 提供 Llama-70B 的最便宜 GPU?",
"ask.btn": "🚀 分析",
"ask.example_btn": "💡 尝试示例",
"recipe.title": "📋 配方",
"recipe.default": "— 选择一个配方 —",
"recipe.input_title": "🎯 输入",
"verdict.title": "📊 判定",
"chain.title": "🔍 计算链",
"chain.desc": "下面每个数字都是确定性 Python。点击步骤展开。",
"answer.title": "💬 自然语言回答",
"share.btn": "🔗 复制分享链接",
"share.copied": "✅ 已复制到剪贴板!",
"share.download": "💾 下载 JSON",
"share.download_md": "📝 Markdown",
"share.download_tex": "📜 LaTeX",
"share.submit": "📤 提交到 registry",
"share.submit_clip_ok": "↗ 已打开 GitHub。正文已复制到剪贴板——粘贴到 issue 正文。",
"share.submit_clip_fail": "↗ 已打开 GitHub。剪贴板被阻止——正文已写入浏览器控制台 (F12)。",
"share.import_title": "📂 导入共享的 TAF 结果",
"a11y.skip": "跳到主要内容",
// v0.6.2 — landing 重构:快速开始 + 功能清单 + 架构提示
"qs.title": "⚡ 快速开始",
"qs.step1": "粘贴 HuggingFace 模型 ID(例如 meta-llama/Meta-Llama-3-8B)",
"qs.step2": "点击 📇 Profile a model",
"qs.step3": "查看你的 TAF Card — 各用例的判定 + 关键数值 + 经 Lean+Mathlib 验证的数学",
"qs.cta": "↓ 立即开始",
"inv.title": "🧰 这个工具能给你什么",
"inv.recipes.title": "🎯 8 个 recipe — 这个模型符合你的用例吗?",
"inv.recipes.x1.title": "自训练 vs API",
"inv.recipes.x1.body": "对你的流量哪个更便宜?",
"inv.recipes.x2.title": "长上下文",
"inv.recipes.x2.body": "能可靠处理 32k / 128k tokens 吗?",
"inv.recipes.x3.title": "预算",
"inv.recipes.x3.body": "用 $X,你能从零训练什么模型?",
"inv.recipes.x5.title": "硬件",
"inv.recipes.x5.body": "用什么 GPU 服务 N tokens/天?",
"inv.recipes.x19.title": "KV 缓存",
"inv.recipes.x19.body": "如何压缩而不破坏质量?",
"inv.recipes.x21.title": "Imprint 纯度",
"inv.recipes.x21.body": "模型的位置编码有多干净?",
"inv.recipes.x22.title": "Compute-context",
"inv.recipes.x22.body": "模型是否落入经验带?",
"inv.recipes.x23.title": "IH 相位",
"inv.recipes.x23.body": "induction-head 之前还是之后?",
"inv.diag.title": "🔬 诊断",
"inv.diag.gamma": "γ 预测 vs 观测 — 自动分入 5 种状态(正常 · 欺诈/夸大上下文 · 压缩 · over-Padé · sliding-window)",
"inv.diag.cardy": "Cardy ΔH — 观测上下文与名义上下文之间的熵偏移",
"inv.diag.fals": "可证伪面板 — 检查 23 个具体预测(F1–F23)",
"inv.diag.alg": "代数一致性 — 模型必须满足的 8 条数学恒等式",
"inv.verify.title": "✓ 形式化验证的数学",
"inv.verify.count": "37 个定理已在 Lean 4 + Mathlib4 机器证明",
"inv.verify.click": "点击任意徽章 → 在 GitHub 打开源码行",
"inv.verify.reverify": "自行验证:lake build(缓存后 ≈5 秒)",
"inv.export.title": "📤 导出与分享",
"inv.export.formats": "JSON · Markdown · LaTeX(论文级)",
"inv.export.share": "可复现的分享链接(状态编入 URL)",
"inv.export.registry": "提交到 GitHub 上的社区登记",
"arch.summary": "支持的架构",
"arch.anyhf": "✓ 任意 HuggingFace 公开模型",
"tooltip.mha": "Multi-Head Attention:每个 token 位置同时通过多个并行 head 进行注意力计算。",
"tooltip.gqa": "Grouped Query Attention:queries 共享比 heads 更少的 keys/values(节省内存但把 γ 推向 Hagedorn)。",
"tooltip.alibi": "Attention with Linear Biases:位置信息以学习斜率加到注意力分数,无旋转。",
"tooltip.abspe": "Absolute Position Embeddings:每个位置有一个固定的学习向量加到 token embedding。",
"tooltip.swa": "Sliding Window Attention:每个 token 仅在固定局部窗口内做注意力(Mistral、gemma-2 使用此机制)。",
"tooltip.ssm": "State Space Model:维护内部状态的序列层(取代注意力,Mamba、Jamba 使用此机制)。",
// v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling 揭示器
"modes.unmask": "🪟 揭示",
"unmask.title": "🪟 上下文揭示器",
"unmask.tip": "粘贴 HuggingFace 模型 id(或原始 config.json)。工具检测 sliding-window attention、RoPE 缩放(YaRN/linear/dynamic NTK)和 GQA — 所有使 max_position_embeddings 大于实际有效上下文的因素。Mistral-7B-v0.1 是经典例子:声称 32k,实际只在 ~4-8k 范围内做注意力。",
"unmask.desc": "你即将为一个实际上注意力不到那么远的模型花钱吗? 粘贴 id,1 秒内得知。无需 GPU,无需推理 — 只是对 config.json 做算术。",
"unmask.id_label": "HF 模型 id:",
"unmask.fetch_btn": "🔍 揭示",
"unmask.paste_summary": "或粘贴原始 config.json(私有 / 在研模型)",
"unmask.paste_btn": "🔍 揭示已粘贴的 config",
"unmask.label.declared": "声明上下文",
"unmask.label.effective": "有效(估计)",
"unmask.label.ratio": "比率",
"unmask.section.flags": "架构标志",
"unmask.section.warnings": "警告",
"unmask.section.reco": "建议",
"unmask.flag.swa": "SWA",
"unmask.flag.rope": "RoPE 缩放",
"unmask.flag.gqa": "GQA",
"unmask.flag.layers": "层数",
"unmask.flag.dhead": "d_head",
"unmask.flag.theta": "RoPE θ",
"unmask.flag.yes": "是",
"unmask.flag.no": "否",
"unmask.flag.full_mha": "否(完整 MHA,{n} heads)",
"unmask.verdict.honest": "✅ 诚实",
"unmask.verdict.inflated": "⚠ 夸大",
"unmask.verdict.severely_inflated": "❌ 严重夸大",
"unmask.verdict.yarn_extended": "⚠ YARN 扩展",
"unmask.verdict.unknown": "❓ 未知",
"unmask.warn.swa_window": "SWA 窗口:{window} tokens — 每层仅在此窗口内做注意力。",
"unmask.warn.multihop": "多跳估计:~{multiHop} tokens(保守:窗口 × {factor})。",
"unmask.warn.yarn": "RoPE 缩放({type})将上下文从 ~{original} 扩展 {factor}× 到 {declared} tokens。",
"unmask.warn.yarn_advice": "RoPE 扩展的上下文 — 用 γ_check 诊断在声称的全长度验证 γ 行为。",
"unmask.warn.gqa_small_dhead": "小 head dim({d_head})+ GQA:长上下文下 KV 缓存压缩很可能(γ 推向 Hagedorn)。",
"unmask.reco.honest": "标准全注意力模型。有效上下文与声明一致({declared} tokens)。",
"unmask.reco.inflated": "通过 SWA 有效 ~{effective} tokens。用 γ_check 验证你目标长度的行为。",
"unmask.reco.severely_inflated": "实际把它当作 ~{effective} tokens 上下文模型。{declared} tokens 的声明仅通过跨层注意力链生效,经验上超过 ~2× SWA 窗口后会退化。",
"unmask.reco.yarn_extended": "RoPE 扩展上下文。运行长上下文 benchmark(NIAH 在 8k / 16k / 32k / 全长度)以确认扩展是否成立。用 γ_check 设 T_eval = {declared}。",
"unmask.reco.unknown": "无法解析 config。验证 URL 是带公开 config.json 的有效 HF 模型。",
"unmask.status.empty_id": "⚠ 输入一个 model id(例如 mistralai/Mistral-7B-v0.1)。",
"unmask.status.fetching": "⏳ 正在获取 {modelId} 的 config.json...",
"unmask.status.success": "✅ 已分析 {modelId}(判定:{verdict})",
"unmask.status.empty_paste": "⚠ 请先粘贴 config.json。",
"unmask.status.invalid_json": "❌ JSON 无效:{error}",
"unmask.status.success_paste": "✅ 已分析粘贴的 config(判定:{verdict})",
"unmask.pasted_label": "(已粘贴 config)",
"mode_desc.ask": "输入自由问题。浏览器内的 LLM 选择正确的 recipe 并运行。",
"mode_desc.recipe": "直接选择一个 recipe 并填表。完整手动控制。",
"mode_desc.profile": "最快开始:粘贴任意 HuggingFace model id,点击 Profile。几秒内看到 5 个 recipe。",
"mode_desc.compare": "选择 2-3 个候选模型 + 一个 recipe。在表格中并排查看判定。",
"mode_desc.inspector": "直接粘贴 config.json。适用于未发布 HF Hub 的私有 / 在研模型。",
"mode_desc.diagnose": "构建 diagnose_model.py 的 CLI 命令,在真实 GPU 上测量 γ_obs。浏览器预测;CLI 测量。",
"mode_desc.phase": "论文经验面板的 γ × θ 散点图。悬停点查看详情,点击加载到 Diagnose / Recipe 表单。",
"mode_desc.unmask": "检测 max_position_embeddings 是否误导(SWA / YaRN / RoPE 缩放)。粘贴 model id,1 行判定。",
"profile.preset_loaded": "✅ 已为 {id} 加载预设。表单已预填。(点击 📥 Fetch 用 HF Hub 最新 config 覆盖。)",
// v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
"modes.template": "📜 Chat-template",
"mode_desc.template": "检测模型使用的 chat-template 系列(Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek)。给出 lm-eval / vLLM / transformers 的精确 CLI flag。",
"template.title": "📜 Chat-template 检测器",
"template.tip": "粘贴 HF 模型 id(或原始 tokenizer_config.json)。检测 chat-template 系列并给出正确使用的精确框架命令。如果忘记应用,lm-eval-harness 会让 accuracy 静默对半(issue #1841)。",
"template.desc": "忘了 --apply_chat_template 吗? 大多数 multi-turn eval 因为 chat template 未应用而失败 ~50%。粘贴 model id,获取你 stack 的精确 CLI flag。",
"template.id_label": "HF 模型 id:",
"template.fetch_btn": "📜 检测",
"template.paste_summary": "或粘贴原始 tokenizer_config.json(私有模型)",
"template.paste_btn": "📜 检测已粘贴 config",
"template.label.family": "检测到的系列",
"template.label.markers": "匹配的标记",
"template.label.tpl_len": "Template 长度",
"template.section.warnings": "警告",
"template.section.commands": "各框架命令",
"template.section.raw": "原始 template(预览)",
"template.family.custom": "自定义(未知系列)",
"template.family.none": "(无 chat_template)",
"template.verdict.ok": "✅ 已检测到 TEMPLATE",
"template.verdict.custom": "⚠ 自定义 TEMPLATE",
"template.verdict.missing": "❌ 无 CHAT TEMPLATE",
"template.verdict.base_model": "ℹ 基础模型(无 chat)",
"template.verdict.unknown": "❓ 未知",
"template.warn.no_chat_template": "tokenizer_config.json 中无 chat_template 字段。基础 / 仅预训练模型的典型情况。如果你期待 instruct-tuned 模型,可能加载了错误的文件。",
"template.warn.custom_template": "非标准 template({length} 字符)。工具无法将其匹配到已知系列。检查下方预览并验证你的 eval 框架是否支持。",
"template.warn.lm_eval_apply": "lm-eval-harness:添加 --apply_chat_template,否则 multi-turn eval 上 accuracy 会静默下降 ~50%(issue #1841)。",
"template.warn.vllm_apply": "vLLM serve:验证 --chat-template 已设置(fine-tuned 变体的自动检测有时失败)。建议:{name}。",
"template.status.empty_id": "⚠ 输入 model id(例如 mistralai/Mistral-7B-Instruct-v0.3)。",
"template.status.fetching": "⏳ 正在获取 {modelId} 的 tokenizer_config.json...",
"template.status.success": "✅ 已检测 {modelId}(判定:{verdict})",
"template.status.empty_paste": "⚠ 请先粘贴 tokenizer_config.json。",
"template.status.invalid_json":"❌ JSON 无效:{error}",
"template.status.success_paste":"✅ 已检测粘贴的 config(判定:{verdict})",
"template.pasted_label": "(已粘贴 tokenizer_config)",
// v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
"modes.arena": "🎯 Arena CI",
"mode_desc.arena": "从原始 pairwise 投票数据中恢复置信区间(Bradley-Terry MLE + bootstrap)。检测公开 Arena 排行榜隐藏的统计上并列对。",
"arena.title": "🎯 Arena-Elo CI 重建器",
"arena.tip": "Chatbot Arena 在公开排行榜中删除了置信区间。5 Elo 的差距在统计上可能毫无意义。粘贴原始投票数据(model_a, model_b, winner) — 工具计算 Bradley-Terry MLE + bootstrap CI 并列出统计上的并列(CI 重叠)。",
"arena.desc": "GPT-4 真的比 Claude 强吗 — 还是它们打平? 粘贴 pairwise 投票 CSV(或点击 加载样本)。Bradley-Terry MLE + 200 次 bootstrap → 排序 Elo + 95% CI + 统计并列检测。全部在浏览器中。",
"arena.sample_btn": "📊 加载样本数据",
"arena.run_btn": "🎯 计算 CIs",
"arena.clear_btn": "🗑️ 清空",
"arena.csv_summary": "投票 CSV(header:model_a,model_b,winner;winner ∈ a/b/tie)",
"arena.section.ranked": "排序 Elo 与 95% CI",
"arena.section.ties": "统计并列(CI 重叠)",
"arena.section.summary": "摘要",
"arena.col.rank": "#",
"arena.col.model": "模型",
"arena.col.elo": "Elo",
"arena.col.ci": "95% CI",
"arena.col.ci_width": "± 半宽",
"arena.col.matches": "对局",
"arena.col.wins": "胜 / 负 / 平",
"arena.col.tie_pair": "配对",
"arena.col.tie_diff": "Elo 差距",
"arena.col.tie_overlap": "CI 重叠",
"arena.no_ties": "无统计并列 — 所有配对在 95% CI 下可区分。",
"arena.summary.votes": "总投票数",
"arena.summary.models": "模型数",
"arena.summary.ties": "统计并列",
"arena.summary.bootstrap": "Bootstrap 迭代",
"arena.summary.ci_level": "CI 水平",
"arena.status.empty": "⚠ 粘贴投票 CSV 或点击加载样本。",
"arena.status.too_few": "⚠ 仅 {n} 个有效投票 — 需要至少 10 个才能可靠拟合 Bradley-Terry。",
"arena.status.computing": "⏳ 在 {n} 个投票上计算 Bradley-Terry MLE + bootstrap...",
"arena.status.done": "✅ {n} 投票 · {models} 模型 · {ties} 统计并列 · {ms} ms",
"arena.status.sample_loaded": "✅ 样本已加载(合成 6 模型 Arena 数据)。点击计算 CIs。",
// v0.7.3 — anti-bullshit pack #4: Contamination Prior
"modes.contam": "🧪 污染",
"mode_desc.contam": "对 benchmark 分数是否被污染做贝叶斯式的先验估计。输入模型训练 cutoff → 评估 20+ 主流 benchmark(MMLU、GSM8K、HumanEval、MMLU-Pro…)。",
"contam.title": "🧪 污染先验",
"contam.tip": "基于 (模型训练 cutoff 日期) × (benchmark 发布日期) × (已知语料库纳入 + 泄漏历史),对 benchmark 分数是否被污染做贝叶斯式的先验估计。Open LLM Leaderboard v1 在 2024 年因 MMLU/HellaSwag 分数被污染而停用。",
"contam.desc": "你应该相信你模型的 MMLU 分数吗? 输入模型训练 cutoff 日期 — 工具评估 20+ 主流 benchmark(MMLU、HellaSwag、GSM8K、HumanEval、IFEval、MMLU-Pro、GPQA…)并告诉你哪些分数可能被污染。",
"contam.cutoff_label": "训练 cutoff:",
"contam.run_btn": "🧪 评估所有 benchmark",
"contam.section.ranked": "Benchmark 污染先验",
"contam.section.high": "🔴 高风险 benchmark(视分数为不可信)",
"contam.section.medium": "🟡 中等风险(用替代品验证)",
"contam.section.low": "🟢 低风险(可能干净)",
"contam.col.benchmark": "Benchmark",
"contam.col.released": "发布",
"contam.col.gap": "差距(月)",
"contam.col.prior": "P(污染)",
"contam.col.level": "等级",
"contam.col.corpora": "在语料库",
"contam.col.category": "类别",
"contam.label.high": "高风险",
"contam.label.medium": "中",
"contam.label.low": "低",
"contam.no_entries": "(此类别中无)",
"contam.advice.high": "视这些分数为不可信。用更新 / 私有测试的替代品替换(MMLU-Pro、GPQA、MUSR、MATH-500)。",
"contam.advice.medium": "谨慎对待。在 held-out 子集或社区复现上寻找复制。",
"contam.advice.low": "分数可能未被污染,但没有泄漏不等于证明 — 仍要用替代测试交叉验证。",
"contam.summary.headline": "Cutoff {cutoff} · {n} 个 benchmark 已评估",
"contam.status.empty": "⚠ 输入模型训练 cutoff 日期(例如 2023-12)。",
"contam.status.bad_date": "⚠ 日期格式错误。使用 YYYY-MM 或 YYYY-MM-DD。",
"contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks 已评估 · {high} 个高风险",
// v0.7 — Help 模态部分
"help.v07.title": "🆕 v0.7 — Anti-bullshit 套件(4 个新模式)",
"help.v07.intro": "v0.7(2026-05-06):四个新模式,解决 HuggingFace 社区报告的具体痛点。每个都在浏览器中运行,无推理 — 纯元数据 + 数学。",
"help.v07.unmask.title": "🪟 上下文揭示器",
"help.v07.unmask.body": "检测 max_position_embeddings 何时具有误导性。Mistral-7B-v0.1 声称 32k 但通过 SWA 实际只在 ~4-8k 内做注意力。粘贴 HF 模型 id → 1 秒判定(诚实 / 夸大 / 严重夸大 / YARN 扩展)。捕获 SWA、RoPE-scaling(YaRN/linear/dynamic NTK)、小 d_head + GQA。用例:在为 32k 上下文付 GPU 钱之前,验证模型是否真的注意那么远。",
"help.v07.template.title": "📜 Chat-template 检测器",
"help.v07.template.body": "检测模型使用的 chat-template 系列(Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / 自定义 / 无)并给出 lm-evaluation-harness、vLLM、transformers 的精确 CLI flag。解决 lm-eval-harness 的 issue #1841:忘记 --apply_chat_template 会让 multi-turn accuracy 静默对半。用例:报告 benchmark 分数前,确认你正确应用了 template。",
"help.v07.arena.title": "🎯 Arena-Elo CI 重建器",
"help.v07.arena.body": "Chatbot Arena 在公开排行榜中删除了置信区间 — 5 Elo 的差距在统计上可能毫无意义。粘贴原始 pairwise 投票数据(model_a, model_b, winner)→ Bradley-Terry MLE + 200 次 bootstrap → 排序 Elo + 95% CI + \"统计并列\" 面板,列出 CI 重叠的配对。尝试加载样本按钮。用例:宣称 \"模型 A 胜过模型 B\" 之前,验证它们的 CI 不重叠。",
"help.v07.contam.title": "🧪 污染先验",
"help.v07.contam.body": "对 benchmark 分数是否被污染做贝叶斯式的先验估计。输入模型训练 cutoff 日期 → 工具按 P(污染) 评估 20+ 主流 benchmark(MMLU、HellaSwag、GSM8K、HumanEval、IFEval、MMLU-Pro、GPQA、AIME、MATH-500、BBH、MUSR…),基于时间差距、语料库纳入和已知泄漏历史。Open LLM Leaderboard v1 在 2024 年因 MMLU/HellaSwag 分数被污染而停用。用例:比较两个模型时决定相信哪些分数。",
"help.v07.quant.title": "⚖️ 量化机制分类器",
"help.v07.quant.body": "预测任意(模型 × 量化方案:NF4、AWQ、GPTQ、GGUF Q4_K_M / Q5_K_M / Q8_0、int8、FP8…)的 γ-shift 与 ΔPPL。架构感知:小 d_head + 激进 GQA → 更敏感;校准方案(AWQ)比未校准方案(NF4)更好地吸收偏移。检测到 cliff 时推荐更安全的替代方案。用例:量化之前,预测你的特定架构 × 方案组合是否能保持 PPL 可接受,否则给出具体的切换建议。",
"help.v07.drift.title": "🔀 跨框架 Drift 界",
"help.v07.drift.body": "同一模型,不同 setup 下分数不同。工具预测仅由数值噪声(dtype、framework、batch)允许的最大 drift。若观测差距超过它 → 真实 bug,通常是 chat-template mismatch(lm-eval-harness issue #1841)或 KV-cache 布局。试试 "加载样本" 按钮看典型的 chat-template bug。用例:在报告回归或声称可复现性之前,验证两个评估之间的差距是否大于数值噪声能解释的范围。",
"inv.v07.drift": "🔀 Drift — bug 还是噪声?预测两个评估间的最大可允许差距",
"help.v07.niah.title": "🔍 NIAH → Reasoning Gap",
"help.v07.niah.body": "RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH(needle 检索)但在相同上下文上多跳 reasoning 失败。工具仅根据架构(γ_Padé + d_horizon + 架构压力:小 d_head、GQA、SWA)预测两种通过率,报告 gap,并找到模型 reasoning 保持 ≥65% 的\"安全 reasoning 上下文\"。扫描模式显示在 1k/4k/16k/64k/T_train 的曲线。用例:在声称的上下文部署之前,搞清楚模型是真的能在那里 reasoning 还是只能检索。",
"inv.v07.niah": "🔍 NIAH→Reason — 你的\"128k 上下文\"真的能在那里 reasoning,还是只能检索?",
// v0.7 — Inventory 模态第 5 卡
"inv.v07.title": "🆕 v0.7 anti-bullshit 套件",
"inv.v07.unmask": "🪟 Unmask — config.json 声称 32k?看它是否真的注意那么远",
"inv.v07.template": "📜 Chat-template — 精确 CLI flag,让 lm-eval 不会静默对半你的 accuracy",
"inv.v07.arena": "🎯 Arena CI — 恢复 Chatbot Arena 隐藏的置信区间",
"inv.v07.contam": "🧪 污染 — 按污染概率对 20+ benchmark 评级",
"inv.v07.quant": "⚖️ Quant — 预测任意(模型 × 量化方案)组合的 γ-shift + ΔPPL",
// v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
"modes.quant": "⚖️ Quant",
"mode_desc.quant": "预测任意(模型 × 量化方案)的 γ-shift 与 ΔPPL。架构感知:小 d_head + GQA → 更敏感。检测到 cliff 时推荐更安全的替代方案。",
"quant.title": "⚖️ 量化机制分类器",
"quant.tip": "预测给定(模型 × 量化方案)的 γ-shift(及由此产生的 ΔPPL)。\"AWQ 保留 ~95%\" 这类通用说法太模糊 — TAF 利用 d_head、GQA 比、SWA 标志和模型大小给出特定于架构的判定。解决:HF 社区普遍报告不可预测的量化 cliff(NF4 在 Phi-3 上 -2 PPL,但在 Llama-3-8B 上没问题)。",
"quant.desc": "量化会破坏你的模型吗?粘贴 HF 模型 id,选择量化方案 — 获取预测的 γ-shift、预期 ΔPPL 区间,以及在 cliff 情况下的推荐替代方案。仅浏览器,无 GPU,无需校准集。",
"quant.id_label": "HF 模型 id:",
"quant.fetch_btn": "📥 获取 config",
"quant.scheme_label": "量化方案:",
"quant.run_btn": "⚖️ 预测",
"quant.all_btn": "📊 比较所有方案",
"quant.regime.safe": "✅ 安全",
"quant.regime.mild": "✅ 轻度压缩",
"quant.regime.significant": "⚠ 显著退化",
"quant.regime.cliff": "❌ 重大 CLIFF",
"quant.label.gamma_shift": "γ 偏移",
"quant.label.delta_ppl": "ΔPPL(估)",
"quant.label.arch_mult": "架构乘数",
"quant.section.breakdown": "细节分解",
"quant.section.reco": "建议",
"quant.section.compare": "所有方案(按安全性排序)",
"quant.field.scheme": "方案",
"quant.field.calibrated": "已校准",
"quant.field.uncalibrated": "未校准",
"quant.field.base_penalty": "基础惩罚",
"quant.field.arch_mult_full": "架构乘数",
"quant.field.gamma_shift": "预测 γ 偏移",
"quant.field.ppl_band": "ΔPPL 区间(估)",
"quant.field.params": "参数量",
"quant.col.scheme": "方案",
"quant.col.bits": "比特",
"quant.col.gamma_shift": "γ 偏移",
"quant.col.ppl_band": "ΔPPL 区间",
"quant.col.regime": "机制",
"quant.reco.switch_to_awq": "切换到 {scheme} — 校准的 4-bit 处理小 d_head + GQA 比 NF4 好得多。预期 ΔPPL 下降 ~2-3 倍。",
"quant.reco.switch_to_q5_km": "切换到 {scheme} — Q5 以低成本保留更多 head 维度(仅大约 25% 文件更大)。",
"quant.reco.switch_to_q4_km": "切换到 {scheme} — Q3/Q2 对此架构过于激进。",
"quant.reco.consider_awq": "考虑 {scheme} — 在此架构上校准能显著降低 γ-shift。",
"quant.reco.use_higher_bits": "使用更高比特的替代 — 此架构无法干净吸收 4-bit。尝试 5 或 8-bit。",
"quant.reco.verify_with_eval": "用真实 eval 验证 — 预测偏移在边缘。部署前在目标上下文运行 NIAH。",
"quant.reco.no_action": "无需操作 — 此架构下量化是安全的。",
"quant.summary.headline_all": "{modelId} 的所有方案",
"quant.status.empty_id": "⚠ 输入 model id(例如 meta-llama/Llama-3.2-1B)。",
"quant.status.fetching": "⏳ 正在获取 {modelId} 的 config.json...",
"quant.status.fetched": "✅ 已获取 {modelId} 的 config。选择方案并点击预测(或比较所有)。",
"quant.status.no_scheme": "⚠ 从下拉中选择一个量化方案。",
"quant.status.done": "✅ 预测机制:{regime}",
"quant.status.done_all": "✅ 已比较 {n} 个方案 — 按安全性排序。",
// v0.7.4 — HF Hub 自动完成:隐私 + rate-limit
"hf_auto.privacy": "🔒 查询发送到 huggingface.co/api · 本地缓存 5 分钟",
"hf_auto.rate_limited": "⚠ HuggingFace 速率限制 — 稍后再试,或手动键入完整 model id",
"hf_auto.gated_msg": "是 gated 模型。在此接受许可证:",
// v0.7.5 — anti-bullshit pack #6: 跨框架 drift 界
"modes.drift": "🔀 Drift",
"mode_desc.drift": "在给定(framework、dtype、batch、chat-template)下预测两个 benchmark 分数之间的最大允许 drift。区分真实 bug 与数值噪声。",
"drift.title": "🔀 跨框架 Drift 界",
"drift.tip": "同一模型,不同 setup 下分数不同。差距是噪声还是真实 bug?输入两个分数及其(framework、dtype、batch、chat-template)— 工具预测仅由数值噪声允许的最大 drift。若观测差距超过它 → 真实 bug,通常是 chat-template mismatch(lm-eval issue #1841)或 KV-cache 布局。",
"drift.desc": "你的模型在 lm-eval-hf 给 67.2,在 vLLM-served 给 65.1。Bug 还是噪声? 输入两个分数及(framework、dtype、batch、是否应用 chat-template)。工具预测噪声区间并标记真实 bug。arxiv 2506.09501 将此记录为评估再现性的主要问题。",
"drift.setup_a": "Setup A",
"drift.setup_b": "Setup B",
"drift.score": "分数",
"drift.framework": "框架",
"drift.dtype": "Dtype",
"drift.batch": "Batch",
"drift.template": "Chat-template",
"drift.template.applied": "已应用",
"drift.template.not_applied": "未应用",
"drift.template.unknown": "未知",
"drift.run_btn": "🔀 计算 drift 界",
"drift.sample_btn": "📊 加载样本(chat-template bug)",
"drift.label.observed": "观测差距",
"drift.label.band": "数值区间",
"drift.label.ratio": "差距 / 区间",
"drift.section.setups": "Setups",
"drift.section.breakdown": "Drift 贡献者(数值区间)",
"drift.section.verdict": "判定与建议",
"drift.contrib.dtype": "Dtype 不匹配",
"drift.contrib.framework": "框架",
"drift.contrib.batch": "Batch 差异",
"drift.contrib.template": "Chat-template 不匹配",
"drift.dominant_cause": "主导原因",
"drift.cause.dtype": "dtype 精度差异",
"drift.cause.framework": "框架 / 内核差异",
"drift.cause.batch": "按 batch 的归一化路径",
"drift.cause.template_mismatch": "一侧应用了 chat-template 而另一侧没有(lm-eval-harness #1841 模式 — 多轮通常 -50%)",
"drift.verdict.noise": "✅ 数值噪声",
"drift.verdict.suspicious": "⚠ 可疑 — 验证",
"drift.verdict.bug": "❌ 真实 BUG — 调查",
"drift.verdict.bug_template": "❌ CHAT-TEMPLATE BUG",
"drift.reco.noise": "差距落在预期的数值噪声区间内。无需操作;差异与单独的 framework/dtype/batch 变化一致。",
"drift.reco.suspicious": "差距是预测区间的 1–2×。边缘——可能是真实 bug。尝试对齐主导贡献者(例如匹配框架或 dtype)并重新测试。",
"drift.reco.bug": "差距 > 预测区间的 2×。这是真实 bug。检查主导贡献者 — 很可能是 tokenizer / chat-template / KV-cache 布局差异。用 --apply_chat_template 运行 lm-eval-harness 并确认。",
"drift.reco.bug_template": "检测到 chat-template 不匹配。这是评估差距大的最常见原因(lm-eval-harness issue #1841)。用 --apply_chat_template 重跑 "未应用" 一侧(或设置 vLLM --chat-template <name>)并重测。",
"drift.status.empty_scores": "⚠ 输入两个分数。",
"drift.status.done": "✅ 判定:{verdict}",
"drift.status.sample_loaded": "✅ 样本已加载(典型 chat-template bug)。点击计算 drift 界。",
// v0.7.6 — anti-bullshit pack #7: NIAH → reasoning gap 预测器
"modes.niah": "🔍 NIAH→Reason",
"mode_desc.niah": "在任意上下文下预测 NIAH(检索)与多跳 reasoning 通过率。解决:长上下文模型常常通过 NIAH 但在同一上下文上 reasoning 失败(RULER 论文)。",
"modes.saturation": "📈 饱和度",
"mode_desc.saturation": "告诉你某个 benchmark 是否仍能区分 frontier 模型,或者已经饱和(例如 MMLU 88-94% 顶部,AIME 2025 已经 96-100%)。返回 top-3 + 判定 + 推荐替代品。",
"modes.hub": "🧭 方案",
"mode_desc.hub": "每个 LLM-eval 问题的地图 → tafagent 模式(若覆盖)+ 精选外部工具。找到方案而非重新发明。30+ 问题,7 类别。",
"modes.yarn": "🧵 YaRN 规划器",
"mode_desc.yarn": "生成精确的 rope_scaling 配置以将模型扩展到训练上下文之外 —— 外加 TAF 裁决:在目标长度下注意力质量是否真的撑得住。",
"modes.gguf": "🧊 GGUF 桥",
"mode_desc.gguf": "在浏览器内读取 GGUF 文件的元数据头(rope_theta、context_length、量化),给出 TAF 质量裁决 —— 显存计算器跳过的那个问题:塞得进且还能用吗?",
"gguf.title": "🧊 GGUF 有效性桥",
"gguf.tip": "塞进显存 ≠ 能用。GGUF/显存计算器读取模型元数据来告诉你某量化是否塞得进 GPU。本工具通过 HTTP Range 直接从 .gguf 头读取同样的元数据(rope_theta、context_length、量化方案、注意力头几何)—— 无需下载数 GB —— 并回答它们不答的:注意力质量是否真的撑得住,量化又侵蚀了多少(γ-shift、ΔPPL)?",
"gguf.desc": "粘贴一个 GGUF 仓库(如 Qwen/Qwen2.5-7B-Instruct-GGUF),选择一个量化文件,获得 TAF 质量裁决:模型的有效注意力视界,以及所选量化对这个具体架构的 γ 位移有多大。只在浏览器内读取文件头。",
"gguf.repo_label": "GGUF 仓库 id:",
"gguf.list_btn": "📂 列出量化文件",
"gguf.file_label": "量化文件:",
"gguf.target_label": "目标上下文 L(可选):",
"gguf.analyze_btn": "🧊 分析 GGUF",
"gguf.all_btn": "📊 比较所有量化",
"gguf.compare_title": "所有量化 —— 质量对比",
"gguf.col.verdict": "裁决",
"gguf.col.gamma_at_l": "γ @ L(量化后)",
"gguf.need_repo": "输入 GGUF 仓库 id,如 'Qwen/Qwen2.5-7B-Instruct-GGUF'",
"gguf.listing": "正在从 HF Hub 列出 .gguf 文件…",
"gguf.no_files": "该仓库中未找到 .gguf 文件。",
"gguf.found": "个量化文件已找到",
"gguf.pick_hint": "选一个并点击分析。",
"gguf.reading": "正在通过 HTTP Range 读取 GGUF 头…",
"gguf.read_ok": "头已解析",
"gguf.verdict.healthy": "健康 —— 量化后有效视界以良好的 γ 到达 L",
"gguf.verdict.usable_with_care":"可用但需谨慎 —— 到达 L,但量化后 γ 偏低",
"gguf.verdict.degrades": "退化 —— 注意力在 L 之前崩溃(或被量化推到那里)",
"gguf.r.arch": "架构",
"gguf.r.ctx_train": "训练上下文",
"gguf.r.horizon_fp16": "注意力视界(fp16)",
"gguf.r.quant": "量化方案",
"gguf.r.gamma_shift": "量化导致的 γ 位移",
"gguf.r.after_quant": "(量化后)",
"gguf.r.eff_horizon": "有效视界(量化后)",
"gguf.r.no_quant_shift": "—— 全精度,无 γ 位移",
"gguf.r.note": "视界来自 γ_Padé / d_horizon(架构)。量化 γ 位移 + ΔPPL 来自 quant-regime 模型(以 llama.cpp PPL + AWQ/GPTQ 论文校准)。两者皆为估计 —— 边界情况请用真实评测核实。",
"gguf.err.not_gguf": "该文件不是有效的 GGUF(magic 错误)。",
"gguf.err.too_large": "元数据头超出获取上限 —— tokenizer 异常大。请换一个量化。",
"gguf.err.incomplete": "GGUF 元数据缺少 rope_theta 或 context_length —— 无法计算视界。",
"help.v091.gguf.title": "🧊 GGUF 有效性桥",
"help.v091.gguf.body": "那一打 GGUF/显存计算器(NyxKrage、oobabooga……)读取 .gguf 头来告诉你某量化是否塞得进 GPU。本工具读取同样的头 —— 通过 HTTP Range,无需下载数 GB —— 并回答它们跳过的:塞得进且还能用吗? 粘贴一个 GGUF 仓库,选择一个量化文件;桥会提取 rope_theta、context_length、量化方案(来自 general.file_type 或文件名)和头几何,然后运行 TAF 的 γ_Padé / d_horizon 加上架构感知的 quant-regime γ 位移。输出:训练上下文处的有效注意力视界、量化对该模型侵蚀 γ(及 ΔPPL)的程度,以及裁决。用例:'Q4_K_M 塞得进 8GB —— 但超过 30K 会变傻吗?' → 在下载 6 GB 之前先看视界和 Q4 的 γ 惩罚。",
"yarn.title": "🧵 YaRN / RoPE 上下文扩展规划器",
"yarn.tip": "配置 + 裁决,不只是显存。GGUF/显存计算器告诉你某上下文长度是否塞得进 GPU。本工具给出要放入 config.json 的精确 rope_scaling 块,并判断该长度下注意力质量是否真的撑得住 —— 使用 TAF 的 γ_Padé / d_horizon 机制,全在浏览器内运行。",
"yarn.desc": "想让模型超出其训练上下文运行?输入模型(或其 θ + 训练上下文)和你的目标长度 L。获得可复制粘贴的 rope_scaling 片段(transformers ≥4.43),外加 TAF 裁决:有效注意力视界能否到达 L,还是模型在 d_horizon 之外就开始幻觉?",
"yarn.model_label": "HF 模型 id(可选):",
"yarn.fetch_btn": "📥 获取配置",
"yarn.orig_label": "训练上下文(orig max_position_embeddings):",
"yarn.theta_label": "RoPE θ(rope_theta):",
"yarn.target_label": "目标上下文 L:",
"yarn.type_label": "RoPE 缩放方法:",
"yarn.type_auto": "自动(推荐)",
"yarn.plan_btn": "🧵 规划扩展",
"yarn.need_id": "输入模型 id,如 'Qwen/Qwen2.5-7B-Instruct'",
"yarn.fetching": "正在从 HF Hub 获取 config.json…",
"yarn.loaded_hint": "如需调整请修改,然后点击规划扩展。",
"yarn.verdict.healthy": "健康 —— 有效视界以良好的 γ 到达 L",
"yarn.verdict.usable_with_care":"可用但需谨慎 —— 能用,但在 L 附近 γ 偏低",
"yarn.verdict.needs_finetune": "需要微调 —— 因子过大,仅凭闭式不够",
"yarn.verdict.degrades": "退化 —— 注意力在 L 之前崩溃",
"yarn.verdict.no_extension_needed":"无需扩展 —— L 已在训练上下文之内",
"yarn.r.factor": "扩展因子",
"yarn.r.method": "方法",
"yarn.r.naive": "(无扩展)",
"yarn.r.eff": "(扩展后)",
"yarn.r.from": "自",
"yarn.r.snippet": "config.json 片段",
"yarn.r.collapsed": "已崩溃(越过 Padé 极点)",
"yarn.copy_btn": "复制配置",
"yarn.copied": "已复制",
"yarn.warn.theta_eff_estimate":"θ_eff ≈ θ×因子 是一阶 NTK 估计;YaRN 的逐频带斜坡可能略有差异。",
"yarn.warn.aggressive": "激进因子 > 4× —— 不微调时 d_horizon 之外的质量不可靠。",
"yarn.warn.horizon_short": "有效视界未覆盖 L —— 预期在 d_horizon 之外失去连贯性。",
"yarn.warn.finetune": "此处的 RoPE 扩展是闭式估计;transformers 文档 + YaRN 论文建议对超过约 2–4× 的因子做一次短微调。",
"yarn.err.no_orig": "请输入训练上下文(orig max_position_embeddings),或获取一个模型。",
"yarn.err.no_theta": "请输入 RoPE θ(rope_theta),或获取一个模型。",
"yarn.err.no_target": "请输入目标上下文长度 L。",
"help.v09.title": "🆕 v0.9 —— YaRN / RoPE 上下文扩展规划器",
"help.v09.intro": "v0.9(2026-05-23):HuggingFace 上最常被问到的问题 —— \"我该怎么设置 rope_scaling 来扩展上下文,它真的能用吗?\" —— 用可复制粘贴的配置片段 + TAF 质量裁决来回答。仅浏览器,无推理。",
"help.v09.yarn.title": "🧵 YaRN / RoPE 上下文扩展规划器",
"help.v09.yarn.body": "HF 上那一打 GGUF/显存计算器(NyxKrage、oobabooga、DavidAU……)都回答同一个问题:上下文长度 L 塞得进我的 GPU 吗? 没有一个回答更难的:L 既塞得进、又还能用吗? 输入模型 id(或其 θ + 训练上下文)和目标长度 L。规划器计算扩展因子,输出 transformers ≥4.43 的精确 rope_scaling 块(yarn / linear / dynamic / llama3,带论文默认 β 斜坡),然后运行 TAF 的 γ_Padé / d_horizon 数学:无扩展的 γ(问题)、所选方法后的 γ(解法)、有效注意力视界,以及裁决 —— 健康 / 可用但需谨慎 / 需要微调 / 退化。它如实标注 θ_eff≈θ·因子 估计和 >4× 的微调要求。用例:'我想让 Mistral-7B(θ=10k,训练 8k)跑到 32k' → 看到朴素使用下 γ 崩溃、YaRN 部分恢复,并得到可粘贴的精确配置。",
"niah.title": "🔍 NIAH → Reasoning Gap",
"niah.tip": "NIAH(Needle in a Haystack)测试检索:\"在长文本中找到这个事实\"。多跳 reasoning 测试推理:\"把开头的事实 X+Y 与结尾的事实 Z 结合\"。RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH 但在相同上下文上 reasoning 失败。本工具仅根据架构预测两种通过率。",
"niah.desc": "你的模型声称 128k 上下文。它在 64k 是真的能 reasoning,还是只能检索?粘贴 HF 模型 id 和目标 eval 上下文 — 工具预测 NIAH 与多跳 reasoning 通过率、gap,以及 reasoning 保持 ≥65% 的 \"安全上下文\"。",
"niah.id_label": "HF 模型 id:",
"niah.fetch_btn": "📥 获取 config",
"niah.teval_label": "目标上下文 (T_eval):",
"niah.run_btn": "🔍 预测",
"niah.sweep_btn": "📊 扫描上下文",
"niah.label.niah": "NIAH 通过率",
"niah.label.reasoning": "Reasoning 通过率",
"niah.label.gap": "Gap",
"niah.label.safe_ctx": "Reasoning 安全上下文",
"niah.section.breakdown": "架构细节",
"niah.section.reco": "建议",
"niah.calib.heading": "RULER 校准(NVIDIA 已发布数据)",
"niah.calib.matched": "匹配 {alias} → KB 行 {canonical}。",
"niah.calib.aggregate": "RULER 聚合分",
"niah.calib.interp": "在以下之间插值",
"niah.calib.extrapolated": "外推到 RULER 已测范围之外",
"niah.calib.col.heuristic": "启发式",
"niah.calib.col.calibrated": "RULER 校准",
"niah.calib.col.delta": "Δ",
"niah.calib.factors": "来自 RULER 论文附录表 13-16 的每任务因子:",
"niah.calib.factors_caveat": "诚实范围:retrieval 0.95-1.10×,reasoning 0.60-0.85×",
"niah.calib.claimed_vs_effective": "论文报告",
"niah.calib.claimed": "claimed",
"niah.calib.effective": "effective",
"niah.calib.source": "来源",
"niah.calib.miss": "此模型暂无 RULER 校准——仅使用架构启发式。如有实测数字,请添加到 data/ruler_kb.json。",
"niah.section.sweep": "按上下文长度扫描通过率",
"niah.field.dhorizon": "d_horizon(有效)",
"niah.field.ratio": "T_eval / d_horizon",
"niah.field.arch_pressure": "架构压力(小 d_head + GQA + SWA)",
"niah.field.theta": "RoPE θ",
"niah.field.t_train": "T_train(声称)",
"niah.col.context": "T_eval",
"niah.col.niah": "NIAH",
"niah.col.reasoning": "Reasoning",
"niah.col.gap": "Gap",
"niah.col.verdict": "判定",
"niah.verdict.robust": "✅ 稳健",
"niah.verdict.marginal": "⚠ 边缘",
"niah.verdict.degraded": "⚠ 退化",
"niah.verdict.retrieval_only": "❌ 仅检索",
"niah.verdict.broken": "❌ 失效",
"niah.reco.robust": "在此上下文下检索与 reasoning 都稳定。可安全部署用于查询和推理任务。",
"niah.reco.marginal": "边缘。检索可用但 reasoning 不稳。用于事实查询,不要用于多步推理。",
"niah.reco.degraded": "Reasoning 显著下降。模型能找到事实但难以组合它们。在此长度下避免多跳任务。",
"niah.reco.retrieval_only": "RULER 的典型发现:模型通过 NIAH 但 reasoning 失败。适用于 RAG 设置(LLM 仅定位事实),不适用于链式推理。把上下文降到下方的 \"安全\" 值。",
"niah.reco.broken": "在此上下文下模型连基本检索都失败。视为 out-of-distribution — 在更短上下文重测。",
"niah.safe_context": "≤ {ctx} tokens(reasoning ≥ 65%)",
"niah.safe_context_none": "在你的目标以下没找到安全上下文 — 模型即使在小上下文也 reasoning 失败。",
"niah.summary.sweep": "{modelId} — 按上下文的通过率",
"niah.status.empty_id": "⚠ 输入 model id(例如 meta-llama/Llama-3.1-8B-Instruct)。",
"niah.status.bad_teval": "⚠ 输入目标上下文(≥ 512 tokens)。",
"niah.status.fetching": "⏳ 正在获取 {modelId} 的 config.json...",
"niah.status.fetched": "✅ 已获取 {modelId} 的 config。设置 T_eval 并点击预测(或扫描上下文)。",
"niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
"niah.status.sweep_done": "✅ 已扫描 {n} 个上下文长度。",
"saturation.title": "📈 Benchmark 饱和度检测器",
"saturation.tip": "MMLU 已饱和(所有 frontier 模型 88-94%)。报告\"92% on MMLU\"现在毫无意义。本工具告诉你哪些 benchmark 仍能区分 frontier 模型,哪些已饱和,以及替代方案。数据:DemandSphere AI Frontier Tracker(CC BY-NC 4.0),2026-05 刷新。",
"saturation.desc": "你的 benchmark 还有用吗?选一个 benchmark 查看 top-3 frontier 分数、spread 与判定(saturated / near-saturated / discriminative),并给出推荐替代品。",
"saturation.select_label": "Benchmark:",
"saturation.select.all": "— 显示所有 benchmark —",
"saturation.run_btn": "📈 分类",
"saturation.all_btn": "📊 显示全部",
"saturation.col.spread": "Top-3 spread",
"saturation.col.mean": "Top-3 平均",
"saturation.col.n": "模型数",
"saturation.col.bench": "Benchmark",
"saturation.col.verdict": "判定",
"saturation.col.reco": "首选替代",
"saturation.col.model": "模型",
"saturation.col.score": "分数",
"saturation.section.top3": "Top-3 frontier 分数",
"saturation.section.recommendations": "推荐替代品",
"saturation.section.note": "备注",
"saturation.section.all": "所有跟踪的 benchmark",
"saturation.verdict.saturated": "🚨 已饱和",
"saturation.verdict.near_saturated": "⚠ 接近饱和",
"saturation.verdict.discriminative": "✅ 仍可区分",
"saturation.verdict.sparse_data": "ℹ 数据稀疏",
"saturation.borderline": "边缘 — 在阈值切点的 ±1pp 内。判定视为\"需仔细核对\"。",
"saturation.unknown": "未知 benchmark。",
"saturation.attribution": "数据:DemandSphere AI Frontier Model Tracker(CC BY-NC 4.0)· HF Open LLM Leaderboard v3(开源权重历史)· 最近一次 fetch 2026-05-05。",
"saturation.status.live": "✅ 实时数据已加载 — {count} 个模型。",
"saturation.status.baked": "ℹ 使用 baked 快照(实时 fetch 不可用)。",
"saturation.status.kb_fail": "⚠ 无法加载饱和度 KB。",
"saturation.status.done": "✅ {name} — {verdict}",
"saturation.status.all_done": "✅ 已分类 {n} 个 benchmark。",
"help.v08.saturation.title": "📈 Benchmark 饱和度检测器",
"help.v08.saturation.body": "MMLU 已饱和(top 88-94%),AIME 2025 上线几个月就饱和,HumanEval 接近饱和。选任何 benchmark,工具返回 top-3 frontier 分数、spread、平均,以及判定 — saturated / near-saturated / discriminative — 加上推荐替代品(例如 MMLU → MMLU-Pro / GPQA / HLE)。可达时从 DemandSphere AI Frontier Tracker(CC BY-NC 4.0)实时 fetch;不可达时使用 2026-05-05 的 baked 快照。用例:在引用\"92% on MMLU\"或设计 eval 之前,检查 benchmark 是否仍能区分任何东西。",
"inv.v08.saturation": "📈 Saturation — 你的 benchmark 还有用吗,还是所有 frontier 都在顶部并列?",
// v0.8.2 — anti-bullshit pack #8: JSON CoT-aware Linter
"modes.cot": "📋 JSON CoT",
"mode_desc.cot": "对 JSON Schema(或示例响应对象)进行 linting,查找『答案在推理之前』的反模式。约束解码引擎按 schema 顺序输出字段——如果 `answer` 在 `reasoning` 之前,CoT 就被破坏了。",
"cot.title": "📋 JSON CoT 感知 Linter",
"cot.tip": "约束解码引擎(llguidance、Outlines、SGLang 语法)按 schema 顺序输出 JSON 属性。如果 schema 把 `answer` 放在 `reasoning` 之前,模型会先承诺最终答案,然后才写理由来证明它——彻底破坏 Chain-of-Thought。粘贴 JSON Schema(或示例对象),linter 会标记顺序问题。",
"cot.desc": "推理永远先于答案。 粘贴 JSON Schema 或示例响应对象——linter 报告推理字段是否在答案字段之前,并提出修复建议。",
"cot.input.placeholder": "{ \"type\": \"object\", \"properties\": { \"answer\": {\"type\": \"string\"}, \"reasoning\": {\"type\": \"string\"} } }",
"cot.lint_btn": "🔍 Lint",
"cot.example_good_btn": "↳ 示例:正确顺序",
"cot.example_bad_btn": "↳ 示例:反模式",
"cot.status.done": "✅ {verdict}",
"cot.col.field": "字段",
"cot.col.type": "角色",
"cot.field.reasoning": "推理",
"cot.field.answer": "答案",
"cot.field.other": "—",
"cot.field_count": "{n} 个字段",
"cot.verdict.good_order": "✅ 顺序正确——推理在答案之前",
"cot.verdict.anti_pattern": "❌ 反模式——答案在推理之前",
"cot.verdict.missing_reasoning": "⚠ 缺少推理字段",
"cot.verdict.missing_answer": "ℹ 未检测到答案类字段",
"cot.verdict.no_cot_fields": "ℹ 未检测到推理/答案字段",
"cot.verdict.invalid_json": "❌ JSON 无效",
"cot.verdict.non_object": "ℹ 顶层值不是对象",
"cot.verdict.empty_fields": "ℹ 没有可分析的字段",
"cot.explain.good_order": "约束解码会先输出推理,所以模型可以在承诺之前思考。Chain-of-Thought 保持诚实。",
"cot.explain.anti_pattern": "模型被迫先输出答案字段;之后的任何推理只能为已承诺的内容辩护。重新排序,使推理类字段在答案类字段之前。",
"cot.explain.missing_reasoning": "存在答案字段但没有推理字段。如果你想要 CoT,在答案之前添加 `reasoning`(或 `chain_of_thought`、`analysis`…)字段。",
"cot.explain.missing_answer": "存在推理字段但没有明显的答案字段。确保 schema 实际上要求模型承诺一个最终值。",
"cot.explain.no_cot_fields": "对象有字段但都不像推理或答案(按名称)。Linter 保守——如果 schema 是有意的,可以忽略。否则添加显式的推理/答案字段。",
"cot.hint.non_object": "顶层值必须是 JSON 对象(`{ … }`)或带 `properties` 的 JSON Schema。",
"cot.hint.empty_fields": "未检测到字段。粘贴 JSON Schema、示例响应,或点击 textarea 下方的示例按钮。",
"cot.suggested_fix.title": "✓ 建议修复",
"cot.suggested_fix.desc": "属性已重新排序——推理字段优先,然后是任何上下文字段,最后是答案字段。`required[]`(如果存在)也镜像同步。",
"cot.suggested_fix.copy": "📋 复制",
"cot.suggested_fix.copied": "✓ 已复制",
"cot.attribution": "参考:",
"inv.v082.cot": "📋 JSON CoT — 对 structured outputs schema 进行 linting,查找悄悄破坏 Chain-of-Thought 的『答案在推理之前』反模式。",
"help.v082.cot.title": "📋 JSON CoT 感知 Linter",
"help.v082.cot.body": "约束解码引擎(llguidance、Outlines、SGLang 语法)按 schema 声明的顺序输出 JSON 属性。如果你写 { answer, reasoning },模型先承诺 answer,CoT 就退化为事后辩护。粘贴任意 schema(或示例响应)——linter 把每个字段分类为推理、答案或其他,标记顺序,并输出可复制回去的重排修复。用例:『我的 CoT 提示在纯文本中正常但在 JSON 模式下退化』→ 运行 linter,找到颠倒的顺序,修复。",
// v0.8.3 — anti-bullshit pack #9: PEFT Anti-Pattern Checker
"modes.peft": "🔧 PEFT Lint",
"mode_desc.peft": "PEFT/LoRA 训练脚本的静态 linter。捕获基础模型的静默加载(peft #2115)、QLoRA 中 prepare/get_peft_model 顺序、target_modules/架构不匹配、以及 lora_alpha 约定。",
"peft.title": "🔧 PEFT 反模式检查器",
"peft.tip": "get_peft_model(base, config) 创建一个全新的 adapter——它不加载已保存的权重。想从 checkpoint 恢复必须调用 PeftModel.from_pretrained(base, path)。peft #2115 记录了基础模型静默加载的 bug。这个 linter 扫描你的训练脚本查找此模式(以及另外 3 个:QLoRA 顺序、target_modules/架构不匹配、lora_alpha 比率)。",
"peft.desc": "不要在基础模型上烧掉 10 小时的训练。 粘贴你的 PEFT 设置代码——linter 会标记基础模型的静默加载、QLoRA 顺序 bug、target_modules/架构不匹配,以及 lora_alpha 约定。",
"peft.input.placeholder": "from peft import LoraConfig, get_peft_model …",
"peft.lint_btn": "🔍 Lint",
"peft.example_bug_btn": "↳ 示例:基础模型静默加载",
"peft.example_qlora_btn": "↳ 示例:QLoRA 顺序 bug",
"peft.example_clean_btn": "↳ 示例:干净",
"peft.status.done": "✅ {verdict} — {n} 项发现",
"peft.line": "第 {n} 行",
"peft.summary": "{total} 项发现",
"peft.attribution": "参考:",
"peft.detected_at_line": "出现在第",
"peft.suggested_fix": "建议:",
"peft.detected_arch": "检测到的架构",
"peft.from_model_id": "(来自 model id",
"peft.your_modules": "你的 target_modules",
"peft.expected_modules": "此架构预期",
"peft.match_ratio": "{hits} / {total} 匹配。",
"peft.ratio": "比率",
"peft.alpha.convention": "约定为 α=2r 或 α=r",
"peft.qlora_order.detail": "prepare_model_for_kbit_training(第 {prepare_line} 行)在 get_peft_model(第 {get_peft_model_line} 行)之后运行。请反转顺序——先调用 prepare,然后 get_peft_model。",
"peft.no_peft_calls.detail": "未检测到 get_peft_model / PeftModel.from_pretrained / LoraConfig 调用。粘贴 PEFT/LoRA 设置代码片段。",
"peft.verdict.errors_found": "❌ 发现错误",
"peft.verdict.warnings_only": "⚠ 警告",
"peft.verdict.info_only": "ℹ 信息",
"peft.verdict.clean": "✅ 干净——未检测到问题",
"peft.verdict.no_peft_calls": "ℹ 未检测到 PEFT 调用",
"peft.verdict.empty_input": "ℹ 空输入",
"peft.rule.silent_base_load.label": "基础模型静默加载(peft #2115)",
"peft.rule.silent_base_load.explain": "get_peft_model(base, config) 创建一个新的 adapter——它不加载已保存的权重。你代码中的 checkpoint 提示表明你想从已保存的 adapter 恢复训练,但这个代码路径会悄悄从头开始并覆盖该次运行。",
"peft.rule.silent_base_load.fix": "恢复时请用 PeftModel.from_pretrained(base, path) 替换 get_peft_model(base, config)。加载后用 model.get_layer_status() 验证。",
"peft.rule.qlora_order.label": "QLoRA 顺序 bug",
"peft.rule.qlora_order.explain": "prepare_model_for_kbit_training 必须在 get_peft_model 之前调用。反转后,kbit 准备不会应用到 LoRA 层,梯度计算会破裂(loss → NaN,或静默训练空内容)。",
"peft.rule.qlora_order.fix": "重新排序:base = prepare_model_for_kbit_training(base) 然后 model = get_peft_model(base, config)。",
"peft.rule.target_modules_mismatch.label": "target_modules / 架构不匹配",
"peft.rule.target_modules_mismatch.explain": "你的 target_modules 列表与代码中检测到的架构的常规模块名不匹配。PEFT 会静默地把 LoRA 应用到无(或错误的层)。",
"peft.rule.target_modules_mismatch.fix": "在加载的基础模型上用 print([n for n,_ in model.named_modules()]) 验证模块名,或使用上面显示的特定架构列表。",
"peft.rule.alpha_not_2r.label": "lora_alpha ≠ 2r(约定)",
"peft.rule.alpha_not_2r.explain": "大多数已发表的 LoRA 配方使用 α = 2r(有效单位尺度)或 α = r(降低有效 LR)。自定义比率可行但值得检查。",
"peft.rule.alpha_not_2r.fix": "对照参考配方核对比率。如果是有意的,忽略此发现。",
"peft.rule.no_peft_calls.label": "未检测到 PEFT 调用",
"inv.v083.peft": "🔧 PEFT Lint — 捕获 get_peft_model 在基础模型上的静默加载(peft #2115)+ QLoRA 顺序 + target_modules / 架构不匹配。",
"help.v083.peft.title": "🔧 PEFT 反模式检查器",
"help.v083.peft.body": "PEFT 的 get_peft_model(base, config) 创建一个新的 adapter——它不从路径加载已保存的权重。粘贴教程代码并尝试从 checkpoint 恢复的人会静默地丢掉训练。peft #2115 是规范的 bug 报告。这个 linter 扫描你的脚本查找该模式 + 3 个相关问题(QLoRA 顺序、target_modules/架构不匹配、lora_alpha 比率),并报告带行号和建议修复的发现。用例:在启动 10 小时的 LoRA fine-tune 之前,粘贴你的脚本——在 200ms 内捕获静默 bug。",
// v0.8.4 — anti-bullshit pack #10: Prompt-Cache Diff Predictor
"modes.cache": "🔁 缓存差异",
"mode_desc.cache": "预测 prompt 编辑是否保留了提供商的 prompt cache 还是使其失效。每个提供商的命中率 + 与无缓存的 $ 差额。",
"cache.title": "🔁 Prompt-Cache 差异预测器",
"cache.tip": "Anthropic 的 cache_control 缓存在标记前缀的第一个 token 差异处中断。OpenAI 自动缓存 ≥1024 token 的前缀,但任何更改都会使其失效。Gemini context cache 需要 ≥32K token。位置不当的编辑会悄悄使你的账单 10 倍——API 永远不会警告。粘贴新旧 prompt,查看每个提供商的命中率 + 成本差额。",
"cache.desc": "不要因一个字符的编辑使账单 10 倍。 粘贴你之前和当前的 prompt——预测器找到最长公共前缀,估算 token,并显示每个提供商的命中率 + 与无缓存的 $ 差额。",
"cache.old_label": "旧 prompt:",
"cache.new_label": "新 prompt:",
"cache.old.placeholder": "你是一个有帮助的助手。…",
"cache.new.placeholder": "你是一个有帮助的助手。…",
"cache.profile_label": "Tokenizer 配置:",
"cache.profile.english": "英语(chars/4)",
"cache.profile.code": "代码(chars/3.5)",
"cache.profile.mixed": "中日韩 / 西里尔(chars/2)",
"cache.output_label": "估计输出 token:",
"cache.diff_btn": "🔍 预测",
"cache.example_good_btn": "↳ 示例:99% 命中",
"cache.example_broken_btn": "↳ 示例:缓存失效",
"cache.example_belowmin_btn": "↳ 示例:低于 OpenAI 最小值",
"cache.status.done": "✅ {verdict} — {hit}% 理论命中",
"cache.verdict.identical": "✅ 完全相同——完整命中",
"cache.verdict.divergent_can_cache":"⚠ 部分命中——按提供商不同",
"cache.verdict.divergent_below_min":"❌ 低于所有提供商最小值——无法缓存",
"cache.verdict.fully_divergent": "❌ 完全不同——缓存失效",
"cache.verdict.empty_input": "ℹ 空输入",
"cache.summary.tokens": "公共前缀 {common} / {total} token({pct}% 理论命中率)。",
"cache.summary.diff_at": "第一个差异在第 {line} 行。",
"cache.col.provider": "提供商",
"cache.col.hit": "命中",
"cache.col.cost": "基础 → 缓存",
"cache.col.savings": "节省",
"cache.note.requires_marker": "(需要 cache_control 标记)",
"cache.note.below_min": "(前缀 < {min} token——提供商最小值)",
"cache.write_surcharge": "+ {cost} 首次缓存写入附加费(Anthropic)",
"cache.diff.title": "缓存在哪里中断",
"cache.diff.legend": "绿色 = 共享前缀(可缓存)。红色 = 首次编辑(从这里开始全部重新计费)。",
"cache.hint.empty": "粘贴两个 prompt,然后预测。",
"cache.attribution": "参考:",
"cache.attribution.snapshot": "价格快照 2026-01;在按 $ 行动前请用提供商当前文档验证。",
"inv.v084.cache": "🔁 缓存差异 — 预测 prompt 编辑是否使提供商的 prompt cache 失效。每个提供商的命中率 + $ 差额。",
"help.v084.cache.title": "🔁 Prompt-Cache 差异预测器",
"help.v084.cache.body": "每个提供商的 prompt cache 有不同规则:Anthropic 的 cache_control 在标记前缀的第一个 token 差异处中断;OpenAI 自动缓存 ≥1024 token 的前缀;Gemini context cache 需要 ≥32K token。位置不当的编辑会悄悄使你的账单 10 倍——API 不会警告,成本只在下张账单上出现。粘贴新旧 prompt,预测器找到最长公共前缀,用三种 tokenizer 配置(英语/代码/CJK)估算 token,并显示每个提供商的命中率 + 与无缓存的 $ 差额,包括 Claude Opus/Sonnet/Haiku、GPT-5/mini 和 Gemini 2.5 Pro。用例:『我调整了 system prompt 后账单暴涨——什么坏了?』→ 粘贴两个 prompt,看到底哪个提供商停止缓存。",
// v0.8.5 — anti-bullshit pack #11: Speculative-Decode Compatibility
"modes.speculative": "🔬 Spec-Decode",
"mode_desc.speculative": "从 HF Hub 获取两个 model id 的 `tokenizer.json` 并在配置 speculative decoding 之前验证 vocab 兼容性。捕获浪费 draft 计算的静默不匹配 bug。",
"speculative.title": "🔬 Speculative-Decode 兼容性",
"speculative.tip": "Speculative decoding(vLLM、SGLang、llama.cpp、transformers)要求 draft 和 target 共享完全相同的词汇表。任何 token-id 不一致都会使 target 拒绝每个 draft token——你支付双倍计算成本且吞吐量比 baseline 更差。系统报告名义输出(只是更慢),所以 bug 在单元测试中不可见。这个工具从 HF Hub 获取两个 id 的 `tokenizer.json` 并比较。",
"speculative.desc": "不要发布 vocab 不匹配的 spec-dec。 粘贴 target + draft model id → 工具获取 tokenizer,比较 vocab 类型、大小、采样的 token-id、special token、added token → 判定 + speedup 估算。",
"speculative.target_label": "Target(大)model id:",
"speculative.draft_label": "Draft(小)model id:",
"speculative.target_label_short": "target",
"speculative.draft_label_short": "draft",
"speculative.check_btn": "🔍 检查兼容性",
"speculative.example_good_btn":"↳ 示例:Llama-3.1 8B/70B(受限 → mirror)",
"speculative.example_bad_btn": "↳ 示例:跨 family(坏)",
"speculative.gated_note": "💡 受限模型(Llama、Mistral、Gemma)会触发自动 open-mirror 回退(unsloth/...)。HF 官方不推荐浏览器端 token,所以工具无法 auth——但 mirror 的 tokenizer 通常字节级等同,因为量化只影响权重,不影响 tokenizer 工件。",
"speculative.mirror.heading": "Open-mirror 回退",
"speculative.mirror.target_used": "Target {original} 受限;使用 mirror {mirror}。",
"speculative.mirror.draft_used": "Draft {original} 受限;使用 mirror {mirror}。",
"speculative.mirror.warn": "Mirror tokenizer(例如 unsloth/)通常与受限原版字节级等同,因为量化只影响权重而非 token。如需精确匹配,请验证 chat-template(unsloth #880 记录了偶发的漂移)。",
"speculative.status.fetching": "🔄 从 HF Hub 获取两个模型的 tokenizer.json…",
"speculative.status.done": "✅ {verdict}",
"speculative.status.error": "❌ 错误",
"speculative.type_mismatch_note": "tokenizer 类型不同;spec-dec 不可能",
"speculative.vocab_size": "Vocab 大小",
"speculative.size_diff": "不同——每个重用的 id 都是一个不对齐",
"speculative.sampled": "Token-id 采样匹配",
"speculative.first_mismatch": "首次不匹配",
"speculative.special_diff": "Special token 差异",
"speculative.added_diff": "Added token 差异",
"speculative.added_diff_more": "+ 更多 …",
"speculative.speedup.title": "估算的 speedup 范围",
"speculative.speedup.params": "target {target} / draft {draft}(参数比 {ratio})",
"speculative.speedup.low": "低(α=0.50)",
"speculative.speedup.expected":"预期(α=0.70)",
"speculative.speedup.high": "高(α=0.85)",
"speculative.speedup.disclaimer": "α = draft 接受率。实际 speedup 取决于 prompt 域、lookahead K 和引擎开销。范围假设理想的 verifier batching。",
"speculative.speedup.draft_not_smaller": "Draft 不比 target 小——这里 spec-dec 是误用。",
"speculative.attribution": "参考:",
"speculative.side.target": "Target",
"speculative.side.draft": "Draft",
"speculative.fetch_error.missing_model_id": "缺少 model id",
"speculative.fetch_error.gated_or_private": "模型受限或私有——没有 auth 无法获取 tokenizer",
"speculative.fetch_error.not_found": "在 HF Hub 上找不到 model id",
"speculative.fetch_error.fetch_failed": "获取失败(HTTP 错误)",
"speculative.fetch_error.parse_failed": "JSON 解析失败(文件格式不正确)",
"speculative.fetch_error.timeout": "超时(>15 秒,大 tokenizer 或慢速连接)",
"speculative.fetch_error.network": "网络错误",
"speculative.fetch_error.hint": "检查 model id 拼写。受限模型需要通过你的 HF 账户查看 tokenizer 文件——这个工具无法 auth。",
"speculative.hint.missing_input": "输入两个 model id(target 和 draft),然后检查。",
"speculative.hint.identical_models": "Target 和 draft 是同一个模型——spec-dec 是 no-op(且浪费)。",
"speculative.verdict.compatible": "✅ 兼容——vocab 匹配",
"speculative.verdict.compatible_with_caveats": "✅ 兼容——但 special/added token 不同(请审查)",
"speculative.verdict.partial_compatible": "⚠ 部分匹配(采样 id 的 95-99.9%)",
"speculative.verdict.type_mismatch": "❌ Tokenizer 类型不同——spec-dec 不可能",
"speculative.verdict.vocab_size_mismatch": "❌ Vocab 大小不同——id 空间不对齐",
"speculative.verdict.incompatible": "❌ 不兼容——太多 id 不匹配",
"speculative.verdict.fetch_failed": "ℹ 无法获取 tokenizer",
"speculative.verdict.identical_models": "ℹ 模型相同——spec-dec 是 no-op",
"speculative.verdict.missing_input": "ℹ 输入两个 id",
"inv.v085.speculative": "🔬 Spec-Decode — 在发布 speculative decoding 前验证 target + draft 之间的 tokenizer vocab 兼容性(静默给出更差吞吐量的 bug)。",
"help.v085.speculative.title": "🔬 Speculative-Decode 兼容性",
"help.v085.speculative.body": "Speculative decoding 仅当 target 和 draft 共享完全相同的词汇表时才能工作。Vocab 不匹配导致每个 draft token 被拒绝——你支付双倍计算成本且吞吐量比 baseline 更差。更糟:系统仍输出正确(只是更慢),所以 bug 在单元测试中不可见。vLLM #4570 / #16757 / #20409 / #12488 都显示了变种。这个工具从 HF Hub 获取两个 model id 的 `tokenizer.json`,比较 tokenizer 类型、vocab 大小、完整 token→id 映射、special token 和 added token,然后基于参数比和典型 α=0.5/0.7/0.85 接受率估算 speedup 范围。用例:在启动启用了 spec-dec 的 vLLM 集群之前,验证这对模型是否真的兼容。",
// v0.8.7 — anti-bullshit pack #13: Multilingual Tokenizer Tax
"modes.tax": "🌍 Token Tax",
"mode_desc.tax": "通过浏览器端 transformers.js 对粘贴文本进行 6 个供应商 tokenizer 的真实 BPE 编码。揭示语言间的静默成本不对称。",
"tax.title": "🌍 多语言 Tokenizer 税",
"tax.tip": "Tokenizer 对非英语文本的征税不对称。同一段落在英语中可能是 100 个 token,但在拉丁字母训练的 tokenizer(Llama、Phi)上的中文可能是 250+ 个 token。每次请求成本和有效上下文都会静默降级。粘贴你的文本,通过供应商 tokenizer 查看实际 token 数——没有估算,通过 transformers.js 在浏览器中真实 BPE 编码。",
"tax.desc": "不要因中文支持让账单 3 倍。 粘贴任意文本 → 通过 Qwen / Phi / Llama / Gemma / GPT-4 / Claude 的真实 BPE 编码 → 查看相对于 baseline 的成本不对称。",
"tax.input_label": "要 tokenize 的文本:",
"tax.input.placeholder": "粘贴任何文本——英语、中文、阿拉伯语、代码……",
"tax.tokenize_btn": "🔬 Tokenize 全部",
"tax.sample_en_btn": "↳ 示例:English",
"tax.sample_zh_btn": "↳ 示例:中文",
"tax.sample_ar_btn": "↳ 示例:عربى",
"tax.sample_mixed_btn": "↳ 示例:混合",
"tax.sample_code_btn": "↳ 示例:代码",
"tax.status.loading": "⏳ 加载 transformers.js + tokenizer(首次运行可能需要 5-15 秒)…",
"tax.status.done": "✅ {n}/{total} 个 tokenizer,用时 {ms}ms",
"tax.col.tokenizer": "Tokenizer",
"tax.col.tokens": "Token 数",
"tax.col.cpt": "字符/token",
"tax.col.ratio": "比率",
"tax.summary.input": "输入:{chars} 字符,{bytes} 字节",
"tax.script_breakdown": "脚本",
"tax.interp.worst": "{label} 对此文本的 token 数比 baseline 多 {pct}%。",
"tax.interp.uniform": "✓ 所有 tokenizer 在 ±5% 范围内——文本在各供应商间处理良好。",
"tax.hint.empty": "粘贴文本然后点击 Tokenize。",
"tax.all_failed": "所有 tokenizer 都失败了。",
"tax.error.gated": "模型受限(需要 HF auth——尝试 open mirror)",
"tax.error.not_found": "找不到 model id",
"tax.error.timeout": "超时(大 tokenizer 或慢速连接)",
"tax.error.network": "网络错误",
"tax.error.fetch_failed": "获取失败",
"tax.error.invalid_input": "无效输入",
"tax.attribution": "Tokenizer 通过",
"tax.attribution.privacy": "文本在本地 tokenize——永远不会离开浏览器。",
"tax.firstload_note": "💡 首次加载:工具按需获取 transformers.js(~750 KB)+ 每个 tokenizer 的词汇表(每个 ~5-15 MB,加载后缓存)。后续运行即时。所有处理都是本地的——你的文本永远不会离开浏览器。",
"inv.v087.tax": "🌍 Token Tax — 6 个供应商 tokenizer 的真实 BPE 编码。揭示语言间(CJK / 阿拉伯语 / 混合)的静默成本不对称。",
"help.v087.tax.title": "🌍 多语言 Tokenizer 税",
"help.v087.tax.body": "Tokenizer 对非英语文本的征税不对称。同一段落在英语中可能是 100 个 token,但在拉丁字母训练的 tokenizer(Llama、Phi)上的中文可能是 250+ 个 token。每次请求成本和有效上下文都会静默降级。这个工具在你的浏览器中加载 HuggingFace transformers.js(~750 KB CDN),并对粘贴的文本运行 6 个预设供应商 tokenizer(Qwen2.5、Phi-3.5、Llama-3.1、Gemma-2、GPT-4 cl100k、Claude 近似)的 tokenize。输出:每个 tokenizer 的 token 数 + 字符/token + 相对于 baseline 的比率 + 成本不对称解读。自动检测脚本块(拉丁/CJK/阿拉伯/西里尔/天城/泰/希腊/希伯来/韩文)让你看到为什么一个 tokenizer 是另一个的 3 倍。用例:『我的多语言支持给账单加了 30%——哪种语言成本最高?』→ 粘贴真实生产文本,查看每个 tokenizer 的精确分解。",
// v0.8.8 — anti-bullshit pack #14:LongScore (RULER + HELMET 查询)
"modes.longscore": "🎯 LongScore",
"mode_desc.longscore": "查询你的模型在短上下文之外的相对降级。RULER + HELMET KB(n=93 模型)。LongScore 指标来自 100-LongBench (ACL 2025)。",
"longscore.title": "🎯 LongScore",
"longscore.tip": "每个模型都声称 128K 上下文窗口,但准确率早就开始降级。LongScore(来自 100-LongBench, ACL 2025 的 peer-reviewed 指标)测量相对于短上下文的降级。将基础能力与真正的长上下文能力解耦——你比较的是降级,而不是原始分数。在 RULER + HELMET KB 中查询(n=93 模型)。",
"longscore.desc": "你的模型在短上下文之外降级多少? 粘贴 HF 模型 id → 查看 LongScore(相对降级)+ 每长度分解 + HELMET 7-task 分数(如有)。无 GPU。无推理。纯查询已发布的 benchmark。",
"longscore.input_label": "模型 id:",
"longscore.input.placeholder": "例如 Qwen2.5-72B-Instruct 或 meta-llama/Llama-3.1-70B-Instruct",
"longscore.lookup_btn": "🔎 查询",
"longscore.example_good_btn": "↳ 示例:Jamba-1.5-Large(无降级)",
"longscore.example_mid_btn": "↳ 示例:Llama-3.1-70B(中等)",
"longscore.example_bad_btn": "↳ 示例:dbrx(严重)",
"longscore.formula_note": "💡 LongScore = 在 l ∈ {16K, 32K, 64K, 128K} 上的 (S_l − Base) / Base 平均值,其中 Base = mean(S_4K, S_8K)。来源:100-LongBench, ACL 2025。数据:NVIDIA RULER(每长度,n=33)+ HELMET(128K 聚合,n=60)。0 = 无降级;-0.30 = 严重。",
"longscore.miss.title": "KB 中未找到模型",
"longscore.miss.body": "查询了 {id}。KB 包含 {n} 个模型。请尝试规范 HF id(例如 Qwen2.5-72B-Instruct、Llama-3.1-70B-Instruct、Jamba-1.5-Mini)。",
"longscore.miss.suggest": "在以下位置检查覆盖范围",
"longscore.no_ruler": "⚠ 无每长度数据 — LongScore 无法计算。改为显示 128K 处的 HELMET 聚合。",
"longscore.score_label": "LongScore",
"longscore.helmet_label": "HELMET 7-task 分解",
"longscore.col.ctx": "上下文",
"longscore.col.score": "分数",
"longscore.col.lc": "LC",
"longscore.col.task": "任务",
"longscore.source_note": "数据源",
"longscore.hint.empty": "⚠ 请先粘贴模型 id。",
"longscore.status.lookup": "⏳ 查询中…",
"longscore.status.miss": "ℹ 模型不在 KB 中",
"longscore.status.ruler_hit": "✅ 找到 RULER 每长度数据",
"longscore.status.helmet_only":"ℹ 仅 HELMET 聚合(无每长度数据)",
"longscore.verdict.no_degradation": "✅ 短上下文之外无降级",
"longscore.verdict.mild": "🟢 轻度降级(<10%)",
"longscore.verdict.moderate": "🟠 中度降级(10-20%)",
"longscore.verdict.severe": "🔴 严重降级(20-30%)",
"longscore.verdict.extreme": "🚨 极端降级(>30%)",
"inv.v088.longscore": "🎯 LongScore — peer-reviewed 降级指标(100-LongBench, ACL 2025)。在 RULER + HELMET KB(n=93)中查询任意模型。看你的模型在短上下文之外实际下降多少。",
"help.v088.longscore.title": "🎯 LongScore",
"help.v088.longscore.body": "每个长上下文 LLM 都声称 128K,但早就开始降级。100-LongBench 论文 (ACL 2025, arXiv:2505.19293) 注意到原始长上下文分数被基础能力主导——一个更聪明但长上下文配方更差的模型,仍然得分高于一个不那么聪明但配方更好的模型,掩盖了真正的长上下文降级。他们提出 LongScore:LC_l = (S_l − Base) / Base,其中 Base = mean(S_short),然后对长长度取平均。结果:每个模型一个相对降级数字,可以同等比较。这个 tafagent 模式嵌入了 LongScore-ready 数据:RULER 每上下文聚合(n=33 模型,4K-128K)+ HELMET 128K 聚合(n=60 模型,7 类别)。查询是按 HF 模型 id 精确匹配(小写、连字符、点号已规范化)。对于有 RULER 数据的模型,你得到完整的 LongScore + 每长度分解 + 判定(无/轻/中/严重/极端降级)。对于仅 HELMET 模型,你得到 128K 处的 7-类别聚合。用例:『我想用 Llama-3.1-70B-Instruct 做 100K-token 文档摘要——实际上我损失多少准确率?』→ 粘贴 id,看到 -10% LongScore(中度降级,主要是 128K 处的 cliff)。决定是否使用、改用 long-ctx engineered 的模型,或者分块输入。",
"inv.v081.hub": "🧭 Solutions Hub — 每个文档化的问题都映射到一个 tafagent 模式或精选外部工具。别重复发明 — 去找。",
"help.v081.hub.title": "🧭 Solutions Hub",
"help.v081.hub.body": "tafagent 作为集成者而非孤岛。30+ 问题跨 7 类别(评估可靠性 · 诊断 · 设置 · 训练 · 检索 · 多模态 · 可观测性),每个映射到(a)解决它的 tafagent 模式(若存在),以及(b)社区已信任的最佳外部工具(RAGAS、MTEB、HELM、MCP Schema Validator、llm-stats、llguidance、GlitchMiner 等)。搜索框匹配 pain、场景和工具名称。用例:'我有问题 X — tafagent 解决它吗,如果不,谁解决?'",
"hub.title": "🧭 Solutions Hub",
"hub.tip": "我们已知的每个 LLM-eval 问题的地图:哪个 tafagent 模式能解决它(若有),以及社区已信任的最佳外部工具。目标:全覆盖。如果规范工具已在别处,我们链接而非重建。",
"hub.desc": "别重新发明 — 去找。30+ 问题映射到 tafagent 模式 + 精选外部工具。按类别浏览、按关键字搜索,或查看新模式最有帮助的空缺。",
"hub.clear_btn": "✕ 清空",
"hub.no_mode": "外部",
"hub.planned": "计划:",
"hub.best_for": "适合",
"hub.not_for": "不适合",
"hub.tools": "外部工具",
"hub.status.loaded": "✅ 已加载 {total} 个问题,跨 {categories} 类别 — {covered} 个由 tafagent 模式覆盖,精选 {externalLinks} 个外部链接。编译于 {compiled}。",
"hub.status.fail": "⚠ 无法加载 Solutions Hub。",
"hub.search.empty": "无 '{query}' 的匹配。尝试更宽泛的词(如 'eval'、'rag'、'tokenizer')。",
"hub.search.results": "为 '{query}' 找到 {n} 个匹配。",
// v0.7.7 — 任务卡片(UX 重构:按用户意图分组的 14 个模式)
"tiles.title": "🎯 你想做什么?",
"tiles.subtitle": "选择一项任务。每一项会打开下方对应的工具。或往下滚动查看完整的 22 个模式列表。",
"tile.diagnose.title": "🔬 诊断一个模型",
"tile.diagnose.desc": "这个具体模型符合我的用例吗?",
"tile.trust.title": "✓ 相信 benchmark 分数",
"tile.trust.desc": "我该相信这个数字吗?是 bug 还是噪声?",
"tile.eval.title": "⚙️ 正确设置 eval",
"tile.eval.desc": "获取 lm-eval / vLLM / transformers 的精确 CLI flag。",
"tile.compare.title": "🆚 比较模型",
"tile.compare.desc": "并排,或浏览经验模型面板。",
"tile.manual.title": "📋 手动 / 自由",
"tile.manual.desc": "手动挑一个具体 recipe,或用自然语言提问。",
"tile.diagnose.tip": "当你有具体的 model id 并想要完整诊断时从这里开始:Profile 一次运行所有 5 个 recipe。Unmask 检查 max_position_embeddings 是否诚实。NIAH→Reason 预测 retrieval-vs-reasoning 的 gap。LongScore 查询已发布的 RULER + HELMET 数据,显示模型在短上下文之外的真实降级(peer-reviewed 指标)。Quant 预测量化是否会破坏它。Inspect 允许粘贴原始 config.json,适用于私有 / 在研模型。",
"tile.trust.tip": "当你看到一个分数想知道它是否可靠。Contamination 按模型在训练时看到 benchmark 的可能性给 20+ 个 benchmark 评级。Drift 告诉你两个 eval 之间的 gap 是数值噪声还是真实 bug(chat-template 不匹配、KV-cache 布局等)。Arena CI 重建 Chatbot Arena 隐藏的置信区间——很多 top-Elo 的 "胜利" 在统计上是并列。",
"tile.eval.tip": "在运行 lm-eval-harness 或 vLLM serve 之前,获取正确的 CLI flag。Chat-template Sniffer 检测 template 系列(Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none)并输出精确的 --apply_chat_template / --chat-template 调用。解决 lm-eval-harness 的 issue #1841(accuracy 静默对半)。Diagnose CLI 生成 Python 命令在你的本地 GPU 上测量 γ_obs。",
"tile.compare.tip": "Compare:选择 2-3 个候选模型 + 一个 recipe,在并排表格中看判定(例如 Llama-3-8B vs Mistral-7B 在 32k 上下文)。Phase diagram:23 个经验模型在 (log θ, γ) 平面上的散点图,叠加 Padé 曲线。悬停点查看详情,点击将该模型加载到 Recipe 表单。",
"tile.manual.tip": "Recipe:挑选具体的 X-N recipe(X-1 自训 vs API、X-2 长上下文、X-3 预算、X-5 硬件、X-19 KV 压缩、X-21 imprint、X-22 compute-context 不变量、X-23 IH 相位)并手动填表,完全控制。Ask:输入自由问题;浏览器内的 0.5B LLM(Qwen2.5)选择合适的 recipe 并运行。最适合 "如果……会怎样" 的探索。",
"share.import_desc": "有他人 TAF 分析的 JSON 文件? 在这里加载以本地查看判定 + 链。与您自己运行的视图相同。",
"share.import_btn": "📂 加载共享的 JSON",
"synthesis.system": "您是 transformer LLM 的精确诊断助手。给定预先计算的 TAF 公式结果,用 4-6 句中文写出清晰的摘要。为每个提到的数字引用章节号 (§X.Y)。始终给出具体建议。不要编造数字。",
// INSPECTOR 模式
"inspector.title": "🔍 架构检查器",
"inspector.desc": "粘贴 config.json 的原始内容。工具提取架构参数并运行完整的 5 配方 Profile。",
"inspector.tip": "直接粘贴任意 config.json。工具解析它并运行完整 Profile。适用于:私有模型、开发中的 configs、尚未在 HuggingFace 的模型,或比较自定义架构的行为。",
"inspector.quickstart": "💡 用例:您有未在 HF Hub 上的私有模型,或正在设计的 config。粘贴下面的原始 JSON,获取完整 TAF 画像。",
"inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}",
"inspector.T_eval": "T_eval (您的目标上下文):",
"inspector.btn": "🚀 检查并画像",
// WHAT-IF 滑块
"whatif.title": "🎚 What-if: 拖动 T_eval 实时查看 γ 变化",
"whatif.desc": "纯 JS 重新计算 (不调用 Pyodide)。滑动时显示几何 γ_Padé 和 d_horizon。点击按钮重新运行完整链。",
"whatif.T_eval": "T_eval",
"whatif.gamma_pade": "γ_Padé",
"whatif.d_horizon": "d_horizon",
"whatif.l_niah": "L_NIAH 上限",
"whatif.predicted": "预测几何判定",
"whatif.rerun": "↻ 在此 T_eval 重新计算完整链",
// COMMUNITY 反馈
"community.title": "🌐 社区最近提交",
"community.desc": "公共 registry 的实时反馈。点击任意提交查看完整分析。",
"community.browse_all": "浏览全部 →",
"community.loading": "加载中...",
"community.no_repo": "Registry 仓库尚未创建。一旦它存在并有提交,它们将在此处实时显示。",
"community.no_submissions": "暂无提交。成为第一个 — 生成一个 Profile 并点击 📤 提交到 registry。",
// FALSIFICATION 仪表板
"falsification.title": "🔬 论文预测 — 可证伪状态",
"falsification.desc": "TAF 框架基于可证伪的预测 (F1-F23)。每一个都经过经验测试。这是论文中每个预测的实时状态。",
"falsification.summary": "{confirmed} 已确认 · {partial} 部分 · {refuted} 已反驳 · {untested} 未测试 (共 {total} 个预测)",
"falsification.col.id": "ID",
"falsification.col.claim": "Claim",
"falsification.col.status": "状态",
"falsification.col.evidence": "证据",
"tafcard.title": "📇 TAF 卡 — 完整模型画像",
"tafcard.recipes_title": "📋 配方 — 各维度判定",
"tafcard.recipes_count_label": "维度",
"tafcard.numbers_title": "🔢 关键数字 (paper §26)",
"tafcard.fals_title": "🔬 可证伪状态 (F1-F23)",
"tafcard.fals_none": "无适用的可证伪。",
"tafcard.diag_title": "🔬 诊断 — 数字 · γ 检验 · what-if",
"tafcard.verify_title": "✓ 验证 — Lean + Sage + 可证伪",
"tafcard.share_title": "📂 来源与分享",
"tafcard.whatif_title": "🎚️ What-if 浏览器",
"verdict.go": "通过",
"verdict.no": "否",
"verdict.degraded": "降级",
"compare.title_out": "🆚 比较表",
"status.loading_pyodide": "⏳ 加载 Python 运行时 (~10MB,首次加载)...",
"status.loading_taf": "⏳ 加载 TAF 公式 + 配方...",
"status.ready": "✅ 就绪。选择一个模型并点击画像开始。",
"status.computing": "🧮 计算 TAF 链...",
"status.done": "✅ 完成。",
"profile.hf_placeholder": "例如: meta-llama/Meta-Llama-3-8B 或 Qwen/Qwen2.5-7B",
"compare.hf_placeholder": "HF 模型 id (例如: meta-llama/Meta-Llama-3-8B)",
"compare.slot1_placeholder": "HF 模型 id (例如: meta-llama/Meta-Llama-3-8B)",
"compare.slot2_placeholder": "HF 模型 id #2",
"compare.slot3_placeholder": "HF 模型 id #3 (可选)",
"compare.preset_default": "— 或预设 —",
// 表单参数
"param.theta": "θ (rope_theta)",
"param.theta.tip": "RoPE 基础频率 来自 config.rope_theta。越高 = 长程能力越强。",
"param.T_train": "T_train",
"param.T_train.tip": "训练最大上下文。来自 max_position_embeddings。超出此范围属于外推。",
"param.T_eval": "T_eval (您的目标)",
"param.T_eval.tip": "您的目标推理上下文。关键问题: 模型在 这个 长度下表现是否良好?",
"param.n_attn": "n_attention_heads",
"param.n_attn.tip": "每层 attention heads 数。来自 num_attention_heads。",
"param.n_kv": "n_kv_heads",
"param.n_kv.tip": "KV heads。若 < n_attention_heads → GQA (Grouped Query Attention)。降低 KV 内存但将 γ 推向 Hagedorn。",
"param.d_head": "head_dim",
"param.d_head.tip": "每 head 维度。典型 64、96、128。来自 head_dim 或 hidden_size / num_attention_heads。",
"param.n_layers": "n_layers",
"param.n_layers.tip": "Transformer 块数。来自 num_hidden_layers。",
"param.n_params": "n_params (例如 8e9)",
"param.n_params.tip": "总参数量。约 400M 阈值出现 induction heads。影响 KV 内存和预算配方。",
"param.has_swa": "有 SWA 吗?",
"param.has_swa.tip": "Sliding Window Attention。Mistral、gemma-2、phi-3 为 true。v0.5.3 校准审计禁用了历史 δ_SWA 校正 (n=1 拟合)。",
"common.yes": "是",
"common.no": "否",
// 模式提示
"modes.tip": "十四种使用方式。
📇 画像: 粘贴模型 id → 5 个配方的 TAF 卡。
🆚 比较: 2-3 个模型在一个配方上并排比较。
🔍 检查 config: 粘贴原始 config.json → 完整画像。
💬 提问: 自由形式问题,浏览器 LLM 选择配方。
📋 配方: 手动选择,完全控制表单。
🩺 CLI 诊断: 生成 Python 命令在本地测量 γ。
📊 相图: 23 个面板模型在 (log θ, γ) 平面上。
🪟 揭示: 检测误导的 max_position_embeddings(SWA / YaRN / RoPE 缩放)。
📜 Chat-template: 检测系列 + 给出 lm-eval / vLLM / transformers 的精确 CLI flag。
🎯 Arena CI: 从原始 pairwise 投票数据重建置信区间;检测 Arena 隐藏的统计并列。
🧪 污染: 根据训练 cutoff 与发布日期,对 20+ benchmark 进行污染概率评估。
⚖️ Quant: 预测任意(模型 × 量化方案)的 γ-shift 与 ΔPPL;cliff 时推荐更安全替代方案。
🔀 Drift: 同一模型,两 setup 下分数不同 — bug 还是噪声?预测数值噪声区间并标记真实 bug。
🔍 NIAH→Reason: 从架构预测 NIAH 与多跳 reasoning 通过率;找到模型的安全 reasoning 上下文。",
"profile.tip": "一键完整诊断。粘贴任意 HF 模型 id (或选择预设)。工具运行所有 5 个配方 (长上下文、KV 压缩、自定义 vs API、预算、硬件),生成单个 TAF 卡,显示每个维度的判定 + 关键数字 + 架构分类。
用例: \"我正在为生产评估 Qwen2.5-32B — 它的完整可行性概况是什么?\" → 粘贴 id → 画像 → 完成。",
"compare.tip": "同一配方,多个模型。选择 2-3 个候选模型和一个配方。在单个比较表中查看判定。
用例: \"我需要在 16K 进行长上下文检索 — 哪个最好: Llama-3-8B、Mistral-7B 或 Qwen-7B?\" → 选择 3 个 + X-2 + 16K → 看赢家。",
// 帮助模态框
"help.title": "📘 TAF Agent — 用户手册",
"help.what.title": "它做什么?",
"help.what.body": "在花费 GPU/$ 之前,预测任意 transformer LLM 的实际可行性。回答诸如 \"这个模型能在 L=32K 工作吗?\" 或 \"我应该自定义训练还是使用 API?\" 等问题,使用确定性 Python 公式 (TAF — Thermodynamic Attention Framework)。",
"help.modes.title": "如何使用 — 7 种模式",
"help.modes.profile": "📇 画像: 粘贴模型 id → 同时运行所有配方 = TAF 卡。最佳起点。",
"help.modes.compare": "🆚 比较: 2-3 个模型在同一配方上并排。最适合在候选者之间选择。",
"help.modes.inspector": "🔍 检查 config: 粘贴原始 config.json → 工具解析并运行完整画像。适用于私有模型、开发中的配置、或尚未在 HF Hub 上的模型。",
"help.modes.ask": "💬 自由提问: 自然语言问题,浏览器 LLM 选择配方。最适合随意探索。",
"help.modes.recipe": "📋 配方 + 表单: 手动选择,完全控制参数。最适合需要精确控制时。",
"help.modes.diagnose": "🩺 CLI 诊断: 生成 Python 命令在你的本地机器上测量 γ (transformers + numpy)。快速 ≈5 分钟 CPU;完整 ≈20–60 分钟 GPU。结果 JSON 可通过 Inspect 重新上传。",
"help.modes.phase": "📊 相图: 23 个面板模型在 (log θ, γ) 平面上的散点图。Hagedorn 线 γ=1 分隔 A 相和 B 相。点击点将该模型加载到配方表单。",
"help.recipes.title": "可用的 8 个配方",
"help.recipe.x1.title": "X-1 自定义训练 vs API — 比较训练自己模型的成本与付费使用 API 的成本。",
"help.recipe.x1.example": "尝试: \"我应该训练 8B 自定义模型还是使用 GPT-4o 处理每月 50M tokens?\"
答案: 是 (自定义) / 否 (API),含损益平衡月数。",
"help.recipe.x2.title": "X-2 长上下文可行性 — 预测模型是否能可靠地服务目标上下文长度。",
"help.recipe.x2.example": "尝试: \"Meta-Llama-3-8B 能处理 32000 tokens 检索吗?\"
链: γ_Padé → 分解 → d_horizon → NIAH 上限 → 幻觉 → KV 内存。
判定: 是 / 降级 / 否,如需则提供缓解措施。",
"help.recipe.x3.title": "X-3 预算预飞行 — 给定 $ 预算,可行训练什么模型?",
"help.recipe.x3.example": "尝试: \"我有 $5000,可以训练什么模型?\"
答案: GO / TINY-MODEL / MEMORY-LIMITED 含具体的 N (参数) 和 D (tokens)。",
"help.recipe.x5.title": "X-5 硬件选择 — 应该使用哪个 GPU 以达到目标吞吐量?",
"help.recipe.x5.example": "尝试: \"以每天 1000 万 tokens 提供 Llama-3-8B 的最便宜硬件\"
答案: 最佳 GPU + $/Mtok + 容量 vs 目标。",
"help.recipe.x19.title": "X-19 KV 压缩决策 — 应该使用 soft decay、hard cutoff 还是文献方法?",
"help.recipe.x21.title": "X-21 Imprint 纯度诊断 — 通过 ν=−1/(2π) 预测 RANDOM token 上的 γ;模型的 RoPE 预测有多干净?",
"help.recipe.x22.title": "X-22 Compute-Context 不变量 — γ × log(N²·D) 是否落在 51.2 ± 16.8 区间内?检测 scaling/training 异常。",
"help.recipe.x23.title": "X-23 IH-Phase 检测器 — 前- 还是后-induction-head?通过 sign(γ_text − γ_random) 进行廉价探测。",
"help.recipe.x19.example": "尝试: \"如何为 Qwen2.5-7B 在 32K 压缩 KV 缓存?\"
答案: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
"help.recipe.x21.example": "尝试: \"Llama-3-8B 上的 RoPE 预测有多干净?\"
答案: 预测的 γ_random + 诊断 (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED)。",
"help.recipe.x22.example": "尝试: \"Mistral-7B 是否符合 compute-context 不变量?\"
答案: K = γ·log(N²·D)、z-score、IN-BAND 或 OUTLIER。",
"help.recipe.x23.example": "尝试: \"Qwen2.5-7B 是后-induction-head 吗?\"
答案: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY。",
"help.section.v04": "v0.4 新增 (第 29 次研究会话, 2026-04-28): 来自 cross-model panel 分析 (n=22 LLMs) 的三个诊断 recipes。",
"help.divider.v04_s29": "— v0.4 (第 29 次会话发现) —",
"footer.tech_stack": "计算:Pyodide · 综合:WebLLM (Qwen2.5-0.5B 本地) · 托管:GitHub Pages · 成本:$0",
"help.v04.imprint": "学习印记斜率 ν = −1/(2π): RoPE 旋转周期 2π 在权重上引发位置偏置, 与 log(N_params) 成正比。即使 random token 也显示此 scaling。ν 是 DERIVED — 非拟合 (经验误差 0.3%)。",
"help.v04.invariant": "Chinchilla-attention 不变量 K: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329)。将 compute scaling 和 attention 指数连接为单一无量纲数。",
"help.v04.ih_probe": "Δγ 作为 IH 探测: sign(γ_text − γ_random) > 0 ⟺ post-induction-head。比运行 in-context-learning 基准更便宜。",
"help.v04.constants": "γ 簇落在著名常数上 (有趣, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (黄金共轭, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e)。Caveat: 可能是巧合。",
"help.param.theta": "θ (rope_theta): RoPE 基础频率。越高 = 长程能力越强。典型: 10000 (早期),500000 (Llama-3),1000000 (Qwen2.5)。",
"help.param.T_train": "T_train: 模型训练时的最大上下文。来自 max_position_embeddings。",
"help.param.T_eval": "T_eval: 您的 目标推理上下文长度。关键旋钮。",
"help.param.gqa": "n_kv_heads < n_attention_heads: 模型使用 GQA (Grouped Query Attention)。减少 KV 内存但将 γ 推向 Hagedorn。",
"help.param.swa": "has_SWA: 模型使用 Sliding Window Attention (Mistral、gemma-2)。",
"help.param.nparams": "n_params: 总参数数量。诱导头出现的阈值约 400M。",
"help.add_models.title": "添加新模型 (3 种方式)",
"help.add_models.preset": "预设列表: 11 个流行模型已策划。从下拉菜单选择。",
"help.add_models.hf": "HF Hub 获取: 粘贴任意 id (例如 Qwen/Qwen2.5-32B-Instruct),点击 📥 获取。浏览器直接从 HuggingFace 下载 config.json,填充表单。适用于任何公共模型。",
"help.add_models.manual": "手动: 用模型卡的值直接填充表单字段。",
"help.audit.title": "可审计链",
"help.audit.body": "每个结果都显示完整的计算链 — 每个公式步骤及其输入、输出和解释。点击任意步骤展开。引用的章节号 (§26.1、§19.1 等) 指向论文中的推导。",
"help.synthesis.title": "自然语言回答",
"help.synthesis.body": "在确定性链运行后,浏览器中的 LLM (Qwen2.5-0.5B,首次加载后约 350MB 缓存) 综合自然语言摘要。上面的数字始终正确 (确定性 Python);综合由 LLM 生成 — 如有疑问,请对照链验证。",
"help.params.title": "常见参数解释",
"help.verdicts.title": "判定中要看什么",
"help.verdict.yes": "是 / GO — 自信地继续;数字支持选择。",
"help.verdict.deg": "降级 / TINY-MODEL — 有警告地工作;阅读操作。",
"help.verdict.no": "否 / MEMORY-LIMITED — 不要按原样进行;提供缓解措施。",
"help.privacy.title": "隐私",
"help.privacy.body": "一切都在您的浏览器中运行。无遥测,无分析,无数据发送到任何地方。即使是 LLM 模型也通过 WebGPU/WebAssembly 在本地运行。您的 model_ids 和问题永不离开此页面。",
"help.source.title": "源代码和论文",
"help.source.body": "源代码: github.com/karlesmarin/tafagent
论文: Marin 2026 — Predicting How Transformers Attend (Zenodo; arXiv 即将)
数据集: taf-attention-decay — 32个模型上的58次γ测量 (CC-BY-4.0)",
"footer.text": "© 2026 Carles Marin · Apache-2.0 · 独立研究 · 闭合论文回路的工具。",
},
};
let currentLang = "en";
export function getLang() {
return currentLang;
}
export function setLang(code) {
if (!TRANSLATIONS[code]) return;
currentLang = code;
try { localStorage.setItem("tafagent_lang", code); } catch (e) {}
applyTranslations();
// Highlight active flag
document.querySelectorAll("[data-lang]").forEach(el => {
el.classList.toggle("lang-active", el.dataset.lang === code);
});
}
export function t(key) {
return TRANSLATIONS[currentLang][key] ?? TRANSLATIONS.en[key] ?? key;
}
export function applyTranslations() {
document.querySelectorAll("[data-i18n]").forEach(el => {
const key = el.dataset.i18n;
const value = t(key);
// Allow HTML in translations (we control them)
el.innerHTML = value;
});
document.querySelectorAll("[data-i18n-placeholder]").forEach(el => {
el.placeholder = t(el.dataset.i18nPlaceholder);
});
}
// Expose so dynamically-inserted DOM (renderProfile, renderCompare) can re-apply
if (typeof window !== "undefined") {
window.__taf_applyTranslations = applyTranslations;
// Also expose the lookup itself so non-import-based modules (e.g. hf_autocomplete
// that runs outside main.js context) can localize without a circular import.
window.__taf_t = t;
}
export function initI18n() {
// Browser default lang detection or stored preference
let stored = null;
try { stored = localStorage.getItem("tafagent_lang"); } catch (e) {}
if (stored && TRANSLATIONS[stored]) {
currentLang = stored;
} else {
const browserLang = (navigator.language || "en").slice(0, 2);
if (TRANSLATIONS[browserLang]) currentLang = browserLang;
}
applyTranslations();
// Mark active flag
document.querySelectorAll("[data-lang]").forEach(el => {
el.classList.toggle("lang-active", el.dataset.lang === currentLang);
});
}