Spaces:

karlexmarin
/

taf-agent

Running

File size: 361,757 Bytes

// TAF Agent i18n — minimal translation system.
// Add languages by extending TRANSLATIONS. Set data-i18n="key" on any element.
// Persist user choice in localStorage.

export const LANGUAGES = [
  { code: "en", flag: "🇬🇧", label: "English" },
  { code: "es", flag: "🇪🇸", label: "Español" },
  { code: "fr", flag: "🇫🇷", label: "Français" },
  { code: "zh", flag: "🇨🇳", label: "中文" },
];

export const TRANSLATIONS = {
  en: {
    "hero.title":     "🔬 TAF Agent",
    "hero.tagline":   "Diagnose any transformer LLM in 30 seconds. Free. No GPU. No signup.",
    "hero.subtitle":  "Predicts whether a model will work for your use case <em>before</em> you spend money or time. Everything runs in your browser &mdash; your inputs never leave this tab.",
    "hero.help":      "📘 Manual &amp; examples",
    "hero.quickstart_btn": "⚡ Quick start",
    "hero.inventory_btn":  "🧰 What it gives you",
    "hero.about":     "Built by an independent researcher. Open source. Not affiliated with any model vendor.",

    "modes.title":    "🎯 Mode",
    "modes.profile":  "📇 Profile a model",
    "modes.compare":  "🆚 Compare models",
    "modes.inspector": "🔍 Inspect config",
    "modes.ask":      "💬 Ask plain English",
    "modes.recipe":   "📋 Pick recipe",
    "modes.diagnose": "🩺 Diagnose CLI",
    "diagnose.title": "🩺 Diagnose CLI Command Builder",
    "diagnose.tip":   "Browser predicts γ from config; the CLI measures γ_obs on real weights. Builder produces the exact command to run locally.",
    "diagnose.desc":  "Pick options and copy-paste the generated command on your local machine (Python + transformers + numpy). Fast mode ≈5 min CPU; full ≈20–60 min GPU.",
    "diagnose.model_label": "HF model id:",
    "diagnose.theta_label": "θ (auto if blank):",
    "diagnose.n_label": "Context N:",
    "diagnose.options_label": "Options:",
    "diagnose.opt_fast": "--fast (CPU, ~5 min)",
    "diagnose.opt_cpu": "--cpu (force CPU)",
    "diagnose.opt_4bit": "--load_in_4bit (≥7B models)",
    "diagnose.local_label": "--local path (optional):",
    "diagnose.build_btn": "📋 Build command",
    "diagnose.cmd_title": "Generated command:",
    "diagnose.copy_btn": "📋 Copy to clipboard",
    "diagnose.next_steps": "Next steps: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Run the command (4) Result JSON → upload via Inspect mode for full TAF analysis.",
    "modes.phase":    "📊 Phase diagram",
    "phase.title":    "📊 Phase diagram (γ × θ)",
    "phase.tip":      "Each dot is one model from the paper's empirical panel. x-axis log θ; y-axis γ. Hagedorn line γ=1 separates Phase A from Phase B. Hover for details, click to load into the recipe form.",
    "phase.desc":     "23 models in the panel; Padé curve at T=2000.",
    "modes.desc":     "<strong>Quickest start</strong>: paste any HuggingFace model id (e.g. <code>meta-llama/Meta-Llama-3-8B</code>), click Profile. See all 5 recipes scored in seconds.",

    "profile.title":           "📇 Profile a model",
    "profile.desc":            "<strong>For technicians</strong>: when you need a complete viability snapshot of a candidate model. One-click runs all 5 recipes and produces a unified TAF Card.",
    "profile.preset_label":    "Preset:",
    "profile.preset_default":  "— or pick from list —",
    "profile.hf_label":        "HF model id:",
    "profile.fetch_btn":       "📥 Fetch",
    "profile.btn":             "🚀 Generate full profile",
    "profile.quickstart":      "💡 Quick start: pick any preset → click Generate. Or paste a model id from <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub trending</a> → 📥 Fetch → Generate.",

    "compare.title":           "🆚 Compare models side-by-side",
    "compare.desc":            "<strong>For technicians</strong>: when choosing between 2-3 candidate models for a specific deployment scenario. Same recipe, multiple models, side-by-side verdicts.",
    "compare.recipe_label":    "Recipe:",
    "compare.T_eval_label":    "T_eval (target context):",
    "compare.models_title":    "Models to compare (add up to 3)",
    "compare.btn":             "🚀 Compare",
    "compare.example":         "💡 Try: paste 3 popular 7-8B models (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), pick recipe X-2, T_eval=16000. See which best handles long context.",

    "ask.title":               "❓ Your question",
    "ask.placeholder":         "e.g. Will Mistral-7B handle 16K NIAH retrieval? Or: I have $5,000, what model can I train? Or: Cheapest GPU to serve Llama-70B at 100M tokens/day?",
    "ask.btn":                 "🚀 Analyze",
    "ask.example_btn":         "💡 Try an example",

    "recipe.title":            "📋 Recipe",
    "recipe.default":          "— select a recipe —",
    "recipe.input_title":      "🎯 Inputs",

    "verdict.title":           "📊 Verdict",
    "chain.title":             "🔍 Computation Chain",
    "chain.desc":              "Every number below is deterministic Python. Click a step to expand.",
    "answer.title":            "💬 Plain-English Answer",
    "share.btn":               "🔗 Copy share link",
    "share.copied":            "✅ Copied to clipboard!",
    "share.download":          "💾 Download JSON",
    "share.download_md":       "📝 Markdown",
    "share.download_tex":      "📜 LaTeX",
    "share.submit":            "📤 Submit to registry",
    "share.submit_clip_ok":    "↗ Opened GitHub. Body copied to clipboard — paste it into the issue body.",
    "share.submit_clip_fail":  "↗ Opened GitHub. Clipboard blocked — body logged in browser console (F12).",
    "share.import_title":      "📂 Import a shared TAF result",
    "a11y.skip":               "Skip to main content",

    // v0.6.2 — landing rework: quick-start + inventory + arch tooltips
    "qs.title":                    "⚡ Quick start",
    "qs.step1":                    "Paste a HuggingFace model ID (e.g. <code>meta-llama/Meta-Llama-3-8B</code>)",
    "qs.step2":                    "Click <strong>📇 Profile a model</strong>",
    "qs.step3":                    "Read your TAF Card — verdict per use case + key numbers + math verified by Lean+Mathlib",
    "qs.cta":                      "↓ Start now",
    "inv.title":                   "🧰 What this tool gives you",
    "inv.recipes.title":           "🎯 8 recipes — does this model fit your use case?",
    "inv.recipes.x1.title":        "Custom train vs API",
    "inv.recipes.x1.body":         "which is cheaper for your traffic?",
    "inv.recipes.x2.title":        "Long context",
    "inv.recipes.x2.body":         "will it handle 32k / 128k tokens reliably?",
    "inv.recipes.x3.title":        "Budget",
    "inv.recipes.x3.body":         "with $X, what model can you train from scratch?",
    "inv.recipes.x5.title":        "Hardware",
    "inv.recipes.x5.body":         "which GPU to serve N tokens/day?",
    "inv.recipes.x19.title":       "KV cache",
    "inv.recipes.x19.body":        "how to compress without breaking quality?",
    "inv.recipes.x21.title":       "Imprint purity",
    "inv.recipes.x21.body":        "how clean is the model's positional encoding?",
    "inv.recipes.x22.title":       "Compute-context",
    "inv.recipes.x22.body":        "does the model fit the empirical band?",
    "inv.recipes.x23.title":       "IH-phase",
    "inv.recipes.x23.body":        "pre- or post-induction-head?",
    "inv.diag.title":              "🔬 Diagnostics",
    "inv.diag.gamma":              "<strong>γ predicted vs observed</strong> — auto-classifies the model into 5 regimes (normal · fraud / inflated context · compressed · over-Padé · sliding-window)",
    "inv.diag.cardy":              "<strong>Cardy ΔH</strong> — entropy shift between observed and nominal context",
    "inv.diag.fals":               "<strong>Falsification dashboard</strong> — checks 23 specific predictions (F1–F23)",
    "inv.diag.alg":                "<strong>Algebraic consistency</strong> — 8 mathematical identities the model must satisfy",
    "inv.verify.title":            "✓ Formally verified math",
    "inv.verify.count":            "<strong>37 theorems</strong> machine-proven in Lean 4 + Mathlib4",
    "inv.verify.click":            "Click any badge → opens the source line on GitHub",
    "inv.verify.reverify":         "Verify yourself: <code>lake build</code> (≈5 s after cache fetch)",
    "inv.export.title":            "📤 Export &amp; share",
    "inv.export.formats":          "<strong>JSON · Markdown · LaTeX</strong> (paper-ready)",
    "inv.export.share":            "Reproducible share link (state encoded in URL)",
    "inv.export.registry":         "Submit to community registry on GitHub",
    "arch.summary":                "Architectures supported",
    "arch.anyhf":                  "✓ Any HuggingFace public model",
    "tooltip.mha":                 "Multi-Head Attention: each token position attends through several parallel heads at once.",
    "tooltip.gqa":                 "Grouped Query Attention: queries share fewer keys/values than heads (saves memory but pushes γ toward Hagedorn).",
    "tooltip.alibi":               "Attention with Linear Biases: position info is a learned slope added to attention scores, no rotation.",
    "tooltip.abspe":               "Absolute Position Embeddings: each position has a fixed learned vector added to the token embedding.",
    "tooltip.swa":                 "Sliding Window Attention: each token only attends within a fixed local window (Mistral, gemma-2 use this).",
    "tooltip.ssm":                 "State Space Model: a sequence layer that maintains internal state instead of attention (Mamba, Jamba use this).",

    // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker
    "modes.unmask":                "🪟 Unmask",
    "unmask.title":                "🪟 Context Unmasker",
    "unmask.tip":                  "Paste a HuggingFace model id (or raw config.json). The tool checks for sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), and GQA — anything that makes <code>max_position_embeddings</code> larger than the practical effective context. Mistral-7B-v0.1 is the canonical example: declared 32k, attends within ~4-8k.",
    "unmask.desc":                 "<strong>Are you about to spend money on a model that won't actually attend that far?</strong> Paste an id and find out in 1 second. No GPU, no inference — just config.json arithmetic.",
    "unmask.id_label":             "HF model id:",
    "unmask.fetch_btn":            "🔍 Unmask",
    "unmask.paste_summary":        "Or paste raw config.json (private / in-dev models)",
    "unmask.paste_btn":            "🔍 Unmask pasted config",
    "unmask.label.declared":       "Declared context",
    "unmask.label.effective":      "Effective (estimate)",
    "unmask.label.ratio":          "Ratio",
    "unmask.section.flags":        "Architecture flags",
    "unmask.section.warnings":     "Warnings",
    "unmask.section.reco":         "Recommendation",
    "unmask.flag.swa":             "SWA",
    "unmask.flag.rope":            "RoPE scaling",
    "unmask.flag.gqa":             "GQA",
    "unmask.flag.layers":          "Layers",
    "unmask.flag.dhead":           "d_head",
    "unmask.flag.theta":           "RoPE θ",
    "unmask.flag.yes":             "yes",
    "unmask.flag.no":              "no",
    "unmask.flag.full_mha":        "no (full MHA, {n} heads)",
    "unmask.verdict.honest":            "✅ HONEST",
    "unmask.verdict.inflated":          "⚠ INFLATED",
    "unmask.verdict.severely_inflated": "❌ SEVERELY INFLATED",
    "unmask.verdict.yarn_extended":     "⚠ YARN-EXTENDED",
    "unmask.verdict.unknown":           "❓ UNKNOWN",
    "unmask.warn.swa_window":      "SWA window: {window} tokens — each layer only attends within this window.",
    "unmask.warn.multihop":        "Multi-hop estimate: ~{multiHop} tokens (conservative: window × {factor}).",
    "unmask.warn.yarn":            "RoPE scaling ({type}) extends context {factor}× from ~{original} to {declared} tokens.",
    "unmask.warn.yarn_advice":     "RoPE-extended context — verify γ behavior at the full claimed length with the γ_check diagnostic.",
    "unmask.warn.gqa_small_dhead": "Small head dim ({d_head}) + GQA: KV cache compression at long context is likely (γ pushed toward Hagedorn).",
    "unmask.reco.honest":              "Standard full-attention model. Effective context matches declared ({declared} tokens).",
    "unmask.reco.inflated":            "Effective ~{effective} tokens via SWA. Use γ_check to verify behavior at your target evaluation length.",
    "unmask.reco.severely_inflated":   "Treat as a ~{effective}-token context model in practice. The {declared}-token claim only applies via cross-layer attention chains, which empirically degrade past ~2× the SWA window.",
    "unmask.reco.yarn_extended":       "RoPE-extended context. Run a long-context benchmark (NIAH at 8k / 16k / 32k / full) to confirm the extension holds. Use γ_check with T_eval = {declared}.",
    "unmask.reco.unknown":             "Could not parse config. Verify the URL is a valid HF model with public config.json.",
    "unmask.status.empty_id":      "⚠ Enter a model id (e.g. mistralai/Mistral-7B-v0.1).",
    "unmask.status.fetching":      "⏳ Fetching config.json for {modelId}...",
    "unmask.status.success":       "✅ Analyzed {modelId} (verdict: {verdict})",
    "unmask.status.empty_paste":   "⚠ Paste a config.json first.",
    "unmask.status.invalid_json":  "❌ Not valid JSON: {error}",
    "unmask.status.success_paste": "✅ Analyzed pasted config (verdict: {verdict})",
    "unmask.pasted_label":         "(pasted config)",
    "mode_desc.ask":               "Type a free-form question. The in-browser LLM picks the right recipe and runs it.",
    "mode_desc.recipe":            "Pick a recipe directly and fill the form. Full manual control.",
    "mode_desc.profile":           "Quickest start: paste any HuggingFace model id, click Profile. See all 5 recipes scored in seconds.",
    "mode_desc.compare":           "Pick 2-3 candidate models + one recipe. See verdicts side-by-side in a comparison table.",
    "mode_desc.inspector":         "Paste a config.json directly. Useful for private/in-development models not on HF Hub.",
    "mode_desc.diagnose":          "Build the diagnose_model.py CLI command to MEASURE γ_obs on real GPU. Browser predicts; CLI measures.",
    "mode_desc.phase":             "γ × θ scatter of the paper's empirical panel. Hover a dot for details, click to load into Diagnose / Recipe forms.",
    "mode_desc.unmask":            "Detects whether max_position_embeddings is misleading (SWA / YaRN / RoPE-scaling). Paste a model id, get a 1-line verdict.",
    "profile.preset_loaded":       "✅ Loaded preset for <strong>{id}</strong>. Form pre-filled. (Click 📥 Fetch to override with the latest config from HF Hub.)",

    // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
    "modes.template":              "📜 Chat-template",
    "mode_desc.template":          "Detects which chat-template family a model uses (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Gives the exact CLI flag for lm-eval / vLLM / transformers.",
    "template.title":              "📜 Chat-template Sniffer",
    "template.tip":                "Paste an HF model id (or raw tokenizer_config.json). Detects the chat-template family and gives you the exact framework command to use it correctly. lm-eval-harness silently halves accuracy if you forget to apply it (issue #1841).",
    "template.desc":               "<strong>Did you forget <code>--apply_chat_template</code>?</strong> Most multi-turn evals fail by ~50% because the chat template wasn't applied. Paste a model id, get the exact CLI flag for your stack.",
    "template.id_label":           "HF model id:",
    "template.fetch_btn":          "📜 Sniff",
    "template.paste_summary":      "Or paste raw tokenizer_config.json (private models)",
    "template.paste_btn":          "📜 Sniff pasted config",
    "template.label.family":       "Detected family",
    "template.label.markers":      "Matched markers",
    "template.label.tpl_len":      "Template length",
    "template.section.warnings":   "Warnings",
    "template.section.commands":   "Commands by framework",
    "template.section.raw":        "Raw template (preview)",
    "template.family.custom":      "custom (unknown family)",
    "template.family.none":        "(no chat_template)",
    "template.verdict.ok":         "✅ TEMPLATE DETECTED",
    "template.verdict.custom":     "⚠ CUSTOM TEMPLATE",
    "template.verdict.missing":    "❌ NO CHAT TEMPLATE",
    "template.verdict.base_model": "ℹ BASE MODEL (no chat)",
    "template.verdict.unknown":    "❓ UNKNOWN",
    "template.warn.no_chat_template": "No <code>chat_template</code> field in tokenizer_config.json. This is typical for base / pretrained-only models. If you intended an instruct-tuned model, the wrong file may be loaded.",
    "template.warn.custom_template":  "Template is non-standard ({length} chars). The tool could not match it against known families. Inspect the raw preview below and verify your eval framework supports it.",
    "template.warn.lm_eval_apply":    "<strong>lm-eval-harness:</strong> add <code>--apply_chat_template</code> or your accuracy will silently drop ~50% on multi-turn evals (issue #1841).",
    "template.warn.vllm_apply":       "<strong>vLLM serve:</strong> verify <code>--chat-template</code> is set (auto-detection sometimes fails for fine-tuned variants). Suggested: <code>{name}</code>.",
    "template.status.empty_id":    "⚠ Enter a model id (e.g. mistralai/Mistral-7B-Instruct-v0.3).",
    "template.status.fetching":    "⏳ Fetching tokenizer_config.json for {modelId}...",
    "template.status.success":     "✅ Sniffed {modelId} (verdict: {verdict})",
    "template.status.empty_paste": "⚠ Paste a tokenizer_config.json first.",
    "template.status.invalid_json":"❌ Not valid JSON: {error}",
    "template.status.success_paste":"✅ Sniffed pasted config (verdict: {verdict})",
    "template.pasted_label":       "(pasted tokenizer_config)",

    // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
    "modes.arena":                 "🎯 Arena CI",
    "mode_desc.arena":             "Recovers confidence intervals from raw pairwise vote data (Bradley-Terry MLE + bootstrap). Detects statistically tied pairs that the public Arena leaderboard hides.",
    "arena.title":                 "🎯 Arena-Elo CI Reconstructor",
    "arena.tip":                   "Chatbot Arena strips confidence intervals from the public leaderboard. A 5-Elo gap can be statistically meaningless. Paste raw vote data (model_a, model_b, winner) — the tool computes Bradley-Terry MLE + bootstrap CIs and lists statistical ties (CI overlap).",
    "arena.desc":                  "<strong>Is GPT-4 actually better than Claude — or are they tied?</strong> Paste pairwise vote CSV (or click <em>Load sample</em>). Bradley-Terry MLE + 200-iteration bootstrap → ranked Elos with 95% CIs and statistical-tie detection. All in browser.",
    "arena.sample_btn":            "📊 Load sample data",
    "arena.run_btn":                "🎯 Compute CIs",
    "arena.clear_btn":             "🗑️ Clear",
    "arena.csv_summary":           "Vote CSV (header: <code>model_a,model_b,winner</code>; winner ∈ a/b/tie)",
    "arena.section.ranked":        "Ranked Elos with 95% CIs",
    "arena.section.ties":          "Statistical ties (CI overlap)",
    "arena.section.summary":       "Summary",
    "arena.col.rank":              "#",
    "arena.col.model":             "Model",
    "arena.col.elo":               "Elo",
    "arena.col.ci":                "95% CI",
    "arena.col.ci_width":          "± half-width",
    "arena.col.matches":           "Matches",
    "arena.col.wins":              "W / L / T",
    "arena.col.tie_pair":          "Pair",
    "arena.col.tie_diff":          "Elo gap",
    "arena.col.tie_overlap":       "CI overlap",
    "arena.no_ties":               "No statistical ties — all pairs distinguishable at 95% CI.",
    "arena.summary.votes":         "Total votes",
    "arena.summary.models":        "Models",
    "arena.summary.ties":          "Statistical ties",
    "arena.summary.bootstrap":     "Bootstrap iters",
    "arena.summary.ci_level":      "CI level",
    "arena.status.empty":          "⚠ Paste vote CSV or click Load sample.",
    "arena.status.too_few":        "⚠ Only {n} valid votes — need at least 10 to fit Bradley-Terry reliably.",
    "arena.status.computing":      "⏳ Computing Bradley-Terry MLE + bootstrap on {n} votes...",
    "arena.status.done":           "✅ {n} votes · {models} models · {ties} statistical ties · {ms} ms",
    "arena.status.sample_loaded":  "✅ Sample loaded (synthetic 6-model Arena data). Click Compute CIs.",

    // v0.7.3 — anti-bullshit pack #4: Contamination Prior
    "modes.contam":                "🧪 Contamination",
    "mode_desc.contam":            "Bayesian-ish prior on whether a benchmark score is contaminated. Enter your model's training cutoff → rates 20+ popular benchmarks (MMLU, GSM8K, HumanEval, MMLU-Pro…).",
    "contam.title":                "🧪 Contamination Prior",
    "contam.tip":                  "Computes a Bayesian-ish prior on whether a benchmark score is contaminated, based on (model training cutoff date) × (benchmark release date) × (known corpus inclusion + leak history). Open LLM Leaderboard v1 was killed in 2024 after MMLU/HellaSwag scores became contaminated.",
    "contam.desc":                 "<strong>Should you trust your model's MMLU score?</strong> Enter the model's training cutoff date — the tool rates 20+ popular benchmarks (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) and tells you which scores are likely contaminated.",
    "contam.cutoff_label":         "Training cutoff:",
    "contam.run_btn":              "🧪 Rate all benchmarks",
    "contam.section.ranked":       "Benchmark contamination priors",
    "contam.section.high":         "🔴 High-risk benchmarks (treat scores as unreliable)",
    "contam.section.medium":       "🟡 Medium-risk (verify with alternates)",
    "contam.section.low":          "🟢 Low-risk (likely clean)",
    "contam.col.benchmark":        "Benchmark",
    "contam.col.released":         "Released",
    "contam.col.gap":              "Gap (months)",
    "contam.col.prior":            "P(contam)",
    "contam.col.level":            "Level",
    "contam.col.corpora":          "In corpora",
    "contam.col.category":         "Category",
    "contam.label.high":           "High risk",
    "contam.label.medium":         "Medium",
    "contam.label.low":            "Low",
    "contam.no_entries":           "(none in this category)",
    "contam.advice.high":          "Treat these scores as unreliable. Replace with newer / private-test alternates (MMLU-Pro, GPQA, MUSR, MATH-500).",
    "contam.advice.medium":        "Take with caution. Look for replication on a held-out subset or community reproductions.",
    "contam.advice.low":           "Score likely uncontaminated, but absence of leak is not proof — still cross-check with alternate test.",
    "contam.summary.headline":     "Cutoff <code>{cutoff}</code> · {n} benchmarks rated",
    "contam.status.empty":         "⚠ Enter a model training cutoff date (e.g. 2023-12).",
    "contam.status.bad_date":      "⚠ Bad date format. Use YYYY-MM or YYYY-MM-DD.",
    "contam.status.done":          "✅ Cutoff {cutoff} · {n} benchmarks rated · {high} high-risk",

    // v0.7 — Help modal section
    "help.v07.title":              "🆕 v0.7 — Anti-bullshit pack (4 new modes)",
    "help.v07.intro":              "<em>v0.7 (2026-05-06): four new modes that solve concrete pain points reported by the HuggingFace community. Each one runs in your browser with no inference — pure metadata + math.</em>",
    "help.v07.unmask.title":       "🪟 Context Unmasker",
    "help.v07.unmask.body":        "Detects when <code>max_position_embeddings</code> is misleading. Mistral-7B-v0.1 declares 32k but attends within ~4-8k via SWA. Paste an HF model id → 1-second verdict (HONEST / INFLATED / SEVERELY INFLATED / YARN-EXTENDED). Catches SWA, RoPE-scaling (YaRN/linear/dynamic NTK), small-d_head + GQA. <em>Use case</em>: before paying GPU for 32k context, verify the model actually attends that far.",
    "help.v07.template.title":     "📜 Chat-template Sniffer",
    "help.v07.template.body":      "Detects which chat-template family a model uses (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) and gives you the exact CLI flag for lm-evaluation-harness, vLLM, and transformers. Solves issue #1841 in lm-eval-harness: forgetting <code>--apply_chat_template</code> silently halves multi-turn accuracy. <em>Use case</em>: before reporting a benchmark score, confirm you applied the template correctly.",
    "help.v07.arena.title":        "🎯 Arena-Elo CI Reconstructor",
    "help.v07.arena.body":         "Chatbot Arena strips confidence intervals from its public leaderboard — a 5-Elo gap can be statistically meaningless. Paste raw pairwise vote data (model_a, model_b, winner) → Bradley-Terry MLE + 200-iteration bootstrap → ranked Elos with 95% CIs and a \"statistical ties\" panel listing pairs whose CIs overlap. Try the Load sample button. <em>Use case</em>: before declaring \"model A beats model B\", verify their CIs don't overlap.",
    "help.v07.contam.title":       "🧪 Contamination Prior",
    "help.v07.contam.body":        "Bayesian-ish prior on whether a benchmark score is contaminated. Enter your model's training cutoff date → tool rates 20+ popular benchmarks (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) by P(contamination) based on time gap, corpus inclusion, and known leak history. Open LLM Leaderboard v1 was killed in 2024 after MMLU/HellaSwag scores became contaminated. <em>Use case</em>: decide which scores to trust when comparing two models.",
    "help.v07.quant.title":        "⚖️ Quant-regime Classifier",
    "help.v07.quant.body":         "Predicts γ-shift and ΔPPL for any (model × quant scheme: NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8, …). Architecture-aware: small d_head + aggressive GQA → more sensitive; calibrated schemes (AWQ) absorb shift better than uncalibrated (NF4). Recommends safer alternatives if a cliff is detected. <em>Use case</em>: before quantizing, predict whether your specific architecture × scheme combo will keep PPL acceptable, with a concrete switch-to suggestion otherwise.",
    "help.v07.drift.title":        "🔀 Cross-framework Drift Bound",
    "help.v07.drift.body":         "Same model, different scores on different setups. Tool predicts the maximum drift admissible from numerical noise alone (dtype, framework, batch). If the observed gap exceeds it → real bug, typically chat-template mismatch (lm-eval-harness issue #1841) or KV-cache layout. Try the &quot;Load sample&quot; button for the canonical chat-template bug. <em>Use case</em>: before reporting a regression or claiming reproducibility, verify whether the gap between two evals is bigger than what numerical noise can explain.",
    "inv.v07.drift":               "<strong>🔀 Drift</strong> — bug or noise? Predict max admissible gap between two evals",
    "help.v07.niah.title":         "🔍 NIAH → Reasoning Gap",
    "help.v07.niah.body":          "RULER paper (NVIDIA 2024) shows that long-context models often pass NIAH (needle retrieval) but fail multi-hop reasoning at the same context. Tool predicts both pass rates from architecture (γ_Padé + d_horizon + arch pressure: small d_head, GQA, SWA), reports the gap, and finds your model's \"safe reasoning context\" where reasoning stays ≥65%. Sweep mode shows the curve across 1k/4k/16k/64k/T_train. <em>Use case</em>: before deploying at the claimed context, find out whether the model will actually reason there or just retrieve.",
    "inv.v07.niah":                "<strong>🔍 NIAH→Reason</strong> — does your \"128k context\" actually reason there, or just retrieve?",

    // v0.7 — Inventory modal 5th card
    "inv.v07.title":               "🆕 v0.7 anti-bullshit pack",
    "inv.v07.unmask":              "<strong>🪟 Unmask</strong> — config.json claims 32k? See if it actually attends that far",
    "inv.v07.template":            "<strong>📜 Chat-template</strong> — exact CLI flag so lm-eval doesn't silently halve your accuracy",
    "inv.v07.arena":               "<strong>🎯 Arena CI</strong> — recover the confidence intervals Chatbot Arena hides",
    "inv.v07.contam":              "<strong>🧪 Contamination</strong> — rate 20+ benchmarks for contamination probability",
    "inv.v07.quant":               "<strong>⚖️ Quant</strong> — predict γ shift + ΔPPL for any (model × quant scheme) combo",

    // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
    "modes.quant":                 "⚖️ Quant",
    "mode_desc.quant":             "Predicts γ-shift and ΔPPL for any (model × quant scheme). Architecture-aware: small d_head + GQA → more sensitive. Recommends safer alternatives if a cliff is detected.",
    "quant.title":                 "⚖️ Quant-regime Classifier",
    "quant.tip":                   "Predicts γ-shift (and downstream ΔPPL) for a given (model × quant scheme). Generic claims like 'AWQ ~95% retention' are too vague — TAF uses d_head, GQA ratio, SWA flag, and model size to give an architecture-specific verdict. Solves: HF community widely reports unpredictable quant cliffs (NF4 -2 PPL on Phi-3 but fine on Llama-3-8B).",
    "quant.desc":                  "<strong>Will quantizing your model break it?</strong> Paste an HF model id, pick a quant scheme — get predicted γ-shift, expected ΔPPL band, and a recommended alternative if it's a cliff. Browser-only, no GPU, no calibration set required.",
    "quant.id_label":              "HF model id:",
    "quant.fetch_btn":             "📥 Fetch config",
    "quant.scheme_label":          "Quant scheme:",
    "quant.run_btn":                "⚖️ Predict",
    "quant.all_btn":               "📊 Compare all schemes",
    "quant.regime.safe":           "✅ SAFE",
    "quant.regime.mild":           "✅ MILD COMPRESSION",
    "quant.regime.significant":    "⚠ SIGNIFICANT DEGRADATION",
    "quant.regime.cliff":          "❌ HEAVY CLIFF",
    "quant.label.gamma_shift":     "γ shift",
    "quant.label.delta_ppl":       "ΔPPL (est.)",
    "quant.label.arch_mult":       "Arch multiplier",
    "quant.section.breakdown":     "Breakdown",
    "quant.section.reco":          "Recommendation",
    "quant.section.compare":       "All schemes (sorted by safety)",
    "quant.field.scheme":          "Scheme",
    "quant.field.calibrated":      "calibrated",
    "quant.field.uncalibrated":    "uncalibrated",
    "quant.field.base_penalty":    "Base penalty",
    "quant.field.arch_mult_full":  "Architecture multiplier",
    "quant.field.gamma_shift":     "Predicted γ shift",
    "quant.field.ppl_band":        "ΔPPL band (est.)",
    "quant.field.params":          "Parameters",
    "quant.col.scheme":            "Scheme",
    "quant.col.bits":              "Bits",
    "quant.col.gamma_shift":       "γ shift",
    "quant.col.ppl_band":          "ΔPPL band",
    "quant.col.regime":            "Regime",
    "quant.reco.switch_to_awq":    "<strong>Switch to {scheme}</strong> — calibrated 4-bit handles small d_head + GQA much better than NF4. Expected ΔPPL drops ~2-3×.",
    "quant.reco.switch_to_q5_km":  "<strong>Switch to {scheme}</strong> — Q5 keeps more head dimensions intact at low cost (only ~25% bigger file).",
    "quant.reco.switch_to_q4_km":  "<strong>Switch to {scheme}</strong> — Q3/Q2 are too aggressive for this architecture.",
    "quant.reco.consider_awq":     "<strong>Consider {scheme}</strong> — calibration meaningfully reduces γ-shift on this architecture.",
    "quant.reco.use_higher_bits":  "<strong>Use higher-bit alternative</strong> — this architecture cannot absorb 4-bit cleanly. Try 5- or 8-bit.",
    "quant.reco.verify_with_eval": "<strong>Verify with a real eval</strong> — predicted shift is borderline. Run NIAH at your target context before deploying.",
    "quant.reco.no_action":        "No action needed — quantization is safe for this architecture.",
    "quant.summary.headline_all":  "All schemes for <code>{modelId}</code>",
    "quant.status.empty_id":       "⚠ Enter a model id (e.g. meta-llama/Llama-3.2-1B).",
    "quant.status.fetching":       "⏳ Fetching config.json for {modelId}...",
    "quant.status.fetched":        "✅ Config fetched for {modelId}. Pick a scheme and click Predict (or Compare all schemes).",
    "quant.status.no_scheme":      "⚠ Pick a quant scheme from the dropdown.",
    "quant.status.done":           "✅ Predicted regime: {regime}",
    "quant.status.done_all":       "✅ Compared {n} schemes — sorted by safety.",

    // v0.7.4 — HF Hub autocomplete privacy + rate-limit notices
    "hf_auto.privacy":             "🔒 Queries sent to huggingface.co/api · cached locally 5 min",
    "hf_auto.rate_limited":        "⚠ HuggingFace rate limit — try again in a moment, or type the full model id manually",
    "hf_auto.gated_msg":           "is gated. Accept the license here:",

    // v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound
    "modes.drift":                 "🔀 Drift",
    "mode_desc.drift":             "Predicts max-allowable drift between two benchmark scores given (framework, dtype, batch, chat-template). Flags real bugs vs numerical noise.",
    "drift.title":                 "🔀 Cross-framework Drift Bound",
    "drift.tip":                   "Same model, different scores on different setups. Is the gap noise or a real bug? Enter two scores with their (framework, dtype, batch, chat-template) — tool predicts the maximum allowable drift from numerical noise alone. If observed gap exceeds it → real bug, usually chat-template mismatch (lm-eval issue #1841) or KV-cache layout.",
    "drift.desc":                  "<strong>Your model gives 67.2 on lm-eval-hf and 65.1 on vLLM-served. Bug or noise?</strong> Enter both scores with (framework, dtype, batch, chat-template applied?). Tool predicts the noise band and flags real bugs. arxiv 2506.09501 documents this as a major eval reproducibility problem.",
    "drift.setup_a":               "Setup A",
    "drift.setup_b":               "Setup B",
    "drift.score":                 "Score",
    "drift.framework":             "Framework",
    "drift.dtype":                 "Dtype",
    "drift.batch":                 "Batch",
    "drift.template":              "Chat-template",
    "drift.template.applied":      "applied",
    "drift.template.not_applied":  "not applied",
    "drift.template.unknown":      "unknown",
    "drift.run_btn":               "🔀 Compute drift bound",
    "drift.sample_btn":            "📊 Load sample (chat-template bug)",
    "drift.label.observed":        "Observed gap",
    "drift.label.band":            "Numerical band",
    "drift.label.ratio":           "Gap / band",
    "drift.section.setups":        "Setups",
    "drift.section.breakdown":     "Drift contributors (numerical band)",
    "drift.section.verdict":       "Verdict & recommendation",
    "drift.contrib.dtype":         "Dtype mismatch",
    "drift.contrib.framework":     "Framework",
    "drift.contrib.batch":         "Batch difference",
    "drift.contrib.template":      "Chat-template MISMATCH",
    "drift.dominant_cause":        "Dominant cause",
    "drift.cause.dtype":           "dtype precision difference",
    "drift.cause.framework":       "framework / kernel difference",
    "drift.cause.batch":           "batch normalization paths",
    "drift.cause.template_mismatch": "chat-template applied on one side but not the other (lm-eval-harness #1841 pattern — typical -50% drop on multi-turn)",
    "drift.verdict.noise":         "✅ NUMERICAL NOISE",
    "drift.verdict.suspicious":    "⚠ SUSPICIOUS — verify",
    "drift.verdict.bug":           "❌ REAL BUG — investigate",
    "drift.verdict.bug_template":  "❌ CHAT-TEMPLATE BUG",
    "drift.reco.noise":            "Gap fits within the expected numerical-noise band. No action needed; the difference is consistent with framework/dtype/batch variation alone.",
    "drift.reco.suspicious":       "Gap is 1–2× the predicted noise band. Borderline — possibly a real bug. Try aligning the dominant contributor (e.g. match framework or dtype) and re-test.",
    "drift.reco.bug":              "Gap is &gt; 2× the predicted noise band. This is a real bug. Inspect the dominant contributor — most likely tokenizer / chat-template / KV-cache layout difference. Run lm-eval-harness with <code>--apply_chat_template</code> and confirm.",
    "drift.reco.bug_template":     "Chat-template mismatch detected. This is the most common cause of large eval discrepancies (lm-eval-harness issue #1841). Re-run the &quot;not applied&quot; side with <code>--apply_chat_template</code> (or set vLLM <code>--chat-template &lt;name&gt;</code>) and re-test.",
    "drift.status.empty_scores":   "⚠ Enter both scores.",
    "drift.status.done":           "✅ Verdict: {verdict}",
    "drift.status.sample_loaded":  "✅ Sample loaded (canonical chat-template bug). Click Compute drift bound.",

    // v0.7.6 — anti-bullshit pack #7: NIAH → reasoning gap predictor
    "modes.niah":                  "🔍 NIAH→Reason",
    "mode_desc.niah":              "Predicts NIAH (retrieval) and multi-hop reasoning pass rates at any context. Solves: long-context models often pass NIAH but fail reasoning at the same context (RULER paper).",
    "modes.saturation":            "📈 Saturation",
    "mode_desc.saturation":        "Tells you whether a benchmark still discriminates frontier models or has saturated (e.g. MMLU 88-94% top, AIME 2025 already 96-100%). Returns top-3 + verdict + recommended replacements.",
    "modes.hub":                   "🧭 Solutions",
    "mode_desc.hub":               "Map of every documented LLM-eval pain → tafagent mode (if covered) + curated external tools. Find the right solution without rebuilding it. 30+ pains, 7 categories.",
    "niah.title":                  "🔍 NIAH → Reasoning Gap",
    "niah.tip":                    "NIAH (Needle in a Haystack) tests retrieval: 'find this fact in long text'. Multi-hop reasoning tests inference: 'combine facts X+Y at the start with fact Z at the end'. RULER paper (NVIDIA 2024) shows long-context models often pass NIAH but fail reasoning at the same context. This tool predicts both pass rates from architecture alone.",
    "niah.desc":                   "<strong>Your model claims 128k context. Will it actually reason at 64k, or just retrieve?</strong> Paste an HF model id and a target eval context — tool predicts NIAH and multi-hop reasoning pass rates, the gap, and a 'safe context' where reasoning stays ≥65%.",
    "niah.id_label":               "HF model id:",
    "niah.fetch_btn":              "📥 Fetch config",
    "niah.teval_label":            "Target context (T_eval):",
    "niah.run_btn":                "🔍 Predict",
    "niah.sweep_btn":              "📊 Sweep contexts",
    "niah.label.niah":             "NIAH pass rate",
    "niah.label.reasoning":        "Reasoning pass rate",
    "niah.label.gap":              "Gap",
    "niah.label.safe_ctx":         "Safe reasoning context",
    "niah.section.breakdown":      "Architecture breakdown",
    "niah.section.reco":           "Recommendation",
    "niah.section.sweep":          "Pass rate sweep across context lengths",
    "niah.field.dhorizon":         "d_horizon (effective)",
    "niah.field.ratio":            "T_eval / d_horizon",
    "niah.field.arch_pressure":    "Arch pressure (small d_head + GQA + SWA)",
    "niah.field.theta":            "RoPE θ",
    "niah.field.t_train":          "T_train (claimed)",
    "niah.col.context":            "T_eval",
    "niah.col.niah":               "NIAH",
    "niah.col.reasoning":          "Reasoning",
    "niah.col.gap":                "Gap",
    "niah.col.verdict":            "Verdict",
    "niah.verdict.robust":         "✅ ROBUST",
    "niah.verdict.marginal":       "⚠ MARGINAL",
    "niah.verdict.degraded":       "⚠ DEGRADED",
    "niah.verdict.retrieval_only": "❌ RETRIEVAL-ONLY",
    "niah.verdict.broken":         "❌ BROKEN",
    "niah.reco.robust":            "Both retrieval and reasoning hold up at this context. Safe to deploy for both lookup and inference tasks.",
    "niah.reco.marginal":          "Borderline. Retrieval works but reasoning is shaky. Use for fact-lookup, not multi-step inference.",
    "niah.reco.degraded":          "Significant reasoning drop. The model can find facts but struggles to combine them. Avoid multi-hop tasks at this length.",
    "niah.reco.retrieval_only":    "Canonical RULER finding: model passes NIAH but fails reasoning. Useful for retrieval-augmented setups (where the LLM only locates facts) but NOT for chained inference. Cut your context to the 'safe' value below.",
    "niah.reco.broken":            "Model fails even basic retrieval at this context. Treat as out-of-distribution — re-test at a shorter context.",
    "niah.safe_context":           "≤ {ctx} tokens (reasoning ≥ 65%)",
    "niah.safe_context_none":      "No safe context found below your target — model fails reasoning even at small contexts.",
    "niah.summary.sweep":          "<code>{modelId}</code> — pass rates by context",
    "niah.status.empty_id":        "⚠ Enter a model id (e.g. meta-llama/Llama-3.1-8B-Instruct).",
    "niah.status.bad_teval":       "⚠ Enter a target context (≥ 512 tokens).",
    "niah.status.fetching":        "⏳ Fetching config.json for {modelId}...",
    "niah.status.fetched":        "✅ Config fetched for {modelId}. Set T_eval and click Predict (or Sweep contexts).",
    "niah.status.done":            "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
    "niah.status.sweep_done":      "✅ Swept {n} context lengths.",
    "saturation.title":            "📈 Benchmark Saturation Detector",
    "saturation.tip":              "MMLU is saturated (88-94% all frontier models). Reporting '92% on MMLU' is now meaningless. This tool tells you which benchmarks still discriminate frontier models, which are saturated, and what to use instead. Data: DemandSphere AI Frontier Tracker (CC BY-NC 4.0) refreshed 2026-05.",
    "saturation.desc":             "<strong>Is your benchmark still useful?</strong> Pick a benchmark to see top-3 frontier scores, spread, and a verdict (saturated / near-saturated / discriminative) plus recommended replacements.",
    "saturation.select_label":     "Benchmark:",
    "saturation.select.all":       "— show all benchmarks —",
    "saturation.run_btn":          "📈 Classify",
    "saturation.all_btn":          "📊 Show all",
    "saturation.col.spread":       "Top-3 spread",
    "saturation.col.mean":         "Top-3 mean",
    "saturation.col.n":            "Models",
    "saturation.col.bench":        "Benchmark",
    "saturation.col.verdict":      "Verdict",
    "saturation.col.reco":         "Top reco",
    "saturation.col.model":        "Model",
    "saturation.col.score":        "Score",
    "saturation.section.top3":     "Top-3 frontier scores",
    "saturation.section.recommendations": "Recommended alternatives",
    "saturation.section.note":     "Notes",
    "saturation.section.all":      "All tracked benchmarks",
    "saturation.verdict.saturated":      "🚨 SATURATED",
    "saturation.verdict.near_saturated": "⚠ NEAR SATURATED",
    "saturation.verdict.discriminative": "✅ DISCRIMINATIVE",
    "saturation.verdict.sparse_data":    "ℹ SPARSE DATA",
    "saturation.borderline":       "Borderline — within ±1pp of a threshold cutoff. Treat verdict as 'check carefully'.",
    "saturation.unknown":          "Unknown benchmark.",
    "saturation.attribution":      "Data: DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (open-weight historical) · last fetch 2026-05-05.",
    "saturation.status.live":      "✅ Live data loaded — {count} models.",
    "saturation.status.baked":     "ℹ Using baked snapshot (live fetch unavailable).",
    "saturation.status.kb_fail":   "⚠ Could not load saturation KB.",
    "saturation.status.done":      "✅ {name} — {verdict}",
    "saturation.status.all_done":  "✅ Classified {n} benchmarks.",
    "help.v08.saturation.title":   "📈 Benchmark Saturation Detector",
    "help.v08.saturation.body":    "MMLU is saturated (88-94% top), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.",
    "inv.v08.saturation":          "<strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?",
    "inv.v081.hub":                "<strong>🧭 Solutions Hub</strong> — every documented pain mapped to a tafagent mode or curated external tool. Don't reinvent — find.",
    "help.v081.hub.title":         "🧭 Solutions Hub",
    "help.v081.hub.body":          "tafagent as integrator, not silo. 30+ pains across 7 categories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), each mapped to (a) the tafagent mode that addresses it, if any, and (b) the best-of-breed external tools the community already trusts (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Search box matches across pain, scenario, and tool name. <em>Use case</em>: 'I have problem X — does tafagent solve it, and if not, who does?'",
    "hub.title":                   "🧭 Solutions Hub",
    "hub.tip":                     "Map of every documented LLM-eval pain we know about: which tafagent mode addresses it (if any), and the best-of-breed external tools the community already trusts. Goal: full coverage. If a canonical tool exists elsewhere, we link rather than rebuild.",
    "hub.desc":                    "<strong>Don't reinvent — find.</strong> 30+ pains mapped to tafagent modes + curated external tools. Browse by category, search by keyword, or see the gaps where new modes would help most.",
    "hub.clear_btn":               "✕ Clear",
    "hub.no_mode":                 "external",
    "hub.planned":                 "planned:",
    "hub.best_for":                "Best for",
    "hub.not_for":                 "Not for",
    "hub.tools":                   "External tools",
    "hub.status.loaded":           "✅ Loaded {total} pains across {categories} categories — {covered} covered by tafagent modes, {externalLinks} external links curated. Compiled {compiled}.",
    "hub.status.fail":             "⚠ Could not load Solutions Hub.",
    "hub.search.empty":            "No matches for '{query}'. Try broader terms (e.g. 'eval', 'rag', 'tokenizer').",
    "hub.search.results":          "Found {n} match(es) for '{query}'.",

    // v0.7.7 — Task tiles (UX restructure: 14 modes grouped by user intent)
    "tiles.title":                 "🎯 What do you want to do?",
    "tiles.subtitle":              "Pick a task. Each one opens the right tool below. Or scroll down for the full list of 14 modes.",
    "tile.diagnose.title":         "🔬 Diagnose a model",
    "tile.diagnose.desc":          "Will this specific model work for my use case?",
    "tile.trust.title":            "✓ Trust a benchmark score",
    "tile.trust.desc":             "Should I believe this number? Bug or noise?",
    "tile.eval.title":              "⚙️ Set up an eval correctly",
    "tile.eval.desc":              "Get the exact CLI flag for lm-eval / vLLM / transformers.",
    "tile.compare.title":          "🆚 Compare models",
    "tile.compare.desc":           "Side-by-side, or browse the empirical model landscape.",
    "tile.manual.title":           "📋 Manual / free-form",
    "tile.manual.desc":            "Pick a specific recipe by hand, or ask in plain English.",
    "tile.diagnose.tip":           "Start here when you have a specific model id and want a full diagnostic: <strong>Profile</strong> runs all 5 recipes at once. <strong>Unmask</strong> checks if max_position_embeddings is honest. <strong>NIAH→Reason</strong> predicts retrieval-vs-reasoning gap. <strong>Quant</strong> predicts whether quantizing will break it. <strong>Inspect</strong> lets you paste raw config.json for private/in-dev models.",
    "tile.trust.tip":              "When you see a score and want to know if it's real. <strong>Contamination</strong> rates 20+ benchmarks for likelihood the model saw them during training. <strong>Drift</strong> tells you if a gap between two evals is numerical noise or a real bug (chat-template mismatch, KV-cache layout, etc.). <strong>Arena CI</strong> reconstructs the confidence intervals Chatbot Arena hides — many top-Elo &quot;wins&quot; are statistically tied.",
    "tile.eval.tip":               "Before you run lm-eval-harness or vLLM serve, get the right CLI flag. <strong>Chat-template Sniffer</strong> detects the template family (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) and emits the exact <code>--apply_chat_template</code> / <code>--chat-template</code> invocation. Solves issue #1841 in lm-eval-harness (silent ÷2 accuracy). <strong>Diagnose CLI</strong> generates the Python command to measure γ_obs on your local GPU.",
    "tile.compare.tip":            "<strong>Compare</strong>: pick 2-3 candidate models + one recipe, see verdicts in a side-by-side table (e.g. Llama-3-8B vs Mistral-7B at 32k context). <strong>Phase diagram</strong>: scatter of 23 empirical models on the (log θ, γ) plane, with the Padé curve overlaid. Hover dots for details, click to load that model into the Recipe form.",
    "tile.manual.tip":             "<strong>Recipe</strong>: pick a specific X-N recipe (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 KV compression, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) and fill the form by hand for full control. <strong>Ask</strong>: type a free-form question; an in-browser 0.5B LLM (Qwen2.5) picks the right recipe and runs it. Best for &quot;what would happen if...&quot; exploration.",
    "share.import_desc":       "Got a JSON file from someone else's TAF analysis? Load it here to see the verdict + chain locally. Same view as if you'd run it yourself.",
    "share.import_btn":        "📂 Load shared JSON",
    "synthesis.system":        "You are a precise transformer LLM diagnostic assistant. Given pre-computed TAF formula results, write a clear plain-English summary in 4-6 sentences. Cite the section number (§X.Y) for each number you mention. Always give a concrete recommendation. Do NOT invent numbers.",

    // INSPECTOR mode
    "inspector.title":         "🔍 Architecture Inspector",
    "inspector.desc":          "Paste the raw <code>config.json</code> contents. The tool extracts the architectural parameters and runs the full 5-recipe Profile.",
    "inspector.tip":           "<strong>Paste any config.json directly</strong>. Tool parses it and runs the full Profile. Useful for: private models, in-development configs, models not yet on HuggingFace, or comparing what your custom architecture would do.",
    "inspector.quickstart":    "💡 Use case: you have a private model not on HF Hub, or a config you're designing. Paste the raw JSON below and get a full TAF profile.",
    "inspector.placeholder":   "{\n  \"model_type\": \"llama\",\n  \"rope_theta\": 500000,\n  \"max_position_embeddings\": 8192,\n  \"num_attention_heads\": 32,\n  \"num_key_value_heads\": 8,\n  \"hidden_size\": 4096,\n  \"num_hidden_layers\": 32\n}",
    "inspector.T_eval":        "T_eval (your target context):",
    "inspector.btn":           "🚀 Inspect & profile",

    // WHAT-IF slider
    "whatif.title":            "🎚 What-if: drag T_eval to see γ change live",
    "whatif.desc":             "Pure JS recompute (no Pyodide call). Shows the geometric γ_Padé and d_horizon as you slide. The full chain re-runs on click.",
    "whatif.T_eval":           "<strong>T_eval</strong>",
    "whatif.gamma_pade":       "<strong>γ_Padé</strong>",
    "whatif.d_horizon":        "<strong>d_horizon</strong>",
    "whatif.l_niah":           "<strong>L_NIAH ceiling</strong>",
    "whatif.predicted":        "<strong>Predicted geometric verdict</strong>",
    "whatif.rerun":            "↻ Recompute full chain at this T_eval",

    // COMMUNITY feed
    "community.title":         "🌐 Recent community submissions",
    "community.desc":          "Live feed from the public registry. Click any submission to view full analysis.",
    "community.browse_all":    "Browse all →",
    "community.loading":       "Loading...",
    "community.no_repo":       "The registry repo isn't created yet. Once it exists with submissions, they'll appear here live.",
    "community.no_submissions": "No submissions yet. Be the first — generate a Profile and click 📤 Submit to registry.",

    // FALSIFICATION dashboard
    "falsification.title":     "🔬 Paper predictions — falsification status",
    "falsification.desc":      "The TAF framework rests on falsifiable predictions (F1-F23). Each is empirically tested. Here's the live status of every prediction in the paper.",
    "falsification.summary":   "{confirmed} confirmed · {partial} partial · {refuted} refuted · {untested} untested (out of {total} total predictions)",
    "falsification.col.id":    "ID",
    "falsification.col.claim": "Claim",
    "falsification.col.status": "Status",
    "falsification.col.evidence": "Evidence",

    "tafcard.title":           "📇 TAF Card — full model profile",
    "tafcard.recipes_title":   "📋 Recipes — verdict per dimension",
    "tafcard.recipes_count_label": "dimensions",
    "tafcard.numbers_title":   "🔢 Key numbers (paper §26)",
    "tafcard.fals_title":      "🔬 Falsification status (F1-F23)",
    "tafcard.fals_none":       "No falsifications applicable.",
    "tafcard.diag_title":      "🔬 Diagnostics — numbers · γ check · what-if",
    "tafcard.verify_title":    "✓ Verification — Lean + Sage + falsification",
    "tafcard.share_title":     "📂 Provenance & share",
    "tafcard.whatif_title":    "🎚️ What-if explorer",
    "verdict.go":              "GO",
    "verdict.no":              "NO",
    "verdict.degraded":        "DEGRADED",

    "compare.title_out":       "🆚 Comparison Table",

    "status.loading_pyodide":  "⏳ Loading Python runtime (~10MB, first time only)...",
    "status.loading_taf":      "⏳ Loading TAF formulas + recipes...",
    "status.ready":            "✅ Ready. Pick a model and click Profile to start.",
    "status.computing":        "🧮 Computing TAF chain...",
    "status.done":             "✅ Done.",

    "profile.hf_placeholder":  "e.g. meta-llama/Meta-Llama-3-8B or Qwen/Qwen2.5-7B",
    "compare.hf_placeholder":  "HF model id (e.g. meta-llama/Meta-Llama-3-8B)",
    "compare.slot1_placeholder": "HF model id (e.g. meta-llama/Meta-Llama-3-8B)",
    "compare.slot2_placeholder": "HF model id #2",
    "compare.slot3_placeholder": "HF model id #3 (optional)",
    "compare.preset_default": "— or preset —",

    // Form parameters
    "param.theta":         "θ (rope_theta)",
    "param.theta.tip":     "<strong>RoPE base frequency</strong> from <code>config.rope_theta</code>. Higher = more long-range capacity.",
    "param.T_train":       "T_train",
    "param.T_train.tip":   "<strong>Max training context</strong>. From <code>max_position_embeddings</code>. Beyond this is extrapolation.",
    "param.T_eval":        "T_eval (your target)",
    "param.T_eval.tip":    "<strong>Your target inference context</strong>. The whole question is: will the model behave well at THIS length?",
    "param.n_attn":        "n_attention_heads",
    "param.n_attn.tip":    "<strong>Number of attention heads</strong> per layer. From <code>num_attention_heads</code>.",
    "param.n_kv":          "n_kv_heads",
    "param.n_kv.tip":      "<strong>KV heads</strong>. If &lt; n_attention_heads → GQA (Grouped Query Attention). Reduces KV memory but pushes γ toward Hagedorn.",
    "param.d_head":        "head_dim",
    "param.d_head.tip":    "<strong>Per-head dimension</strong>. Typical 64, 96, 128. From <code>head_dim</code> or <code>hidden_size / num_attention_heads</code>.",
    "param.n_layers":      "n_layers",
    "param.n_layers.tip":  "<strong>Number of transformer blocks</strong>. From <code>num_hidden_layers</code>.",
    "param.n_params":      "n_params (e.g. 8e9)",
    "param.n_params.tip":  "<strong>Total parameter count</strong>. Threshold ~400M for induction-head emergence. Affects KV memory and budget recipes.",
    "param.has_swa":       "Has SWA?",
    "param.has_swa.tip":   "<strong>Sliding Window Attention</strong>. <code>true</code> for Mistral, gemma-2, phi-3. v0.5.3 calibration audit disabled the historical δ_SWA correction (n=1 fit).",
    "common.yes":          "Yes",
    "common.no":           "No",

    // Mode tooltips
    "modes.tip":           "<strong>Fourteen ways to use the tool</strong>.<br><strong>📇 Profile</strong>: paste a model id → 5-recipe TAF Card.<br><strong>🆚 Compare</strong>: 2-3 models side-by-side on one recipe.<br><strong>🔍 Inspect config</strong>: paste raw config.json → full Profile.<br><strong>💬 Ask</strong>: free-form question, browser LLM picks the recipe.<br><strong>📋 Recipe</strong>: manual selection with full form control.<br><strong>🩺 Diagnose CLI</strong>: generate Python command for local γ measurement.<br><strong>📊 Phase diagram</strong>: 23-model panel on (log θ, γ) plane.<br><strong>🪟 Unmask</strong>: detect misleading max_position_embeddings (SWA / YaRN / RoPE-scaling).<br><strong>📜 Chat-template</strong>: detect family + give exact CLI flag for lm-eval / vLLM / transformers.<br><strong>🎯 Arena CI</strong>: reconstruct confidence intervals from raw pairwise vote data; detect statistical ties Arena hides.<br><strong>🧪 Contamination</strong>: rate 20+ benchmarks for contamination probability based on training cutoff vs release date.<br><strong>⚖️ Quant</strong>: predict γ-shift and ΔPPL for any (model × quant scheme); recommend safer alternative on cliff.<br><strong>🔀 Drift</strong>: same model, different scores on two setups — bug or noise? Predict numerical-noise band and flag real bugs.<br><strong>🔍 NIAH→Reason</strong>: predict NIAH and multi-hop reasoning pass rates from architecture; find your model's safe reasoning context.",
    "profile.tip":         "<strong>One-click full diagnosis</strong>. Paste any HF model id (or pick preset). Tool runs all 5 recipes (long-context, KV-compression, custom-vs-API, budget, hardware) and produces a single <strong>TAF Card</strong> with verdict per dimension + key numbers + architecture classification.<br><br><strong>Use case</strong>: \"I'm evaluating Qwen2.5-32B for production — what's its full viability profile?\" → paste id → Profile → done.",
    "compare.tip":         "<strong>Same recipe, multiple models</strong>. Pick 2-3 candidate models and one recipe. See verdicts in a single comparison table.<br><br><strong>Use case</strong>: \"I need long-context retrieval at 16K — which is best: Llama-3-8B, Mistral-7B, or Qwen-7B?\" → pick 3 + X-2 + 16K → see winner.",

    // Help modal
    "help.title":               "📘 TAF Agent — User Manual",
    "help.what.title":          "What does it do?",
    "help.what.body":           "Predicts <strong>practical viability</strong> of any transformer LLM <em>before you spend GPU/$</em>. Answers questions like \"will this model work at L=32K?\" or \"should I train custom or use API?\" using deterministic Python formulas (TAF — Thermodynamic Attention Framework).",
    "help.modes.title":         "How to use — 7 modes",
    "help.modes.profile":       "<strong>📇 Profile</strong>: paste model id → all recipes at once = TAF Card. <strong>Best starting point</strong>.",
    "help.modes.compare":       "<strong>🆚 Compare</strong>: 2-3 models side-by-side on same recipe. Best when choosing between candidates.",
    "help.modes.inspector":     "<strong>🔍 Inspect config</strong>: paste raw <code>config.json</code> → tool parses + runs full Profile. For private models, in-development configs, or models not yet on HF Hub.",
    "help.modes.ask":           "<strong>💬 Ask plain English</strong>: free-form question, in-browser LLM picks the recipe. Best for casual exploration.",
    "help.modes.recipe":        "<strong>📋 Recipe + form</strong>: manual selection, full parameter control. Best when you want exact control.",
    "help.modes.diagnose":      "<strong>🩺 Diagnose CLI</strong>: generate Python command to measure γ on your local machine (transformers + numpy). Fast ≈5 min CPU; full ≈20–60 min GPU. Output JSON re-uploadable via Inspect.",
    "help.modes.phase":         "<strong>📊 Phase diagram</strong>: scatter plot of 23 panel models on (log θ, γ) plane. Hagedorn line γ=1 separates Phase A from Phase B. Click a dot to load that model into Recipe form.",
    "help.recipes.title":       "The 8 recipes available",
    "help.recipe.x1.title":     "<strong>X-1 Custom training vs API</strong> — compares cost of training your own model vs paying for API access.",
    "help.recipe.x1.example":   "Try: <em>\"Should I train an 8B custom model or use GPT-4o for 50M tokens/month?\"</em><br>Answer types: YES (custom) / NO (API) with break-even months.",
    "help.recipe.x2.title":     "<strong>X-2 Long Context Viability</strong> — predicts if a model serves a target context length reliably.",
    "help.recipe.x2.example":   "Try: <em>\"Will Meta-Llama-3-8B handle 32000 tokens for retrieval?\"</em><br>Chains: γ_Padé → decomposition → d_horizon → NIAH ceiling → hallucination → KV memory.<br>Verdict: YES / DEGRADED / NO with mitigation if needed.",
    "help.recipe.x3.title":     "<strong>X-3 Budget pre-flight</strong> — given $ budget, what model is feasible to train?",
    "help.recipe.x3.example":   "Try: <em>\"I have $5000, what model can I train?\"</em><br>Answer: GO / TINY-MODEL / MEMORY-LIMITED with concrete N (params) and D (tokens).",
    "help.recipe.x5.title":     "<strong>X-5 Hardware selection</strong> — which GPU should I use to serve at target throughput?",
    "help.recipe.x5.example":   "Try: <em>\"Cheapest hardware to serve Llama-3-8B at 10M tokens/day\"</em><br>Answer: best GPU + $/Mtok + capacity vs target.",
    "help.recipe.x19.title":    "<strong>X-19 KV Compression decision</strong> — should I use soft decay, hard cutoff, or literature methods?",
    "help.recipe.x21.title":    "<strong>X-21 Imprint Purity Diagnostic</strong> — predicts γ on RANDOM tokens via ν=−1/(2π); how clean is the model's RoPE prediction?",
    "help.recipe.x22.title":    "<strong>X-22 Compute-Context Invariant</strong> — does γ × log(N²·D) lie in panel band 51.2 ± 16.8? Detects scaling/training anomalies.",
    "help.recipe.x23.title":    "<strong>X-23 IH-Phase Detector</strong> — pre- or post-induction-head? Cheap probe via sign(γ_text − γ_random).",
    "help.recipe.x19.example":  "Try: <em>\"How to compress KV cache for Qwen2.5-7B at 32K?\"</em><br>Answer: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
    "help.recipe.x21.example":  "Try: <em>\"How clean is the RoPE prediction on Llama-3-8B?\"</em><br>Answer: predicted γ_random + purity diagnostic (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).",
    "help.recipe.x22.example":  "Try: <em>\"Does Mistral-7B fit the compute-context invariant?\"</em><br>Answer: K = γ·log(N²·D), z-score, IN-BAND or OUTLIER.",
    "help.recipe.x23.example":  "Try: <em>\"Is Qwen2.5-7B post-induction-head?\"</em><br>Answer: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY (with size-vs-Δγ consistency check).",
    "help.section.v04":         "<strong>What's new in v0.4</strong> (sesión 29 findings 2026-04-28): three diagnostic recipes derived from cross-model panel analysis (n=22 LLMs).",
    "help.divider.v04_s29":     "— v0.4 (sesión 29 findings) —",
    "footer.tech_stack":        "Computation: Pyodide · Synthesis: WebLLM (Qwen2.5-0.5B local) · Hosting: GitHub Pages · Cost: $0",
    "help.v04.imprint":         "<strong>Learned-imprint slope ν = −1/(2π)</strong>: RoPE rotation period 2π drives a positional bias on weights, proportional to log(N_params). Even random tokens show this scaling. ν is DERIVED — not fitted (empirical err 0.3%).",
    "help.v04.invariant":       "<strong>Chinchilla-attention invariant K</strong>: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Connects compute scaling and attention exponent into a single dimensionless number.",
    "help.v04.ih_probe":        "<strong>Δγ as IH probe</strong>: sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Cheaper than running an in-context-learning benchmark.",
    "help.v04.constants":       "<strong>γ-cluster on famous constants</strong> (intriguing, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (golden conjugate, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat: could be coincidence.",
    "help.param.theta":         "<strong>θ (rope_theta)</strong>: RoPE base frequency. Higher = more long-range capacity. Typical: 10000 (early), 500000 (Llama-3), 1000000 (Qwen2.5).",
    "help.param.T_train":       "<strong>T_train</strong>: max context the model was trained on. From <code>max_position_embeddings</code>.",
    "help.param.T_eval":        "<strong>T_eval</strong>: <em>your target</em> inference context length. The key knob.",
    "help.param.gqa":           "<strong>n_kv_heads &lt; n_attention_heads</strong>: model uses GQA (Grouped Query Attention). Reduces KV memory but pushes γ toward Hagedorn.",
    "help.param.swa":           "<strong>has_SWA</strong>: model uses Sliding Window Attention (Mistral, gemma-2).",
    "help.param.nparams":       "<strong>n_params</strong>: total parameter count. Threshold ~400M for induction-head emergence.",
    "help.add_models.title":    "Adding new models (3 ways)",
    "help.add_models.preset":   "<strong>Preset list</strong>: 11 popular models curated. Just select from dropdown.",
    "help.add_models.hf":       "<strong>HF Hub fetch</strong>: paste any model id (e.g. <code>Qwen/Qwen2.5-32B-Instruct</code>), click 📥 Fetch. Browser downloads <code>config.json</code> directly from HuggingFace, fills the form. Works for any public model.",
    "help.add_models.manual":   "<strong>Manual</strong>: fill the form fields directly with values from the model card.",
    "help.audit.title":         "The audit chain",
    "help.audit.body":          "Every result shows the full <strong>Computation Chain</strong> — each formula step with its inputs, output, and interpretation. Click any step to expand. Cite section numbers (§26.1, §19.1, etc.) refer to the underlying paper for derivation.",
    "help.synthesis.title":     "The plain-English answer",
    "help.synthesis.body":      "After the deterministic chain runs, an in-browser LLM (Qwen2.5-0.5B, ~350MB cached after first load) synthesizes a plain-English summary. The numbers above are <em>always correct</em> (deterministic Python); the synthesis is LLM-generated — verify against the chain if in doubt.",
    "help.params.title":        "Common parameters explained",
    "help.verdicts.title":      "What to look for in verdicts",
    "help.verdict.yes":         "<strong style=\"color:#3fb950;\">YES / GO</strong> — proceed with confidence; numbers support the choice.",
    "help.verdict.deg":         "<strong style=\"color:#d29922;\">DEGRADED / TINY-MODEL</strong> — works but with caveats; read the action.",
    "help.verdict.no":          "<strong style=\"color:#f85149;\">NO / MEMORY-LIMITED</strong> — don't proceed as-is; mitigation provided.",
    "help.privacy.title":       "Privacy",
    "help.privacy.body":        "Everything runs in your browser. No telemetry, no analytics, no data sent anywhere. Even the LLM model runs locally via WebGPU/WebAssembly. Your model_ids and questions never leave this page.",
    "help.source.title":        "Source & paper",
    "help.source.body":         "Source code: <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>Paper: <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a>; arXiv forthcoming)<br>Dataset: <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 58 γ-measurements across 32 models (CC-BY-4.0)",

    "footer.text":             "© 2026 Carles Marin · Apache-2.0 · independent research · the tool that closes the loop of the paper.",

    // §33 v0.4 (sesion 31, 2026-04-30) — new diagnostic functions
    "v04.title":                  "🆕 v0.4 — New diagnostics (sesion 31)",
    "v04.section.intro":          "Four new diagnostic functions derived sesion 31 (2026-04-30) from cross-of-crosses formula games + Sócratic interrogation. Available in <code>taf_browser.py</code> §33.",
    "v04.arch.label":             "Architectural Concentration",
    "v04.arch.desc":              "γ_text ≈ γ_Padé − 0.012·n_kv. Cross-panel correlational law (R²=0.30). Caveat: not per-model predictor.",
    "v04.pdi.label":              "PDI — Padé Deviation Index",
    "v04.pdi.desc":               "PDI = d_horizon_obs/T_eval. Traffic light: green (≈1), orange (>>1), yellow (<<1), red (Phase B negative).",
    "v04.4bit.label":             "4-bit Shift Predictor",
    "v04.4bit.desc":              "MHA: R²(bf16)<0.9 → γ rises; R²>0.99 → γ drops. GQA: precision-robust regardless.",
    "v04.crit.label":             "Critical Exponents Bundle",
    "v04.crit.desc":              "ν_c, β_c, η_c (=γ−1, CORRECTED), α_C, γ_susc with AM-GM minimum at γ=1−1/√2≈0.293.",

    // §34 v0.5 (sesion 32, 2026-05-01) — Machine-verified framework consistency
    "v05.title":                  "🔬 v0.5 — Machine-verified consistency (sesion 32)",
    "v05.section.intro":          "Sage Groebner basis + Lean Mathlib4 dual-tool verification of <strong>15 algebraic identities</strong> of TAF critical exponents. First transformer-attention framework with formal machine-proof backing.",
    "v05.verify.label":           "Algebraic Consistency Check",
    "v05.verify.desc":            "Given measured γ, verifies 12 D-SAGE identities (D-SAGE-1: 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). All passing = framework intact. Failures indicate bf16 outliers / quantization artifacts.",
    "v05.dsage1.label":           "D-SAGE-1 (★★ core)",
    "v05.dsage1.desc":             "Quadratic identity 2η² + η·γ_χ + 1 = 0 (Sage Groebner-discovered, Lean-verified). Replaces incorrect 'triple closure' claim. Refutes paper 1's η=2γ algebraically.",
    "v05.erratum.label":          "Paper 1 erratum — η correction",
    "v05.erratum.desc":            "Paper 1 originally claimed η = 2γ. Sage Groebner + Lean Mathlib4 proved this fails (residual (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Phase A). Correct value: η = γ−1, satisfying D-SAGE-1.",
    "v05.repro.label":            "Reproducibility",
    "v05.repro.desc":              "All 15 theorems machine-proof in Lean Mathlib4 (1973 jobs build success). Sage script: <code>analysis/sage_recursive_sweep_2026-04-30.sage</code>. Lean code: <code>lean_taf/taf/Taf/Identities.lean</code>.",

    // v0.5.1 — TAF Card consistency check button
    "v05.consistency.title":      "🔬 Algebraic consistency check (Sage + Lean v0.5)",
    "v05.consistency.desc":       "Verifies 12 D-SAGE algebraic identities of TAF critical exponents (machine-proof Sage Groebner basis + Lean Mathlib4). Pass = framework intact. Fail = bf16 outlier / quantization artifact.",
    "v05.consistency.btn":        "🔬 Verify algebraic consistency",

    // v0.5.2 — Anti-Ising universality class badge
    "v05.antiising.badge":        "🧲 Anti-Ising class (β=γ−1&lt;0, machine-verified)",

    // v0.5.2 — Per-identity tooltips (plain English explanations)
    "v05.tooltip.D_SAGE_1":       "Quadratic algebraic identity connecting anomalous dimension η and susceptibility γ_χ. The CORE identity discovered by Sage Groebner basis (machine-proof). Replaces earlier wrong claim of triple closure.",
    "v05.tooltip.D_SAGE_2":       "In Phase A, β = γ−1 is negative (anti-Ising). Multiplied by χ = 1/(1−γ) gives exactly −1. Signature of TAF's negative-β regime.",
    "v05.tooltip.D_SAGE_4":       "The specific heat exponent α and susceptibility χ sum to exactly 2 in TAF. Algebraic consequence of Josephson hyperscaling.",
    "v05.tooltip.D_SAGE_5":       "Linear sum identity: α + γ_χ = 2(2−γ). Means as γ approaches 1 (Hagedorn), the sum approaches 2; at γ=0 it's 4.",
    "v05.tooltip.D_SAGE_6":       "Order parameter exponent times susceptibility exponent equals a specific quadratic in γ. Factored algebraic relation.",
    "v05.tooltip.Rushbrooke_tautology": "Standard Rushbrooke hyperscaling 2β + γ_χ = ν·d at d=1. In TAF this is a TAUTOLOGY — γ_χ is defined exactly so this holds. Confirmed by Sage Groebner basis.",
    "v05.tooltip.Josephson_tautology": "Standard Josephson hyperscaling 2 − α = ν·d at d=1. In TAF this is a TAUTOLOGY — α is defined exactly so this holds.",
    "v05.tooltip.Fisher_independent": "Fisher relation γ_χ = (2−η)·ν. In TAF this is INDEPENDENT (does NOT close as identity, contrary to triple-closure claim). Residual is γ(2γ−3)/(1−γ).",
    "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 originally claimed η=2γ. This identity refutes it: residual is positive throughout Phase A. Lean Mathlib4 machine-proof refutation.",
    "v05.tooltip.D_14_nu_imprint": "The learned imprint slope ν = −1/(2π) times 2π equals −1. Trivial dimensional check from paper 1.",
    "v05.tooltip.D_SAGE_7":       "The central charge c=3 times |ν_imprint| times 2π equals 3. Dimensional closure connecting CFT and training imprint.",
    "v05.tooltip.nu_beta_id":     "Correlation length exponent ν times order parameter exponent β equals −1 in Phase A. Variant of D-SAGE-2.",

    "v053.calibration.title":     "🔬 v0.5.3 — Calibration audit (2026-05-02)",
    "v053.calibration.note":      "<strong>SWA correction disabled</strong> — original δ_SWA = -0.21 was fit on n=1 model (insufficient data; group-mean +0.355 with single yes-case). <strong>post_IH correction marked exploratory</strong> — group-mean ≈ 0 in re-audit (n=22 panel) does not replicate the OLS fit. <strong>GQA correction replicates</strong> (panel +0.115 vs hardcoded +0.11). <strong>D_f formula corrected for Phase B (γ&gt;1)</strong> — uses discrete cumulative sum instead of continuum approximation. LLaMA-3, Mistral, Gemma now report correct compression values.",
    "v053.release.banner":        "🔧 v0.5.3 — Audit-driven fixes: KV compression D_f now uses discrete sum (correct for all γ); δ_SWA disabled (n=1 calibration); paper §5.2 C_V coefficient erratum (1/4 → 1/12).",

    // §35 v0.6 — γ predicted-vs-observed diagnostic
    "gamma_check.title":           "🔍 γ predicted vs observed",
    "gamma_check.desc":            "Enter your empirically measured γ. Tool detects regime: fraud (θ inflated) / compressed / over-Padé / SWA-random / normal.",
    "gamma_check.gobs_label":      "γ_observed",
    "gamma_check.gobs_tip":        "Empirically measured γ from your model's attention scores. Use the Diagnose CLI to obtain this from real weights.",
    "gamma_check.random_label":    "Random corpus?",
    "gamma_check.random_tip":      "Tick if γ_observed was measured on random/unstructured tokens. Distinguishes SWA signature (γ_obs > 1) from anomaly.",
    "gamma_check.regime":          "Regime",
    "gamma_check.regime.normal":         "Normal",
    "gamma_check.regime.fraud":          "Fraud (θ inflated)",
    "gamma_check.regime.compressed":     "Compressed context",
    "gamma_check.regime.overpade":       "Over-Padé",
    "gamma_check.regime.swa":            "SWA random-corpus signature",
    "gamma_check.regime.unknown":        "Unknown",
    "gamma_check.regime.normal.desc":    "η ∈ [0.85, 1.15]: model uses its full nominal context, no anomaly.",
    "gamma_check.regime.fraud.desc":     "η < 0.01: nominal θ inflated. Model behaves as if θ ≪ advertised. Likely YaRN/marketing inflation without true context extension.",
    "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5): context is compressed (model attends less far than nominal θ predicts). Common in instruction-tuned / RLHF models.",
    "gamma_check.regime.overpade.desc":  "η > 1.5: model attends farther than Padé predicts. Possible Lerch-corrected regime or undertrained early-checkpoint.",
    "gamma_check.regime.swa.desc":       "γ_obs > 1.05 on random corpus = sliding-window attention signature (Mistral / Gemma family).",
    "gamma_check.regime.unknown.desc":   "Inputs out of range or γ_obs > 1 without random-corpus flag. Verify measurement.",
    "gamma_check.glossary.title":        "ⓘ Glossary — what these variables mean",
    "gamma_check.glossary.gamma_pade":   "<strong>γ_Padé</strong>: closed-form prediction (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
    "gamma_check.glossary.gamma_obs":    "<strong>γ_observed</strong>: empirically measured from your model's attention scores (run the Diagnose CLI on real weights).",
    "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (observed)</strong>: inverted from γ_obs via T√2 / (1 − γ_obs). Effective θ implied by your measurement.",
    "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong>: θ + T/√2. Effective θ predicted by closed-form.",
    "gamma_check.glossary.efficiency":   "<strong>η</strong>: ratio θ_eff_obs / θ_eff_Padé. ≈1 = normal · &lt;0.01 = fraud · &lt;0.5 = compressed · &gt;1.5 = over-Padé.",
    "gamma_check.glossary.delta_h":      "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cardy entropy shift. Negative = compression entropy. ~0 = nominal match.",
    "gamma_check.glossary.regime":       "<strong>Regime</strong>: automatic classifier from η + γ_obs + random_corpus flag.",

    // §36 v0.6 — Tooltips for inline ⓘ icons (per-variable explanations)
    "tooltip.gamma_pade":          "<strong>γ_Padé(T_eval)</strong>: closed-form prediction (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
    "tooltip.gamma_decomposed":    "<strong>γ_decomposed</strong>: γ from full architectural decomposition. Padé baseline + GQA shift + post-IH shift (calibrated audit-replicated subset).",
    "tooltip.d_horizon":           "<strong>d_horizon</strong>: effective attention horizon. Beyond this position, scores fall below noise floor (paper §26).",
    "tooltip.L_NIAH":              "<strong>L_NIAH ceiling</strong>: predicted ceiling for needle-in-a-haystack retrieval reliability at current d_horizon.",
    "tooltip.chi":                 "<strong>χ susceptibility</strong>: χ = 1/(1−γ). Diverges at the Hagedorn line γ=1.",
    "tooltip.kv_memory":           "<strong>KV memory @ T_eval (BF16)</strong>: per-request KV cache = 2 · n_layers · n_kv_heads · d_head · T_eval bytes.",
    "tooltip.theta_eff_obs":       "<strong>θ_eff (observed)</strong>: effective θ implied by your γ_observed: T√2 / (1 − γ_obs).",
    "tooltip.theta_eff_pade":      "<strong>θ_eff (Padé)</strong>: effective θ predicted by closed-form: θ + T/√2.",
    "tooltip.efficiency":          "<strong>η = θ_eff_obs / θ_eff_Padé</strong>: efficiency ratio. ≈1 = normal · &lt;0.01 = fraud · &lt;0.5 = compressed · &gt;1.5 = over-Padé.",
    "tooltip.delta_h_cardy":       "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cardy entropy shift. Negative = compression entropy. ~0 = nominal match.",
    "tooltip.verdict_aggregate":   "<strong>Verdict</strong>: worst-of across all recipes. ✅ GO = all green · ⚠ DEGRADED = ≥1 yellow · ❌ NO = ≥1 red.",
    "tooltip.verdict_breakdown":   "<strong>Per-recipe breakdown</strong>: each recipe tests an <em>independent</em> decision axis (long-context · budget · hardware · custom-vs-API · KV-compression). A ❌ on X-1 means \"use the API for your volume\" not \"the model fails\" — open the Recipes section for per-axis context.",
    "tooltip.gamma_pill":          "<strong>γ headline</strong>: γ_decomposed (or γ_Padé fallback). Range (0,1) = Phase A (anti-Ising). γ ≥ 1 = Hagedorn / Phase B.",
    "tooltip.anti_ising":          "<strong>Anti-Ising class</strong>: Phase A → β = γ−1 &lt; 0. Machine-verified (Sage + Lean Mathlib4). See §35 v0.5.",

    // §37 v0.6 — Lean+Mathlib theorem table
    "lean.table.title":            "📑 Lean+Mathlib theorem table",
    "lean.table.desc":             "Every entry below is machine-proven against Lean 4 + Mathlib4. Click any L# link to jump to the source line on GitHub. Grouped by topic — click a header to expand.",
    "lean.table.theorem":          "Theorem",
    "lean.table.claim":            "Claim",
    "lean.table.tactic":           "Tactic",
    "lean.table.source":           "Source",
    "lean.table.lean":             "Lean",
    "lean.findings.title":         "🔎 Substantive findings",
    "lean.findings.detected_by":   "Detected by",
    "lean.findings.fixed_by":      "Fixed by",
    "lean.findings.recommendation":"Recommendation",
    "lean.meta.repo":              "Repo",
    "lean.meta.build":             "Build",
    "lean.meta.theorems":          "Theorems",
    "lean.meta.verified":          "verified",
    "lean.meta.rejected":          "rejected",
    "lean.meta.sorry":             "sorry",
    "lean.meta.findings":          "substantive findings",
    "lean.manifest.loading":       "Loading Lean manifest…",
    "lean.manifest.error":         "Lean manifest unavailable",

    // Help modal — v0.6 section
    "help.v06.title":              "🆕 v0.6 — γ predicted-vs-observed + Cardy ΔH + Lean badges",
    "help.v06.intro":              "<em>v0.6 (2026-05-06): three new diagnostics live in the TAF Card under <strong>🔬 Diagnostics</strong>. All run in your browser; γ_observed comes from the Diagnose CLI on real weights.</em>",
    "help.v06.layout.title":       "TAF Card layout (new in v0.6)",
    "help.v06.layout.body":        "After clicking <strong>🚀 Generate full profile</strong> the card shows: a <strong>hero strip</strong> on top (architecture class + meta + 3 pills: aggregate verdict ✅/⚠/❌, γ headline, 🧲 Anti-Ising if Phase A) and four <strong>expandable sections</strong>: <strong>📋 Recipes</strong> (open by default — verdict per dimension), <strong>🔬 Diagnostics</strong> (key numbers, γ predicted vs observed, what-if explorer), <strong>✓ Verification</strong> (Sage+Lean algebraic consistency, falsification F1-F23), <strong>📂 Provenance &amp; share</strong> (calibration audit + JSON download / share link / registry submit). Click any header to expand. Every variable has an inline <strong>ⓘ</strong> tooltip.",
    "help.v06.gamma_check.title":  "γ predicted vs observed",
    "help.v06.gamma_check.body":   "Enter the empirically-measured γ from your model and the tool computes <strong>η = θ_eff_obs / θ_eff_Padé</strong> and classifies into one of 5 regimes:",
    "help.v06.case.normal":        "<strong>Normal</strong> (η ∈ [0.85, 1.15]) — model uses its full nominal context. <em>Use case</em>: validate a new release before adopting it.",
    "help.v06.case.fraud":         "<strong>Fraud</strong> (η &lt; 0.01) — nominal θ inflated; model behaves as if θ ≪ advertised. <em>Use case</em>: detect YaRN/marketing inflation (CodeLlama / Mistral-Nemo pattern).",
    "help.v06.case.compressed":    "<strong>Compressed</strong> (η &lt; 0.5) — context compressed; model attends shorter than nominal θ. <em>Use case</em>: spot RLHF/instruction-tuning compression (LLaMA-2 pattern).",
    "help.v06.case.overpade":      "<strong>Over-Padé</strong> (η &gt; 1.5) — model attends farther than Padé predicts. <em>Use case</em>: identify Lerch-corrected regime or undertrained early checkpoints (pythia-1b pattern).",
    "help.v06.case.swa":           "<strong>SWA random-corpus</strong> (γ_obs &gt; 1.05 with random_corpus=Yes) — sliding-window attention signature. <em>Use case</em>: confirm Mistral / Gemma SWA on random tokens.",
    "help.v06.cardy.title":        "Cardy ΔH diagnostic",
    "help.v06.cardy.body":         "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>. Entropy shift between observed effective θ and nominal θ. Strong negative = compression entropy; near zero = nominal match. Complements η for borderline cases.",
    "help.v06.lean.title":         "Lean + Mathlib verification badges",
    "help.v06.lean.body":          "TAF identities are formally machine-proven in Lean Mathlib4: <strong>37 theorems</strong> in 7 groups (Padé, RG flow, Cayley, D-SAGE, audit findings, erratum CV, misc) + <strong>1 substantive finding</strong> (V-derivative factor-2, theorem <code>V_derivative_ne_RG_beta</code>). Source: <a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a> (commit 25c77fd). Re-verify locally: <code>git clone --depth=1 https://github.com/karlesmarin/lean-taf &amp;&amp; cd lean-taf &amp;&amp; lake exe cache get &amp;&amp; lake env lean Taf/Identities.lean</code>. The 🧲 Anti-Ising pill in the hero strip and the Verification accordion link to specific source lines.",
    "help.v06.glossary.title":     "Variable glossary (also embedded in TAF Card)",
    "help.v06.glossary.body":      "Every variable in the TAF Card has an inline ⓘ tooltip. The complete list: γ, γ_Padé, γ_decomposed, γ_observed, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, KV memory, regime. Hover any ⓘ for the definition + paper section.",
  },

  // ────────────────────────────────────────────────────────────────────────
  // ES — Español
  // ────────────────────────────────────────────────────────────────────────
  es: {
    // §33 v0.4 (sesion 31, 2026-04-30) — nuevas funciones diagnósticas
    "v04.title":                  "🆕 v0.4 — Nuevos diagnósticos (sesion 31)",
    "v04.section.intro":          "Cuatro nuevas funciones diagnósticas derivadas en sesión 31 (2026-04-30) desde juegos de fórmulas cross-of-crosses + interrogación socrática. Disponibles en <code>taf_browser.py</code> §33.",
    "v04.arch.label":             "Concentración Arquitectural",
    "v04.arch.desc":              "γ_text ≈ γ_Padé − 0.012·n_kv. Ley correlacional cross-panel (R²=0.30). Caveat: no es predictor per-model.",
    "v04.pdi.label":              "PDI — Índice de Desviación de Padé",
    "v04.pdi.desc":               "PDI = d_horizon_obs/T_eval. Semáforo: verde (≈1), naranja (>>1), amarillo (<<1), rojo (Phase B negativo).",
    "v04.4bit.label":             "Predictor de Shift 4-bit",
    "v04.4bit.desc":              "MHA: R²(bf16)<0.9 → γ sube; R²>0.99 → γ baja. GQA: precision-robusto.",
    "v04.crit.label":             "Bundle de Exponentes Críticos",
    "v04.crit.desc":              "ν_c, β_c, η_c (=γ−1, CORREGIDO), α_C, γ_susc con mínimo AM-GM en γ=1−1/√2≈0.293.",

    // §34 v0.5 (sesion 32, 2026-05-01) — Consistencia algebraica verificada por máquina
    "v05.title":                  "🔬 v0.5 — Consistencia verificada por máquina (sesion 32)",
    "v05.section.intro":          "Verificación dual con Sage Groebner basis + Lean Mathlib4 de <strong>15 identidades algebraicas</strong> de los exponentes críticos TAF. Primer framework transformer-attention con respaldo formal machine-proof.",
    "v05.verify.label":           "Comprobación de Consistencia Algebraica",
    "v05.verify.desc":            "Dado γ medido, verifica 12 identidades D-SAGE (D-SAGE-1: 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). Todas pasando = framework intacto. Fallos indican bf16 outliers / artefactos de cuantización.",
    "v05.dsage1.label":           "D-SAGE-1 (★★ core)",
    "v05.dsage1.desc":             "Identidad cuadrática 2η² + η·γ_χ + 1 = 0 (descubierta por Sage Groebner, verificada Lean). Reemplaza claim incorrecto de 'cierre triple'. Refuta η=2γ del paper 1 algebraicamente.",
    "v05.erratum.label":          "Erratum paper 1 — corrección η",
    "v05.erratum.desc":            "Paper 1 afirmaba η = 2γ. Sage Groebner + Lean Mathlib4 demostraron que falla (residual (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Fase A). Valor correcto: η = γ−1, satisface D-SAGE-1.",
    "v05.repro.label":            "Reproducibilidad",
    "v05.repro.desc":              "Los 15 teoremas son machine-proof en Lean Mathlib4 (build exitoso 1973 jobs). Script Sage: <code>analysis/sage_recursive_sweep_2026-04-30.sage</code>. Código Lean: <code>lean_taf/taf/Taf/Identities.lean</code>.",

    // v0.5.1 — TAF Card consistency check button
    "v05.consistency.title":      "🔬 Comprobación de consistencia algebraica (Sage + Lean v0.5)",
    "v05.consistency.desc":       "Verifica 12 identidades algebraicas D-SAGE de los exponentes críticos TAF (machine-proof Sage Groebner basis + Lean Mathlib4). Pasa = framework intacto. Falla = bf16 outlier / artefacto de cuantización.",
    "v05.consistency.btn":        "🔬 Verificar consistencia algebraica",

    // v0.5.2 — Anti-Ising universality class badge
    "v05.antiising.badge":        "🧲 Clase Anti-Ising (β=γ−1&lt;0, verificado por máquina)",

    // v0.5.2 — Per-identity tooltips (explicaciones en lenguaje claro)
    "v05.tooltip.D_SAGE_1":       "Identidad algebraica cuadrática que conecta dimensión anómala η con susceptibilidad γ_χ. Identidad CORE descubierta por Sage Groebner basis (machine-proof). Reemplaza claim incorrecto de triple closure.",
    "v05.tooltip.D_SAGE_2":       "En Fase A, β = γ−1 es negativo (anti-Ising). Multiplicado por χ = 1/(1−γ) da exactamente −1. Signature del régimen negativo-β de TAF.",
    "v05.tooltip.D_SAGE_4":       "El exponente de calor específico α y la susceptibilidad χ suman exactamente 2 en TAF. Consecuencia algebraica del hiperescalado de Josephson.",
    "v05.tooltip.D_SAGE_5":       "Identidad lineal: α + γ_χ = 2(2−γ). Significa que cuando γ se acerca a 1 (Hagedorn), la suma se acerca a 2; en γ=0 vale 4.",
    "v05.tooltip.D_SAGE_6":       "Exponente de parámetro de orden multiplicado por exponente de susceptibilidad da una cuadrática específica en γ. Relación algebraica factorizada.",
    "v05.tooltip.Rushbrooke_tautology": "Hiperescalado de Rushbrooke estándar 2β + γ_χ = ν·d en d=1. En TAF es TAUTOLOGÍA — γ_χ se define exactamente para que se cumpla. Confirmado por Sage Groebner basis.",
    "v05.tooltip.Josephson_tautology": "Hiperescalado de Josephson estándar 2 − α = ν·d en d=1. En TAF es TAUTOLOGÍA — α se define exactamente para que se cumpla.",
    "v05.tooltip.Fisher_independent": "Relación de Fisher γ_χ = (2−η)·ν. En TAF es INDEPENDIENTE (NO cierra como identidad, contrario al claim de triple closure). El residuo es γ(2γ−3)/(1−γ).",
    "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 afirmaba η=2γ. Esta identidad lo refuta: el residuo es positivo en toda Fase A. Refutación machine-proof por Lean Mathlib4.",
    "v05.tooltip.D_14_nu_imprint": "La pendiente de imprint aprendido ν = −1/(2π) multiplicada por 2π da −1. Verificación dimensional trivial del paper 1.",
    "v05.tooltip.D_SAGE_7":       "La carga central c=3 multiplicada por |ν_imprint| multiplicada por 2π da 3. Cierre dimensional conectando CFT con imprint de entrenamiento.",
    "v05.tooltip.nu_beta_id":     "Exponente de longitud de correlación ν multiplicado por exponente de parámetro de orden β da −1 en Fase A. Variante de D-SAGE-2.",

    "v053.calibration.title":     "🔬 v0.5.3 — Auditoría de calibración (2026-05-02)",
    "v053.calibration.note":      "<strong>Corrección SWA desactivada</strong> — δ_SWA = -0.21 original se ajustó con n=1 modelo (datos insuficientes; promedio del único caso +0.355). <strong>Corrección post_IH marcada exploratoria</strong> — promedio de grupo ≈ 0 en re-auditoría (panel n=22) no replica el ajuste OLS. <strong>Corrección GQA replica</strong> (panel +0.115 vs hardcoded +0.11). <strong>Fórmula D_f corregida para Fase B (γ&gt;1)</strong> — usa suma cumulativa discreta en lugar de aproximación continua. LLaMA-3, Mistral, Gemma ahora reportan valores correctos de compresión.",
    "v053.release.banner":        "🔧 v0.5.3 — Correcciones por audit: D_f de compresión KV ahora usa suma discreta (correcto para todo γ); δ_SWA desactivado (calibración n=1); erratum coeficiente C_V paper §5.2 (1/4 → 1/12).",

    // §35 v0.6 — Diagnóstico γ predicho vs observado
    "gamma_check.title":           "🔍 γ predicho vs observado",
    "gamma_check.desc":            "Introduce tu γ medido empíricamente. La herramienta detecta el régimen: fraude (θ inflado) / comprimido / sobre-Padé / SWA-aleatorio / normal.",
    "gamma_check.gobs_label":      "γ_observado",
    "gamma_check.gobs_tip":        "γ medido empíricamente desde los attention scores de tu modelo. Usa la CLI de Diagnose para obtenerlo desde pesos reales.",
    "gamma_check.random_label":    "¿Corpus aleatorio?",
    "gamma_check.random_tip":      "Marca sí si γ_observado se midió sobre tokens aleatorios/no estructurados. Distingue la firma SWA (γ_obs > 1) de una anomalía.",
    "gamma_check.regime":          "Régimen",
    "gamma_check.regime.normal":         "Normal",
    "gamma_check.regime.fraud":          "Fraude (θ inflado)",
    "gamma_check.regime.compressed":     "Contexto comprimido",
    "gamma_check.regime.overpade":       "Sobre-Padé",
    "gamma_check.regime.swa":            "Firma SWA (corpus aleatorio)",
    "gamma_check.regime.unknown":        "Desconocido",
    "gamma_check.regime.normal.desc":    "η ∈ [0.85, 1.15]: el modelo usa su contexto nominal completo, sin anomalías.",
    "gamma_check.regime.fraud.desc":     "η < 0.01: θ nominal inflado. El modelo se comporta como si θ ≪ del anunciado. Probable inflación tipo YaRN/marketing sin extensión real de contexto.",
    "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5): contexto comprimido (el modelo atiende menos lejos de lo que predice θ nominal). Común en modelos instruction-tuned / RLHF.",
    "gamma_check.regime.overpade.desc":  "η > 1.5: el modelo atiende más lejos de lo que Padé predice. Posible régimen Lerch-corregido o checkpoint temprano sub-entrenado.",
    "gamma_check.regime.swa.desc":       "γ_obs > 1.05 sobre corpus aleatorio = firma de sliding-window attention (familias Mistral / Gemma).",
    "gamma_check.regime.unknown.desc":   "Entradas fuera de rango o γ_obs > 1 sin flag de corpus aleatorio. Verifica la medición.",
    "gamma_check.glossary.title":        "ⓘ Glosario — significado de las variables",
    "gamma_check.glossary.gamma_pade":   "<strong>γ_Padé</strong>: predicción cerrada (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
    "gamma_check.glossary.gamma_obs":    "<strong>γ_observado</strong>: medido empíricamente desde los attention scores (ejecuta Diagnose CLI sobre pesos reales).",
    "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (observado)</strong>: invertido desde γ_obs vía T√2 / (1 − γ_obs). θ efectivo implicado por tu medición.",
    "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong>: θ + T/√2. θ efectivo predicho por la fórmula cerrada.",
    "gamma_check.glossary.efficiency":   "<strong>η</strong>: ratio θ_eff_obs / θ_eff_Padé. ≈1 = normal · &lt;0.01 = fraude · &lt;0.5 = comprimido · &gt;1.5 = sobre-Padé.",
    "gamma_check.glossary.delta_h":      "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cambio de entropía de Cardy. Negativo = entropía de compresión. ~0 = coincide con nominal.",
    "gamma_check.glossary.regime":       "<strong>Régimen</strong>: clasificador automático a partir de η + γ_obs + flag corpus_aleatorio.",

    // §36 v0.6 — Tooltips para iconos ⓘ inline
    "tooltip.gamma_pade":          "<strong>γ_Padé(T_eval)</strong>: predicción cerrada (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
    "tooltip.gamma_decomposed":    "<strong>γ_descompuesto</strong>: γ desde descomposición arquitectural completa. Línea base Padé + shift GQA + shift post-IH (subconjunto replicado en audit calibrado).",
    "tooltip.d_horizon":           "<strong>d_horizon</strong>: horizonte efectivo de atención. Más allá los scores caen bajo el suelo de ruido (paper §26).",
    "tooltip.L_NIAH":              "<strong>Techo L_NIAH</strong>: techo predicho de fiabilidad needle-in-a-haystack al d_horizon actual.",
    "tooltip.chi":                 "<strong>χ susceptibilidad</strong>: χ = 1/(1−γ). Diverge en la línea Hagedorn γ=1.",
    "tooltip.kv_memory":           "<strong>Memoria KV @ T_eval (BF16)</strong>: caché KV por petición = 2 · n_layers · n_kv_heads · d_head · T_eval bytes.",
    "tooltip.theta_eff_obs":       "<strong>θ_eff (observado)</strong>: θ efectivo implicado por tu γ_observado: T√2 / (1 − γ_obs).",
    "tooltip.theta_eff_pade":      "<strong>θ_eff (Padé)</strong>: θ efectivo predicho por la fórmula cerrada: θ + T/√2.",
    "tooltip.efficiency":          "<strong>η = θ_eff_obs / θ_eff_Padé</strong>: ratio de eficiencia. ≈1 = normal · &lt;0.01 = fraude · &lt;0.5 = comprimido · &gt;1.5 = sobre-Padé.",
    "tooltip.delta_h_cardy":       "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cambio de entropía de Cardy. Negativo = entropía de compresión. ~0 = coincide con nominal.",
    "tooltip.verdict_aggregate":   "<strong>Veredicto</strong>: peor-de entre todas las recipes. ✅ ADELANTE = todo verde · ⚠ DEGRADADO = ≥1 amarillo · ❌ NO = ≥1 rojo.",
    "tooltip.verdict_breakdown":   "<strong>Desglose por recipe</strong>: cada recipe evalúa un eje de decisión <em>independiente</em> (contexto-largo · presupuesto · hardware · custom-vs-API · compresión-KV). Un ❌ en X-1 significa «usa la API para tu volumen» no «el modelo falla» — abre la sección Recipes para contexto por eje.",
    "tooltip.gamma_pill":          "<strong>γ titular</strong>: γ_descompuesto (o γ_Padé como fallback). Rango (0,1) = Fase A (anti-Ising). γ ≥ 1 = Hagedorn / Fase B.",
    "tooltip.anti_ising":          "<strong>Clase Anti-Ising</strong>: Fase A → β = γ−1 &lt; 0. Machine-verified (Sage + Lean Mathlib4). Ver §35 v0.5.",

    // §37 v0.6 — Tabla de teoremas Lean+Mathlib
    "lean.table.title":            "📑 Tabla de teoremas Lean+Mathlib",
    "lean.table.desc":             "Cada entrada está machine-proven contra Lean 4 + Mathlib4. Click en cualquier link L# para saltar a la línea fuente en GitHub. Agrupado por tema — click en cabecera para expandir.",
    "lean.table.theorem":          "Teorema",
    "lean.table.claim":            "Afirmación",
    "lean.table.tactic":           "Táctica",
    "lean.table.source":           "Fuente",
    "lean.table.lean":             "Lean",
    "lean.findings.title":         "🔎 Findings sustantivos",
    "lean.findings.detected_by":   "Detectado por",
    "lean.findings.fixed_by":      "Arreglado por",
    "lean.findings.recommendation":"Recomendación",
    "lean.meta.repo":              "Repo",
    "lean.meta.build":             "Build",
    "lean.meta.theorems":          "Teoremas",
    "lean.meta.verified":          "verificados",
    "lean.meta.rejected":          "rechazados",
    "lean.meta.sorry":             "sorry",
    "lean.meta.findings":          "findings sustantivos",
    "lean.manifest.loading":       "Cargando manifest Lean…",
    "lean.manifest.error":         "Manifest Lean no disponible",

    // Help modal — sección v0.6
    "help.v06.title":              "🆕 v0.6 — γ predicho-vs-observado + Cardy ΔH + badges Lean",
    "help.v06.intro":              "<em>v0.6 (2026-05-06): tres diagnósticos nuevos viven en el TAF Card bajo <strong>🔬 Diagnósticos</strong>. Todo corre en tu navegador; γ_observado lo obtienes con la Diagnose CLI sobre pesos reales.</em>",
    "help.v06.layout.title":       "Layout del TAF Card (nuevo en v0.6)",
    "help.v06.layout.body":        "Tras click en <strong>🚀 Generar perfil completo</strong> la tarjeta muestra: una <strong>tira hero</strong> arriba (clase de arquitectura + meta + 3 pills: veredicto agregado ✅/⚠/❌, γ titular, 🧲 Anti-Ising si Fase A) y cuatro <strong>secciones plegables</strong>: <strong>📋 Recipes</strong> (abierto por defecto — veredicto por dimensión), <strong>🔬 Diagnósticos</strong> (números clave, γ predicho vs observado, explorador what-if), <strong>✓ Verificación</strong> (consistencia algebraica Sage+Lean, falsificación F1-F23), <strong>📂 Procedencia y compartir</strong> (auditoría de calibración + descarga JSON / enlace / submit al registro). Click en cualquier cabecera para expandir. Cada variable tiene tooltip <strong>ⓘ</strong> inline.",
    "help.v06.gamma_check.title":  "γ predicho vs observado",
    "help.v06.gamma_check.body":   "Introduces el γ medido empíricamente y la herramienta calcula <strong>η = θ_eff_obs / θ_eff_Padé</strong> y clasifica en uno de 5 regímenes:",
    "help.v06.case.normal":        "<strong>Normal</strong> (η ∈ [0.85, 1.15]) — el modelo usa su contexto nominal completo. <em>Caso de uso</em>: validar un release nuevo antes de adoptarlo.",
    "help.v06.case.fraud":         "<strong>Fraude</strong> (η &lt; 0.01) — θ nominal inflado; el modelo se comporta como si θ ≪ del anunciado. <em>Caso de uso</em>: detectar inflación YaRN/marketing (patrón CodeLlama / Mistral-Nemo).",
    "help.v06.case.compressed":    "<strong>Comprimido</strong> (η &lt; 0.5) — contexto comprimido; el modelo atiende menos lejos que θ nominal. <em>Caso de uso</em>: detectar compresión por RLHF/instruction-tuning (patrón LLaMA-2).",
    "help.v06.case.overpade":      "<strong>Sobre-Padé</strong> (η &gt; 1.5) — el modelo atiende más lejos que Padé predice. <em>Caso de uso</em>: identificar régimen Lerch-corregido o checkpoints tempranos sub-entrenados (patrón pythia-1b).",
    "help.v06.case.swa":           "<strong>SWA corpus aleatorio</strong> (γ_obs &gt; 1.05 con corpus_aleatorio=Sí) — firma de sliding-window attention. <em>Caso de uso</em>: confirmar SWA en Mistral / Gemma sobre tokens random.",
    "help.v06.cardy.title":        "Diagnóstico Cardy ΔH",
    "help.v06.cardy.body":         "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>. Cambio de entropía entre el θ efectivo observado y el θ nominal. Negativo fuerte = entropía de compresión; cerca de cero = coincide con nominal. Complementa a η para casos borderline.",
    "help.v06.lean.title":         "Badges de verificación Lean + Mathlib",
    "help.v06.lean.body":          "Las identidades TAF están formalmente machine-proven en Lean Mathlib4: <strong>37 teoremas</strong> en 7 grupos (Padé, flujo RG, Cayley, D-SAGE, hallazgos de auditoría, erratum CV, misc) + <strong>1 hallazgo sustantivo</strong> (factor 2 en derivada V, teorema <code>V_derivative_ne_RG_beta</code>). Fuente: <a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a> (commit 25c77fd). Re-verifica localmente: <code>git clone --depth=1 https://github.com/karlesmarin/lean-taf &amp;&amp; cd lean-taf &amp;&amp; lake exe cache get &amp;&amp; lake env lean Taf/Identities.lean</code>. La pill 🧲 Anti-Ising del hero y la sección Verificación enlazan a líneas específicas del código fuente.",
    "help.v06.glossary.title":     "Glosario de variables (también embebido en TAF Card)",
    "help.v06.glossary.body":      "Cada variable del TAF Card tiene un tooltip ⓘ inline. Lista completa: γ, γ_Padé, γ_descompuesto, γ_observado, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, memoria KV, régimen. Pasa el ratón sobre cualquier ⓘ para la definición + sección del paper.",

    "hero.title":     "🔬 TAF Agent",
    "hero.tagline":   "Diagnostica cualquier LLM transformer en 30 segundos. Gratis. Sin GPU. Sin registro.",
    "hero.subtitle":  "Predice si un modelo te servirá para tu caso de uso <em>antes</em> de gastar dinero o tiempo. Todo corre en tu navegador &mdash; tus datos nunca salen de esta pestaña.",
    "hero.help":      "📘 Manual y ejemplos",
    "hero.quickstart_btn": "⚡ Inicio rápido",
    "hero.inventory_btn":  "🧰 Qué te ofrece",
    "hero.about":     "Construido por un investigador independiente. Código abierto. Sin afiliación con ningún proveedor de modelos.",

    "modes.title":    "🎯 Modo",
    "modes.profile":  "📇 Perfilar un modelo",
    "modes.compare":  "🆚 Comparar modelos",
    "modes.inspector": "🔍 Inspeccionar config",
    "modes.ask":      "💬 Pregunta libre",
    "modes.recipe":   "📋 Elegir receta",
    "modes.diagnose": "🩺 Diagnóstico CLI",
    "diagnose.title": "🩺 Generador del comando Diagnose CLI",
    "diagnose.tip":   "El navegador predice γ desde config; el CLI mide γ_obs sobre los pesos reales. Este generador produce el comando exacto para ejecutar localmente.",
    "diagnose.desc":  "Elige opciones y copia-pega el comando generado en tu máquina local (Python + transformers + numpy). Modo rápido ≈5 min CPU; completo ≈20–60 min GPU.",
    "diagnose.model_label": "ID del modelo HF:",
    "diagnose.theta_label": "θ (auto si vacío):",
    "diagnose.n_label": "Contexto N:",
    "diagnose.options_label": "Opciones:",
    "diagnose.opt_fast": "--fast (CPU, ~5 min)",
    "diagnose.opt_cpu": "--cpu (forzar CPU)",
    "diagnose.opt_4bit": "--load_in_4bit (modelos ≥7B)",
    "diagnose.local_label": "--local path (opcional):",
    "diagnose.build_btn": "📋 Generar comando",
    "diagnose.cmd_title": "Comando generado:",
    "diagnose.copy_btn": "📋 Copiar al portapapeles",
    "diagnose.next_steps": "Siguientes pasos: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Ejecuta el comando (4) JSON resultado → subir vía modo Inspect para análisis TAF completo.",
    "modes.phase":    "📊 Diagrama de fase",
    "phase.title":    "📊 Diagrama de fase (γ × θ)",
    "phase.tip":      "Cada punto es un modelo del panel empírico del paper. x: log θ; y: γ. La línea Hagedorn γ=1 separa Fase A de Fase B. Hover para detalles, click para cargar en el formulario.",
    "phase.desc":     "23 modelos en el panel; curva Padé a T=2000.",
    "modes.desc":     "<strong>Inicio rápido</strong>: pega cualquier id de modelo HuggingFace (ej. <code>meta-llama/Meta-Llama-3-8B</code>), click Perfilar. Verás las 5 recetas evaluadas en segundos.",

    "profile.title":           "📇 Perfilar un modelo",
    "profile.desc":            "<strong>Para técnicos</strong>: cuando necesitas una foto completa de viabilidad de un modelo candidato. Un click ejecuta las 5 recetas y produce una TAF Card unificada.",
    "profile.preset_label":    "Preset:",
    "profile.preset_default":  "— o elige de la lista —",
    "profile.hf_label":        "ID modelo HF:",
    "profile.fetch_btn":       "📥 Cargar",
    "profile.btn":             "🚀 Generar perfil completo",
    "profile.quickstart":      "💡 Inicio rápido: elige cualquier preset → click Generar. O pega un id desde <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub trending</a> → 📥 Cargar → Generar.",

    "compare.title":           "🆚 Comparar modelos lado a lado",
    "compare.desc":            "<strong>Para técnicos</strong>: cuando eliges entre 2-3 modelos candidatos para un escenario de despliegue específico. Misma receta, múltiples modelos, veredictos lado a lado.",
    "compare.recipe_label":    "Receta:",
    "compare.T_eval_label":    "T_eval (contexto objetivo):",
    "compare.models_title":    "Modelos a comparar (hasta 3)",
    "compare.btn":             "🚀 Comparar",
    "compare.example":         "💡 Prueba: pega 3 modelos populares de 7-8B (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), receta X-2, T_eval=16000. Mira cuál maneja mejor contexto largo.",

    "ask.title":               "❓ Tu pregunta",
    "ask.placeholder":         "ej. ¿Mistral-7B aguanta 16K NIAH retrieval? O: Tengo 5,000$, ¿qué modelo puedo entrenar? O: ¿GPU más barato para servir Llama-70B a 100M tokens/día?",
    "ask.btn":                 "🚀 Analizar",
    "ask.example_btn":         "💡 Probar ejemplo",

    "recipe.title":            "📋 Receta",
    "recipe.default":          "— elige una receta —",
    "recipe.input_title":      "🎯 Entradas",

    "verdict.title":           "📊 Veredicto",
    "chain.title":             "🔍 Cadena de cálculo",
    "chain.desc":              "Cada número de abajo es Python determinista. Click en un paso para expandir.",
    "answer.title":            "💬 Respuesta en lenguaje natural",
    "share.btn":               "🔗 Copiar link",
    "share.copied":            "✅ ¡Copiado al portapapeles!",
    "share.download":          "💾 Descargar JSON",
    "share.download_md":       "📝 Markdown",
    "share.download_tex":      "📜 LaTeX",
    "share.submit":            "📤 Enviar al registry",
    "share.submit_clip_ok":    "↗ GitHub abierto. Cuerpo copiado al portapapeles — pégalo en el cuerpo del issue.",
    "share.submit_clip_fail":  "↗ GitHub abierto. Portapapeles bloqueado — cuerpo volcado en la consola del navegador (F12).",
    "share.import_title":      "📂 Importar un resultado TAF compartido",
    "a11y.skip":               "Saltar al contenido principal",

    // v0.6.2 — rework de landing: inicio rápido + inventario + tooltips de arquitectura
    "qs.title":                    "⚡ Inicio rápido",
    "qs.step1":                    "Pega un model ID de HuggingFace (ej. <code>meta-llama/Meta-Llama-3-8B</code>)",
    "qs.step2":                    "Click en <strong>📇 Profile a model</strong>",
    "qs.step3":                    "Lee tu TAF Card — veredicto por caso de uso + números clave + matemáticas verificadas con Lean+Mathlib",
    "qs.cta":                      "↓ Empezar ahora",
    "inv.title":                   "🧰 Qué te ofrece esta herramienta",
    "inv.recipes.title":           "🎯 8 recetas — ¿sirve este modelo para tu caso?",
    "inv.recipes.x1.title":        "Entrenar propio vs API",
    "inv.recipes.x1.body":         "¿cuál sale más barato para tu tráfico?",
    "inv.recipes.x2.title":        "Contexto largo",
    "inv.recipes.x2.body":         "¿aguanta 32k / 128k tokens de forma fiable?",
    "inv.recipes.x3.title":        "Presupuesto",
    "inv.recipes.x3.body":         "con $X, ¿qué modelo puedes entrenar desde cero?",
    "inv.recipes.x5.title":        "Hardware",
    "inv.recipes.x5.body":         "¿qué GPU para servir N tokens/día?",
    "inv.recipes.x19.title":       "KV cache",
    "inv.recipes.x19.body":        "¿cómo comprimir sin romper la calidad?",
    "inv.recipes.x21.title":       "Pureza de imprint",
    "inv.recipes.x21.body":        "¿cómo de limpia es la codificación posicional del modelo?",
    "inv.recipes.x22.title":       "Compute-context",
    "inv.recipes.x22.body":        "¿el modelo entra en la banda empírica?",
    "inv.recipes.x23.title":       "Fase IH",
    "inv.recipes.x23.body":        "¿pre- o post-induction-head?",
    "inv.diag.title":              "🔬 Diagnósticos",
    "inv.diag.gamma":              "<strong>γ predicho vs observado</strong> — auto-clasifica el modelo en 5 regímenes (normal · fraude / contexto inflado · comprimido · over-Padé · sliding-window)",
    "inv.diag.cardy":              "<strong>Cardy ΔH</strong> — desplazamiento de entropía entre contexto observado y nominal",
    "inv.diag.fals":               "<strong>Tabla de falsabilidad</strong> — comprueba 23 predicciones específicas (F1–F23)",
    "inv.diag.alg":                "<strong>Consistencia algebraica</strong> — 8 identidades matemáticas que el modelo debe cumplir",
    "inv.verify.title":            "✓ Matemáticas formalmente verificadas",
    "inv.verify.count":            "<strong>37 teoremas</strong> machine-proven en Lean 4 + Mathlib4",
    "inv.verify.click":            "Click en cualquier badge → abre la línea fuente en GitHub",
    "inv.verify.reverify":         "Verifícalo tú: <code>lake build</code> (≈5 s tras cache)",
    "inv.export.title":            "📤 Exportar y compartir",
    "inv.export.formats":          "<strong>JSON · Markdown · LaTeX</strong> (listo para paper)",
    "inv.export.share":            "Link reproducible (estado codificado en URL)",
    "inv.export.registry":         "Envía al registro comunitario en GitHub",
    "arch.summary":                "Arquitecturas soportadas",
    "arch.anyhf":                  "✓ Cualquier modelo público de HuggingFace",
    "tooltip.mha":                 "Multi-Head Attention: cada posición atiende mediante varios heads paralelos a la vez.",
    "tooltip.gqa":                 "Grouped Query Attention: las queries comparten menos keys/values que heads (ahorra memoria pero empuja γ hacia Hagedorn).",
    "tooltip.alibi":               "Attention with Linear Biases: la info de posición es una pendiente aprendida añadida a los scores, sin rotación.",
    "tooltip.abspe":               "Absolute Position Embeddings: cada posición tiene un vector fijo aprendido sumado al embedding del token.",
    "tooltip.swa":                 "Sliding Window Attention: cada token solo atiende dentro de una ventana local fija (Mistral, gemma-2 lo usan).",
    "tooltip.ssm":                 "State Space Model: capa de secuencia que mantiene estado interno en lugar de atención (Mamba, Jamba lo usan).",

    // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker
    "modes.unmask":                "🪟 Desenmascarar",
    "unmask.title":                "🪟 Desenmascarador de contexto",
    "unmask.tip":                  "Pega un id de modelo HuggingFace (o config.json crudo). La herramienta detecta sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), y GQA — todo lo que hace que <code>max_position_embeddings</code> sea mayor que el contexto efectivo real. Mistral-7B-v0.1 es el ejemplo canónico: declara 32k, atiende dentro de ~4-8k.",
    "unmask.desc":                 "<strong>¿Estás a punto de gastar dinero en un modelo que en realidad no atiende tan lejos?</strong> Pega un id y descúbrelo en 1 segundo. Sin GPU, sin inferencia — solo aritmética sobre config.json.",
    "unmask.id_label":             "ID modelo HF:",
    "unmask.fetch_btn":            "🔍 Desenmascarar",
    "unmask.paste_summary":        "O pega config.json crudo (modelos privados / en desarrollo)",
    "unmask.paste_btn":            "🔍 Desenmascarar config pegado",
    "unmask.label.declared":       "Contexto declarado",
    "unmask.label.effective":      "Efectivo (estimado)",
    "unmask.label.ratio":          "Ratio",
    "unmask.section.flags":        "Banderas de arquitectura",
    "unmask.section.warnings":     "Avisos",
    "unmask.section.reco":         "Recomendación",
    "unmask.flag.swa":             "SWA",
    "unmask.flag.rope":            "RoPE scaling",
    "unmask.flag.gqa":             "GQA",
    "unmask.flag.layers":          "Capas",
    "unmask.flag.dhead":           "d_head",
    "unmask.flag.theta":           "RoPE θ",
    "unmask.flag.yes":             "sí",
    "unmask.flag.no":              "no",
    "unmask.flag.full_mha":        "no (MHA completo, {n} heads)",
    "unmask.verdict.honest":            "✅ HONESTO",
    "unmask.verdict.inflated":          "⚠ INFLADO",
    "unmask.verdict.severely_inflated": "❌ GRAVEMENTE INFLADO",
    "unmask.verdict.yarn_extended":     "⚠ YARN-EXTENDIDO",
    "unmask.verdict.unknown":           "❓ DESCONOCIDO",
    "unmask.warn.swa_window":      "Ventana SWA: {window} tokens — cada capa solo atiende dentro de esta ventana.",
    "unmask.warn.multihop":        "Estimación multi-hop: ~{multiHop} tokens (conservador: ventana × {factor}).",
    "unmask.warn.yarn":            "RoPE scaling ({type}) extiende contexto {factor}× desde ~{original} hasta {declared} tokens.",
    "unmask.warn.yarn_advice":     "Contexto RoPE-extendido — verifica el comportamiento de γ a la longitud declarada con el diagnóstico γ_check.",
    "unmask.warn.gqa_small_dhead": "head dim pequeño ({d_head}) + GQA: probable compresión de KV cache a contexto largo (γ empujado hacia Hagedorn).",
    "unmask.reco.honest":              "Modelo de atención completa estándar. Contexto efectivo coincide con declarado ({declared} tokens).",
    "unmask.reco.inflated":            "Efectivo ~{effective} tokens vía SWA. Usa γ_check para verificar el comportamiento a tu longitud objetivo.",
    "unmask.reco.severely_inflated":   "Trátalo como un modelo de ~{effective} tokens en la práctica. El claim de {declared} tokens solo aplica vía cadenas de atención cross-layer, que empíricamente degradan más allá de ~2× la ventana SWA.",
    "unmask.reco.yarn_extended":       "Contexto RoPE-extendido. Corre un benchmark long-context (NIAH a 8k / 16k / 32k / full) para confirmar que la extensión se sostiene. Usa γ_check con T_eval = {declared}.",
    "unmask.reco.unknown":             "No se pudo parsear el config. Verifica que la URL sea un modelo HF válido con config.json público.",
    "unmask.status.empty_id":      "⚠ Introduce un model id (ej. mistralai/Mistral-7B-v0.1).",
    "unmask.status.fetching":      "⏳ Obteniendo config.json para {modelId}...",
    "unmask.status.success":       "✅ Analizado {modelId} (veredicto: {verdict})",
    "unmask.status.empty_paste":   "⚠ Pega un config.json primero.",
    "unmask.status.invalid_json":  "❌ JSON inválido: {error}",
    "unmask.status.success_paste": "✅ Config pegado analizado (veredicto: {verdict})",
    "unmask.pasted_label":         "(config pegado)",
    "mode_desc.ask":               "Escribe una pregunta libre. El LLM en el navegador elige la receta correcta y la ejecuta.",
    "mode_desc.recipe":            "Selecciona una receta directamente y rellena el formulario. Control manual completo.",
    "mode_desc.profile":           "Inicio más rápido: pega cualquier model id de HuggingFace, click Profile. Mira las 5 recetas en segundos.",
    "mode_desc.compare":           "Elige 2-3 modelos candidatos + una receta. Ve veredictos lado a lado en tabla.",
    "mode_desc.inspector":         "Pega un config.json directamente. Útil para modelos privados / en desarrollo no en HF Hub.",
    "mode_desc.diagnose":          "Construye el comando CLI diagnose_model.py para MEDIR γ_obs en GPU real. El navegador predice; el CLI mide.",
    "mode_desc.phase":             "Scatter γ × θ del panel empírico del paper. Hover sobre puntos para detalles, click para cargar en Diagnose / Recipe.",
    "mode_desc.unmask":            "Detecta si max_position_embeddings es engañoso (SWA / YaRN / RoPE-scaling). Pega un model id, obtén un veredicto en 1 línea.",
    "profile.preset_loaded":       "✅ Preset cargado para <strong>{id}</strong>. Formulario pre-rellenado. (Click 📥 Fetch para sobreescribir con el último config de HF Hub.)",

    // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
    "modes.template":              "📜 Chat-template",
    "mode_desc.template":          "Detecta qué familia de chat-template usa un modelo (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Da el flag CLI exacto para lm-eval / vLLM / transformers.",
    "template.title":              "📜 Detector de Chat-template",
    "template.tip":                "Pega un model id de HF (o tokenizer_config.json crudo). Detecta la familia del chat-template y te da el comando exacto para usarlo bien. lm-eval-harness divide la accuracy entre 2 silenciosamente si te olvidas de aplicarlo (issue #1841).",
    "template.desc":               "<strong>¿Olvidaste <code>--apply_chat_template</code>?</strong> La mayoría de evals multi-turn fallan ~50% porque el chat template no se aplicó. Pega un model id, obtén el flag CLI exacto para tu stack.",
    "template.id_label":           "ID modelo HF:",
    "template.fetch_btn":          "📜 Detectar",
    "template.paste_summary":      "O pega tokenizer_config.json crudo (modelos privados)",
    "template.paste_btn":          "📜 Detectar config pegado",
    "template.label.family":       "Familia detectada",
    "template.label.markers":      "Marcadores coincidentes",
    "template.label.tpl_len":      "Longitud template",
    "template.section.warnings":   "Avisos",
    "template.section.commands":   "Comandos por framework",
    "template.section.raw":        "Template crudo (preview)",
    "template.family.custom":      "custom (familia desconocida)",
    "template.family.none":        "(sin chat_template)",
    "template.verdict.ok":         "✅ TEMPLATE DETECTADO",
    "template.verdict.custom":     "⚠ TEMPLATE CUSTOM",
    "template.verdict.missing":    "❌ SIN CHAT TEMPLATE",
    "template.verdict.base_model": "ℹ MODELO BASE (sin chat)",
    "template.verdict.unknown":    "❓ DESCONOCIDO",
    "template.warn.no_chat_template": "Sin campo <code>chat_template</code> en tokenizer_config.json. Típico de modelos base / pretrained. Si esperabas un modelo instruct-tuned, puede que el archivo cargado sea incorrecto.",
    "template.warn.custom_template":  "Template no estándar ({length} chars). La herramienta no lo encajó en familias conocidas. Revisa el preview y verifica que tu framework de eval lo soporta.",
    "template.warn.lm_eval_apply":    "<strong>lm-eval-harness:</strong> añade <code>--apply_chat_template</code> o tu accuracy bajará ~50% silenciosamente en evals multi-turn (issue #1841).",
    "template.warn.vllm_apply":       "<strong>vLLM serve:</strong> verifica que <code>--chat-template</code> esté puesto (la auto-detección a veces falla en variantes fine-tuned). Sugerido: <code>{name}</code>.",
    "template.status.empty_id":    "⚠ Introduce un model id (ej. mistralai/Mistral-7B-Instruct-v0.3).",
    "template.status.fetching":    "⏳ Obteniendo tokenizer_config.json para {modelId}...",
    "template.status.success":     "✅ Detectado {modelId} (veredicto: {verdict})",
    "template.status.empty_paste": "⚠ Pega un tokenizer_config.json primero.",
    "template.status.invalid_json":"❌ JSON inválido: {error}",
    "template.status.success_paste":"✅ Config pegado detectado (veredicto: {verdict})",
    "template.pasted_label":       "(tokenizer_config pegado)",

    // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
    "modes.arena":                 "🎯 Arena CI",
    "mode_desc.arena":             "Recupera intervalos de confianza desde datos crudos de votos pairwise (MLE Bradley-Terry + bootstrap). Detecta pares estadísticamente empatados que el leaderboard público de Arena oculta.",
    "arena.title":                 "🎯 Reconstructor Arena-Elo CI",
    "arena.tip":                   "Chatbot Arena oculta los intervalos de confianza en el leaderboard público. Una diferencia de 5 Elo puede ser estadísticamente irrelevante. Pega datos crudos de votos (model_a, model_b, winner) — la herramienta calcula MLE Bradley-Terry + bootstrap CIs y lista los empates estadísticos (overlap de CI).",
    "arena.desc":                  "<strong>¿GPT-4 es realmente mejor que Claude — o están empatados?</strong> Pega CSV de votos pairwise (o click <em>Cargar sample</em>). MLE Bradley-Terry + 200 iteraciones de bootstrap → Elos ranked con CIs 95% y detección de empates estadísticos. Todo en el navegador.",
    "arena.sample_btn":            "📊 Cargar datos sample",
    "arena.run_btn":                "🎯 Calcular CIs",
    "arena.clear_btn":             "🗑️ Limpiar",
    "arena.csv_summary":           "CSV de votos (header: <code>model_a,model_b,winner</code>; winner ∈ a/b/tie)",
    "arena.section.ranked":        "Elos ranked con CIs 95%",
    "arena.section.ties":          "Empates estadísticos (overlap CI)",
    "arena.section.summary":       "Resumen",
    "arena.col.rank":              "#",
    "arena.col.model":             "Modelo",
    "arena.col.elo":               "Elo",
    "arena.col.ci":                "CI 95%",
    "arena.col.ci_width":          "± semi-anchura",
    "arena.col.matches":           "Partidas",
    "arena.col.wins":              "V / D / E",
    "arena.col.tie_pair":          "Par",
    "arena.col.tie_diff":          "Brecha Elo",
    "arena.col.tie_overlap":       "Overlap CI",
    "arena.no_ties":               "Sin empates estadísticos — todos los pares distinguibles al CI 95%.",
    "arena.summary.votes":         "Votos totales",
    "arena.summary.models":        "Modelos",
    "arena.summary.ties":          "Empates estadísticos",
    "arena.summary.bootstrap":     "Iteraciones bootstrap",
    "arena.summary.ci_level":      "Nivel CI",
    "arena.status.empty":          "⚠ Pega un CSV de votos o click en Cargar sample.",
    "arena.status.too_few":        "⚠ Solo {n} votos válidos — se necesitan al menos 10 para ajustar Bradley-Terry de forma fiable.",
    "arena.status.computing":      "⏳ Calculando MLE Bradley-Terry + bootstrap sobre {n} votos...",
    "arena.status.done":           "✅ {n} votos · {models} modelos · {ties} empates estadísticos · {ms} ms",
    "arena.status.sample_loaded":  "✅ Sample cargado (datos sintéticos Arena de 6 modelos). Click en Calcular CIs.",

    // v0.7.3 — anti-bullshit pack #4: Contamination Prior
    "modes.contam":                "🧪 Contaminación",
    "mode_desc.contam":            "Prior bayesiano-ish sobre si un score de benchmark está contaminado. Introduce la fecha de cutoff de entrenamiento → puntúa 20+ benchmarks populares (MMLU, GSM8K, HumanEval, MMLU-Pro…).",
    "contam.title":                "🧪 Prior de Contaminación",
    "contam.tip":                  "Calcula un prior bayesiano-ish sobre si un score de benchmark está contaminado, basado en (fecha de cutoff de entrenamiento) × (fecha de release del benchmark) × (inclusión conocida en corpus + historial de leaks). Open LLM Leaderboard v1 fue cancelado en 2024 tras la contaminación de MMLU/HellaSwag.",
    "contam.desc":                 "<strong>¿Deberías confiar en el MMLU de tu modelo?</strong> Introduce la fecha cutoff de entrenamiento — la herramienta puntúa 20+ benchmarks populares (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) y te dice qué scores son probablemente contaminados.",
    "contam.cutoff_label":         "Cutoff entrenamiento:",
    "contam.run_btn":              "🧪 Puntuar todos los benchmarks",
    "contam.section.ranked":       "Priors de contaminación por benchmark",
    "contam.section.high":         "🔴 Benchmarks de alto riesgo (trata los scores como no fiables)",
    "contam.section.medium":       "🟡 Riesgo medio (verifica con alternativas)",
    "contam.section.low":          "🟢 Bajo riesgo (probablemente limpios)",
    "contam.col.benchmark":        "Benchmark",
    "contam.col.released":         "Release",
    "contam.col.gap":              "Gap (meses)",
    "contam.col.prior":            "P(contam)",
    "contam.col.level":            "Nivel",
    "contam.col.corpora":          "En corpus",
    "contam.col.category":         "Categoría",
    "contam.label.high":           "Alto riesgo",
    "contam.label.medium":         "Medio",
    "contam.label.low":            "Bajo",
    "contam.no_entries":           "(ninguno en esta categoría)",
    "contam.advice.high":          "Trata estos scores como no fiables. Sustituye por alternativas más recientes / con test privado (MMLU-Pro, GPQA, MUSR, MATH-500).",
    "contam.advice.medium":        "Toma con cautela. Busca replicación sobre subset held-out o reproducciones comunitarias.",
    "contam.advice.low":           "Score probablemente no contaminado, pero ausencia de leak no es prueba — verifica también con test alternativo.",
    "contam.summary.headline":     "Cutoff <code>{cutoff}</code> · {n} benchmarks puntuados",
    "contam.status.empty":         "⚠ Introduce una fecha cutoff de entrenamiento (ej. 2023-12).",
    "contam.status.bad_date":      "⚠ Formato de fecha incorrecto. Usa YYYY-MM o YYYY-MM-DD.",
    "contam.status.done":          "✅ Cutoff {cutoff} · {n} benchmarks puntuados · {high} de alto riesgo",

    // v0.7 — Sección Help modal
    "help.v07.title":              "🆕 v0.7 — Pack anti-bullshit (4 modos nuevos)",
    "help.v07.intro":              "<em>v0.7 (2026-05-06): cuatro modos nuevos que resuelven problemas concretos reportados por la comunidad HuggingFace. Cada uno corre en tu navegador sin inferencia — pura metadata + matemáticas.</em>",
    "help.v07.unmask.title":       "🪟 Desenmascarador de Contexto",
    "help.v07.unmask.body":        "Detecta cuándo <code>max_position_embeddings</code> es engañoso. Mistral-7B-v0.1 declara 32k pero atiende dentro de ~4-8k vía SWA. Pega un id HF → veredicto en 1 segundo (HONESTO / INFLADO / GRAVEMENTE INFLADO / YARN-EXTENDIDO). Pilla SWA, RoPE-scaling (YaRN/linear/dynamic NTK), d_head pequeño + GQA. <em>Caso de uso</em>: antes de pagar GPU para 32k de contexto, verifica que el modelo realmente atiende tan lejos.",
    "help.v07.template.title":     "📜 Detector de Chat-template",
    "help.v07.template.body":      "Detecta qué familia de chat-template usa un modelo (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) y te da el flag CLI exacto para lm-evaluation-harness, vLLM, y transformers. Resuelve el issue #1841 de lm-eval-harness: olvidar <code>--apply_chat_template</code> divide la accuracy multi-turn por 2 silenciosamente. <em>Caso de uso</em>: antes de reportar un score, confirma que aplicaste el template correctamente.",
    "help.v07.arena.title":        "🎯 Reconstructor Arena-Elo CI",
    "help.v07.arena.body":         "Chatbot Arena oculta los intervalos de confianza en su leaderboard público — una diferencia de 5 Elo puede ser estadísticamente irrelevante. Pega datos crudos de votos pairwise (model_a, model_b, winner) → MLE Bradley-Terry + bootstrap de 200 iteraciones → Elos ranked con CIs 95% y un panel de \"empates estadísticos\" listando pares cuyos CIs se solapan. Prueba el botón Cargar sample. <em>Caso de uso</em>: antes de afirmar \"modelo A vence a modelo B\", verifica que sus CIs no se solapen.",
    "help.v07.contam.title":       "🧪 Prior de Contaminación",
    "help.v07.contam.body":        "Prior bayesiano-ish sobre si un score de benchmark está contaminado. Introduce la fecha cutoff de entrenamiento de tu modelo → la herramienta puntúa 20+ benchmarks populares (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) por P(contaminación) según gap temporal, inclusión en corpus y historial de leaks conocidos. Open LLM Leaderboard v1 fue cancelado en 2024 tras la contaminación de MMLU/HellaSwag. <em>Caso de uso</em>: decide qué scores te puedes creer al comparar dos modelos.",
    "help.v07.quant.title":        "⚖️ Clasificador de régimen de cuantización",
    "help.v07.quant.body":         "Predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización: NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8…). Arch-aware: d_head pequeño + GQA agresivo → más sensible; los esquemas calibrados (AWQ) absorben mejor el shift que los no calibrados (NF4). Recomienda alternativas más seguras si detecta cliff. <em>Caso de uso</em>: antes de cuantizar, predice si tu combo arquitectura × esquema mantendrá la PPL aceptable, con sugerencia concreta de switch si no.",
    "help.v07.drift.title":        "🔀 Cota de drift entre frameworks",
    "help.v07.drift.body":         "Mismo modelo, scores distintos en setups distintos. La herramienta predice el drift máximo admisible solo por ruido numérico (dtype, framework, batch). Si el gap observado lo excede → bug real, normalmente chat-template mismatch (issue #1841 de lm-eval-harness) o layout de KV-cache. Prueba el botón &quot;Cargar sample&quot; para el bug canónico de chat-template. <em>Caso de uso</em>: antes de reportar una regresión o reclamar reproducibilidad, verifica si el gap entre dos evals es mayor de lo que el ruido numérico puede explicar.",
    "inv.v07.drift":               "<strong>🔀 Drift</strong> — ¿bug o ruido? Predice el gap máximo admisible entre dos evals",
    "help.v07.niah.title":         "🔍 Gap NIAH → Reasoning",
    "help.v07.niah.body":          "El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH (retrieval de needle) pero fallan reasoning multi-hop al mismo contexto. La herramienta predice ambas tasas de pass desde la arquitectura (γ_Padé + d_horizon + presión arq: d_head pequeño, GQA, SWA), reporta el gap, y encuentra el \"contexto seguro de reasoning\" donde reasoning se mantiene ≥65%. Modo barrido muestra la curva a 1k/4k/16k/64k/T_train. <em>Caso de uso</em>: antes de desplegar al contexto declarado, descubre si el modelo realmente razonará ahí o solo encontrará.",
    "inv.v07.niah":                "<strong>🔍 NIAH→Reason</strong> — ¿tu \"128k\" realmente razona ahí, o solo encuentra?",

    // v0.7 — Inventory modal 5ª card
    "inv.v07.title":               "🆕 Pack anti-bullshit v0.7",
    "inv.v07.unmask":              "<strong>🪟 Unmask</strong> — ¿config.json declara 32k? Mira si de verdad atiende tan lejos",
    "inv.v07.template":            "<strong>📜 Chat-template</strong> — flag CLI exacto para que lm-eval no divida tu accuracy entre 2 silenciosamente",
    "inv.v07.arena":               "<strong>🎯 Arena CI</strong> — recupera los intervalos de confianza que Chatbot Arena oculta",
    "inv.v07.contam":              "<strong>🧪 Contaminación</strong> — puntúa 20+ benchmarks por probabilidad de contaminación",
    "inv.v07.quant":               "<strong>⚖️ Quant</strong> — predice γ-shift + ΔPPL para cualquier combo (modelo × esquema de cuantización)",

    // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
    "modes.quant":                 "⚖️ Quant",
    "mode_desc.quant":             "Predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización). Arch-aware: d_head pequeño + GQA → más sensible. Recomienda alternativas más seguras si detecta cliff.",
    "quant.title":                 "⚖️ Clasificador de régimen de cuantización",
    "quant.tip":                   "Predice γ-shift (y la ΔPPL resultante) para un par (modelo × esquema). Claims genéricos como 'AWQ ~95% retención' son demasiado vagos — TAF usa d_head, ratio GQA, flag SWA y tamaño del modelo para dar veredicto arquitectura-específico. Resuelve: la comunidad HF reporta cliffs de cuantización impredecibles (NF4 -2 PPL en Phi-3 pero bien en Llama-3-8B).",
    "quant.desc":                  "<strong>¿Cuantizar romperá tu modelo?</strong> Pega un id HF, elige esquema de cuantización — obtén γ-shift predicho, banda ΔPPL esperada y alternativa recomendada si es un cliff. Solo navegador, sin GPU, sin set de calibración.",
    "quant.id_label":              "ID modelo HF:",
    "quant.fetch_btn":             "📥 Fetch config",
    "quant.scheme_label":          "Esquema cuant:",
    "quant.run_btn":                "⚖️ Predecir",
    "quant.all_btn":               "📊 Comparar todos los esquemas",
    "quant.regime.safe":           "✅ SEGURO",
    "quant.regime.mild":           "✅ COMPRESIÓN LEVE",
    "quant.regime.significant":    "⚠ DEGRADACIÓN SIGNIFICATIVA",
    "quant.regime.cliff":          "❌ CLIFF FUERTE",
    "quant.label.gamma_shift":     "γ shift",
    "quant.label.delta_ppl":       "ΔPPL (est.)",
    "quant.label.arch_mult":       "Multiplicador arch",
    "quant.section.breakdown":     "Desglose",
    "quant.section.reco":          "Recomendación",
    "quant.section.compare":       "Todos los esquemas (ordenados por seguridad)",
    "quant.field.scheme":          "Esquema",
    "quant.field.calibrated":      "calibrado",
    "quant.field.uncalibrated":    "no calibrado",
    "quant.field.base_penalty":    "Penalización base",
    "quant.field.arch_mult_full":  "Multiplicador arquitectónico",
    "quant.field.gamma_shift":     "γ shift predicho",
    "quant.field.ppl_band":        "Banda ΔPPL (est.)",
    "quant.field.params":          "Parámetros",
    "quant.col.scheme":            "Esquema",
    "quant.col.bits":              "Bits",
    "quant.col.gamma_shift":       "γ shift",
    "quant.col.ppl_band":          "Banda ΔPPL",
    "quant.col.regime":            "Régimen",
    "quant.reco.switch_to_awq":    "<strong>Cambia a {scheme}</strong> — el 4-bit calibrado maneja d_head pequeño + GQA mucho mejor que NF4. ΔPPL esperada cae ~2-3×.",
    "quant.reco.switch_to_q5_km":  "<strong>Cambia a {scheme}</strong> — Q5 mantiene más dimensiones de head intactas a bajo coste (solo ~25% más grande).",
    "quant.reco.switch_to_q4_km":  "<strong>Cambia a {scheme}</strong> — Q3/Q2 son demasiado agresivos para esta arquitectura.",
    "quant.reco.consider_awq":     "<strong>Considera {scheme}</strong> — la calibración reduce γ-shift significativamente en esta arquitectura.",
    "quant.reco.use_higher_bits":  "<strong>Usa alternativa de mayor bit</strong> — esta arquitectura no absorbe 4-bit limpiamente. Prueba 5 u 8-bit.",
    "quant.reco.verify_with_eval": "<strong>Verifica con eval real</strong> — el shift predicho está en el límite. Corre NIAH a tu contexto objetivo antes de desplegar.",
    "quant.reco.no_action":        "No requiere acción — la cuantización es segura para esta arquitectura.",
    "quant.summary.headline_all":  "Todos los esquemas para <code>{modelId}</code>",
    "quant.status.empty_id":       "⚠ Introduce un model id (ej. meta-llama/Llama-3.2-1B).",
    "quant.status.fetching":       "⏳ Obteniendo config.json para {modelId}...",
    "quant.status.fetched":        "✅ Config obtenido para {modelId}. Elige un esquema y click Predecir (o Comparar todos).",
    "quant.status.no_scheme":      "⚠ Elige un esquema de cuantización del dropdown.",
    "quant.status.done":           "✅ Régimen predicho: {regime}",
    "quant.status.done_all":       "✅ Comparados {n} esquemas — ordenados por seguridad.",

    // v0.7.4 — autocomplete HF Hub: privacy + rate-limit
    "hf_auto.privacy":             "🔒 Queries enviadas a huggingface.co/api · caché local 5 min",
    "hf_auto.rate_limited":        "⚠ Rate limit de HuggingFace — espera un momento, o teclea el id completo manualmente",
    "hf_auto.gated_msg":           "es gated. Acepta la licencia aquí:",

    // v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound
    "modes.drift":                 "🔀 Drift",
    "mode_desc.drift":             "Predice el drift máximo permitido entre dos scores de benchmark dados (framework, dtype, batch, chat-template). Distingue bugs reales de ruido numérico.",
    "drift.title":                 "🔀 Cota de drift entre frameworks",
    "drift.tip":                   "Mismo modelo, scores distintos en setups distintos. ¿La diferencia es ruido o un bug real? Introduce dos scores con su (framework, dtype, batch, chat-template) — la herramienta predice el drift máximo permitido por ruido numérico solo. Si el gap observado lo excede → bug real, normalmente chat-template mismatch (issue #1841 de lm-eval) o layout de KV-cache.",
    "drift.desc":                  "<strong>Tu modelo da 67.2 en lm-eval-hf y 65.1 en vLLM-served. ¿Bug o ruido?</strong> Introduce ambos scores con (framework, dtype, batch, ¿chat-template aplicado?). La herramienta predice la banda de ruido y flagea bugs reales. arxiv 2506.09501 documenta esto como problema mayor de reproducibilidad de evals.",
    "drift.setup_a":               "Setup A",
    "drift.setup_b":               "Setup B",
    "drift.score":                 "Score",
    "drift.framework":             "Framework",
    "drift.dtype":                 "Dtype",
    "drift.batch":                 "Batch",
    "drift.template":              "Chat-template",
    "drift.template.applied":      "aplicado",
    "drift.template.not_applied":  "no aplicado",
    "drift.template.unknown":      "desconocido",
    "drift.run_btn":               "🔀 Calcular cota de drift",
    "drift.sample_btn":            "📊 Cargar sample (bug de chat-template)",
    "drift.label.observed":        "Gap observado",
    "drift.label.band":            "Banda numérica",
    "drift.label.ratio":           "Gap / banda",
    "drift.section.setups":        "Setups",
    "drift.section.breakdown":     "Contribuyentes al drift (banda numérica)",
    "drift.section.verdict":       "Veredicto y recomendación",
    "drift.contrib.dtype":         "Mismatch de dtype",
    "drift.contrib.framework":     "Framework",
    "drift.contrib.batch":         "Diferencia de batch",
    "drift.contrib.template":      "MISMATCH de chat-template",
    "drift.dominant_cause":        "Causa dominante",
    "drift.cause.dtype":           "diferencia de precisión dtype",
    "drift.cause.framework":       "diferencia de framework / kernel",
    "drift.cause.batch":           "paths de normalización por batch",
    "drift.cause.template_mismatch": "chat-template aplicado en un lado pero no en el otro (patrón #1841 de lm-eval-harness — típico -50% en multi-turn)",
    "drift.verdict.noise":         "✅ RUIDO NUMÉRICO",
    "drift.verdict.suspicious":    "⚠ SOSPECHOSO — verifica",
    "drift.verdict.bug":           "❌ BUG REAL — investiga",
    "drift.verdict.bug_template":  "❌ BUG DE CHAT-TEMPLATE",
    "drift.reco.noise":            "El gap encaja en la banda esperada de ruido numérico. No requiere acción; la diferencia es consistente con variación de framework/dtype/batch sola.",
    "drift.reco.suspicious":       "El gap es 1–2× la banda predicha. Borderline — posible bug real. Intenta alinear el contribuyente dominante (ej. iguala framework o dtype) y re-testea.",
    "drift.reco.bug":              "El gap es &gt; 2× la banda predicha. Es un bug real. Inspecciona el contribuyente dominante — probablemente diferencia de tokenizer / chat-template / layout de KV-cache. Corre lm-eval-harness con <code>--apply_chat_template</code> y confirma.",
    "drift.reco.bug_template":     "Mismatch de chat-template detectado. Es la causa más común de gaps grandes en evals (issue #1841 de lm-eval-harness). Re-corre el lado &quot;no aplicado&quot; con <code>--apply_chat_template</code> (o pon vLLM <code>--chat-template &lt;name&gt;</code>) y re-testea.",
    "drift.status.empty_scores":   "⚠ Introduce ambos scores.",
    "drift.status.done":           "✅ Veredicto: {verdict}",
    "drift.status.sample_loaded":  "✅ Sample cargado (bug canónico de chat-template). Click en Calcular cota de drift.",

    // v0.7.6 — anti-bullshit pack #7: NIAH → predictor de gap de reasoning
    "modes.niah":                  "🔍 NIAH→Reason",
    "mode_desc.niah":              "Predice tasas de pass de NIAH (retrieval) y reasoning multi-hop a cualquier contexto. Resuelve: modelos long-context pasan NIAH pero fallan reasoning al mismo contexto (paper RULER).",
    "modes.saturation":            "📈 Saturación",
    "mode_desc.saturation":        "Te dice si un benchmark sigue discriminando frontier models o ya está saturado (ej. MMLU 88-94% top, AIME 2025 ya 96-100%). Devuelve top-3 + veredicto + reemplazos recomendados.",
    "modes.hub":                   "🧭 Soluciones",
    "mode_desc.hub":               "Mapa de cada problema documentado de LLM-eval → mode tafagent (si cubierto) + herramientas externas curadas. Encuentra la solución sin reinventarla. 30+ pains, 7 categorías.",
    "niah.title":                  "🔍 Gap NIAH → Reasoning",
    "niah.tip":                    "NIAH (Needle in a Haystack) testea retrieval: 'encuentra este hecho en texto largo'. Reasoning multi-hop testea inferencia: 'combina hechos X+Y del principio con hecho Z del final'. El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH pero fallan reasoning al mismo contexto. Esta herramienta predice ambas tasas desde la arquitectura sola.",
    "niah.desc":                   "<strong>Tu modelo dice 128k de contexto. ¿Razonará realmente a 64k, o solo encontrará?</strong> Pega un model id HF y un contexto objetivo — la herramienta predice tasas de pass NIAH y reasoning multi-hop, el gap, y un 'contexto seguro' donde reasoning se mantiene ≥65%.",
    "niah.id_label":               "ID modelo HF:",
    "niah.fetch_btn":              "📥 Fetch config",
    "niah.teval_label":            "Contexto objetivo (T_eval):",
    "niah.run_btn":                "🔍 Predecir",
    "niah.sweep_btn":              "📊 Barrer contextos",
    "niah.label.niah":             "Tasa pass NIAH",
    "niah.label.reasoning":        "Tasa pass Reasoning",
    "niah.label.gap":              "Gap",
    "niah.label.safe_ctx":         "Contexto seguro de reasoning",
    "niah.section.breakdown":      "Desglose arquitectónico",
    "niah.section.reco":           "Recomendación",
    "niah.section.sweep":          "Barrido de tasas pass por longitud de contexto",
    "niah.field.dhorizon":         "d_horizon (efectivo)",
    "niah.field.ratio":            "T_eval / d_horizon",
    "niah.field.arch_pressure":    "Presión arq (d_head pequeño + GQA + SWA)",
    "niah.field.theta":            "RoPE θ",
    "niah.field.t_train":          "T_train (declarado)",
    "niah.col.context":            "T_eval",
    "niah.col.niah":               "NIAH",
    "niah.col.reasoning":          "Reasoning",
    "niah.col.gap":                "Gap",
    "niah.col.verdict":            "Veredicto",
    "niah.verdict.robust":         "✅ ROBUSTO",
    "niah.verdict.marginal":       "⚠ MARGINAL",
    "niah.verdict.degraded":       "⚠ DEGRADADO",
    "niah.verdict.retrieval_only": "❌ SOLO RETRIEVAL",
    "niah.verdict.broken":         "❌ ROTO",
    "niah.reco.robust":            "Tanto retrieval como reasoning aguantan a este contexto. Seguro para desplegar tareas de lookup e inferencia.",
    "niah.reco.marginal":          "Borderline. Retrieval funciona pero reasoning está flojo. Úsalo para lookup, no para inferencia multi-paso.",
    "niah.reco.degraded":          "Caída significativa de reasoning. El modelo encuentra hechos pero le cuesta combinarlos. Evita tareas multi-hop a esta longitud.",
    "niah.reco.retrieval_only":    "Hallazgo canónico de RULER: el modelo pasa NIAH pero falla reasoning. Útil para setups RAG (donde el LLM solo localiza hechos) pero NO para inferencia encadenada. Reduce tu contexto al valor 'seguro' de abajo.",
    "niah.reco.broken":            "El modelo falla incluso retrieval básico a este contexto. Trátalo como out-of-distribution — re-testea a contexto más corto.",
    "niah.safe_context":           "≤ {ctx} tokens (reasoning ≥ 65%)",
    "niah.safe_context_none":      "No se encontró contexto seguro bajo tu objetivo — el modelo falla reasoning incluso a contextos pequeños.",
    "niah.summary.sweep":          "<code>{modelId}</code> — tasas pass por contexto",
    "niah.status.empty_id":        "⚠ Introduce un model id (ej. meta-llama/Llama-3.1-8B-Instruct).",
    "niah.status.bad_teval":       "⚠ Introduce un contexto objetivo (≥ 512 tokens).",
    "niah.status.fetching":        "⏳ Obteniendo config.json para {modelId}...",
    "niah.status.fetched":        "✅ Config obtenido para {modelId}. Pon T_eval y click Predecir (o Barrer contextos).",
    "niah.status.done":            "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
    "niah.status.sweep_done":      "✅ Barridos {n} largos de contexto.",
    "saturation.title":            "📈 Detector de saturación de benchmarks",
    "saturation.tip":              "MMLU está saturado (88-94% en todos los frontier). Reportar '92% en MMLU' ya no significa nada. Esta herramienta te dice qué benchmarks aún discriminan frontier models, cuáles están saturados, y qué usar en su lugar. Datos: DemandSphere AI Frontier Tracker (CC BY-NC 4.0) refrescado 2026-05.",
    "saturation.desc":             "<strong>¿Sigue siendo útil tu benchmark?</strong> Elige un benchmark para ver top-3 frontier scores, spread, y un veredicto (saturated / near-saturated / discriminative) + reemplazos recomendados.",
    "saturation.select_label":     "Benchmark:",
    "saturation.select.all":       "— mostrar todos los benchmarks —",
    "saturation.run_btn":          "📈 Clasificar",
    "saturation.all_btn":          "📊 Mostrar todos",
    "saturation.col.spread":       "Spread top-3",
    "saturation.col.mean":         "Media top-3",
    "saturation.col.n":            "Modelos",
    "saturation.col.bench":        "Benchmark",
    "saturation.col.verdict":      "Veredicto",
    "saturation.col.reco":         "Mejor reco",
    "saturation.col.model":        "Modelo",
    "saturation.col.score":        "Score",
    "saturation.section.top3":     "Top-3 frontier scores",
    "saturation.section.recommendations": "Alternativas recomendadas",
    "saturation.section.note":     "Notas",
    "saturation.section.all":      "Todos los benchmarks rastreados",
    "saturation.verdict.saturated":      "🚨 SATURADO",
    "saturation.verdict.near_saturated": "⚠ CASI SATURADO",
    "saturation.verdict.discriminative": "✅ DISCRIMINATIVO",
    "saturation.verdict.sparse_data":    "ℹ DATOS ESCASOS",
    "saturation.borderline":       "Borderline — dentro de ±1pp de un umbral. Trata el veredicto como 'verifica con cuidado'.",
    "saturation.unknown":          "Benchmark desconocido.",
    "saturation.attribution":      "Datos: DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (histórico open-weight) · último fetch 2026-05-05.",
    "saturation.status.live":      "✅ Datos en vivo cargados — {count} modelos.",
    "saturation.status.baked":     "ℹ Usando snapshot baked (fetch en vivo no disponible).",
    "saturation.status.kb_fail":   "⚠ No se pudo cargar el KB de saturación.",
    "saturation.status.done":      "✅ {name} — {verdict}",
    "saturation.status.all_done":  "✅ Clasificados {n} benchmarks.",
    "help.v08.saturation.title":   "📈 Detector de saturación de benchmarks",
    "help.v08.saturation.body":    "MMLU está saturado (top 88-94%), AIME 2025 saturó a los pocos meses de salir, HumanEval near-saturated. Elige cualquier benchmark y la herramienta retorna top-3 frontier scores, spread, media, y un veredicto — saturated / near-saturated / discriminative — más un reemplazo recomendado (ej. MMLU → MMLU-Pro / GPQA / HLE). Fetch en vivo desde DemandSphere AI Frontier Tracker (CC BY-NC 4.0) cuando llega; snapshot baked 2026-05-05 cuando no. <em>Caso de uso</em>: antes de citar '92% en MMLU' o diseñar una eval, verifica si el benchmark aún discrimina algo.",
    "inv.v08.saturation":          "<strong>📈 Saturation</strong> — ¿sigue siendo útil tu benchmark, o están todos los frontiers empatados arriba?",
    "inv.v081.hub":                "<strong>🧭 Solutions Hub</strong> — cada pain documentado mapeado a un mode tafagent o herramienta externa curada. No reinventes — encuentra.",
    "help.v081.hub.title":         "🧭 Solutions Hub",
    "help.v081.hub.body":          "tafagent como integrador, no silo. 30+ pains en 7 categorías (eval reliability · diagnósticos · setup · training · retrieval · multimodal · observability), cada uno mapeado a (a) el mode tafagent que lo resuelve, si existe, y (b) las herramientas externas best-of-breed que la comunidad ya usa (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Caja de búsqueda matchea pain, scenario, y nombre de herramienta. <em>Caso de uso</em>: 'tengo problema X — ¿lo resuelve tafagent, y si no, quién?'",
    "hub.title":                   "🧭 Solutions Hub",
    "hub.tip":                     "Mapa de cada pain de LLM-eval documentado: qué mode tafagent lo resuelve (si alguno), y las herramientas externas best-of-breed que la comunidad ya usa. Objetivo: cobertura total. Si la herramienta canónica existe en otra parte, enlazamos en vez de rebuildear.",
    "hub.desc":                    "<strong>No reinventes — encuentra.</strong> 30+ pains mapeados a modes tafagent + herramientas externas curadas. Navega por categoría, busca por keyword, o ve los huecos donde nuevos modes ayudarían más.",
    "hub.clear_btn":               "✕ Limpiar",
    "hub.no_mode":                 "externo",
    "hub.planned":                 "planeado:",
    "hub.best_for":                "Mejor para",
    "hub.not_for":                 "No para",
    "hub.tools":                   "Herramientas externas",
    "hub.status.loaded":           "✅ Cargados {total} pains en {categories} categorías — {covered} cubiertos por modes tafagent, {externalLinks} enlaces externos curados. Compilado {compiled}.",
    "hub.status.fail":             "⚠ No se pudo cargar Solutions Hub.",
    "hub.search.empty":            "Sin coincidencias para '{query}'. Prueba términos más amplios (ej. 'eval', 'rag', 'tokenizer').",
    "hub.search.results":          "Encontradas {n} coincidencia(s) para '{query}'.",

    // v0.7.7 — Tiles de tareas (UX restructure: 14 modos agrupados por intención)
    "tiles.title":                 "🎯 ¿Qué quieres hacer?",
    "tiles.subtitle":              "Elige una tarea. Cada una abre la herramienta adecuada debajo. O baja para la lista completa de 14 modos.",
    "tile.diagnose.title":         "🔬 Diagnosticar un modelo",
    "tile.diagnose.desc":          "¿Servirá este modelo concreto para mi caso de uso?",
    "tile.trust.title":            "✓ Confiar en un score de benchmark",
    "tile.trust.desc":             "¿Me creo este número? ¿Es bug o ruido?",
    "tile.eval.title":              "⚙️ Configurar bien una eval",
    "tile.eval.desc":              "Obtén el flag CLI exacto para lm-eval / vLLM / transformers.",
    "tile.compare.title":          "🆚 Comparar modelos",
    "tile.compare.desc":           "Lado a lado, o explora el panel empírico de modelos.",
    "tile.manual.title":           "📋 Manual / libre",
    "tile.manual.desc":            "Elige una receta concreta a mano, o pregunta en inglés llano.",
    "tile.diagnose.tip":           "Empieza aquí cuando tengas un id de modelo concreto y quieras diagnóstico completo: <strong>Profile</strong> corre las 5 recetas a la vez. <strong>Unmask</strong> comprueba si max_position_embeddings es honesto. <strong>NIAH→Reason</strong> predice el gap retrieval-vs-reasoning. <strong>Quant</strong> predice si cuantizar lo romperá. <strong>Inspect</strong> permite pegar config.json crudo para modelos privados / en desarrollo.",
    "tile.trust.tip":              "Cuando ves un score y quieres saber si es real. <strong>Contamination</strong> puntúa 20+ benchmarks por probabilidad de que el modelo los viera en entrenamiento. <strong>Drift</strong> te dice si el gap entre dos evals es ruido numérico o bug real (chat-template mismatch, layout KV-cache, etc.). <strong>Arena CI</strong> reconstruye los intervalos de confianza que Chatbot Arena oculta — muchas &quot;victorias&quot; top-Elo están estadísticamente empatadas.",
    "tile.eval.tip":               "Antes de correr lm-eval-harness o vLLM serve, obtén el flag CLI correcto. <strong>Chat-template Sniffer</strong> detecta la familia de template (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) y emite la invocación exacta <code>--apply_chat_template</code> / <code>--chat-template</code>. Resuelve el issue #1841 de lm-eval-harness (÷2 accuracy silencioso). <strong>Diagnose CLI</strong> genera el comando Python para medir γ_obs en tu GPU local.",
    "tile.compare.tip":            "<strong>Compare</strong>: elige 2-3 modelos candidatos + una receta, ve veredictos en tabla lado a lado (ej. Llama-3-8B vs Mistral-7B a 32k). <strong>Phase diagram</strong>: scatter de 23 modelos empíricos en el plano (log θ, γ), con la curva Padé superpuesta. Hover puntos para detalles, click para cargar ese modelo en la Recipe form.",
    "tile.manual.tip":             "<strong>Recipe</strong>: elige una receta X-N específica (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 compresión KV, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) y rellena la form a mano para control total. <strong>Ask</strong>: escribe una pregunta libre; un LLM 0.5B (Qwen2.5) en tu navegador elige la receta correcta y la ejecuta. Ideal para exploración &quot;qué pasaría si...&quot;.",
    "share.import_desc":       "¿Tienes un fichero JSON del análisis TAF de alguien? Cárgalo aquí para ver el veredicto + cadena localmente. La misma vista que si lo hubieras ejecutado tú.",
    "share.import_btn":        "📂 Cargar JSON compartido",
    "synthesis.system":        "Eres un asistente de diagnóstico preciso para LLMs transformer. Dados resultados de fórmulas TAF pre-calculados, escribe un resumen claro en español de 4-6 frases. Cita el número de sección (§X.Y) para cada número que menciones. Da siempre una recomendación concreta. NO inventes números.",

    // INSPECTOR mode
    "inspector.title":         "🔍 Inspector de Arquitectura",
    "inspector.desc":          "Pega el contenido crudo de <code>config.json</code>. La herramienta extrae los parámetros arquitectónicos y ejecuta el Profile completo de 5 recetas.",
    "inspector.tip":           "<strong>Pega cualquier config.json directamente</strong>. La herramienta lo parsea y ejecuta el Profile completo. Útil para: modelos privados, configs en desarrollo, modelos aún no en HuggingFace, o comparar qué haría tu arquitectura custom.",
    "inspector.quickstart":    "💡 Caso de uso: tienes un modelo privado no en HF Hub, o una config que estás diseñando. Pega el JSON crudo abajo y obtén un perfil TAF completo.",
    "inspector.placeholder":   "{\n  \"model_type\": \"llama\",\n  \"rope_theta\": 500000,\n  \"max_position_embeddings\": 8192,\n  \"num_attention_heads\": 32,\n  \"num_key_value_heads\": 8,\n  \"hidden_size\": 4096,\n  \"num_hidden_layers\": 32\n}",
    "inspector.T_eval":        "T_eval (tu contexto objetivo):",
    "inspector.btn":           "🚀 Inspeccionar y perfilar",

    // WHAT-IF slider
    "whatif.title":            "🎚 What-if: arrastra T_eval para ver γ cambiar en vivo",
    "whatif.desc":             "Recálculo puro JS (sin llamada Pyodide). Muestra γ_Padé y d_horizon geométricos mientras deslizas. Click en el botón para re-ejecutar la cadena completa.",
    "whatif.T_eval":           "<strong>T_eval</strong>",
    "whatif.gamma_pade":       "<strong>γ_Padé</strong>",
    "whatif.d_horizon":        "<strong>d_horizon</strong>",
    "whatif.l_niah":           "<strong>Techo L_NIAH</strong>",
    "whatif.predicted":        "<strong>Veredicto geométrico predicho</strong>",
    "whatif.rerun":            "↻ Re-calcular cadena completa con este T_eval",

    // COMMUNITY feed
    "community.title":         "🌐 Envíos recientes de la comunidad",
    "community.desc":          "Feed en vivo del registry público. Click en cualquier envío para ver análisis completo.",
    "community.browse_all":    "Ver todo →",
    "community.loading":       "Cargando...",
    "community.no_repo":       "El repo del registry aún no está creado. Cuando exista con envíos, aparecerán aquí en vivo.",
    "community.no_submissions": "Sin envíos aún. Sé el primero — genera un Profile y click 📤 Enviar al registry.",

    // FALSIFICATION dashboard
    "falsification.title":     "🔬 Predicciones del paper — estado de falsificación",
    "falsification.desc":      "El framework TAF se basa en predicciones falsificables (F1-F23). Cada una está empíricamente testada. Aquí está el estado en vivo de cada predicción del paper.",
    "falsification.summary":   "{confirmed} confirmadas · {partial} parciales · {refuted} refutadas · {untested} sin testear (de {total} predicciones totales)",
    "falsification.col.id":    "ID",
    "falsification.col.claim": "Claim",
    "falsification.col.status": "Estado",
    "falsification.col.evidence": "Evidencia",

    "tafcard.title":           "📇 TAF Card — perfil completo del modelo",
    "tafcard.recipes_title":   "📋 Recetas — veredicto por dimensión",
    "tafcard.recipes_count_label": "dimensiones",
    "tafcard.numbers_title":   "🔢 Números clave (paper §26)",
    "tafcard.fals_title":      "🔬 Estado de falsificación (F1-F23)",
    "tafcard.fals_none":       "Sin falsificaciones aplicables.",
    "tafcard.diag_title":      "🔬 Diagnósticos — números · γ check · what-if",
    "tafcard.verify_title":    "✓ Verificación — Lean + Sage + falsificación",
    "tafcard.share_title":     "📂 Procedencia y compartir",
    "tafcard.whatif_title":    "🎚️ Explorador what-if",
    "verdict.go":              "ADELANTE",
    "verdict.no":              "NO",
    "verdict.degraded":        "DEGRADADO",

    "compare.title_out":       "🆚 Tabla comparativa",

    "status.loading_pyodide":  "⏳ Cargando runtime Python (~10MB, solo primera vez)...",
    "status.loading_taf":      "⏳ Cargando fórmulas TAF + recetas...",
    "status.ready":            "✅ Listo. Elige un modelo y click Perfilar para empezar.",
    "status.computing":        "🧮 Calculando cadena TAF...",
    "status.done":             "✅ Hecho.",

    "profile.hf_placeholder":  "ej. meta-llama/Meta-Llama-3-8B o Qwen/Qwen2.5-7B",
    "compare.hf_placeholder":  "ID modelo HF (ej. meta-llama/Meta-Llama-3-8B)",
    "compare.slot1_placeholder": "ID modelo HF (ej. meta-llama/Meta-Llama-3-8B)",
    "compare.slot2_placeholder": "ID modelo HF #2",
    "compare.slot3_placeholder": "ID modelo HF #3 (opcional)",
    "compare.preset_default": "— o preset —",

    // Parámetros del formulario
    "param.theta":         "θ (rope_theta)",
    "param.theta.tip":     "<strong>Frecuencia base RoPE</strong> de <code>config.rope_theta</code>. Mayor = más capacidad de largo alcance.",
    "param.T_train":       "T_train",
    "param.T_train.tip":   "<strong>Contexto máximo de entrenamiento</strong>. De <code>max_position_embeddings</code>. Más allá es extrapolación.",
    "param.T_eval":        "T_eval (tu objetivo)",
    "param.T_eval.tip":    "<strong>Tu contexto de inferencia objetivo</strong>. La pregunta clave: ¿se comportará bien el modelo a ESTA longitud?",
    "param.n_attn":        "n_attention_heads",
    "param.n_attn.tip":    "<strong>Número de attention heads</strong> por capa. De <code>num_attention_heads</code>.",
    "param.n_kv":          "n_kv_heads",
    "param.n_kv.tip":      "<strong>KV heads</strong>. Si &lt; n_attention_heads → GQA (Grouped Query Attention). Reduce memoria KV pero empuja γ hacia Hagedorn.",
    "param.d_head":        "head_dim",
    "param.d_head.tip":    "<strong>Dimensión por head</strong>. Típico 64, 96, 128. De <code>head_dim</code> o <code>hidden_size / num_attention_heads</code>.",
    "param.n_layers":      "n_layers",
    "param.n_layers.tip":  "<strong>Número de bloques transformer</strong>. De <code>num_hidden_layers</code>.",
    "param.n_params":      "n_params (ej. 8e9)",
    "param.n_params.tip":  "<strong>Número total de parámetros</strong>. Umbral ~400M para emergencia de induction heads. Afecta memoria KV y recipes de presupuesto.",
    "param.has_swa":       "¿Tiene SWA?",
    "param.has_swa.tip":   "<strong>Sliding Window Attention</strong>. <code>true</code> para Mistral, gemma-2, phi-3. El audit de calibración v0.5.3 desactivó la corrección histórica δ_SWA (ajuste n=1).",
    "common.yes":          "Sí",
    "common.no":           "No",

    // Tooltips de modos
    "modes.tip":           "<strong>Catorce formas de usar la herramienta</strong>.<br><strong>📇 Perfil</strong>: pega un id → TAF Card de 5 recetas.<br><strong>🆚 Comparar</strong>: 2-3 modelos lado a lado en una receta.<br><strong>🔍 Inspeccionar config</strong>: pega config.json crudo → Perfil completo.<br><strong>💬 Pregunta</strong>: pregunta libre, el LLM del navegador elige la receta.<br><strong>📋 Receta</strong>: selección manual con control total del formulario.<br><strong>🩺 Diagnóstico CLI</strong>: genera comando Python para medir γ localmente.<br><strong>📊 Diagrama de fase</strong>: panel de 23 modelos en plano (log θ, γ).<br><strong>🪟 Desenmascarar</strong>: detecta max_position_embeddings engañoso (SWA / YaRN / RoPE-scaling).<br><strong>📜 Chat-template</strong>: detecta familia + da el flag CLI exacto para lm-eval / vLLM / transformers.<br><strong>🎯 Arena CI</strong>: reconstruye intervalos de confianza desde votos pairwise crudos; detecta empates estadísticos que Arena oculta.<br><strong>🧪 Contaminación</strong>: puntúa 20+ benchmarks por probabilidad de contaminación según cutoff de entrenamiento vs fecha de release.<br><strong>⚖️ Quant</strong>: predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización); recomienda alternativa segura si hay cliff.<br><strong>🔀 Drift</strong>: mismo modelo, scores distintos en dos setups — ¿bug o ruido? Predice banda de ruido numérico y flagea bugs reales.<br><strong>🔍 NIAH→Reason</strong>: predice tasas pass NIAH y reasoning multi-hop desde arquitectura; encuentra el contexto seguro de reasoning.",
    "profile.tip":         "<strong>Diagnóstico completo en un click</strong>. Pega cualquier id de modelo HF (o elige preset). La herramienta ejecuta las 5 recetas (contexto largo, compresión KV, custom vs API, presupuesto, hardware) y produce una única <strong>TAF Card</strong> con veredicto por dimensión + números clave + clasificación arquitectónica.<br><br><strong>Caso de uso</strong>: \"Estoy evaluando Qwen2.5-32B para producción — ¿cuál es su perfil completo de viabilidad?\" → pega id → Perfilar → listo.",
    "compare.tip":         "<strong>Misma receta, múltiples modelos</strong>. Elige 2-3 modelos candidatos y una receta. Ve los veredictos en una única tabla comparativa.<br><br><strong>Caso de uso</strong>: \"Necesito recuperación de contexto largo a 16K — ¿cuál es mejor: Llama-3-8B, Mistral-7B o Qwen-7B?\" → elige 3 + X-2 + 16K → ve el ganador.",

    // Modal de ayuda
    "help.title":               "📘 TAF Agent — Manual de Usuario",
    "help.what.title":          "¿Qué hace?",
    "help.what.body":           "Predice la <strong>viabilidad práctica</strong> de cualquier LLM transformer <em>antes de gastar GPU/€</em>. Responde preguntas como \"¿funcionará este modelo a L=32K?\" o \"¿debería entrenar custom o usar API?\" usando fórmulas Python deterministas (TAF — Thermodynamic Attention Framework).",
    "help.modes.title":         "Cómo usar — 7 modos",
    "help.modes.profile":       "<strong>📇 Perfilar</strong>: pega id de modelo → todas las recetas a la vez = TAF Card. <strong>Mejor punto de inicio</strong>.",
    "help.modes.compare":       "<strong>🆚 Comparar</strong>: 2-3 modelos lado a lado en la misma receta. Mejor al elegir entre candidatos.",
    "help.modes.inspector":     "<strong>🔍 Inspeccionar config</strong>: pega <code>config.json</code> crudo → la herramienta lo parsea y ejecuta el Perfil completo. Para modelos privados, configs en desarrollo, o modelos aún no en HF Hub.",
    "help.modes.ask":           "<strong>💬 Pregunta libre</strong>: pregunta en lenguaje natural, el LLM del navegador elige la receta. Mejor para exploración casual.",
    "help.modes.recipe":        "<strong>📋 Receta + formulario</strong>: selección manual, control total de parámetros. Mejor cuando quieres control exacto.",
    "help.modes.diagnose":      "<strong>🩺 Diagnóstico CLI</strong>: genera comando Python para medir γ en tu máquina local (transformers + numpy). Rápido ≈5 min CPU; completo ≈20–60 min GPU. JSON resultado re-subible por Inspect.",
    "help.modes.phase":         "<strong>📊 Diagrama de fase</strong>: scatter de 23 modelos del panel en plano (log θ, γ). Línea Hagedorn γ=1 separa Fase A de Fase B. Click en un punto para cargar ese modelo en el formulario de Receta.",
    "help.recipes.title":       "Las 8 recetas disponibles",
    "help.recipe.x1.title":     "<strong>X-1 Entrenamiento custom vs API</strong> — compara coste de entrenar tu propio modelo vs pagar API.",
    "help.recipe.x1.example":   "Prueba: <em>\"¿Entrenar 8B custom o usar GPT-4o para 50M tokens/mes?\"</em><br>Respuestas: SÍ (custom) / NO (API) con meses para break-even.",
    "help.recipe.x2.title":     "<strong>X-2 Viabilidad contexto largo</strong> — predice si un modelo sirve longitud objetivo de manera fiable.",
    "help.recipe.x2.example":   "Prueba: <em>\"¿Meta-Llama-3-8B maneja 32000 tokens para retrieval?\"</em><br>Cadena: γ_Padé → descomposición → d_horizon → techo NIAH → alucinación → memoria KV.<br>Veredicto: SÍ / DEGRADADO / NO con mitigación si hace falta.",
    "help.recipe.x3.title":     "<strong>X-3 Pre-flight presupuesto</strong> — dado un presupuesto $, ¿qué modelo es viable entrenar?",
    "help.recipe.x3.example":   "Prueba: <em>\"Tengo $5000, ¿qué modelo puedo entrenar?\"</em><br>Respuesta: GO / TINY-MODEL / MEMORY-LIMITED con N (params) y D (tokens) concretos.",
    "help.recipe.x5.title":     "<strong>X-5 Selección hardware</strong> — ¿qué GPU usar para servir al throughput objetivo?",
    "help.recipe.x5.example":   "Prueba: <em>\"Hardware más barato para servir Llama-3-8B a 10M tokens/día\"</em><br>Respuesta: mejor GPU + $/Mtok + capacidad vs objetivo.",
    "help.recipe.x19.title":    "<strong>X-19 Decisión compresión KV</strong> — ¿usar soft decay, hard cutoff, o métodos de literatura?",
    "help.recipe.x21.title":    "<strong>X-21 Diagnóstico Pureza Imprint</strong> — predice γ sobre tokens RANDOM via ν=−1/(2π); ¿cuán limpia es la predicción RoPE del modelo?",
    "help.recipe.x22.title":    "<strong>X-22 Invariante Compute-Context</strong> — ¿γ × log(N²·D) está en banda 51.2 ± 16.8? Detecta anomalías de scaling/training.",
    "help.recipe.x23.title":    "<strong>X-23 Detector Fase IH</strong> — ¿pre- o post-induction-head? Probe barato via sign(γ_text − γ_random).",
    "help.recipe.x21.example":  "Prueba: <em>«¿Cuán limpia es la predicción RoPE en Llama-3-8B?»</em><br>Respuesta: γ_random predicho + diagnóstico (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).",
    "help.recipe.x22.example":  "Prueba: <em>«¿Mistral-7B entra en el invariante compute-context?»</em><br>Respuesta: K = γ·log(N²·D), z-score, IN-BAND u OUTLIER.",
    "help.recipe.x23.example":  "Prueba: <em>«¿Qwen2.5-7B es post-induction-head?»</em><br>Respuesta: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY (chequeo consistencia tamaño vs Δγ).",
    "help.section.v04":         "<strong>Novedades v0.4</strong> (hallazgos sesión 29 del 2026-04-28): tres recipes diagnósticas derivadas del análisis panel cross-model (n=22 LLMs).",
    "help.divider.v04_s29":     "— v0.4 (hallazgos sesión 29) —",
    "footer.tech_stack":        "Cómputo: Pyodide · Síntesis: WebLLM (Qwen2.5-0.5B local) · Hosting: GitHub Pages · Coste: $0",
    "help.v04.imprint":         "<strong>Slope imprint aprendido ν = −1/(2π)</strong>: el periodo de rotación RoPE 2π provoca un sesgo posicional en los pesos, proporcional a log(N_params). Incluso tokens random muestran este scaling. ν es DERIVADO — no ajustado (err empírico 0.3%).",
    "help.v04.invariant":       "<strong>Invariante Chinchilla-atención K</strong>: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Conecta compute scaling y exponente de atención en un solo número adimensional.",
    "help.v04.ih_probe":        "<strong>Δγ como probe IH</strong>: sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Más barato que correr un benchmark in-context-learning.",
    "help.v04.constants":       "<strong>γ-cluster en constantes famosas</strong> (intrigante, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (conjugado áureo, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat: podría ser coincidencia.",
    "help.recipe.x19.example":  "Prueba: <em>\"¿Cómo comprimir caché KV para Qwen2.5-7B a 32K?\"</em><br>Respuesta: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
    "help.param.theta":         "<strong>θ (rope_theta)</strong>: frecuencia base RoPE. Mayor = más capacidad de largo alcance. Típico: 10000 (modelos antiguos), 500000 (Llama-3), 1000000 (Qwen2.5).",
    "help.param.T_train":       "<strong>T_train</strong>: contexto máximo que vio el modelo durante entrenamiento. De <code>max_position_embeddings</code>.",
    "help.param.T_eval":        "<strong>T_eval</strong>: <em>tu</em> longitud de contexto objetivo en inferencia. La perilla clave.",
    "help.param.gqa":           "<strong>n_kv_heads &lt; n_attention_heads</strong>: el modelo usa GQA (Grouped Query Attention). Reduce memoria KV pero empuja γ hacia Hagedorn.",
    "help.param.swa":           "<strong>has_SWA</strong>: el modelo usa Sliding Window Attention (Mistral, gemma-2).",
    "help.param.nparams":       "<strong>n_params</strong>: número total de parámetros. Umbral ~400M para emergencia de induction heads.",
    "help.add_models.title":    "Añadir nuevos modelos (3 maneras)",
    "help.add_models.preset":   "<strong>Lista de presets</strong>: 11 modelos populares curados. Selecciona del dropdown.",
    "help.add_models.hf":       "<strong>HF Hub fetch</strong>: pega cualquier id (ej. <code>Qwen/Qwen2.5-32B-Instruct</code>), click 📥 Cargar. El navegador descarga <code>config.json</code> directamente de HuggingFace, llena el formulario. Funciona con cualquier modelo público.",
    "help.add_models.manual":   "<strong>Manual</strong>: rellena los campos directamente con valores de la model card.",
    "help.audit.title":         "La cadena auditable",
    "help.audit.body":          "Cada resultado muestra la <strong>Cadena de Cálculo</strong> completa — cada paso de fórmula con sus entradas, salida e interpretación. Click en cualquier paso para expandir. Las referencias de sección (§26.1, §19.1, etc.) apuntan al paper para la derivación.",
    "help.synthesis.title":     "La respuesta en lenguaje natural",
    "help.synthesis.body":      "Tras ejecutar la cadena determinista, un LLM en el navegador (Qwen2.5-0.5B, ~350MB cacheado tras primera carga) sintetiza un resumen en lenguaje natural. Los números arriba son <em>siempre correctos</em> (Python determinista); la síntesis la genera el LLM — verifica contra la cadena si dudas.",
    "help.params.title":        "Parámetros comunes explicados",
    "help.verdicts.title":      "Qué mirar en los veredictos",
    "help.verdict.yes":         "<strong style=\"color:#3fb950;\">SÍ / GO</strong> — procede con confianza; los números apoyan la elección.",
    "help.verdict.deg":         "<strong style=\"color:#d29922;\">DEGRADADO / TINY-MODEL</strong> — funciona con caveats; lee la acción.",
    "help.verdict.no":          "<strong style=\"color:#f85149;\">NO / MEMORY-LIMITED</strong> — no procedas tal cual; se da mitigación.",
    "help.privacy.title":       "Privacidad",
    "help.privacy.body":        "Todo corre en tu navegador. Sin telemetría, sin analytics, sin datos enviados a ningún sitio. Incluso el modelo LLM corre localmente vía WebGPU/WebAssembly. Tus model_ids y preguntas nunca abandonan esta página.",
    "help.source.title":        "Código fuente y paper",
    "help.source.body":         "Código: <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>Paper: <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a>; arXiv próximamente)<br>Dataset: <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 58 mediciones γ sobre 32 modelos (CC-BY-4.0)",

    "footer.text":             "© 2026 Carles Marin · Apache-2.0 · investigación independiente · la herramienta que cierra el círculo del paper.",
  },

  // ────────────────────────────────────────────────────────────────────────
  // FR — Français
  // ────────────────────────────────────────────────────────────────────────
  fr: {
    // §33 v0.4 (sesion 31, 2026-04-30) — nouvelles fonctions de diagnostic
    "v04.title":                  "🆕 v0.4 — Nouveaux diagnostics (sesion 31)",
    "v04.section.intro":          "Quatre nouvelles fonctions diagnostiques dérivées en session 31 (2026-04-30) depuis jeux de formules cross-of-crosses + interrogation socratique. Disponibles dans <code>taf_browser.py</code> §33.",
    "v04.arch.label":             "Concentration Architecturale",
    "v04.arch.desc":              "γ_text ≈ γ_Padé − 0.012·n_kv. Loi corrélationnelle cross-panel (R²=0.30). Caveat : pas un prédicteur par-modèle.",
    "v04.pdi.label":              "PDI — Indice de Déviation de Padé",
    "v04.pdi.desc":               "PDI = d_horizon_obs/T_eval. Feu : vert (≈1), orange (>>1), jaune (<<1), rouge (Phase B négatif).",
    "v04.4bit.label":             "Prédicteur de Décalage 4-bit",
    "v04.4bit.desc":              "MHA : R²(bf16)<0.9 → γ monte ; R²>0.99 → γ descend. GQA : précision-robuste.",
    "v04.crit.label":             "Ensemble d'Exposants Critiques",
    "v04.crit.desc":              "ν_c, β_c, η_c (=γ−1, CORRIGÉ), α_C, γ_susc avec minimum AM-GM à γ=1−1/√2≈0.293.",

    // §34 v0.5 (session 32, 2026-05-01) — Cohérence algébrique vérifiée par machine
    "v05.title":                  "🔬 v0.5 — Cohérence vérifiée par machine (session 32)",
    "v05.section.intro":          "Vérification duale par Sage Groebner basis + Lean Mathlib4 de <strong>15 identités algébriques</strong> des exposants critiques TAF. Premier framework transformer-attention avec preuve formelle machine.",
    "v05.verify.label":           "Vérification de Cohérence Algébrique",
    "v05.verify.desc":            "Étant donné γ mesuré, vérifie 12 identités D-SAGE (D-SAGE-1 : 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). Toutes passantes = framework intact. Échecs = outliers bf16 / artefacts de quantification.",
    "v05.dsage1.label":           "D-SAGE-1 (★★ core)",
    "v05.dsage1.desc":             "Identité quadratique 2η² + η·γ_χ + 1 = 0 (découverte par Sage Groebner, vérifiée Lean). Remplace l'affirmation incorrecte de 'fermeture triple'. Réfute η=2γ du paper 1 algébriquement.",
    "v05.erratum.label":          "Erratum paper 1 — correction η",
    "v05.erratum.desc":            "Paper 1 affirmait η = 2γ. Sage Groebner + Lean Mathlib4 ont prouvé l'échec (résidu (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Phase A). Valeur correcte : η = γ−1, satisfaisant D-SAGE-1.",
    "v05.repro.label":            "Reproductibilité",
    "v05.repro.desc":              "Les 15 théorèmes sont machine-proof en Lean Mathlib4 (build réussi 1973 jobs). Script Sage : <code>analysis/sage_recursive_sweep_2026-04-30.sage</code>. Code Lean : <code>lean_taf/taf/Taf/Identities.lean</code>.",

    // v0.5.1 — TAF Card consistency check button
    "v05.consistency.title":      "🔬 Vérification de cohérence algébrique (Sage + Lean v0.5)",
    "v05.consistency.desc":       "Vérifie 12 identités algébriques D-SAGE des exposants critiques TAF (machine-proof Sage Groebner basis + Lean Mathlib4). Passe = framework intact. Échec = outlier bf16 / artefact de quantification.",
    "v05.consistency.btn":        "🔬 Vérifier cohérence algébrique",

    // v0.5.2 — Anti-Ising universality class badge
    "v05.antiising.badge":        "🧲 Classe Anti-Ising (β=γ−1&lt;0, vérifié par machine)",

    // v0.5.2 — Tooltips par identité (explications en langage clair)
    "v05.tooltip.D_SAGE_1":       "Identité algébrique quadratique reliant la dimension anormale η et la susceptibilité γ_χ. Identité CENTRALE découverte par Sage Groebner basis (machine-proof). Remplace l'ancienne affirmation incorrecte de triple closure.",
    "v05.tooltip.D_SAGE_2":       "En Phase A, β = γ−1 est négatif (anti-Ising). Multiplié par χ = 1/(1−γ) donne exactement −1. Signature du régime négatif-β de TAF.",
    "v05.tooltip.D_SAGE_4":       "L'exposant de chaleur spécifique α et la susceptibilité χ se somment exactement à 2 en TAF. Conséquence algébrique de l'hyperscaling de Josephson.",
    "v05.tooltip.D_SAGE_5":       "Identité linéaire : α + γ_χ = 2(2−γ). Signifie que quand γ s'approche de 1 (Hagedorn), la somme s'approche de 2 ; à γ=0 elle vaut 4.",
    "v05.tooltip.D_SAGE_6":       "Exposant de paramètre d'ordre multiplié par exposant de susceptibilité donne une quadratique spécifique en γ. Relation algébrique factorisée.",
    "v05.tooltip.Rushbrooke_tautology": "Hyperscaling de Rushbrooke standard 2β + γ_χ = ν·d à d=1. En TAF c'est une TAUTOLOGIE — γ_χ est défini exactement pour que cela soit vrai. Confirmé par Sage Groebner basis.",
    "v05.tooltip.Josephson_tautology": "Hyperscaling de Josephson standard 2 − α = ν·d à d=1. En TAF c'est une TAUTOLOGIE — α est défini exactement pour que cela soit vrai.",
    "v05.tooltip.Fisher_independent": "Relation de Fisher γ_χ = (2−η)·ν. En TAF est INDÉPENDANTE (ne ferme PAS comme identité, contrairement à l'affirmation de triple closure). Le résidu est γ(2γ−3)/(1−γ).",
    "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 affirmait η=2γ. Cette identité le réfute : le résidu est positif dans toute la Phase A. Réfutation machine-proof par Lean Mathlib4.",
    "v05.tooltip.D_14_nu_imprint": "La pente d'empreinte apprise ν = −1/(2π) multipliée par 2π donne −1. Vérification dimensionnelle triviale du paper 1.",
    "v05.tooltip.D_SAGE_7":       "La charge centrale c=3 multipliée par |ν_imprint| multipliée par 2π donne 3. Fermeture dimensionnelle reliant CFT à l'empreinte d'entraînement.",
    "v05.tooltip.nu_beta_id":     "Exposant de longueur de corrélation ν multiplié par exposant de paramètre d'ordre β donne −1 en Phase A. Variante de D-SAGE-2.",

    "v053.calibration.title":     "🔬 v0.5.3 — Audit de calibrage (2026-05-02)",
    "v053.calibration.note":      "<strong>Correction SWA désactivée</strong> — δ_SWA = -0.21 d'origine était calibrée sur n=1 modèle (données insuffisantes ; moyenne du cas unique +0.355). <strong>Correction post_IH marquée exploratoire</strong> — moyenne de groupe ≈ 0 en ré-audit (panel n=22) ne réplique pas l'ajustement OLS. <strong>Correction GQA réplique</strong> (panel +0.115 vs hardcoded +0.11). <strong>Formule D_f corrigée pour Phase B (γ&gt;1)</strong> — utilise une somme cumulative discrète au lieu d'une approximation continue. LLaMA-3, Mistral, Gemma rapportent maintenant des valeurs de compression correctes.",
    "v053.release.banner":        "🔧 v0.5.3 — Corrections issues d'audit : D_f de compression KV utilise maintenant la somme discrète (correct pour tout γ) ; δ_SWA désactivé (calibrage n=1) ; erratum du coefficient C_V paper §5.2 (1/4 → 1/12).",

    // §35 v0.6 — Diagnostic γ prédit vs observé
    "gamma_check.title":           "🔍 γ prédit vs observé",
    "gamma_check.desc":            "Saisissez votre γ mesuré empiriquement. L'outil détecte le régime : fraude (θ gonflé) / comprimé / sur-Padé / SWA-aléatoire / normal.",
    "gamma_check.gobs_label":      "γ_observé",
    "gamma_check.gobs_tip":        "γ mesuré empiriquement à partir des attention scores de votre modèle. Utilisez la CLI Diagnose pour l'obtenir depuis les poids réels.",
    "gamma_check.random_label":    "Corpus aléatoire ?",
    "gamma_check.random_tip":      "Cochez si γ_observé a été mesuré sur des tokens aléatoires/non structurés. Distingue la signature SWA (γ_obs > 1) d'une anomalie.",
    "gamma_check.regime":          "Régime",
    "gamma_check.regime.normal":         "Normal",
    "gamma_check.regime.fraud":          "Fraude (θ gonflé)",
    "gamma_check.regime.compressed":     "Contexte comprimé",
    "gamma_check.regime.overpade":       "Sur-Padé",
    "gamma_check.regime.swa":            "Signature SWA (corpus aléatoire)",
    "gamma_check.regime.unknown":        "Inconnu",
    "gamma_check.regime.normal.desc":    "η ∈ [0.85, 1.15] : le modèle utilise son contexte nominal complet, sans anomalie.",
    "gamma_check.regime.fraud.desc":     "η < 0.01 : θ nominal gonflé. Le modèle se comporte comme si θ ≪ annoncé. Probable inflation YaRN/marketing sans vraie extension de contexte.",
    "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5) : contexte comprimé (le modèle attend moins loin que ne le prédit θ nominal). Fréquent en instruction-tuned / RLHF.",
    "gamma_check.regime.overpade.desc":  "η > 1.5 : le modèle attend plus loin que Padé ne le prédit. Régime Lerch-corrigé possible ou checkpoint précoce sous-entraîné.",
    "gamma_check.regime.swa.desc":       "γ_obs > 1.05 sur corpus aléatoire = signature de sliding-window attention (familles Mistral / Gemma).",
    "gamma_check.regime.unknown.desc":   "Entrées hors plage ou γ_obs > 1 sans flag corpus_aléatoire. Vérifiez la mesure.",
    "gamma_check.glossary.title":        "ⓘ Glossaire — signification des variables",
    "gamma_check.glossary.gamma_pade":   "<strong>γ_Padé</strong> : prédiction fermée (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
    "gamma_check.glossary.gamma_obs":    "<strong>γ_observé</strong> : mesuré empiriquement à partir des attention scores (exécutez Diagnose CLI sur poids réels).",
    "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (observé)</strong> : inversé depuis γ_obs via T√2 / (1 − γ_obs). θ effectif impliqué par votre mesure.",
    "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong> : θ + T/√2. θ effectif prédit par la formule fermée.",
    "gamma_check.glossary.efficiency":   "<strong>η</strong> : rapport θ_eff_obs / θ_eff_Padé. ≈1 = normal · &lt;0.01 = fraude · &lt;0.5 = comprimé · &gt;1.5 = sur-Padé.",
    "gamma_check.glossary.delta_h":      "<strong>ΔH_Cardy</strong> : log(θ_eff_obs / θ_nominal). Variation d'entropie de Cardy. Négatif = entropie de compression. ~0 = correspondance nominale.",
    "gamma_check.glossary.regime":       "<strong>Régime</strong> : classifieur automatique à partir de η + γ_obs + flag corpus_aléatoire.",

    // §36 v0.6 — Tooltips pour icônes ⓘ inline
    "tooltip.gamma_pade":          "<strong>γ_Padé(T_eval)</strong> : prédiction fermée (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.",
    "tooltip.gamma_decomposed":    "<strong>γ_décomposé</strong> : γ depuis la décomposition architecturale complète. Ligne de base Padé + shift GQA + shift post-IH (sous-ensemble répliqué dans audit calibré).",
    "tooltip.d_horizon":           "<strong>d_horizon</strong> : horizon d'attention effectif. Au-delà, les scores tombent sous le plancher de bruit (paper §26).",
    "tooltip.L_NIAH":              "<strong>Plafond L_NIAH</strong> : plafond prédit de fiabilité needle-in-a-haystack au d_horizon courant.",
    "tooltip.chi":                 "<strong>χ susceptibilité</strong> : χ = 1/(1−γ). Diverge à la ligne Hagedorn γ=1.",
    "tooltip.kv_memory":           "<strong>Mémoire KV @ T_eval (BF16)</strong> : cache KV par requête = 2 · n_layers · n_kv_heads · d_head · T_eval octets.",
    "tooltip.theta_eff_obs":       "<strong>θ_eff (observé)</strong> : θ effectif impliqué par votre γ_observé : T√2 / (1 − γ_obs).",
    "tooltip.theta_eff_pade":      "<strong>θ_eff (Padé)</strong> : θ effectif prédit par la formule fermée : θ + T/√2.",
    "tooltip.efficiency":          "<strong>η = θ_eff_obs / θ_eff_Padé</strong> : ratio d'efficacité. ≈1 = normal · &lt;0.01 = fraude · &lt;0.5 = comprimé · &gt;1.5 = sur-Padé.",
    "tooltip.delta_h_cardy":       "<strong>ΔH_Cardy</strong> : log(θ_eff_obs / θ_nominal). Variation d'entropie de Cardy. Négatif = entropie de compression. ~0 = correspondance nominale.",
    "tooltip.verdict_aggregate":   "<strong>Verdict</strong> : pire-de toutes les recettes. ✅ GO = tout vert · ⚠ DÉGRADÉ = ≥1 jaune · ❌ NON = ≥1 rouge.",
    "tooltip.verdict_breakdown":   "<strong>Décomposition par recette</strong> : chaque recette teste un axe de décision <em>indépendant</em> (contexte-long · budget · matériel · custom-vs-API · compression-KV). Un ❌ en X-1 signifie « utilisez l'API pour votre volume » et non « le modèle échoue » — ouvrez la section Recettes pour le contexte par axe.",
    "tooltip.gamma_pill":          "<strong>γ vedette</strong> : γ_décomposé (ou γ_Padé en fallback). Plage (0,1) = Phase A (anti-Ising). γ ≥ 1 = Hagedorn / Phase B.",
    "tooltip.anti_ising":          "<strong>Classe Anti-Ising</strong> : Phase A → β = γ−1 &lt; 0. Machine-verified (Sage + Lean Mathlib4). Voir §35 v0.5.",

    // §37 v0.6 — Table des théorèmes Lean+Mathlib
    "lean.table.title":            "📑 Table des théorèmes Lean+Mathlib",
    "lean.table.desc":             "Chaque entrée ci-dessous est machine-proven contre Lean 4 + Mathlib4. Cliquez sur un lien L# pour aller à la ligne source sur GitHub. Groupé par thème — cliquez sur un en-tête pour déplier.",
    "lean.table.theorem":          "Théorème",
    "lean.table.claim":            "Énoncé",
    "lean.table.tactic":           "Tactique",
    "lean.table.source":           "Source",
    "lean.table.lean":             "Lean",
    "lean.findings.title":         "🔎 Findings substantiels",
    "lean.findings.detected_by":   "Détecté par",
    "lean.findings.fixed_by":      "Corrigé par",
    "lean.findings.recommendation":"Recommandation",
    "lean.meta.repo":              "Repo",
    "lean.meta.build":             "Build",
    "lean.meta.theorems":          "Théorèmes",
    "lean.meta.verified":          "vérifiés",
    "lean.meta.rejected":          "rejetés",
    "lean.meta.sorry":             "sorry",
    "lean.meta.findings":          "findings substantiels",
    "lean.manifest.loading":       "Chargement du manifeste Lean…",
    "lean.manifest.error":         "Manifeste Lean indisponible",

    // Help modal — section v0.6
    "help.v06.title":              "🆕 v0.6 — γ prédit-vs-observé + Cardy ΔH + badges Lean",
    "help.v06.intro":              "<em>v0.6 (2026-05-06) : trois nouveaux diagnostics vivent dans la TAF Card sous <strong>🔬 Diagnostics</strong>. Tout tourne dans votre navigateur ; γ_observé provient de la Diagnose CLI sur poids réels.</em>",
    "help.v06.layout.title":       "Disposition de la TAF Card (nouveau en v0.6)",
    "help.v06.layout.body":        "Après avoir cliqué <strong>🚀 Générer profil complet</strong>, la carte affiche : une <strong>bande hero</strong> en haut (classe d'architecture + méta + 3 pills : verdict agrégé ✅/⚠/❌, γ vedette, 🧲 Anti-Ising si Phase A) et quatre <strong>sections pliables</strong> : <strong>📋 Recettes</strong> (ouverte par défaut — verdict par dimension), <strong>🔬 Diagnostics</strong> (nombres clés, γ prédit vs observé, explorateur what-if), <strong>✓ Vérification</strong> (cohérence algébrique Sage+Lean, falsification F1-F23), <strong>📂 Provenance &amp; partage</strong> (audit de calibration + téléchargement JSON / lien / soumission registre). Cliquez sur n'importe quel en-tête pour déplier. Chaque variable a un tooltip <strong>ⓘ</strong> inline.",
    "help.v06.gamma_check.title":  "γ prédit vs observé",
    "help.v06.gamma_check.body":   "Saisissez le γ mesuré empiriquement et l'outil calcule <strong>η = θ_eff_obs / θ_eff_Padé</strong> et classe en l'un de 5 régimes :",
    "help.v06.case.normal":        "<strong>Normal</strong> (η ∈ [0.85, 1.15]) — le modèle utilise son contexte nominal complet. <em>Cas d'usage</em> : valider une nouvelle release avant adoption.",
    "help.v06.case.fraud":         "<strong>Fraude</strong> (η &lt; 0.01) — θ nominal gonflé ; le modèle se comporte comme si θ ≪ annoncé. <em>Cas d'usage</em> : détecter inflation YaRN/marketing (motif CodeLlama / Mistral-Nemo).",
    "help.v06.case.compressed":    "<strong>Comprimé</strong> (η &lt; 0.5) — contexte comprimé ; le modèle attend moins loin que θ nominal. <em>Cas d'usage</em> : repérer compression par RLHF/instruction-tuning (motif LLaMA-2).",
    "help.v06.case.overpade":      "<strong>Sur-Padé</strong> (η &gt; 1.5) — le modèle attend plus loin que Padé ne le prédit. <em>Cas d'usage</em> : identifier régime Lerch-corrigé ou checkpoints précoces sous-entraînés (motif pythia-1b).",
    "help.v06.case.swa":           "<strong>SWA corpus aléatoire</strong> (γ_obs &gt; 1.05 avec corpus_aléatoire=Oui) — signature de sliding-window attention. <em>Cas d'usage</em> : confirmer SWA Mistral / Gemma sur tokens aléatoires.",
    "help.v06.cardy.title":        "Diagnostic Cardy ΔH",
    "help.v06.cardy.body":         "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>. Variation d'entropie entre le θ effectif observé et le θ nominal. Fortement négatif = entropie de compression ; proche de zéro = correspondance nominale. Complète η pour les cas borderline.",
    "help.v06.lean.title":         "Badges de vérification Lean + Mathlib",
    "help.v06.lean.body":          "Les identités TAF sont formellement machine-proven en Lean Mathlib4 : <strong>37 théorèmes</strong> en 7 groupes (Padé, flot RG, Cayley, D-SAGE, résultats d'audit, erratum CV, divers) + <strong>1 résultat substantiel</strong> (facteur 2 dans la dérivée V, théorème <code>V_derivative_ne_RG_beta</code>). Source : <a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a> (commit 25c77fd). Re-vérifier localement : <code>git clone --depth=1 https://github.com/karlesmarin/lean-taf &amp;&amp; cd lean-taf &amp;&amp; lake exe cache get &amp;&amp; lake env lean Taf/Identities.lean</code>. La pill 🧲 Anti-Ising du hero et la section Vérification renvoient à des lignes sources spécifiques.",
    "help.v06.glossary.title":     "Glossaire des variables (également intégré dans la TAF Card)",
    "help.v06.glossary.body":      "Chaque variable de la TAF Card a un tooltip ⓘ inline. Liste complète : γ, γ_Padé, γ_décomposé, γ_observé, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, mémoire KV, régime. Survolez n'importe quel ⓘ pour la définition + section du paper.",

    "hero.title":     "🔬 TAF Agent",
    "hero.tagline":   "Diagnostiquez n'importe quel LLM transformer en 30 secondes. Gratuit. Sans GPU. Sans inscription.",
    "hero.subtitle":  "Prédit si un modèle conviendra à votre cas d'usage <em>avant</em> que vous ne dépensiez argent ou temps. Tout tourne dans votre navigateur &mdash; vos données ne quittent jamais cet onglet.",
    "hero.help":      "📘 Manuel et exemples",
    "hero.quickstart_btn": "⚡ Démarrage rapide",
    "hero.inventory_btn":  "🧰 Ce que ça offre",
    "hero.about":     "Conçu par un chercheur indépendant. Open source. Non affilié à un fournisseur de modèles.",

    "modes.title":    "🎯 Mode",
    "modes.profile":  "📇 Profiler un modèle",
    "modes.compare":  "🆚 Comparer des modèles",
    "modes.inspector": "🔍 Inspecter config",
    "modes.ask":      "💬 Question libre",
    "modes.recipe":   "📋 Choisir une recette",
    "modes.diagnose": "🩺 Diagnose CLI",
    "diagnose.title": "🩺 Générateur de commande Diagnose CLI",
    "diagnose.tip":   "Le navigateur prédit γ à partir de la config; le CLI mesure γ_obs sur les poids réels. Ce générateur produit la commande exacte à exécuter localement.",
    "diagnose.desc":  "Choisis les options et copie-colle la commande générée sur ta machine locale (Python + transformers + numpy). Mode rapide ≈5 min CPU; complet ≈20–60 min GPU.",
    "diagnose.model_label": "ID du modèle HF:",
    "diagnose.theta_label": "θ (auto si vide):",
    "diagnose.n_label": "Contexte N:",
    "diagnose.options_label": "Options:",
    "diagnose.opt_fast": "--fast (CPU, ~5 min)",
    "diagnose.opt_cpu": "--cpu (forcer CPU)",
    "diagnose.opt_4bit": "--load_in_4bit (modèles ≥7B)",
    "diagnose.local_label": "--local path (optionnel):",
    "diagnose.build_btn": "📋 Générer la commande",
    "diagnose.cmd_title": "Commande générée :",
    "diagnose.copy_btn": "📋 Copier dans le presse-papiers",
    "diagnose.next_steps": "Prochaines étapes: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Exécute la commande (4) JSON résultat → upload via mode Inspect pour analyse TAF complète.",
    "modes.phase":    "📊 Diagramme de phase",
    "phase.title":    "📊 Diagramme de phase (γ × θ)",
    "phase.tip":      "Chaque point est un modèle du panel empirique du paper. x: log θ; y: γ. La ligne Hagedorn γ=1 sépare Phase A de Phase B. Hover pour détails, click pour charger dans le formulaire.",
    "phase.desc":     "23 modèles dans le panel; courbe Padé à T=2000.",
    "modes.desc":     "<strong>Démarrage rapide</strong>: collez n'importe quel id de modèle HuggingFace (ex. <code>meta-llama/Meta-Llama-3-8B</code>), cliquez Profiler. Voyez les 5 recettes évaluées en quelques secondes.",

    "profile.title":           "📇 Profiler un modèle",
    "profile.desc":            "<strong>Pour techniciens</strong>: quand vous avez besoin d'un instantané complet de viabilité d'un modèle candidat. Un clic exécute les 5 recettes et produit une TAF Card unifiée.",
    "profile.preset_label":    "Préréglage:",
    "profile.preset_default":  "— ou choisir dans la liste —",
    "profile.hf_label":        "ID modèle HF:",
    "profile.fetch_btn":       "📥 Charger",
    "profile.btn":             "🚀 Générer profil complet",
    "profile.quickstart":      "💡 Démarrage rapide: choisissez un préréglage → cliquez Générer. Ou collez un id depuis <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub tendances</a> → 📥 Charger → Générer.",

    "compare.title":           "🆚 Comparer côte à côte",
    "compare.desc":            "<strong>Pour techniciens</strong>: quand vous choisissez entre 2-3 modèles candidats pour un scénario de déploiement spécifique. Même recette, plusieurs modèles, verdicts côte à côte.",
    "compare.recipe_label":    "Recette:",
    "compare.T_eval_label":    "T_eval (contexte cible):",
    "compare.models_title":    "Modèles à comparer (jusqu'à 3)",
    "compare.btn":             "🚀 Comparer",
    "compare.example":         "💡 Essayez: collez 3 modèles populaires de 7-8B (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), recette X-2, T_eval=16000. Voyez lequel gère le mieux le contexte long.",

    "ask.title":               "❓ Votre question",
    "ask.placeholder":         "ex. Mistral-7B gérera-t-il 16K NIAH? Ou: J'ai 5,000$, quel modèle puis-je entraîner? Ou: GPU le moins cher pour servir Llama-70B à 100M tokens/jour?",
    "ask.btn":                 "🚀 Analyser",
    "ask.example_btn":         "💡 Essayer un exemple",

    "recipe.title":            "📋 Recette",
    "recipe.default":          "— choisir une recette —",
    "recipe.input_title":      "🎯 Entrées",

    "verdict.title":           "📊 Verdict",
    "chain.title":             "🔍 Chaîne de calcul",
    "chain.desc":              "Chaque nombre ci-dessous est du Python déterministe. Cliquez sur une étape pour développer.",
    "answer.title":            "💬 Réponse en langage naturel",
    "share.btn":               "🔗 Copier le lien",
    "share.copied":            "✅ Copié dans le presse-papiers!",
    "share.download":          "💾 Télécharger JSON",
    "share.download_md":       "📝 Markdown",
    "share.download_tex":      "📜 LaTeX",
    "share.submit":            "📤 Soumettre au registry",
    "share.submit_clip_ok":    "↗ GitHub ouvert. Corps copié dans le presse-papiers — collez-le dans le corps de l'issue.",
    "share.submit_clip_fail":  "↗ GitHub ouvert. Presse-papiers bloqué — corps dans la console du navigateur (F12).",
    "share.import_title":      "📂 Importer un résultat TAF partagé",
    "a11y.skip":               "Aller au contenu principal",

    // v0.6.2 — refonte de la landing : démarrage rapide + inventaire + tooltips d'architecture
    "qs.title":                    "⚡ Démarrage rapide",
    "qs.step1":                    "Collez un model ID HuggingFace (ex. <code>meta-llama/Meta-Llama-3-8B</code>)",
    "qs.step2":                    "Cliquez sur <strong>📇 Profile a model</strong>",
    "qs.step3":                    "Lisez votre TAF Card — verdict par cas d'usage + chiffres clés + maths vérifiées par Lean+Mathlib",
    "qs.cta":                      "↓ Commencer",
    "inv.title":                   "🧰 Ce que cet outil vous offre",
    "inv.recipes.title":           "🎯 8 recettes — ce modèle convient-il à votre usage ?",
    "inv.recipes.x1.title":        "Entraînement propre vs API",
    "inv.recipes.x1.body":         "lequel coûte moins cher pour votre trafic ?",
    "inv.recipes.x2.title":        "Contexte long",
    "inv.recipes.x2.body":         "tient-il 32k / 128k tokens de manière fiable ?",
    "inv.recipes.x3.title":        "Budget",
    "inv.recipes.x3.body":         "avec $X, quel modèle pouvez-vous entraîner ?",
    "inv.recipes.x5.title":        "Matériel",
    "inv.recipes.x5.body":         "quel GPU pour servir N tokens/jour ?",
    "inv.recipes.x19.title":       "KV cache",
    "inv.recipes.x19.body":        "comment compresser sans casser la qualité ?",
    "inv.recipes.x21.title":       "Pureté d'imprint",
    "inv.recipes.x21.body":        "à quel point l'encodage positionnel est-il propre ?",
    "inv.recipes.x22.title":       "Compute-contexte",
    "inv.recipes.x22.body":        "le modèle entre-t-il dans la bande empirique ?",
    "inv.recipes.x23.title":       "Phase IH",
    "inv.recipes.x23.body":        "pré- ou post-induction-head ?",
    "inv.diag.title":              "🔬 Diagnostics",
    "inv.diag.gamma":              "<strong>γ prédit vs observé</strong> — auto-classe le modèle en 5 régimes (normal · fraude / contexte gonflé · compressé · over-Padé · sliding-window)",
    "inv.diag.cardy":              "<strong>Cardy ΔH</strong> — décalage d'entropie entre contexte observé et nominal",
    "inv.diag.fals":               "<strong>Tableau de falsifiabilité</strong> — vérifie 23 prédictions spécifiques (F1–F23)",
    "inv.diag.alg":                "<strong>Cohérence algébrique</strong> — 8 identités mathématiques que le modèle doit satisfaire",
    "inv.verify.title":            "✓ Maths formellement vérifiées",
    "inv.verify.count":            "<strong>37 théorèmes</strong> machine-proven en Lean 4 + Mathlib4",
    "inv.verify.click":            "Cliquez sur un badge → ouvre la ligne source sur GitHub",
    "inv.verify.reverify":         "Vérifiez vous-même : <code>lake build</code> (≈5 s après cache)",
    "inv.export.title":            "📤 Export et partage",
    "inv.export.formats":          "<strong>JSON · Markdown · LaTeX</strong> (prêt pour papier)",
    "inv.export.share":            "Lien reproductible (état encodé dans l'URL)",
    "inv.export.registry":         "Soumettre au registre communautaire sur GitHub",
    "arch.summary":                "Architectures prises en charge",
    "arch.anyhf":                  "✓ Tout modèle public HuggingFace",
    "tooltip.mha":                 "Multi-Head Attention : chaque position attend via plusieurs têtes parallèles à la fois.",
    "tooltip.gqa":                 "Grouped Query Attention : les queries partagent moins de keys/values que de heads (économise mémoire mais pousse γ vers Hagedorn).",
    "tooltip.alibi":               "Attention with Linear Biases : l'info de position est une pente apprise ajoutée aux scores, sans rotation.",
    "tooltip.abspe":               "Absolute Position Embeddings : chaque position a un vecteur fixe appris ajouté au token.",
    "tooltip.swa":                 "Sliding Window Attention : chaque token n'attend que dans une fenêtre locale fixe (Mistral, gemma-2 l'utilisent).",
    "tooltip.ssm":                 "State Space Model : couche de séquence qui maintient un état interne au lieu d'attention (Mamba, Jamba l'utilisent).",

    // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker
    "modes.unmask":                "🪟 Démasquer",
    "unmask.title":                "🪟 Démasqueur de contexte",
    "unmask.tip":                  "Collez un id de modèle HuggingFace (ou config.json brut). L'outil détecte sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), et GQA — tout ce qui rend <code>max_position_embeddings</code> plus grand que le contexte effectif réel. Mistral-7B-v0.1 est l'exemple canonique : déclare 32k, attend dans ~4-8k.",
    "unmask.desc":                 "<strong>Êtes-vous sur le point de dépenser de l'argent sur un modèle qui n'attend pas vraiment aussi loin ?</strong> Collez un id et découvrez-le en 1 seconde. Sans GPU, sans inférence — juste de l'arithmétique sur config.json.",
    "unmask.id_label":             "ID modèle HF :",
    "unmask.fetch_btn":            "🔍 Démasquer",
    "unmask.paste_summary":        "Ou collez config.json brut (modèles privés / en dev)",
    "unmask.paste_btn":            "🔍 Démasquer config collé",
    "unmask.label.declared":       "Contexte déclaré",
    "unmask.label.effective":      "Effectif (estimé)",
    "unmask.label.ratio":          "Ratio",
    "unmask.section.flags":        "Drapeaux d'architecture",
    "unmask.section.warnings":     "Avertissements",
    "unmask.section.reco":         "Recommandation",
    "unmask.flag.swa":             "SWA",
    "unmask.flag.rope":            "RoPE scaling",
    "unmask.flag.gqa":             "GQA",
    "unmask.flag.layers":          "Couches",
    "unmask.flag.dhead":           "d_head",
    "unmask.flag.theta":           "RoPE θ",
    "unmask.flag.yes":             "oui",
    "unmask.flag.no":              "non",
    "unmask.flag.full_mha":        "non (MHA complet, {n} heads)",
    "unmask.verdict.honest":            "✅ HONNÊTE",
    "unmask.verdict.inflated":          "⚠ GONFLÉ",
    "unmask.verdict.severely_inflated": "❌ GRAVEMENT GONFLÉ",
    "unmask.verdict.yarn_extended":     "⚠ YARN-ÉTENDU",
    "unmask.verdict.unknown":           "❓ INCONNU",
    "unmask.warn.swa_window":      "Fenêtre SWA : {window} tokens — chaque couche n'attend que dans cette fenêtre.",
    "unmask.warn.multihop":        "Estimation multi-hop : ~{multiHop} tokens (conservateur : fenêtre × {factor}).",
    "unmask.warn.yarn":            "RoPE scaling ({type}) étend le contexte {factor}× de ~{original} à {declared} tokens.",
    "unmask.warn.yarn_advice":     "Contexte RoPE-étendu — vérifiez le comportement de γ à la longueur déclarée avec le diagnostic γ_check.",
    "unmask.warn.gqa_small_dhead": "Petite head dim ({d_head}) + GQA : compression de KV cache probable en contexte long (γ poussé vers Hagedorn).",
    "unmask.reco.honest":              "Modèle d'attention complète standard. Contexte effectif correspond au déclaré ({declared} tokens).",
    "unmask.reco.inflated":            "Effectif ~{effective} tokens via SWA. Utilisez γ_check pour vérifier le comportement à votre longueur cible.",
    "unmask.reco.severely_inflated":   "Traitez-le comme un modèle de ~{effective} tokens en pratique. Le claim de {declared} tokens ne s'applique que via des chaînes d'attention cross-layer, qui dégradent empiriquement au-delà de ~2× la fenêtre SWA.",
    "unmask.reco.yarn_extended":       "Contexte RoPE-étendu. Lancez un benchmark long-context (NIAH à 8k / 16k / 32k / full) pour confirmer que l'extension tient. Utilisez γ_check avec T_eval = {declared}.",
    "unmask.reco.unknown":             "Impossible de parser le config. Vérifiez que l'URL est un modèle HF valide avec config.json public.",
    "unmask.status.empty_id":      "⚠ Saisissez un model id (ex. mistralai/Mistral-7B-v0.1).",
    "unmask.status.fetching":      "⏳ Récupération config.json pour {modelId}...",
    "unmask.status.success":       "✅ {modelId} analysé (verdict : {verdict})",
    "unmask.status.empty_paste":   "⚠ Collez d'abord un config.json.",
    "unmask.status.invalid_json":  "❌ JSON invalide : {error}",
    "unmask.status.success_paste": "✅ Config collé analysé (verdict : {verdict})",
    "unmask.pasted_label":         "(config collé)",
    "mode_desc.ask":               "Tapez une question libre. Le LLM dans le navigateur choisit la recette et l'exécute.",
    "mode_desc.recipe":            "Sélectionnez une recette directement et remplissez le formulaire. Contrôle manuel complet.",
    "mode_desc.profile":           "Démarrage le plus rapide : collez n'importe quel model id HuggingFace, cliquez Profile. Voyez les 5 recettes en quelques secondes.",
    "mode_desc.compare":           "Choisissez 2-3 modèles candidats + une recette. Verdicts côte à côte dans un tableau.",
    "mode_desc.inspector":         "Collez un config.json directement. Utile pour modèles privés / en dev non publiés sur HF Hub.",
    "mode_desc.diagnose":          "Construit la commande CLI diagnose_model.py pour MESURER γ_obs sur GPU réel. Le navigateur prédit ; le CLI mesure.",
    "mode_desc.phase":             "Scatter γ × θ du panel empirique du papier. Survolez les points pour détails, cliquez pour charger dans Diagnose / Recipe.",
    "mode_desc.unmask":            "Détecte si max_position_embeddings est trompeur (SWA / YaRN / RoPE-scaling). Collez un model id, obtenez un verdict en 1 ligne.",
    "profile.preset_loaded":       "✅ Préréglage chargé pour <strong>{id}</strong>. Formulaire pré-rempli. (Cliquez 📥 Fetch pour écraser avec le dernier config depuis HF Hub.)",

    // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
    "modes.template":              "📜 Chat-template",
    "mode_desc.template":          "Détecte la famille de chat-template d'un modèle (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Donne le flag CLI exact pour lm-eval / vLLM / transformers.",
    "template.title":              "📜 Détecteur de Chat-template",
    "template.tip":                "Collez un model id HF (ou tokenizer_config.json brut). Détecte la famille du chat-template et donne le commande exacte pour l'utiliser correctement. lm-eval-harness divise l'accuracy par 2 silencieusement si vous oubliez de l'appliquer (issue #1841).",
    "template.desc":               "<strong>Avez-vous oublié <code>--apply_chat_template</code> ?</strong> La plupart des évals multi-tours échouent à ~50% parce que le chat template n'a pas été appliqué. Collez un model id, obtenez le flag CLI exact pour votre stack.",
    "template.id_label":           "ID modèle HF :",
    "template.fetch_btn":          "📜 Détecter",
    "template.paste_summary":      "Ou collez tokenizer_config.json brut (modèles privés)",
    "template.paste_btn":          "📜 Détecter config collé",
    "template.label.family":       "Famille détectée",
    "template.label.markers":      "Marqueurs correspondants",
    "template.label.tpl_len":      "Longueur du template",
    "template.section.warnings":   "Avertissements",
    "template.section.commands":   "Commandes par framework",
    "template.section.raw":        "Template brut (preview)",
    "template.family.custom":      "custom (famille inconnue)",
    "template.family.none":        "(pas de chat_template)",
    "template.verdict.ok":         "✅ TEMPLATE DÉTECTÉ",
    "template.verdict.custom":     "⚠ TEMPLATE CUSTOM",
    "template.verdict.missing":    "❌ PAS DE CHAT TEMPLATE",
    "template.verdict.base_model": "ℹ MODÈLE DE BASE (sans chat)",
    "template.verdict.unknown":    "❓ INCONNU",
    "template.warn.no_chat_template": "Pas de champ <code>chat_template</code> dans tokenizer_config.json. Typique des modèles base / pré-entraînés. Si vous attendiez un modèle instruct-tuned, le mauvais fichier peut être chargé.",
    "template.warn.custom_template":  "Template non standard ({length} chars). L'outil n'a pas pu le faire correspondre aux familles connues. Inspectez le preview et vérifiez que votre framework d'éval le supporte.",
    "template.warn.lm_eval_apply":    "<strong>lm-eval-harness :</strong> ajoutez <code>--apply_chat_template</code> ou votre accuracy chutera silencieusement de ~50% sur les évals multi-tours (issue #1841).",
    "template.warn.vllm_apply":       "<strong>vLLM serve :</strong> vérifiez que <code>--chat-template</code> est défini (l'auto-détection échoue parfois sur les variantes fine-tunées). Suggéré : <code>{name}</code>.",
    "template.status.empty_id":    "⚠ Saisissez un model id (ex. mistralai/Mistral-7B-Instruct-v0.3).",
    "template.status.fetching":    "⏳ Récupération tokenizer_config.json pour {modelId}...",
    "template.status.success":     "✅ {modelId} détecté (verdict : {verdict})",
    "template.status.empty_paste": "⚠ Collez d'abord un tokenizer_config.json.",
    "template.status.invalid_json":"❌ JSON invalide : {error}",
    "template.status.success_paste":"✅ Config collé détecté (verdict : {verdict})",
    "template.pasted_label":       "(tokenizer_config collé)",

    // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
    "modes.arena":                 "🎯 Arena CI",
    "mode_desc.arena":             "Récupère les intervalles de confiance à partir des données brutes de votes pairwise (MLE Bradley-Terry + bootstrap). Détecte les paires statistiquement à égalité que le leaderboard public d'Arena cache.",
    "arena.title":                 "🎯 Reconstructeur Arena-Elo CI",
    "arena.tip":                   "Chatbot Arena masque les intervalles de confiance dans le leaderboard public. Un écart de 5 Elo peut être statistiquement insignifiant. Collez les données brutes de votes (model_a, model_b, winner) — l'outil calcule le MLE Bradley-Terry + bootstrap CIs et liste les égalités statistiques (overlap CI).",
    "arena.desc":                  "<strong>GPT-4 est-il vraiment meilleur que Claude — ou sont-ils à égalité ?</strong> Collez le CSV de votes pairwise (ou cliquez <em>Charger un échantillon</em>). MLE Bradley-Terry + 200 itérations de bootstrap → Elos classés avec CIs 95% et détection d'égalités statistiques. Tout dans le navigateur.",
    "arena.sample_btn":            "📊 Charger échantillon",
    "arena.run_btn":                "🎯 Calculer CIs",
    "arena.clear_btn":             "🗑️ Effacer",
    "arena.csv_summary":           "CSV de votes (header : <code>model_a,model_b,winner</code> ; winner ∈ a/b/tie)",
    "arena.section.ranked":        "Elos classés avec CIs 95%",
    "arena.section.ties":          "Égalités statistiques (overlap CI)",
    "arena.section.summary":       "Résumé",
    "arena.col.rank":              "#",
    "arena.col.model":             "Modèle",
    "arena.col.elo":               "Elo",
    "arena.col.ci":                "CI 95%",
    "arena.col.ci_width":          "± demi-largeur",
    "arena.col.matches":           "Matchs",
    "arena.col.wins":              "V / D / E",
    "arena.col.tie_pair":          "Paire",
    "arena.col.tie_diff":          "Écart Elo",
    "arena.col.tie_overlap":       "Overlap CI",
    "arena.no_ties":               "Aucune égalité statistique — toutes les paires sont distinguables à 95% CI.",
    "arena.summary.votes":         "Total des votes",
    "arena.summary.models":        "Modèles",
    "arena.summary.ties":          "Égalités statistiques",
    "arena.summary.bootstrap":     "Itérations bootstrap",
    "arena.summary.ci_level":      "Niveau CI",
    "arena.status.empty":          "⚠ Collez un CSV de votes ou cliquez sur Charger échantillon.",
    "arena.status.too_few":        "⚠ Seulement {n} votes valides — il en faut au moins 10 pour ajuster Bradley-Terry de manière fiable.",
    "arena.status.computing":      "⏳ Calcul MLE Bradley-Terry + bootstrap sur {n} votes...",
    "arena.status.done":           "✅ {n} votes · {models} modèles · {ties} égalités statistiques · {ms} ms",
    "arena.status.sample_loaded":  "✅ Échantillon chargé (données Arena synthétiques 6 modèles). Cliquez sur Calculer CIs.",

    // v0.7.3 — anti-bullshit pack #4: Contamination Prior
    "modes.contam":                "🧪 Contamination",
    "mode_desc.contam":            "Prior bayésien-ish sur la contamination d'un score de benchmark. Saisissez le cutoff d'entraînement → note 20+ benchmarks populaires (MMLU, GSM8K, HumanEval, MMLU-Pro…).",
    "contam.title":                "🧪 Prior de Contamination",
    "contam.tip":                  "Calcule un prior bayésien-ish indiquant si un score de benchmark est contaminé, basé sur (date de cutoff d'entraînement) × (date de sortie du benchmark) × (inclusion connue dans corpus + historique de leaks). Open LLM Leaderboard v1 a été tué en 2024 après la contamination de MMLU/HellaSwag.",
    "contam.desc":                 "<strong>Devez-vous faire confiance au score MMLU de votre modèle ?</strong> Saisissez la date de cutoff d'entraînement — l'outil note 20+ benchmarks populaires (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) et vous dit quels scores sont probablement contaminés.",
    "contam.cutoff_label":         "Cutoff entraînement :",
    "contam.run_btn":              "🧪 Noter tous les benchmarks",
    "contam.section.ranked":       "Priors de contamination par benchmark",
    "contam.section.high":         "🔴 Benchmarks à haut risque (traitez les scores comme non fiables)",
    "contam.section.medium":       "🟡 Risque moyen (vérifiez avec des alternatives)",
    "contam.section.low":          "🟢 Faible risque (probablement propres)",
    "contam.col.benchmark":        "Benchmark",
    "contam.col.released":         "Sorti",
    "contam.col.gap":              "Écart (mois)",
    "contam.col.prior":            "P(contam)",
    "contam.col.level":            "Niveau",
    "contam.col.corpora":          "Dans corpus",
    "contam.col.category":         "Catégorie",
    "contam.label.high":           "Haut risque",
    "contam.label.medium":         "Moyen",
    "contam.label.low":            "Faible",
    "contam.no_entries":           "(aucun dans cette catégorie)",
    "contam.advice.high":          "Traitez ces scores comme non fiables. Remplacez par des alternatives plus récentes / à test privé (MMLU-Pro, GPQA, MUSR, MATH-500).",
    "contam.advice.medium":        "À prendre avec précaution. Cherchez une réplication sur un subset held-out ou des reproductions communautaires.",
    "contam.advice.low":           "Score probablement non contaminé, mais absence de leak n'est pas une preuve — vérifiez avec un test alternatif.",
    "contam.summary.headline":     "Cutoff <code>{cutoff}</code> · {n} benchmarks notés",
    "contam.status.empty":         "⚠ Saisissez une date de cutoff d'entraînement (ex. 2023-12).",
    "contam.status.bad_date":      "⚠ Format de date incorrect. Utilisez YYYY-MM ou YYYY-MM-DD.",
    "contam.status.done":          "✅ Cutoff {cutoff} · {n} benchmarks notés · {high} à haut risque",

    // v0.7 — Section Help modal
    "help.v07.title":              "🆕 v0.7 — Pack anti-bullshit (4 nouveaux modes)",
    "help.v07.intro":              "<em>v0.7 (2026-05-06) : quatre nouveaux modes qui résolvent des problèmes concrets remontés par la communauté HuggingFace. Chacun tourne dans votre navigateur sans inférence — pure métadonnée + maths.</em>",
    "help.v07.unmask.title":       "🪟 Démasqueur de Contexte",
    "help.v07.unmask.body":        "Détecte quand <code>max_position_embeddings</code> est trompeur. Mistral-7B-v0.1 déclare 32k mais attend dans ~4-8k via SWA. Collez un id HF → verdict en 1 seconde (HONNÊTE / GONFLÉ / GRAVEMENT GONFLÉ / YARN-ÉTENDU). Détecte SWA, RoPE-scaling (YaRN/linear/dynamic NTK), petit d_head + GQA. <em>Cas d'usage</em> : avant de payer un GPU pour 32k de contexte, vérifiez que le modèle attend vraiment aussi loin.",
    "help.v07.template.title":     "📜 Détecteur de Chat-template",
    "help.v07.template.body":      "Détecte la famille de chat-template d'un modèle (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) et donne le flag CLI exact pour lm-evaluation-harness, vLLM, et transformers. Résout l'issue #1841 de lm-eval-harness : oublier <code>--apply_chat_template</code> divise l'accuracy multi-tours par 2 silencieusement. <em>Cas d'usage</em> : avant de reporter un score, confirmez avoir appliqué le template correctement.",
    "help.v07.arena.title":        "🎯 Reconstructeur Arena-Elo CI",
    "help.v07.arena.body":         "Chatbot Arena masque les intervalles de confiance de son leaderboard public — un écart de 5 Elo peut être statistiquement insignifiant. Collez des données brutes de votes pairwise (model_a, model_b, winner) → MLE Bradley-Terry + bootstrap 200 itérations → Elos classés avec CIs 95% et un panneau \"égalités statistiques\" listant les paires dont les CIs se chevauchent. Essayez le bouton Charger échantillon. <em>Cas d'usage</em> : avant de déclarer \"modèle A bat modèle B\", vérifiez que leurs CIs ne se chevauchent pas.",
    "help.v07.contam.title":       "🧪 Prior de Contamination",
    "help.v07.contam.body":        "Prior bayésien-ish sur la contamination d'un score de benchmark. Saisissez la date de cutoff d'entraînement de votre modèle → l'outil note 20+ benchmarks populaires (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) par P(contamination) selon l'écart temporel, l'inclusion dans corpus et l'historique de leaks connus. Open LLM Leaderboard v1 a été tué en 2024 après la contamination de MMLU/HellaSwag. <em>Cas d'usage</em> : décidez quels scores croire en comparant deux modèles.",
    "help.v07.quant.title":        "⚖️ Classificateur de régime de quantification",
    "help.v07.quant.body":         "Prédit le γ-shift et ΔPPL pour tout (modèle × schéma de quantification : NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8…). Arch-aware : petit d_head + GQA agressif → plus sensible ; les schémas calibrés (AWQ) absorbent mieux le shift que les non calibrés (NF4). Recommande des alternatives plus sûres si un cliff est détecté. <em>Cas d'usage</em> : avant de quantifier, prédisez si votre combo architecture × schéma maintiendra la PPL acceptable, avec une suggestion concrète de switch sinon.",
    "help.v07.drift.title":        "🔀 Borne de drift inter-frameworks",
    "help.v07.drift.body":         "Même modèle, scores différents sur setups différents. L'outil prédit le drift max admissible dû au seul bruit numérique (dtype, framework, batch). Si l'écart observé le dépasse → vrai bug, généralement chat-template mismatch (issue #1841 lm-eval-harness) ou layout KV-cache. Essayez le bouton &quot;Charger échantillon&quot; pour le bug chat-template canonique. <em>Cas d'usage</em> : avant de reporter une régression ou de revendiquer la reproductibilité, vérifiez si l'écart entre deux évals est plus grand que ce que le bruit numérique peut expliquer.",
    "inv.v07.drift":               "<strong>🔀 Drift</strong> — bug ou bruit ? Prédit l'écart max admissible entre deux évals",
    "help.v07.niah.title":         "🔍 Gap NIAH → Reasoning",
    "help.v07.niah.body":          "Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH (retrieval de needle) mais échouent au reasoning multi-hop au même contexte. L'outil prédit les deux taux de réussite à partir de l'architecture (γ_Padé + d_horizon + pression arch : petit d_head, GQA, SWA), reporte le gap, et trouve le \"contexte sûr pour reasoning\" où le reasoning reste ≥65%. Mode balayage montre la courbe à 1k/4k/16k/64k/T_train. <em>Cas d'usage</em> : avant de déployer au contexte revendiqué, découvrez si le modèle va vraiment raisonner là ou seulement retrouver.",
    "inv.v07.niah":                "<strong>🔍 NIAH→Reason</strong> — votre \"128k\" raisonne-t-il vraiment là, ou seulement retrouve ?",

    // v0.7 — Inventory modal 5ème card
    "inv.v07.title":               "🆕 Pack anti-bullshit v0.7",
    "inv.v07.unmask":              "<strong>🪟 Unmask</strong> — config.json annonce 32k ? Voyez s'il attend vraiment aussi loin",
    "inv.v07.template":            "<strong>📜 Chat-template</strong> — flag CLI exact pour que lm-eval ne divise pas votre accuracy par 2 en silence",
    "inv.v07.arena":               "<strong>🎯 Arena CI</strong> — récupère les intervalles de confiance que Chatbot Arena cache",
    "inv.v07.contam":              "<strong>🧪 Contamination</strong> — note 20+ benchmarks par probabilité de contamination",
    "inv.v07.quant":               "<strong>⚖️ Quant</strong> — prédit le γ-shift + ΔPPL pour tout combo (modèle × schéma de quantification)",

    // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
    "modes.quant":                 "⚖️ Quant",
    "mode_desc.quant":             "Prédit le γ-shift et ΔPPL pour tout (modèle × schéma de quantification). Arch-aware : petit d_head + GQA → plus sensible. Recommande des alternatives plus sûres si un cliff est détecté.",
    "quant.title":                 "⚖️ Classificateur de régime de quantification",
    "quant.tip":                   "Prédit le γ-shift (et la ΔPPL résultante) pour une paire (modèle × schéma). Les claims génériques comme 'AWQ ~95% retention' sont trop vagues — TAF utilise d_head, ratio GQA, flag SWA et taille du modèle pour donner un verdict arch-spécifique. Résout : la communauté HF rapporte des cliffs de quantification imprédictibles (NF4 -2 PPL sur Phi-3 mais OK sur Llama-3-8B).",
    "quant.desc":                  "<strong>La quantification cassera-t-elle votre modèle ?</strong> Collez un id HF, choisissez un schéma — obtenez le γ-shift prédit, la bande ΔPPL attendue et une alternative recommandée si c'est un cliff. Navigateur uniquement, sans GPU, sans set de calibration.",
    "quant.id_label":              "ID modèle HF :",
    "quant.fetch_btn":             "📥 Récupérer config",
    "quant.scheme_label":          "Schéma quant :",
    "quant.run_btn":                "⚖️ Prédire",
    "quant.all_btn":               "📊 Comparer tous les schémas",
    "quant.regime.safe":           "✅ SÛR",
    "quant.regime.mild":           "✅ COMPRESSION LÉGÈRE",
    "quant.regime.significant":    "⚠ DÉGRADATION SIGNIFICATIVE",
    "quant.regime.cliff":          "❌ CLIFF SÉVÈRE",
    "quant.label.gamma_shift":     "γ shift",
    "quant.label.delta_ppl":       "ΔPPL (est.)",
    "quant.label.arch_mult":       "Multiplicateur arch",
    "quant.section.breakdown":     "Détail",
    "quant.section.reco":          "Recommandation",
    "quant.section.compare":       "Tous les schémas (triés par sécurité)",
    "quant.field.scheme":          "Schéma",
    "quant.field.calibrated":      "calibré",
    "quant.field.uncalibrated":    "non calibré",
    "quant.field.base_penalty":    "Pénalité de base",
    "quant.field.arch_mult_full":  "Multiplicateur architectural",
    "quant.field.gamma_shift":     "γ shift prédit",
    "quant.field.ppl_band":        "Bande ΔPPL (est.)",
    "quant.field.params":          "Paramètres",
    "quant.col.scheme":            "Schéma",
    "quant.col.bits":              "Bits",
    "quant.col.gamma_shift":       "γ shift",
    "quant.col.ppl_band":          "Bande ΔPPL",
    "quant.col.regime":            "Régime",
    "quant.reco.switch_to_awq":    "<strong>Passez à {scheme}</strong> — le 4-bit calibré gère bien mieux les petits d_head + GQA que NF4. ΔPPL attendue chute ~2-3×.",
    "quant.reco.switch_to_q5_km":  "<strong>Passez à {scheme}</strong> — Q5 garde plus de dimensions de head intactes à faible coût (~25% plus grand seulement).",
    "quant.reco.switch_to_q4_km":  "<strong>Passez à {scheme}</strong> — Q3/Q2 sont trop agressifs pour cette architecture.",
    "quant.reco.consider_awq":     "<strong>Considérez {scheme}</strong> — la calibration réduit significativement le γ-shift sur cette architecture.",
    "quant.reco.use_higher_bits":  "<strong>Utilisez une alternative à plus de bits</strong> — cette architecture n'absorbe pas le 4-bit proprement. Essayez 5 ou 8-bit.",
    "quant.reco.verify_with_eval": "<strong>Vérifiez avec une vraie éval</strong> — le shift prédit est borderline. Lancez NIAH à votre contexte cible avant de déployer.",
    "quant.reco.no_action":        "Pas d'action requise — la quantification est sûre pour cette architecture.",
    "quant.summary.headline_all":  "Tous les schémas pour <code>{modelId}</code>",
    "quant.status.empty_id":       "⚠ Saisissez un model id (ex. meta-llama/Llama-3.2-1B).",
    "quant.status.fetching":       "⏳ Récupération config.json pour {modelId}...",
    "quant.status.fetched":        "✅ Config récupéré pour {modelId}. Choisissez un schéma et cliquez Prédire (ou Comparer tous).",
    "quant.status.no_scheme":      "⚠ Choisissez un schéma de quantification dans le dropdown.",
    "quant.status.done":           "✅ Régime prédit : {regime}",
    "quant.status.done_all":       "✅ Comparé {n} schémas — triés par sécurité.",

    // v0.7.4 — autocomplete HF Hub : confidentialité + rate-limit
    "hf_auto.privacy":             "🔒 Requêtes envoyées à huggingface.co/api · cache local 5 min",
    "hf_auto.rate_limited":        "⚠ Rate limit HuggingFace — réessayez dans un moment, ou tapez l'id complet manuellement",
    "hf_auto.gated_msg":           "est gated. Acceptez la licence ici :",

    // v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound
    "modes.drift":                 "🔀 Drift",
    "mode_desc.drift":             "Prédit le drift max admissible entre deux scores de benchmark donnés (framework, dtype, batch, chat-template). Distingue les vrais bugs du bruit numérique.",
    "drift.title":                 "🔀 Borne de drift inter-frameworks",
    "drift.tip":                   "Même modèle, scores différents sur des setups différents. L'écart est-il du bruit ou un vrai bug ? Saisissez deux scores avec leur (framework, dtype, batch, chat-template) — l'outil prédit le drift max admissible dû au seul bruit numérique. Si l'écart observé le dépasse → vrai bug, généralement chat-template mismatch (issue #1841 lm-eval) ou layout KV-cache.",
    "drift.desc":                  "<strong>Votre modèle donne 67.2 sur lm-eval-hf et 65.1 sur vLLM-served. Bug ou bruit ?</strong> Saisissez les deux scores avec (framework, dtype, batch, chat-template appliqué ?). L'outil prédit la bande de bruit et signale les vrais bugs. arxiv 2506.09501 documente cela comme un problème majeur de reproductibilité d'évals.",
    "drift.setup_a":               "Setup A",
    "drift.setup_b":               "Setup B",
    "drift.score":                 "Score",
    "drift.framework":             "Framework",
    "drift.dtype":                 "Dtype",
    "drift.batch":                 "Batch",
    "drift.template":              "Chat-template",
    "drift.template.applied":      "appliqué",
    "drift.template.not_applied":  "non appliqué",
    "drift.template.unknown":      "inconnu",
    "drift.run_btn":               "🔀 Calculer la borne de drift",
    "drift.sample_btn":            "📊 Charger échantillon (bug chat-template)",
    "drift.label.observed":        "Écart observé",
    "drift.label.band":            "Bande numérique",
    "drift.label.ratio":           "Écart / bande",
    "drift.section.setups":        "Setups",
    "drift.section.breakdown":     "Contributeurs au drift (bande numérique)",
    "drift.section.verdict":       "Verdict et recommandation",
    "drift.contrib.dtype":         "Mismatch de dtype",
    "drift.contrib.framework":     "Framework",
    "drift.contrib.batch":         "Différence de batch",
    "drift.contrib.template":      "MISMATCH de chat-template",
    "drift.dominant_cause":        "Cause dominante",
    "drift.cause.dtype":           "différence de précision dtype",
    "drift.cause.framework":       "différence de framework / kernel",
    "drift.cause.batch":           "chemins de normalisation par batch",
    "drift.cause.template_mismatch": "chat-template appliqué d'un côté mais pas de l'autre (motif #1841 lm-eval-harness — typiquement -50% sur multi-tours)",
    "drift.verdict.noise":         "✅ BRUIT NUMÉRIQUE",
    "drift.verdict.suspicious":    "⚠ SUSPECT — vérifiez",
    "drift.verdict.bug":           "❌ VRAI BUG — investiguez",
    "drift.verdict.bug_template":  "❌ BUG DE CHAT-TEMPLATE",
    "drift.reco.noise":            "L'écart entre dans la bande de bruit numérique attendue. Pas d'action requise ; la différence est cohérente avec la seule variation framework/dtype/batch.",
    "drift.reco.suspicious":       "L'écart est 1–2× la bande prédite. Borderline — possible vrai bug. Essayez d'aligner le contributeur dominant (ex. égalisez framework ou dtype) et re-testez.",
    "drift.reco.bug":              "L'écart est &gt; 2× la bande prédite. C'est un vrai bug. Inspectez le contributeur dominant — probablement une différence de tokenizer / chat-template / layout KV-cache. Lancez lm-eval-harness avec <code>--apply_chat_template</code> et confirmez.",
    "drift.reco.bug_template":     "Mismatch de chat-template détecté. C'est la cause la plus commune des grands écarts d'évals (issue #1841 lm-eval-harness). Relancez le côté &quot;non appliqué&quot; avec <code>--apply_chat_template</code> (ou réglez vLLM <code>--chat-template &lt;name&gt;</code>) et re-testez.",
    "drift.status.empty_scores":   "⚠ Saisissez les deux scores.",
    "drift.status.done":           "✅ Verdict : {verdict}",
    "drift.status.sample_loaded":  "✅ Échantillon chargé (bug chat-template canonique). Cliquez sur Calculer la borne de drift.",

    // v0.7.6 — anti-bullshit pack #7: prédicteur de gap NIAH → reasoning
    "modes.niah":                  "🔍 NIAH→Reason",
    "mode_desc.niah":              "Prédit les taux de réussite NIAH (retrieval) et reasoning multi-hop à n'importe quel contexte. Résout : les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte (paper RULER).",
    "modes.saturation":            "📈 Saturation",
    "mode_desc.saturation":        "Indique si un benchmark discrimine encore les frontier models ou s'il est saturé (ex. MMLU 88-94% top, AIME 2025 déjà 96-100%). Retourne top-3 + verdict + remplacements recommandés.",
    "modes.hub":                   "🧭 Solutions",
    "mode_desc.hub":               "Carte de chaque problème documenté de LLM-eval → mode tafagent (si couvert) + outils externes curés. Trouvez la solution sans la réinventer. 30+ pains, 7 catégories.",
    "niah.title":                  "🔍 Gap NIAH → Reasoning",
    "niah.tip":                    "NIAH (Needle in a Haystack) teste le retrieval : 'trouve ce fait dans un long texte'. Le reasoning multi-hop teste l'inférence : 'combine les faits X+Y au début avec le fait Z à la fin'. Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte. Cet outil prédit les deux taux à partir de la seule architecture.",
    "niah.desc":                   "<strong>Votre modèle revendique 128k de contexte. Va-t-il vraiment raisonner à 64k, ou seulement retrouver ?</strong> Collez un model id HF et un contexte cible — l'outil prédit les taux de réussite NIAH et reasoning multi-hop, le gap, et un 'contexte sûr' où le reasoning reste ≥65%.",
    "niah.id_label":               "ID modèle HF :",
    "niah.fetch_btn":              "📥 Récupérer config",
    "niah.teval_label":            "Contexte cible (T_eval) :",
    "niah.run_btn":                "🔍 Prédire",
    "niah.sweep_btn":              "📊 Balayer les contextes",
    "niah.label.niah":             "Taux NIAH",
    "niah.label.reasoning":        "Taux Reasoning",
    "niah.label.gap":              "Gap",
    "niah.label.safe_ctx":         "Contexte sûr pour reasoning",
    "niah.section.breakdown":      "Détail architectural",
    "niah.section.reco":           "Recommandation",
    "niah.section.sweep":          "Balayage des taux par longueur de contexte",
    "niah.field.dhorizon":         "d_horizon (effectif)",
    "niah.field.ratio":            "T_eval / d_horizon",
    "niah.field.arch_pressure":    "Pression arch (petit d_head + GQA + SWA)",
    "niah.field.theta":            "RoPE θ",
    "niah.field.t_train":          "T_train (revendiqué)",
    "niah.col.context":            "T_eval",
    "niah.col.niah":               "NIAH",
    "niah.col.reasoning":          "Reasoning",
    "niah.col.gap":                "Gap",
    "niah.col.verdict":            "Verdict",
    "niah.verdict.robust":         "✅ ROBUSTE",
    "niah.verdict.marginal":       "⚠ MARGINAL",
    "niah.verdict.degraded":       "⚠ DÉGRADÉ",
    "niah.verdict.retrieval_only": "❌ RETRIEVAL UNIQUEMENT",
    "niah.verdict.broken":         "❌ CASSÉ",
    "niah.reco.robust":            "Retrieval et reasoning tiennent tous deux à ce contexte. Sûr de déployer pour les tâches de lookup et d'inférence.",
    "niah.reco.marginal":          "Borderline. Le retrieval fonctionne mais le reasoning est fragile. À utiliser pour le lookup, pas pour l'inférence multi-étapes.",
    "niah.reco.degraded":          "Chute significative du reasoning. Le modèle trouve des faits mais peine à les combiner. Évitez les tâches multi-hop à cette longueur.",
    "niah.reco.retrieval_only":    "Constat canonique de RULER : le modèle passe NIAH mais échoue au reasoning. Utile pour les setups RAG (où le LLM ne fait que localiser les faits) mais PAS pour l'inférence chaînée. Réduisez votre contexte à la valeur 'sûre' ci-dessous.",
    "niah.reco.broken":            "Le modèle échoue même au retrieval basique à ce contexte. Traitez-le comme hors-distribution — re-testez à un contexte plus court.",
    "niah.safe_context":           "≤ {ctx} tokens (reasoning ≥ 65%)",
    "niah.safe_context_none":      "Aucun contexte sûr trouvé sous votre cible — le modèle échoue au reasoning même à de petits contextes.",
    "niah.summary.sweep":          "<code>{modelId}</code> — taux par contexte",
    "niah.status.empty_id":        "⚠ Saisissez un model id (ex. meta-llama/Llama-3.1-8B-Instruct).",
    "niah.status.bad_teval":       "⚠ Saisissez un contexte cible (≥ 512 tokens).",
    "niah.status.fetching":        "⏳ Récupération config.json pour {modelId}...",
    "niah.status.fetched":        "✅ Config récupéré pour {modelId}. Réglez T_eval et cliquez Prédire (ou Balayer les contextes).",
    "niah.status.done":            "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
    "niah.status.sweep_done":      "✅ Balayé {n} longueurs de contexte.",
    "saturation.title":            "📈 Détecteur de saturation des benchmarks",
    "saturation.tip":              "MMLU est saturé (88-94% sur tous les frontier models). Annoncer '92% sur MMLU' n'a plus de sens. Cet outil vous dit quels benchmarks discriminent encore les frontier models, lesquels sont saturés, et quoi utiliser à la place. Données : DemandSphere AI Frontier Tracker (CC BY-NC 4.0) rafraîchi 2026-05.",
    "saturation.desc":             "<strong>Votre benchmark est-il encore utile ?</strong> Choisissez un benchmark pour voir top-3 frontier scores, spread, et un verdict (saturated / near-saturated / discriminative) + remplacements recommandés.",
    "saturation.select_label":     "Benchmark :",
    "saturation.select.all":       "— afficher tous les benchmarks —",
    "saturation.run_btn":          "📈 Classer",
    "saturation.all_btn":          "📊 Afficher tout",
    "saturation.col.spread":       "Écart top-3",
    "saturation.col.mean":         "Moyenne top-3",
    "saturation.col.n":            "Modèles",
    "saturation.col.bench":        "Benchmark",
    "saturation.col.verdict":      "Verdict",
    "saturation.col.reco":         "Reco principale",
    "saturation.col.model":        "Modèle",
    "saturation.col.score":        "Score",
    "saturation.section.top3":     "Top-3 frontier scores",
    "saturation.section.recommendations": "Alternatives recommandées",
    "saturation.section.note":     "Notes",
    "saturation.section.all":      "Tous les benchmarks suivis",
    "saturation.verdict.saturated":      "🚨 SATURÉ",
    "saturation.verdict.near_saturated": "⚠ PRESQUE SATURÉ",
    "saturation.verdict.discriminative": "✅ DISCRIMINATIF",
    "saturation.verdict.sparse_data":    "ℹ DONNÉES RARES",
    "saturation.borderline":       "Borderline — à ±1pp d'un seuil de coupure. Traitez le verdict comme 'à vérifier soigneusement'.",
    "saturation.unknown":          "Benchmark inconnu.",
    "saturation.attribution":      "Données : DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (historique open-weight) · dernier fetch 2026-05-05.",
    "saturation.status.live":      "✅ Données en direct chargées — {count} modèles.",
    "saturation.status.baked":     "ℹ Utilisation du snapshot baked (fetch en direct indisponible).",
    "saturation.status.kb_fail":   "⚠ Impossible de charger le KB de saturation.",
    "saturation.status.done":      "✅ {name} — {verdict}",
    "saturation.status.all_done":  "✅ {n} benchmarks classés.",
    "help.v08.saturation.title":   "📈 Détecteur de saturation des benchmarks",
    "help.v08.saturation.body":    "MMLU est saturé (top 88-94%), AIME 2025 saturé en quelques mois après sa sortie, HumanEval presque saturé. Choisissez un benchmark et l'outil retourne top-3 frontier scores, spread, moyenne, et un verdict — saturated / near-saturated / discriminative — plus un remplacement recommandé (ex. MMLU → MMLU-Pro / GPQA / HLE). Fetch en direct depuis DemandSphere AI Frontier Tracker (CC BY-NC 4.0) si accessible ; snapshot baked 2026-05-05 sinon. <em>Cas d'usage</em> : avant de citer '92% sur MMLU' ou de concevoir une eval, vérifiez si le benchmark discrimine encore quelque chose.",
    "inv.v08.saturation":          "<strong>📈 Saturation</strong> — votre benchmark est-il encore utile, ou tous les frontiers sont-ils à égalité au sommet ?",
    "inv.v081.hub":                "<strong>🧭 Solutions Hub</strong> — chaque pain documenté mappé à un mode tafagent ou outil externe curé. Ne réinventez pas — trouvez.",
    "help.v081.hub.title":         "🧭 Solutions Hub",
    "help.v081.hub.body":          "tafagent comme intégrateur, pas silo. 30+ pains à travers 7 catégories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), chacun mappé à (a) le mode tafagent qui le résout, s'il existe, et (b) les outils externes best-of-breed que la communauté utilise déjà (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). La barre de recherche matche pain, scénario, et nom d'outil. <em>Cas d'usage</em> : 'j'ai le problème X — tafagent le résout-il, et sinon, qui ?'",
    "hub.title":                   "🧭 Solutions Hub",
    "hub.tip":                     "Carte de chaque pain de LLM-eval documenté : quel mode tafagent l'adresse (si applicable), et les outils externes best-of-breed que la communauté utilise déjà. Objectif : couverture totale. Si l'outil canonique existe ailleurs, nous lions plutôt que de reconstruire.",
    "hub.desc":                    "<strong>Ne réinventez pas — trouvez.</strong> 30+ pains mappés à des modes tafagent + outils externes curés. Naviguez par catégorie, recherchez par mot-clé, ou voyez les lacunes où de nouveaux modes aideraient le plus.",
    "hub.clear_btn":               "✕ Effacer",
    "hub.no_mode":                 "externe",
    "hub.planned":                 "prévu :",
    "hub.best_for":                "Idéal pour",
    "hub.not_for":                 "Pas pour",
    "hub.tools":                   "Outils externes",
    "hub.status.loaded":           "✅ Chargés {total} pains dans {categories} catégories — {covered} couverts par des modes tafagent, {externalLinks} liens externes curés. Compilé {compiled}.",
    "hub.status.fail":             "⚠ Impossible de charger Solutions Hub.",
    "hub.search.empty":            "Aucune correspondance pour '{query}'. Essayez des termes plus larges (ex. 'eval', 'rag', 'tokenizer').",
    "hub.search.results":          "{n} correspondance(s) trouvée(s) pour '{query}'.",

    // v0.7.7 — Tuiles de tâches (refonte UX : 14 modes regroupés par intention)
    "tiles.title":                 "🎯 Que voulez-vous faire ?",
    "tiles.subtitle":              "Choisissez une tâche. Chacune ouvre l'outil adéquat ci-dessous. Ou faites défiler pour la liste complète des 14 modes.",
    "tile.diagnose.title":         "🔬 Diagnostiquer un modèle",
    "tile.diagnose.desc":          "Ce modèle conviendra-t-il à mon cas d'usage ?",
    "tile.trust.title":            "✓ Faire confiance à un score",
    "tile.trust.desc":             "Dois-je croire ce nombre ? Bug ou bruit ?",
    "tile.eval.title":              "⚙️ Configurer une éval correctement",
    "tile.eval.desc":              "Obtenez le flag CLI exact pour lm-eval / vLLM / transformers.",
    "tile.compare.title":          "🆚 Comparer des modèles",
    "tile.compare.desc":           "Côte à côte, ou explorez le panel empirique de modèles.",
    "tile.manual.title":           "📋 Manuel / libre",
    "tile.manual.desc":            "Choisissez une recette à la main, ou demandez en langage naturel.",
    "tile.diagnose.tip":           "Commencez ici quand vous avez un id de modèle spécifique et voulez un diagnostic complet : <strong>Profile</strong> lance les 5 recettes d'un coup. <strong>Unmask</strong> vérifie si max_position_embeddings est honnête. <strong>NIAH→Reason</strong> prédit le gap retrieval-vs-reasoning. <strong>Quant</strong> prédit si quantifier va le casser. <strong>Inspect</strong> permet de coller un config.json brut pour modèles privés / en dev.",
    "tile.trust.tip":              "Quand vous voyez un score et voulez savoir s'il est réel. <strong>Contamination</strong> note 20+ benchmarks selon la probabilité que le modèle les ait vus en entraînement. <strong>Drift</strong> vous dit si l'écart entre deux évals est du bruit numérique ou un vrai bug (chat-template mismatch, layout KV-cache, etc.). <strong>Arena CI</strong> reconstruit les intervalles de confiance que Chatbot Arena cache — beaucoup de &quot;victoires&quot; top-Elo sont statistiquement à égalité.",
    "tile.eval.tip":               "Avant de lancer lm-eval-harness ou vLLM serve, obtenez le bon flag CLI. <strong>Chat-template Sniffer</strong> détecte la famille de template (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) et émet l'invocation exacte <code>--apply_chat_template</code> / <code>--chat-template</code>. Résout l'issue #1841 de lm-eval-harness (÷2 accuracy silencieux). <strong>Diagnose CLI</strong> génère la commande Python pour mesurer γ_obs sur votre GPU local.",
    "tile.compare.tip":            "<strong>Compare</strong> : choisissez 2-3 modèles candidats + une recette, voyez les verdicts dans un tableau côte à côte (ex. Llama-3-8B vs Mistral-7B à 32k). <strong>Phase diagram</strong> : nuage de 23 modèles empiriques dans le plan (log θ, γ), avec la courbe Padé superposée. Survolez les points pour détails, cliquez pour charger ce modèle dans le formulaire Recipe.",
    "tile.manual.tip":             "<strong>Recipe</strong> : choisissez une recette X-N spécifique (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 compression KV, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) et remplissez le formulaire à la main pour contrôle total. <strong>Ask</strong> : tapez une question libre ; un LLM 0.5B (Qwen2.5) dans votre navigateur choisit la bonne recette et la lance. Idéal pour explorer &quot;que se passerait-il si...&quot;.",
    "share.import_desc":       "Vous avez un fichier JSON de l'analyse TAF de quelqu'un ? Chargez-le ici pour voir le verdict + la chaîne localement. La même vue que si vous l'aviez exécuté vous-même.",
    "share.import_btn":        "📂 Charger JSON partagé",
    "synthesis.system":        "Vous êtes un assistant de diagnostic précis pour LLMs transformer. Étant donné des résultats de formules TAF pré-calculés, écrivez un résumé clair en français de 4-6 phrases. Citez le numéro de section (§X.Y) pour chaque nombre mentionné. Donnez toujours une recommandation concrète. N'INVENTEZ PAS de nombres.",

    // INSPECTOR mode
    "inspector.title":         "🔍 Inspecteur d'Architecture",
    "inspector.desc":          "Collez le contenu brut de <code>config.json</code>. L'outil extrait les paramètres architecturaux et exécute le Profil complet à 5 recettes.",
    "inspector.tip":           "<strong>Collez n'importe quel config.json directement</strong>. L'outil le parse et exécute le Profil complet. Utile pour : modèles privés, configs en développement, modèles pas encore sur HuggingFace, ou comparer ce que ferait votre architecture custom.",
    "inspector.quickstart":    "💡 Cas d'usage : vous avez un modèle privé pas sur HF Hub, ou une config que vous concevez. Collez le JSON brut ci-dessous et obtenez un profil TAF complet.",
    "inspector.placeholder":   "{\n  \"model_type\": \"llama\",\n  \"rope_theta\": 500000,\n  \"max_position_embeddings\": 8192,\n  \"num_attention_heads\": 32,\n  \"num_key_value_heads\": 8,\n  \"hidden_size\": 4096,\n  \"num_hidden_layers\": 32\n}",
    "inspector.T_eval":        "T_eval (votre contexte cible) :",
    "inspector.btn":           "🚀 Inspecter et profiler",

    // WHAT-IF slider
    "whatif.title":            "🎚 What-if : faites glisser T_eval pour voir γ changer en direct",
    "whatif.desc":             "Recalcul pur JS (sans appel Pyodide). Montre γ_Padé et d_horizon géométriques pendant que vous glissez. Cliquez pour ré-exécuter la chaîne complète.",
    "whatif.T_eval":           "<strong>T_eval</strong>",
    "whatif.gamma_pade":       "<strong>γ_Padé</strong>",
    "whatif.d_horizon":        "<strong>d_horizon</strong>",
    "whatif.l_niah":           "<strong>Plafond L_NIAH</strong>",
    "whatif.predicted":        "<strong>Verdict géométrique prédit</strong>",
    "whatif.rerun":            "↻ Recalculer la chaîne complète à ce T_eval",

    // COMMUNITY feed
    "community.title":         "🌐 Soumissions récentes de la communauté",
    "community.desc":          "Flux en direct du registre public. Cliquez sur n'importe quelle soumission pour voir l'analyse complète.",
    "community.browse_all":    "Voir tout →",
    "community.loading":       "Chargement...",
    "community.no_repo":       "Le repo du registre n'est pas encore créé. Une fois qu'il existe avec des soumissions, elles apparaîtront ici en direct.",
    "community.no_submissions": "Aucune soumission. Soyez le premier — générez un Profil et cliquez 📤 Soumettre au registry.",

    // FALSIFICATION dashboard
    "falsification.title":     "🔬 Prédictions du paper — statut de falsification",
    "falsification.desc":      "Le framework TAF repose sur des prédictions falsifiables (F1-F23). Chacune est empiriquement testée. Voici le statut en direct de chaque prédiction du paper.",
    "falsification.summary":   "{confirmed} confirmées · {partial} partielles · {refuted} réfutées · {untested} non testées (sur {total} prédictions au total)",
    "falsification.col.id":    "ID",
    "falsification.col.claim": "Claim",
    "falsification.col.status": "Statut",
    "falsification.col.evidence": "Preuve",

    "tafcard.title":           "📇 TAF Card — profil complet du modèle",
    "tafcard.recipes_title":   "📋 Recettes — verdict par dimension",
    "tafcard.recipes_count_label": "dimensions",
    "tafcard.numbers_title":   "🔢 Nombres clés (paper §26)",
    "tafcard.fals_title":      "🔬 État de falsification (F1-F23)",
    "tafcard.fals_none":       "Aucune falsification applicable.",
    "tafcard.diag_title":      "🔬 Diagnostics — nombres · contrôle γ · what-if",
    "tafcard.verify_title":    "✓ Vérification — Lean + Sage + falsification",
    "tafcard.share_title":     "📂 Provenance & partage",
    "tafcard.whatif_title":    "🎚️ Explorateur what-if",
    "verdict.go":              "GO",
    "verdict.no":              "NON",
    "verdict.degraded":        "DÉGRADÉ",

    "compare.title_out":       "🆚 Tableau comparatif",

    "status.loading_pyodide":  "⏳ Chargement du runtime Python (~10MB, première fois)...",
    "status.loading_taf":      "⏳ Chargement des formules TAF + recettes...",
    "status.ready":            "✅ Prêt. Choisissez un modèle et cliquez Profiler pour commencer.",
    "status.computing":        "🧮 Calcul de la chaîne TAF...",
    "status.done":             "✅ Terminé.",

    "profile.hf_placeholder":  "ex. meta-llama/Meta-Llama-3-8B ou Qwen/Qwen2.5-7B",
    "compare.hf_placeholder":  "ID modèle HF (ex. meta-llama/Meta-Llama-3-8B)",
    "compare.slot1_placeholder": "ID modèle HF (ex. meta-llama/Meta-Llama-3-8B)",
    "compare.slot2_placeholder": "ID modèle HF #2",
    "compare.slot3_placeholder": "ID modèle HF #3 (optionnel)",
    "compare.preset_default": "— ou préréglage —",

    // Paramètres du formulaire
    "param.theta":         "θ (rope_theta)",
    "param.theta.tip":     "<strong>Fréquence de base RoPE</strong> de <code>config.rope_theta</code>. Plus haut = plus de capacité longue portée.",
    "param.T_train":       "T_train",
    "param.T_train.tip":   "<strong>Contexte max d'entraînement</strong>. De <code>max_position_embeddings</code>. Au-delà c'est de l'extrapolation.",
    "param.T_eval":        "T_eval (votre cible)",
    "param.T_eval.tip":    "<strong>Votre contexte d'inférence cible</strong>. La question clé : le modèle se comportera-t-il bien à CETTE longueur ?",
    "param.n_attn":        "n_attention_heads",
    "param.n_attn.tip":    "<strong>Nombre d'attention heads</strong> par couche. De <code>num_attention_heads</code>.",
    "param.n_kv":          "n_kv_heads",
    "param.n_kv.tip":      "<strong>KV heads</strong>. Si &lt; n_attention_heads → GQA (Grouped Query Attention). Réduit la mémoire KV mais pousse γ vers Hagedorn.",
    "param.d_head":        "head_dim",
    "param.d_head.tip":    "<strong>Dimension par head</strong>. Typique 64, 96, 128. De <code>head_dim</code> ou <code>hidden_size / num_attention_heads</code>.",
    "param.n_layers":      "n_layers",
    "param.n_layers.tip":  "<strong>Nombre de blocs transformer</strong>. De <code>num_hidden_layers</code>.",
    "param.n_params":      "n_params (ex. 8e9)",
    "param.n_params.tip":  "<strong>Nombre total de paramètres</strong>. Seuil ~400M pour l'émergence d'induction heads. Affecte la mémoire KV et les recettes de budget.",
    "param.has_swa":       "A SWA ?",
    "param.has_swa.tip":   "<strong>Sliding Window Attention</strong>. <code>true</code> pour Mistral, gemma-2, phi-3. L'audit de calibration v0.5.3 a désactivé la correction historique δ_SWA (ajustement n=1).",
    "common.yes":          "Oui",
    "common.no":           "Non",

    // Tooltips des modes
    "modes.tip":           "<strong>Quatorze façons d'utiliser l'outil</strong>.<br><strong>📇 Profil</strong>: collez un id → TAF Card avec 5 recettes.<br><strong>🆚 Comparer</strong>: 2-3 modèles côte à côte sur une recette.<br><strong>🔍 Inspecter config</strong>: collez config.json brut → Profil complet.<br><strong>💬 Question</strong>: question libre, le LLM du navigateur choisit la recette.<br><strong>📋 Recette</strong>: sélection manuelle avec contrôle total du formulaire.<br><strong>🩺 Diagnostic CLI</strong>: génère commande Python pour mesurer γ localement.<br><strong>📊 Diagramme de phase</strong>: panel de 23 modèles dans le plan (log θ, γ).<br><strong>🪟 Démasquer</strong>: détecte un max_position_embeddings trompeur (SWA / YaRN / RoPE-scaling).<br><strong>📜 Chat-template</strong>: détecte la famille + donne le flag CLI exact pour lm-eval / vLLM / transformers.<br><strong>🎯 Arena CI</strong>: reconstruit les intervalles de confiance depuis les votes pairwise bruts ; détecte les égalités statistiques qu'Arena cache.<br><strong>🧪 Contamination</strong>: note 20+ benchmarks pour leur probabilité de contamination selon le cutoff d'entraînement vs la date de sortie.<br><strong>⚖️ Quant</strong>: prédit γ-shift et ΔPPL pour tout (modèle × schéma de quantification) ; recommande une alternative sûre en cas de cliff.<br><strong>🔀 Drift</strong>: même modèle, scores différents sur deux setups — bug ou bruit ? Prédit la bande de bruit numérique et signale les vrais bugs.<br><strong>🔍 NIAH→Reason</strong>: prédit les taux NIAH et reasoning multi-hop depuis l'architecture ; trouve le contexte sûr pour reasoning.",
    "profile.tip":         "<strong>Diagnostic complet en un clic</strong>. Collez n'importe quel id de modèle HF (ou choisissez préréglage). L'outil exécute les 5 recettes (contexte long, compression KV, custom vs API, budget, hardware) et produit une <strong>TAF Card</strong> unique avec verdict par dimension + nombres clés + classification architecturale.<br><br><strong>Cas d'usage</strong>: « J'évalue Qwen2.5-32B pour la production — quel est son profil complet de viabilité ? » → collez id → Profiler → fait.",
    "compare.tip":         "<strong>Même recette, plusieurs modèles</strong>. Choisissez 2-3 modèles candidats et une recette. Voyez les verdicts dans un seul tableau comparatif.<br><br><strong>Cas d'usage</strong>: « J'ai besoin de récupération longue contexte à 16K — quel est le meilleur : Llama-3-8B, Mistral-7B ou Qwen-7B ? » → choisissez 3 + X-2 + 16K → voyez le gagnant.",

    // Modal d'aide
    "help.title":               "📘 TAF Agent — Manuel d'utilisation",
    "help.what.title":          "Que fait-il ?",
    "help.what.body":           "Prédit la <strong>viabilité pratique</strong> de tout LLM transformer <em>avant de dépenser du GPU/€</em>. Répond à des questions comme « ce modèle fonctionnera-t-il à L=32K ? » ou « dois-je entraîner sur mesure ou utiliser une API ? » via des formules Python déterministes (TAF — Thermodynamic Attention Framework).",
    "help.modes.title":         "Comment l'utiliser — 7 modes",
    "help.modes.profile":       "<strong>📇 Profiler</strong>: collez id de modèle → toutes les recettes à la fois = TAF Card. <strong>Meilleur point de départ</strong>.",
    "help.modes.compare":       "<strong>🆚 Comparer</strong>: 2-3 modèles côte à côte sur la même recette. Mieux pour choisir entre candidats.",
    "help.modes.inspector":     "<strong>🔍 Inspecter config</strong>: collez <code>config.json</code> brut → l'outil le parse et lance le Profil complet. Pour modèles privés, configs en développement, ou modèles pas encore sur HF Hub.",
    "help.modes.ask":           "<strong>💬 Question libre</strong>: question en langage naturel, le LLM du navigateur choisit la recette. Mieux pour exploration casuelle.",
    "help.modes.recipe":        "<strong>📋 Recette + formulaire</strong>: sélection manuelle, contrôle total des paramètres. Mieux quand vous voulez un contrôle exact.",
    "help.modes.diagnose":      "<strong>🩺 Diagnostic CLI</strong>: génère commande Python pour mesurer γ sur votre machine locale (transformers + numpy). Rapide ≈5 min CPU; complet ≈20–60 min GPU. JSON résultat ré-uploadable via Inspect.",
    "help.modes.phase":         "<strong>📊 Diagramme de phase</strong>: nuage de 23 modèles du panel dans le plan (log θ, γ). Ligne Hagedorn γ=1 sépare Phase A de Phase B. Cliquer un point pour charger ce modèle dans le formulaire Recette.",
    "help.recipes.title":       "Les 8 recettes disponibles",
    "help.recipe.x1.title":     "<strong>X-1 Entraînement custom vs API</strong> — compare le coût d'entraîner votre propre modèle vs payer l'accès API.",
    "help.recipe.x1.example":   "Essayez: <em>« Dois-je entraîner un 8B custom ou utiliser GPT-4o pour 50M tokens/mois ? »</em><br>Réponses: OUI (custom) / NON (API) avec mois pour break-even.",
    "help.recipe.x2.title":     "<strong>X-2 Viabilité contexte long</strong> — prédit si un modèle sert une longueur cible de manière fiable.",
    "help.recipe.x2.example":   "Essayez: <em>« Meta-Llama-3-8B gérera-t-il 32000 tokens pour récupération ? »</em><br>Chaîne: γ_Padé → décomposition → d_horizon → plafond NIAH → hallucination → mémoire KV.<br>Verdict: OUI / DÉGRADÉ / NON avec mitigation si nécessaire.",
    "help.recipe.x3.title":     "<strong>X-3 Pre-flight budget</strong> — étant donné un budget $, quel modèle est faisable à entraîner ?",
    "help.recipe.x3.example":   "Essayez: <em>« J'ai $5000, quel modèle puis-je entraîner ? »</em><br>Réponse: GO / TINY-MODEL / MEMORY-LIMITED avec N (params) et D (tokens) concrets.",
    "help.recipe.x5.title":     "<strong>X-5 Sélection hardware</strong> — quel GPU utiliser pour servir au throughput cible ?",
    "help.recipe.x5.example":   "Essayez: <em>« Hardware le moins cher pour servir Llama-3-8B à 10M tokens/jour »</em><br>Réponse: meilleur GPU + $/Mtok + capacité vs cible.",
    "help.recipe.x19.title":    "<strong>X-19 Décision compression KV</strong> — utiliser soft decay, hard cutoff, ou méthodes de littérature ?",
    "help.recipe.x21.title":    "<strong>X-21 Diagnostic Pureté Imprint</strong> — prédit γ sur tokens RANDOM via ν=−1/(2π); à quel point la prédiction RoPE du modèle est-elle propre ?",
    "help.recipe.x22.title":    "<strong>X-22 Invariant Compute-Context</strong> — γ × log(N²·D) est-il dans la bande 51.2 ± 16.8 ? Détecte anomalies de scaling/training.",
    "help.recipe.x23.title":    "<strong>X-23 Détecteur Phase IH</strong> — pré- ou post-induction-head ? Probe peu coûteux via sign(γ_text − γ_random).",
    "help.recipe.x19.example":  "Essayez: <em>« Comment compresser le cache KV pour Qwen2.5-7B à 32K ? »</em><br>Réponse: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
    "help.recipe.x21.example":  "Essayez: <em>« Quelle est la pureté de la prédiction RoPE sur Llama-3-8B ? »</em><br>Réponse: γ_random prédit + diagnostic (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).",
    "help.recipe.x22.example":  "Essayez: <em>« Mistral-7B entre-t-il dans l'invariant compute-context ? »</em><br>Réponse: K = γ·log(N²·D), z-score, IN-BAND ou OUTLIER.",
    "help.recipe.x23.example":  "Essayez: <em>« Qwen2.5-7B est-il post-induction-head ? »</em><br>Réponse: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY.",
    "help.section.v04":         "<strong>Nouveautés v0.4</strong> (résultats session 29, 2026-04-28) : trois recettes de diagnostic dérivées de l'analyse panel cross-model (n=22 LLMs).",
    "help.divider.v04_s29":     "— v0.4 (résultats session 29) —",
    "footer.tech_stack":        "Calcul : Pyodide · Synthèse : WebLLM (Qwen2.5-0.5B local) · Hébergement : GitHub Pages · Coût : 0 $",
    "help.v04.imprint":         "<strong>Pente d'imprint apprise ν = −1/(2π)</strong> : la période de rotation RoPE 2π entraîne un biais positionnel dans les poids, proportionnel à log(N_params). Même les tokens aléatoires montrent ce scaling. ν est DÉRIVÉ — non ajusté (erreur empirique 0,3 %).",
    "help.v04.invariant":       "<strong>Invariant Chinchilla-attention K</strong> : γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Connecte le scaling de compute et l'exposant d'attention en un seul nombre sans dimension.",
    "help.v04.ih_probe":        "<strong>Δγ comme probe IH</strong> : sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Moins coûteux que de lancer un benchmark in-context-learning.",
    "help.v04.constants":       "<strong>γ-cluster sur constantes célèbres</strong> (intriguant, n=4) : CodeLlama-13b γ=0.382 ≈ 1−1/φ (conjugué doré, err 0,0003) ; pythia-1.4b γ=0.705 ≈ 1/√2 ; Llama-2-7b γ=0.287 ≈ 1−1/√2 ; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat : peut être coïncidence.",
    "help.param.theta":         "<strong>θ (rope_theta)</strong>: fréquence de base RoPE. Plus haut = plus de capacité longue portée. Typique: 10000 (anciens), 500000 (Llama-3), 1000000 (Qwen2.5).",
    "help.param.T_train":       "<strong>T_train</strong>: contexte max vu par le modèle pendant l'entraînement. De <code>max_position_embeddings</code>.",
    "help.param.T_eval":        "<strong>T_eval</strong>: <em>votre</em> longueur de contexte cible en inférence. Le bouton clé.",
    "help.param.gqa":           "<strong>n_kv_heads &lt; n_attention_heads</strong>: le modèle utilise GQA (Grouped Query Attention). Réduit la mémoire KV mais pousse γ vers Hagedorn.",
    "help.param.swa":           "<strong>has_SWA</strong>: le modèle utilise Sliding Window Attention (Mistral, gemma-2).",
    "help.param.nparams":       "<strong>n_params</strong>: nombre total de paramètres. Seuil ~400M pour l'émergence des induction heads.",
    "help.add_models.title":    "Ajouter de nouveaux modèles (3 façons)",
    "help.add_models.preset":   "<strong>Liste de préréglages</strong>: 11 modèles populaires curés. Sélectionnez dans le dropdown.",
    "help.add_models.hf":       "<strong>HF Hub fetch</strong>: collez n'importe quel id (ex. <code>Qwen/Qwen2.5-32B-Instruct</code>), cliquez 📥 Charger. Le navigateur télécharge <code>config.json</code> directement de HuggingFace, remplit le formulaire. Fonctionne avec tout modèle public.",
    "help.add_models.manual":   "<strong>Manuel</strong>: remplissez les champs directement avec les valeurs de la model card.",
    "help.audit.title":         "La chaîne auditable",
    "help.audit.body":          "Chaque résultat montre la <strong>Chaîne de Calcul</strong> complète — chaque étape de formule avec ses entrées, sortie et interprétation. Cliquez sur n'importe quelle étape pour développer. Les références de section (§26.1, §19.1, etc.) renvoient au paper pour la dérivation.",
    "help.synthesis.title":     "La réponse en langage naturel",
    "help.synthesis.body":      "Après exécution de la chaîne déterministe, un LLM dans le navigateur (Qwen2.5-0.5B, ~350MB cachés après premier chargement) synthétise un résumé en langage naturel. Les nombres ci-dessus sont <em>toujours corrects</em> (Python déterministe) ; la synthèse est générée par LLM — vérifiez contre la chaîne en cas de doute.",
    "help.params.title":        "Paramètres communs expliqués",
    "help.verdicts.title":      "Quoi regarder dans les verdicts",
    "help.verdict.yes":         "<strong style=\"color:#3fb950;\">OUI / GO</strong> — procédez avec confiance ; les nombres soutiennent le choix.",
    "help.verdict.deg":         "<strong style=\"color:#d29922;\">DÉGRADÉ / TINY-MODEL</strong> — fonctionne avec caveats ; lisez l'action.",
    "help.verdict.no":          "<strong style=\"color:#f85149;\">NON / MEMORY-LIMITED</strong> — ne procédez pas tel quel ; mitigation fournie.",
    "help.privacy.title":       "Confidentialité",
    "help.privacy.body":        "Tout s'exécute dans votre navigateur. Pas de télémétrie, pas d'analytique, pas de données envoyées ailleurs. Même le modèle LLM s'exécute localement via WebGPU/WebAssembly. Vos model_ids et questions ne quittent jamais cette page.",
    "help.source.title":        "Code source et paper",
    "help.source.body":         "Code : <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>Paper : <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a> ; arXiv à venir)<br>Dataset : <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 58 mesures γ sur 32 modèles (CC-BY-4.0)",

    "footer.text":             "© 2026 Carles Marin · Apache-2.0 · recherche indépendante · l'outil qui ferme la boucle du paper.",
  },

  // ────────────────────────────────────────────────────────────────────────
  // ZH — 中文
  // ────────────────────────────────────────────────────────────────────────
  zh: {
    // §33 v0.4 (sesion 31, 2026-04-30) — 新诊断功能
    "v04.title":                  "🆕 v0.4 — 新诊断 (会话 31)",
    "v04.section.intro":          "会话 31 (2026-04-30) 从公式 cross-of-crosses 游戏 + 苏格拉底质询中得出的四个新诊断函数。在 <code>taf_browser.py</code> §33 中可用。",
    "v04.arch.label":             "架构集中度",
    "v04.arch.desc":              "γ_text ≈ γ_Padé − 0.012·n_kv。跨面板相关性定律（R²=0.30）。警告：不是逐模型预测器。",
    "v04.pdi.label":              "PDI — Padé 偏差指数",
    "v04.pdi.desc":               "PDI = d_horizon_obs/T_eval。交通灯：绿色（≈1）、橙色（>>1）、黄色（<<1）、红色（B 阶段负值）。",
    "v04.4bit.label":             "4 位精度移位预测器",
    "v04.4bit.desc":              "MHA: R²(bf16)<0.9 → γ 上升；R²>0.99 → γ 下降。GQA: 精度稳健。",
    "v04.crit.label":             "临界指数捆绑",
    "v04.crit.desc":              "ν_c、β_c、η_c (=γ−1, 已修正)、α_C、γ_susc，AM-GM 最小值在 γ=1−1/√2≈0.293。",

    // §34 v0.5 (会话 32, 2026-05-01) — 机器验证的代数一致性
    "v05.title":                  "🔬 v0.5 — 机器验证一致性 (会话 32)",
    "v05.section.intro":          "Sage Groebner basis + Lean Mathlib4 双工具验证 TAF 临界指数的<strong>15 个代数恒等式</strong>。首个具有形式化机器证明支持的 transformer-attention 框架。",
    "v05.verify.label":           "代数一致性检查",
    "v05.verify.desc":            "给定测得的 γ，验证 12 个 D-SAGE 恒等式（D-SAGE-1：2η²+η·γ_χ+1=0、β·χ=−1、α+χ=2 等）。全部通过 = 框架完整。失败表明 bf16 异常值 / 量化伪影。",
    "v05.dsage1.label":           "D-SAGE-1 (★★ 核心)",
    "v05.dsage1.desc":             "二次恒等式 2η² + η·γ_χ + 1 = 0（Sage Groebner 发现, Lean 验证）。取代错误的 '三重闭合' 主张。从代数上反驳 paper 1 的 η=2γ。",
    "v05.erratum.label":          "Paper 1 勘误 — η 修正",
    "v05.erratum.desc":            "Paper 1 原本声明 η = 2γ。Sage Groebner + Lean Mathlib4 证明此为失败（残差 (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ A 相）。正确值：η = γ−1，满足 D-SAGE-1。",
    "v05.repro.label":            "可重现性",
    "v05.repro.desc":              "全部 15 个定理在 Lean Mathlib4 中机器证明（build 成功 1973 jobs）。Sage 脚本：<code>analysis/sage_recursive_sweep_2026-04-30.sage</code>。Lean 代码：<code>lean_taf/taf/Taf/Identities.lean</code>。",

    // v0.5.1 — TAF Card consistency check button
    "v05.consistency.title":      "🔬 代数一致性检查 (Sage + Lean v0.5)",
    "v05.consistency.desc":       "验证 TAF 临界指数的 12 个 D-SAGE 代数恒等式（Sage Groebner basis + Lean Mathlib4 机器证明）。通过 = 框架完整。失败 = bf16 异常值 / 量化伪影。",
    "v05.consistency.btn":        "🔬 验证代数一致性",

    // v0.5.2 — Anti-Ising universality class badge
    "v05.antiising.badge":        "🧲 反 Ising 类 (β=γ−1&lt;0，机器验证)",

    // v0.5.2 — 每个恒等式的工具提示（通俗解释）
    "v05.tooltip.D_SAGE_1":       "二次代数恒等式，连接异常维度 η 和磁化率 γ_χ。Sage Groebner basis 发现的核心恒等式（机器证明）。取代了之前关于三重闭合的错误声明。",
    "v05.tooltip.D_SAGE_2":       "在 A 相中，β = γ−1 为负（反 Ising）。乘以 χ = 1/(1−γ) 恰好等于 −1。TAF 负 β 体制的标志。",
    "v05.tooltip.D_SAGE_4":       "比热指数 α 和磁化率 χ 在 TAF 中精确加和为 2。Josephson 超标度的代数推论。",
    "v05.tooltip.D_SAGE_5":       "线性恒等式：α + γ_χ = 2(2−γ)。意味着当 γ 接近 1（Hagedorn）时，总和接近 2；在 γ=0 时为 4。",
    "v05.tooltip.D_SAGE_6":       "序参量指数乘以磁化率指数等于 γ 的特定二次式。因式分解的代数关系。",
    "v05.tooltip.Rushbrooke_tautology": "标准 Rushbrooke 超标度 2β + γ_χ = ν·d 在 d=1。在 TAF 中这是一个重言式 — γ_χ 的定义就是为了使其成立。Sage Groebner basis 确认。",
    "v05.tooltip.Josephson_tautology": "标准 Josephson 超标度 2 − α = ν·d 在 d=1。在 TAF 中这是一个重言式 — α 的定义就是为了使其成立。",
    "v05.tooltip.Fisher_independent": "Fisher 关系 γ_χ = (2−η)·ν。在 TAF 中是独立的（不作为恒等式闭合，与三重闭合声明相反）。残差为 γ(2γ−3)/(1−γ)。",
    "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 声称 η=2γ。这个恒等式驳斥了它：残差在整个 A 相中为正。Lean Mathlib4 的机器证明驳斥。",
    "v05.tooltip.D_14_nu_imprint": "学习到的印记斜率 ν = −1/(2π) 乘以 2π 得 −1。来自 paper 1 的简单维度检查。",
    "v05.tooltip.D_SAGE_7":       "中心电荷 c=3 乘以 |ν_imprint| 乘以 2π 得 3。连接 CFT 和训练印记的维度闭合。",
    "v05.tooltip.nu_beta_id":     "关联长度指数 ν 乘以序参量指数 β 在 A 相中得 −1。D-SAGE-2 的变体。",

    "v053.calibration.title":     "🔬 v0.5.3 — 校准审计 (2026-05-02)",
    "v053.calibration.note":      "<strong>SWA 修正已禁用</strong> — 原 δ_SWA = -0.21 基于 n=1 模型拟合（数据不足；唯一案例的均值为 +0.355）。<strong>post_IH 修正标记为探索性</strong> — 重审中组均值 ≈ 0（n=22 面板）未能复现 OLS 拟合。<strong>GQA 修正可复现</strong>（面板 +0.115 vs 硬编码 +0.11）。<strong>D_f 公式修正 Phase B (γ&gt;1)</strong> — 使用离散累积和代替连续近似。LLaMA-3、Mistral、Gemma 现在报告正确的压缩值。",
    "v053.release.banner":        "🔧 v0.5.3 — 审计驱动的修复：KV 压缩 D_f 现使用离散和（适用于所有 γ）；δ_SWA 禁用（n=1 校准）；论文 §5.2 C_V 系数勘误 (1/4 → 1/12)。",

    // §35 v0.6 — γ 预测 vs 观测 诊断
    "gamma_check.title":           "🔍 γ 预测 vs 观测",
    "gamma_check.desc":            "输入你经验测量的 γ。工具自动检测体制：欺诈 (θ 虚高) / 压缩 / 超 Padé / SWA-随机 / 正常。",
    "gamma_check.gobs_label":      "γ_观测",
    "gamma_check.gobs_tip":        "从模型注意力分数经验测量的 γ。使用 Diagnose CLI 从真实权重获取。",
    "gamma_check.random_label":    "随机语料？",
    "gamma_check.random_tip":      "若 γ_观测在随机/无结构 token 上测得请勾选。区分 SWA 签名 (γ_obs > 1) 与异常。",
    "gamma_check.regime":          "体制",
    "gamma_check.regime.normal":         "正常",
    "gamma_check.regime.fraud":          "欺诈 (θ 虚高)",
    "gamma_check.regime.compressed":     "上下文压缩",
    "gamma_check.regime.overpade":       "超 Padé",
    "gamma_check.regime.swa":            "SWA 签名 (随机语料)",
    "gamma_check.regime.unknown":        "未知",
    "gamma_check.regime.normal.desc":    "η ∈ [0.85, 1.15]：模型完全利用名义上下文，无异常。",
    "gamma_check.regime.fraud.desc":     "η < 0.01：名义 θ 虚高。模型表现如同 θ 远小于宣称值。可能是 YaRN/营销虚标，无真实上下文扩展。",
    "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5)：上下文压缩 (模型注意距离比名义 θ 预测更短)。常见于 instruction-tuned / RLHF 模型。",
    "gamma_check.regime.overpade.desc":  "η > 1.5：模型注意距离超过 Padé 预测。可能是 Lerch 修正体制或欠训练早期 checkpoint。",
    "gamma_check.regime.swa.desc":       "随机语料上 γ_obs > 1.05 = 滑动窗口注意力签名 (Mistral / Gemma 系列)。",
    "gamma_check.regime.unknown.desc":   "输入超范围或 γ_obs > 1 但未标记随机语料。请核验测量。",
    "gamma_check.glossary.title":        "ⓘ 词汇表 — 变量含义",
    "gamma_check.glossary.gamma_pade":   "<strong>γ_Padé</strong>：闭式预测 (2−z)/(2+z), z = T√2/θ。论文 §sec:gamma_decomposition。",
    "gamma_check.glossary.gamma_obs":    "<strong>γ_观测</strong>：从注意力分数经验测得 (在真实权重上运行 Diagnose CLI)。",
    "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (观测)</strong>：由 γ_obs 反演 T√2 / (1 − γ_obs)。测量隐含的有效 θ。",
    "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong>：θ + T/√2。闭式公式预测的有效 θ。",
    "gamma_check.glossary.efficiency":   "<strong>η</strong>：θ_eff_obs / θ_eff_Padé 比值。≈1 = 正常 · &lt;0.01 = 欺诈 · &lt;0.5 = 压缩 · &gt;1.5 = 超 Padé。",
    "gamma_check.glossary.delta_h":      "<strong>ΔH_Cardy</strong>：log(θ_eff_obs / θ_nominal)。Cardy 熵变。负值 = 压缩熵。~0 = 与名义匹配。",
    "gamma_check.glossary.regime":       "<strong>体制</strong>：基于 η + γ_obs + 随机语料标志的自动分类器。",

    // §36 v0.6 — 内联 ⓘ 图标提示
    "tooltip.gamma_pade":          "<strong>γ_Padé(T_eval)</strong>：闭式预测 (2−z)/(2+z), z = T√2/θ。论文 §sec:gamma_decomposition。",
    "tooltip.gamma_decomposed":    "<strong>γ_分解</strong>：基于完整架构分解的 γ。Padé 基线 + GQA 偏移 + post-IH 偏移 (校准审计已复制子集)。",
    "tooltip.d_horizon":           "<strong>d_horizon</strong>：有效注意力视野。超过此位置分数低于噪声底 (论文 §26)。",
    "tooltip.L_NIAH":              "<strong>L_NIAH 上限</strong>：当前 d_horizon 下针-在-干草堆检索可靠性的预测上限。",
    "tooltip.chi":                 "<strong>χ 易感性</strong>：χ = 1/(1−γ)。在 Hagedorn 线 γ=1 处发散。",
    "tooltip.kv_memory":           "<strong>KV 内存 @ T_eval (BF16)</strong>：每请求 KV 缓存 = 2 · n_layers · n_kv_heads · d_head · T_eval 字节。",
    "tooltip.theta_eff_obs":       "<strong>θ_eff (观测)</strong>：由 γ_观测 隐含的有效 θ：T√2 / (1 − γ_obs)。",
    "tooltip.theta_eff_pade":      "<strong>θ_eff (Padé)</strong>：闭式公式预测的有效 θ：θ + T/√2。",
    "tooltip.efficiency":          "<strong>η = θ_eff_obs / θ_eff_Padé</strong>：效率比。≈1 = 正常 · &lt;0.01 = 欺诈 · &lt;0.5 = 压缩 · &gt;1.5 = 超 Padé。",
    "tooltip.delta_h_cardy":       "<strong>ΔH_Cardy</strong>：log(θ_eff_obs / θ_nominal)。Cardy 熵变。负值 = 压缩熵。~0 = 与名义匹配。",
    "tooltip.verdict_aggregate":   "<strong>判定</strong>：所有配方中最差。✅ 通过 = 全绿 · ⚠ 降级 = ≥1 黄 · ❌ 否 = ≥1 红。",
    "tooltip.verdict_breakdown":   "<strong>各配方分解</strong>：每个配方测试一个<em>独立</em>的决策轴 (长上下文 · 预算 · 硬件 · 自训 vs API · KV 压缩)。X-1 上的 ❌ 表示「按你的量级用 API」而非「模型失败」——展开 Recipes 节查看各轴上下文。",
    "tooltip.gamma_pill":          "<strong>γ 头条</strong>：γ_分解 (或 γ_Padé 回退)。范围 (0,1) = 相位 A (反伊辛)。γ ≥ 1 = Hagedorn / 相位 B。",
    "tooltip.anti_ising":          "<strong>反伊辛类</strong>：相位 A → β = γ−1 &lt; 0。机器证明 (Sage + Lean Mathlib4)。见 §35 v0.5。",

    // §37 v0.6 — Lean+Mathlib 定理表
    "lean.table.title":            "📑 Lean+Mathlib 定理表",
    "lean.table.desc":             "下方每一项都已机器证明对 Lean 4 + Mathlib4。点击任意 L# 链接跳转到 GitHub 源码行。按主题分组——点击标题展开。",
    "lean.table.theorem":          "定理",
    "lean.table.claim":            "陈述",
    "lean.table.tactic":           "策略",
    "lean.table.source":           "出处",
    "lean.table.lean":             "Lean",
    "lean.findings.title":         "🔎 实质性发现",
    "lean.findings.detected_by":   "检测于",
    "lean.findings.fixed_by":      "修正于",
    "lean.findings.recommendation":"建议",
    "lean.meta.repo":              "仓库",
    "lean.meta.build":             "构建",
    "lean.meta.theorems":          "定理",
    "lean.meta.verified":          "已验证",
    "lean.meta.rejected":          "已拒绝",
    "lean.meta.sorry":             "sorry",
    "lean.meta.findings":          "项实质性发现",
    "lean.manifest.loading":       "正在加载 Lean 清单…",
    "lean.manifest.error":         "Lean 清单不可用",

    // 帮助弹窗 — v0.6 节
    "help.v06.title":              "🆕 v0.6 — γ 预测-vs-观测 + Cardy ΔH + Lean 徽章",
    "help.v06.intro":              "<em>v0.6 (2026-05-06)：三个新诊断位于 TAF 卡的 <strong>🔬 诊断</strong> 下。全部在浏览器运行；γ_观测来自在真实权重上运行 Diagnose CLI。</em>",
    "help.v06.layout.title":       "TAF 卡布局 (v0.6 新增)",
    "help.v06.layout.body":        "点击 <strong>🚀 生成完整画像</strong> 后，卡片展示：顶部一条 <strong>hero 条</strong> (架构类 + 元信息 + 3 个 pill：聚合判定 ✅/⚠/❌、γ 头条、🧲 反伊辛若处于相位 A) 和四个 <strong>可展开节</strong>：<strong>📋 配方</strong> (默认展开 — 各维度判定)、<strong>🔬 诊断</strong> (关键数字、γ 预测 vs 观测、what-if 浏览器)、<strong>✓ 验证</strong> (Sage+Lean 代数一致性、可证伪 F1-F23)、<strong>📂 来源与分享</strong> (校准审计 + JSON 下载 / 链接 / 注册表提交)。点击任意标题展开。每个变量都有内联 <strong>ⓘ</strong> 提示。",
    "help.v06.gamma_check.title":  "γ 预测 vs 观测",
    "help.v06.gamma_check.body":   "输入经验测量的 γ，工具计算 <strong>η = θ_eff_obs / θ_eff_Padé</strong> 并分类到 5 种体制之一：",
    "help.v06.case.normal":        "<strong>正常</strong> (η ∈ [0.85, 1.15]) — 模型完整使用名义上下文。<em>用例</em>：在采用前验证新发布。",
    "help.v06.case.fraud":         "<strong>欺诈</strong> (η &lt; 0.01) — 名义 θ 虚高；模型表现如同 θ ≪ 宣称值。<em>用例</em>：检测 YaRN/营销虚标 (CodeLlama / Mistral-Nemo 模式)。",
    "help.v06.case.compressed":    "<strong>压缩</strong> (η &lt; 0.5) — 上下文压缩；模型注意距离比名义 θ 短。<em>用例</em>：识别 RLHF/指令调优引起的压缩 (LLaMA-2 模式)。",
    "help.v06.case.overpade":      "<strong>超 Padé</strong> (η &gt; 1.5) — 模型注意距离超过 Padé 预测。<em>用例</em>：识别 Lerch 修正体制或欠训练早期 checkpoint (pythia-1b 模式)。",
    "help.v06.case.swa":           "<strong>SWA 随机语料</strong> (γ_obs &gt; 1.05 且 随机语料=是) — 滑动窗口注意力签名。<em>用例</em>：在随机 token 上确认 Mistral / Gemma SWA。",
    "help.v06.cardy.title":        "Cardy ΔH 诊断",
    "help.v06.cardy.body":         "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>。观测有效 θ 与名义 θ 之间的熵变。强负值 = 压缩熵；接近零 = 与名义匹配。在边界情况下补充 η。",
    "help.v06.lean.title":         "Lean + Mathlib 验证徽章",
    "help.v06.lean.body":          "TAF 恒等式在 Lean Mathlib4 中形式化机器证明：<strong>37 个定理</strong>分布于 7 组（Padé、RG 流、Cayley、D-SAGE、审计发现、CV 勘误、杂项）+ <strong>1 项实质性发现</strong>（V 导数 2 倍因子，定理 <code>V_derivative_ne_RG_beta</code>）。源：<a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a>（commit 25c77fd）。本地重新验证：<code>git clone --depth=1 https://github.com/karlesmarin/lean-taf &amp;&amp; cd lean-taf &amp;&amp; lake exe cache get &amp;&amp; lake env lean Taf/Identities.lean</code>。Hero 中的 🧲 反伊辛 pill 与验证手风琴链接到具体源码行。",
    "help.v06.glossary.title":     "变量词汇表 (亦嵌入 TAF 卡)",
    "help.v06.glossary.body":      "TAF 卡中每个变量都有内联 ⓘ 提示。完整列表：γ、γ_Padé、γ_分解、γ_观测、θ、θ_eff_obs、θ_eff_Padé、η、ΔH_Cardy、χ、d_horizon、L_NIAH、KV 内存、体制。鼠标悬停任意 ⓘ 查看定义 + 论文章节。",

    "hero.title":     "🔬 TAF Agent",
    "hero.tagline":   "30 秒诊断任意 transformer LLM。免费。无需 GPU。无需注册。",
    "hero.subtitle":  "在你花钱或花时间<em>之前</em>，预测某个模型是否适合你的用例。所有计算在浏览器本地运行 &mdash; 你的输入永远不会离开此标签页。",
    "hero.help":      "📘 手册与示例",
    "hero.quickstart_btn": "⚡ 快速开始",
    "hero.inventory_btn":  "🧰 它能给你什么",
    "hero.about":     "由独立研究员构建。开源。不隶属于任何模型供应商。",

    "modes.title":    "🎯 模式",
    "modes.profile":  "📇 模型画像",
    "modes.compare":  "🆚 比较模型",
    "modes.inspector": "🔍 检查 config",
    "modes.ask":      "💬 自由提问",
    "modes.recipe":   "📋 选择配方",
    "modes.diagnose": "🩺 诊断 CLI",
    "diagnose.title": "🩺 诊断 CLI 命令生成器",
    "diagnose.tip":   "浏览器从 config 预测 γ；CLI 在真实权重上测量 γ_obs。此生成器产生在本地运行的精确命令。",
    "diagnose.desc":  "选择选项并将生成的命令复制粘贴到本地机器（Python + transformers + numpy）。快速模式 ≈5 分钟 CPU；完整 ≈20–60 分钟 GPU。",
    "diagnose.model_label": "HF 模型 id:",
    "diagnose.theta_label": "θ（留空自动）:",
    "diagnose.n_label": "上下文 N:",
    "diagnose.options_label": "选项:",
    "diagnose.opt_fast": "--fast（CPU，≈5 分钟）",
    "diagnose.opt_cpu": "--cpu（强制 CPU）",
    "diagnose.opt_4bit": "--load_in_4bit（≥7B 模型）",
    "diagnose.local_label": "--local 路径（可选）:",
    "diagnose.build_btn": "📋 生成命令",
    "diagnose.cmd_title": "生成的命令:",
    "diagnose.copy_btn": "📋 复制到剪贴板",
    "diagnose.next_steps": "下一步: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) 运行命令 (4) JSON 结果 → 通过 Inspect 模式上传以进行完整 TAF 分析。",
    "modes.phase":    "📊 相图",
    "phase.title":    "📊 相图（γ × θ）",
    "phase.tip":      "每个点是论文经验数据集中的一个模型。x 轴: log θ; y 轴: γ。Hagedorn 线 γ=1 分隔 A 相和 B 相。悬停查看详情，点击加载到表单。",
    "phase.desc":     "数据集中 23 个模型；Padé 曲线在 T=2000。",
    "modes.desc":     "<strong>最快开始</strong>: 粘贴任意 HuggingFace 模型 id (例如 <code>meta-llama/Meta-Llama-3-8B</code>),点击 画像。秒内看到所有 5 个配方的评分。",

    "profile.title":           "📇 模型画像",
    "profile.desc":            "<strong>面向技术人员</strong>: 当您需要候选模型的完整可行性快照时。一键运行所有 5 个配方,生成统一的 TAF 卡。",
    "profile.preset_label":    "预设:",
    "profile.preset_default":  "— 或从列表中选择 —",
    "profile.hf_label":        "HF 模型 id:",
    "profile.fetch_btn":       "📥 获取",
    "profile.btn":             "🚀 生成完整画像",
    "profile.quickstart":      "💡 快速开始: 选择任意预设 → 点击生成。或从 <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub 热门</a> 粘贴一个 id → 📥 获取 → 生成。",

    "compare.title":           "🆚 模型并排比较",
    "compare.desc":            "<strong>面向技术人员</strong>: 当为特定部署场景在 2-3 个候选模型之间选择时。同一配方,多个模型,并排判定。",
    "compare.recipe_label":    "配方:",
    "compare.T_eval_label":    "T_eval (目标上下文):",
    "compare.models_title":    "要比较的模型(最多 3 个)",
    "compare.btn":             "🚀 比较",
    "compare.example":         "💡 尝试: 粘贴 3 个流行的 7-8B 模型 (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B),配方 X-2, T_eval=16000。查看哪个最适合长上下文。",

    "ask.title":               "❓ 您的问题",
    "ask.placeholder":         "例如: Mistral-7B 能处理 16K NIAH 检索吗?或: 我有 5,000 美元,可以训练什么模型?或: 以每天 1 亿 tokens 提供 Llama-70B 的最便宜 GPU?",
    "ask.btn":                 "🚀 分析",
    "ask.example_btn":         "💡 尝试示例",

    "recipe.title":            "📋 配方",
    "recipe.default":          "— 选择一个配方 —",
    "recipe.input_title":      "🎯 输入",

    "verdict.title":           "📊 判定",
    "chain.title":             "🔍 计算链",
    "chain.desc":              "下面每个数字都是确定性 Python。点击步骤展开。",
    "answer.title":            "💬 自然语言回答",
    "share.btn":               "🔗 复制分享链接",
    "share.copied":            "✅ 已复制到剪贴板!",
    "share.download":          "💾 下载 JSON",
    "share.download_md":       "📝 Markdown",
    "share.download_tex":      "📜 LaTeX",
    "share.submit":            "📤 提交到 registry",
    "share.submit_clip_ok":    "↗ 已打开 GitHub。正文已复制到剪贴板——粘贴到 issue 正文。",
    "share.submit_clip_fail":  "↗ 已打开 GitHub。剪贴板被阻止——正文已写入浏览器控制台 (F12)。",
    "share.import_title":      "📂 导入共享的 TAF 结果",
    "a11y.skip":               "跳到主要内容",

    // v0.6.2 — landing 重构：快速开始 + 功能清单 + 架构提示
    "qs.title":                    "⚡ 快速开始",
    "qs.step1":                    "粘贴 HuggingFace 模型 ID（例如 <code>meta-llama/Meta-Llama-3-8B</code>）",
    "qs.step2":                    "点击 <strong>📇 Profile a model</strong>",
    "qs.step3":                    "查看你的 TAF Card — 各用例的判定 + 关键数值 + 经 Lean+Mathlib 验证的数学",
    "qs.cta":                      "↓ 立即开始",
    "inv.title":                   "🧰 这个工具能给你什么",
    "inv.recipes.title":           "🎯 8 个 recipe — 这个模型符合你的用例吗？",
    "inv.recipes.x1.title":        "自训练 vs API",
    "inv.recipes.x1.body":         "对你的流量哪个更便宜？",
    "inv.recipes.x2.title":        "长上下文",
    "inv.recipes.x2.body":         "能可靠处理 32k / 128k tokens 吗？",
    "inv.recipes.x3.title":        "预算",
    "inv.recipes.x3.body":         "用 $X，你能从零训练什么模型？",
    "inv.recipes.x5.title":        "硬件",
    "inv.recipes.x5.body":         "用什么 GPU 服务 N tokens/天？",
    "inv.recipes.x19.title":       "KV 缓存",
    "inv.recipes.x19.body":        "如何压缩而不破坏质量？",
    "inv.recipes.x21.title":       "Imprint 纯度",
    "inv.recipes.x21.body":        "模型的位置编码有多干净？",
    "inv.recipes.x22.title":       "Compute-context",
    "inv.recipes.x22.body":        "模型是否落入经验带？",
    "inv.recipes.x23.title":       "IH 相位",
    "inv.recipes.x23.body":        "induction-head 之前还是之后？",
    "inv.diag.title":              "🔬 诊断",
    "inv.diag.gamma":              "<strong>γ 预测 vs 观测</strong> — 自动分入 5 种状态（正常 · 欺诈/夸大上下文 · 压缩 · over-Padé · sliding-window）",
    "inv.diag.cardy":              "<strong>Cardy ΔH</strong> — 观测上下文与名义上下文之间的熵偏移",
    "inv.diag.fals":               "<strong>可证伪面板</strong> — 检查 23 个具体预测（F1–F23）",
    "inv.diag.alg":                "<strong>代数一致性</strong> — 模型必须满足的 8 条数学恒等式",
    "inv.verify.title":            "✓ 形式化验证的数学",
    "inv.verify.count":            "<strong>37 个定理</strong>已在 Lean 4 + Mathlib4 机器证明",
    "inv.verify.click":            "点击任意徽章 → 在 GitHub 打开源码行",
    "inv.verify.reverify":         "自行验证：<code>lake build</code>（缓存后 ≈5 秒）",
    "inv.export.title":            "📤 导出与分享",
    "inv.export.formats":          "<strong>JSON · Markdown · LaTeX</strong>（论文级）",
    "inv.export.share":            "可复现的分享链接（状态编入 URL）",
    "inv.export.registry":         "提交到 GitHub 上的社区登记",
    "arch.summary":                "支持的架构",
    "arch.anyhf":                  "✓ 任意 HuggingFace 公开模型",
    "tooltip.mha":                 "Multi-Head Attention：每个 token 位置同时通过多个并行 head 进行注意力计算。",
    "tooltip.gqa":                 "Grouped Query Attention：queries 共享比 heads 更少的 keys/values（节省内存但把 γ 推向 Hagedorn）。",
    "tooltip.alibi":               "Attention with Linear Biases：位置信息以学习斜率加到注意力分数，无旋转。",
    "tooltip.abspe":               "Absolute Position Embeddings：每个位置有一个固定的学习向量加到 token embedding。",
    "tooltip.swa":                 "Sliding Window Attention：每个 token 仅在固定局部窗口内做注意力（Mistral、gemma-2 使用此机制）。",
    "tooltip.ssm":                 "State Space Model：维护内部状态的序列层（取代注意力，Mamba、Jamba 使用此机制）。",

    // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling 揭示器
    "modes.unmask":                "🪟 揭示",
    "unmask.title":                "🪟 上下文揭示器",
    "unmask.tip":                  "粘贴 HuggingFace 模型 id（或原始 config.json）。工具检测 sliding-window attention、RoPE 缩放（YaRN/linear/dynamic NTK）和 GQA — 所有使 <code>max_position_embeddings</code> 大于实际有效上下文的因素。Mistral-7B-v0.1 是经典例子：声称 32k，实际只在 ~4-8k 范围内做注意力。",
    "unmask.desc":                 "<strong>你即将为一个实际上注意力不到那么远的模型花钱吗？</strong> 粘贴 id，1 秒内得知。无需 GPU，无需推理 — 只是对 config.json 做算术。",
    "unmask.id_label":             "HF 模型 id：",
    "unmask.fetch_btn":            "🔍 揭示",
    "unmask.paste_summary":        "或粘贴原始 config.json（私有 / 在研模型）",
    "unmask.paste_btn":            "🔍 揭示已粘贴的 config",
    "unmask.label.declared":       "声明上下文",
    "unmask.label.effective":      "有效（估计）",
    "unmask.label.ratio":          "比率",
    "unmask.section.flags":        "架构标志",
    "unmask.section.warnings":     "警告",
    "unmask.section.reco":         "建议",
    "unmask.flag.swa":             "SWA",
    "unmask.flag.rope":            "RoPE 缩放",
    "unmask.flag.gqa":             "GQA",
    "unmask.flag.layers":          "层数",
    "unmask.flag.dhead":           "d_head",
    "unmask.flag.theta":           "RoPE θ",
    "unmask.flag.yes":             "是",
    "unmask.flag.no":              "否",
    "unmask.flag.full_mha":        "否（完整 MHA，{n} heads）",
    "unmask.verdict.honest":            "✅ 诚实",
    "unmask.verdict.inflated":          "⚠ 夸大",
    "unmask.verdict.severely_inflated": "❌ 严重夸大",
    "unmask.verdict.yarn_extended":     "⚠ YARN 扩展",
    "unmask.verdict.unknown":           "❓ 未知",
    "unmask.warn.swa_window":      "SWA 窗口：{window} tokens — 每层仅在此窗口内做注意力。",
    "unmask.warn.multihop":        "多跳估计：~{multiHop} tokens（保守：窗口 × {factor}）。",
    "unmask.warn.yarn":            "RoPE 缩放（{type}）将上下文从 ~{original} 扩展 {factor}× 到 {declared} tokens。",
    "unmask.warn.yarn_advice":     "RoPE 扩展的上下文 — 用 γ_check 诊断在声称的全长度验证 γ 行为。",
    "unmask.warn.gqa_small_dhead": "小 head dim（{d_head}）+ GQA：长上下文下 KV 缓存压缩很可能（γ 推向 Hagedorn）。",
    "unmask.reco.honest":              "标准全注意力模型。有效上下文与声明一致（{declared} tokens）。",
    "unmask.reco.inflated":            "通过 SWA 有效 ~{effective} tokens。用 γ_check 验证你目标长度的行为。",
    "unmask.reco.severely_inflated":   "实际把它当作 ~{effective} tokens 上下文模型。{declared} tokens 的声明仅通过跨层注意力链生效，经验上超过 ~2× SWA 窗口后会退化。",
    "unmask.reco.yarn_extended":       "RoPE 扩展上下文。运行长上下文 benchmark（NIAH 在 8k / 16k / 32k / 全长度）以确认扩展是否成立。用 γ_check 设 T_eval = {declared}。",
    "unmask.reco.unknown":             "无法解析 config。验证 URL 是带公开 config.json 的有效 HF 模型。",
    "unmask.status.empty_id":      "⚠ 输入一个 model id（例如 mistralai/Mistral-7B-v0.1）。",
    "unmask.status.fetching":      "⏳ 正在获取 {modelId} 的 config.json...",
    "unmask.status.success":       "✅ 已分析 {modelId}（判定：{verdict}）",
    "unmask.status.empty_paste":   "⚠ 请先粘贴 config.json。",
    "unmask.status.invalid_json":  "❌ JSON 无效：{error}",
    "unmask.status.success_paste": "✅ 已分析粘贴的 config（判定：{verdict}）",
    "unmask.pasted_label":         "（已粘贴 config）",
    "mode_desc.ask":               "输入自由问题。浏览器内的 LLM 选择正确的 recipe 并运行。",
    "mode_desc.recipe":            "直接选择一个 recipe 并填表。完整手动控制。",
    "mode_desc.profile":           "最快开始：粘贴任意 HuggingFace model id，点击 Profile。几秒内看到 5 个 recipe。",
    "mode_desc.compare":           "选择 2-3 个候选模型 + 一个 recipe。在表格中并排查看判定。",
    "mode_desc.inspector":         "直接粘贴 config.json。适用于未发布 HF Hub 的私有 / 在研模型。",
    "mode_desc.diagnose":          "构建 diagnose_model.py 的 CLI 命令，在真实 GPU 上测量 γ_obs。浏览器预测；CLI 测量。",
    "mode_desc.phase":             "论文经验面板的 γ × θ 散点图。悬停点查看详情，点击加载到 Diagnose / Recipe 表单。",
    "mode_desc.unmask":            "检测 max_position_embeddings 是否误导（SWA / YaRN / RoPE 缩放）。粘贴 model id，1 行判定。",
    "profile.preset_loaded":       "✅ 已为 <strong>{id}</strong> 加载预设。表单已预填。（点击 📥 Fetch 用 HF Hub 最新 config 覆盖。）",

    // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer
    "modes.template":              "📜 Chat-template",
    "mode_desc.template":          "检测模型使用的 chat-template 系列（Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek）。给出 lm-eval / vLLM / transformers 的精确 CLI flag。",
    "template.title":              "📜 Chat-template 检测器",
    "template.tip":                "粘贴 HF 模型 id（或原始 tokenizer_config.json）。检测 chat-template 系列并给出正确使用的精确框架命令。如果忘记应用，lm-eval-harness 会让 accuracy 静默对半（issue #1841）。",
    "template.desc":               "<strong>忘了 <code>--apply_chat_template</code> 吗？</strong> 大多数 multi-turn eval 因为 chat template 未应用而失败 ~50%。粘贴 model id，获取你 stack 的精确 CLI flag。",
    "template.id_label":           "HF 模型 id：",
    "template.fetch_btn":          "📜 检测",
    "template.paste_summary":      "或粘贴原始 tokenizer_config.json（私有模型）",
    "template.paste_btn":          "📜 检测已粘贴 config",
    "template.label.family":       "检测到的系列",
    "template.label.markers":      "匹配的标记",
    "template.label.tpl_len":      "Template 长度",
    "template.section.warnings":   "警告",
    "template.section.commands":   "各框架命令",
    "template.section.raw":        "原始 template（预览）",
    "template.family.custom":      "自定义（未知系列）",
    "template.family.none":        "（无 chat_template）",
    "template.verdict.ok":         "✅ 已检测到 TEMPLATE",
    "template.verdict.custom":     "⚠ 自定义 TEMPLATE",
    "template.verdict.missing":    "❌ 无 CHAT TEMPLATE",
    "template.verdict.base_model": "ℹ 基础模型（无 chat）",
    "template.verdict.unknown":    "❓ 未知",
    "template.warn.no_chat_template": "tokenizer_config.json 中无 <code>chat_template</code> 字段。基础 / 仅预训练模型的典型情况。如果你期待 instruct-tuned 模型，可能加载了错误的文件。",
    "template.warn.custom_template":  "非标准 template（{length} 字符）。工具无法将其匹配到已知系列。检查下方预览并验证你的 eval 框架是否支持。",
    "template.warn.lm_eval_apply":    "<strong>lm-eval-harness：</strong>添加 <code>--apply_chat_template</code>，否则 multi-turn eval 上 accuracy 会静默下降 ~50%（issue #1841）。",
    "template.warn.vllm_apply":       "<strong>vLLM serve：</strong>验证 <code>--chat-template</code> 已设置（fine-tuned 变体的自动检测有时失败）。建议：<code>{name}</code>。",
    "template.status.empty_id":    "⚠ 输入 model id（例如 mistralai/Mistral-7B-Instruct-v0.3）。",
    "template.status.fetching":    "⏳ 正在获取 {modelId} 的 tokenizer_config.json...",
    "template.status.success":     "✅ 已检测 {modelId}（判定：{verdict}）",
    "template.status.empty_paste": "⚠ 请先粘贴 tokenizer_config.json。",
    "template.status.invalid_json":"❌ JSON 无效：{error}",
    "template.status.success_paste":"✅ 已检测粘贴的 config（判定：{verdict}）",
    "template.pasted_label":       "（已粘贴 tokenizer_config）",

    // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor
    "modes.arena":                 "🎯 Arena CI",
    "mode_desc.arena":             "从原始 pairwise 投票数据中恢复置信区间（Bradley-Terry MLE + bootstrap）。检测公开 Arena 排行榜隐藏的统计上并列对。",
    "arena.title":                 "🎯 Arena-Elo CI 重建器",
    "arena.tip":                   "Chatbot Arena 在公开排行榜中删除了置信区间。5 Elo 的差距在统计上可能毫无意义。粘贴原始投票数据（model_a, model_b, winner） — 工具计算 Bradley-Terry MLE + bootstrap CI 并列出统计上的并列（CI 重叠）。",
    "arena.desc":                  "<strong>GPT-4 真的比 Claude 强吗 — 还是它们打平？</strong> 粘贴 pairwise 投票 CSV（或点击 <em>加载样本</em>）。Bradley-Terry MLE + 200 次 bootstrap → 排序 Elo + 95% CI + 统计并列检测。全部在浏览器中。",
    "arena.sample_btn":            "📊 加载样本数据",
    "arena.run_btn":                "🎯 计算 CIs",
    "arena.clear_btn":             "🗑️ 清空",
    "arena.csv_summary":           "投票 CSV（header：<code>model_a,model_b,winner</code>；winner ∈ a/b/tie）",
    "arena.section.ranked":        "排序 Elo 与 95% CI",
    "arena.section.ties":          "统计并列（CI 重叠）",
    "arena.section.summary":       "摘要",
    "arena.col.rank":              "#",
    "arena.col.model":             "模型",
    "arena.col.elo":               "Elo",
    "arena.col.ci":                "95% CI",
    "arena.col.ci_width":          "± 半宽",
    "arena.col.matches":           "对局",
    "arena.col.wins":              "胜 / 负 / 平",
    "arena.col.tie_pair":          "配对",
    "arena.col.tie_diff":          "Elo 差距",
    "arena.col.tie_overlap":       "CI 重叠",
    "arena.no_ties":               "无统计并列 — 所有配对在 95% CI 下可区分。",
    "arena.summary.votes":         "总投票数",
    "arena.summary.models":        "模型数",
    "arena.summary.ties":          "统计并列",
    "arena.summary.bootstrap":     "Bootstrap 迭代",
    "arena.summary.ci_level":      "CI 水平",
    "arena.status.empty":          "⚠ 粘贴投票 CSV 或点击加载样本。",
    "arena.status.too_few":        "⚠ 仅 {n} 个有效投票 — 需要至少 10 个才能可靠拟合 Bradley-Terry。",
    "arena.status.computing":      "⏳ 在 {n} 个投票上计算 Bradley-Terry MLE + bootstrap...",
    "arena.status.done":           "✅ {n} 投票 · {models} 模型 · {ties} 统计并列 · {ms} ms",
    "arena.status.sample_loaded":  "✅ 样本已加载（合成 6 模型 Arena 数据）。点击计算 CIs。",

    // v0.7.3 — anti-bullshit pack #4: Contamination Prior
    "modes.contam":                "🧪 污染",
    "mode_desc.contam":            "对 benchmark 分数是否被污染做贝叶斯式的先验估计。输入模型训练 cutoff → 评估 20+ 主流 benchmark（MMLU、GSM8K、HumanEval、MMLU-Pro…）。",
    "contam.title":                "🧪 污染先验",
    "contam.tip":                  "基于 (模型训练 cutoff 日期) × (benchmark 发布日期) × (已知语料库纳入 + 泄漏历史)，对 benchmark 分数是否被污染做贝叶斯式的先验估计。Open LLM Leaderboard v1 在 2024 年因 MMLU/HellaSwag 分数被污染而停用。",
    "contam.desc":                 "<strong>你应该相信你模型的 MMLU 分数吗？</strong> 输入模型训练 cutoff 日期 — 工具评估 20+ 主流 benchmark（MMLU、HellaSwag、GSM8K、HumanEval、IFEval、MMLU-Pro、GPQA…）并告诉你哪些分数可能被污染。",
    "contam.cutoff_label":         "训练 cutoff：",
    "contam.run_btn":              "🧪 评估所有 benchmark",
    "contam.section.ranked":       "Benchmark 污染先验",
    "contam.section.high":         "🔴 高风险 benchmark（视分数为不可信）",
    "contam.section.medium":       "🟡 中等风险（用替代品验证）",
    "contam.section.low":          "🟢 低风险（可能干净）",
    "contam.col.benchmark":        "Benchmark",
    "contam.col.released":         "发布",
    "contam.col.gap":              "差距（月）",
    "contam.col.prior":            "P(污染)",
    "contam.col.level":            "等级",
    "contam.col.corpora":          "在语料库",
    "contam.col.category":         "类别",
    "contam.label.high":           "高风险",
    "contam.label.medium":         "中",
    "contam.label.low":            "低",
    "contam.no_entries":           "（此类别中无）",
    "contam.advice.high":          "视这些分数为不可信。用更新 / 私有测试的替代品替换（MMLU-Pro、GPQA、MUSR、MATH-500）。",
    "contam.advice.medium":        "谨慎对待。在 held-out 子集或社区复现上寻找复制。",
    "contam.advice.low":           "分数可能未被污染，但没有泄漏不等于证明 — 仍要用替代测试交叉验证。",
    "contam.summary.headline":     "Cutoff <code>{cutoff}</code> · {n} 个 benchmark 已评估",
    "contam.status.empty":         "⚠ 输入模型训练 cutoff 日期（例如 2023-12）。",
    "contam.status.bad_date":      "⚠ 日期格式错误。使用 YYYY-MM 或 YYYY-MM-DD。",
    "contam.status.done":          "✅ Cutoff {cutoff} · {n} benchmarks 已评估 · {high} 个高风险",

    // v0.7 — Help 模态部分
    "help.v07.title":              "🆕 v0.7 — Anti-bullshit 套件（4 个新模式）",
    "help.v07.intro":              "<em>v0.7（2026-05-06）：四个新模式，解决 HuggingFace 社区报告的具体痛点。每个都在浏览器中运行，无推理 — 纯元数据 + 数学。</em>",
    "help.v07.unmask.title":       "🪟 上下文揭示器",
    "help.v07.unmask.body":        "检测 <code>max_position_embeddings</code> 何时具有误导性。Mistral-7B-v0.1 声称 32k 但通过 SWA 实际只在 ~4-8k 内做注意力。粘贴 HF 模型 id → 1 秒判定（诚实 / 夸大 / 严重夸大 / YARN 扩展）。捕获 SWA、RoPE-scaling（YaRN/linear/dynamic NTK）、小 d_head + GQA。<em>用例</em>：在为 32k 上下文付 GPU 钱之前，验证模型是否真的注意那么远。",
    "help.v07.template.title":     "📜 Chat-template 检测器",
    "help.v07.template.body":      "检测模型使用的 chat-template 系列（Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / 自定义 / 无）并给出 lm-evaluation-harness、vLLM、transformers 的精确 CLI flag。解决 lm-eval-harness 的 issue #1841：忘记 <code>--apply_chat_template</code> 会让 multi-turn accuracy 静默对半。<em>用例</em>：报告 benchmark 分数前，确认你正确应用了 template。",
    "help.v07.arena.title":        "🎯 Arena-Elo CI 重建器",
    "help.v07.arena.body":         "Chatbot Arena 在公开排行榜中删除了置信区间 — 5 Elo 的差距在统计上可能毫无意义。粘贴原始 pairwise 投票数据（model_a, model_b, winner）→ Bradley-Terry MLE + 200 次 bootstrap → 排序 Elo + 95% CI + \"统计并列\" 面板，列出 CI 重叠的配对。尝试加载样本按钮。<em>用例</em>：宣称 \"模型 A 胜过模型 B\" 之前，验证它们的 CI 不重叠。",
    "help.v07.contam.title":       "🧪 污染先验",
    "help.v07.contam.body":        "对 benchmark 分数是否被污染做贝叶斯式的先验估计。输入模型训练 cutoff 日期 → 工具按 P(污染) 评估 20+ 主流 benchmark（MMLU、HellaSwag、GSM8K、HumanEval、IFEval、MMLU-Pro、GPQA、AIME、MATH-500、BBH、MUSR…），基于时间差距、语料库纳入和已知泄漏历史。Open LLM Leaderboard v1 在 2024 年因 MMLU/HellaSwag 分数被污染而停用。<em>用例</em>：比较两个模型时决定相信哪些分数。",
    "help.v07.quant.title":        "⚖️ 量化机制分类器",
    "help.v07.quant.body":         "预测任意（模型 × 量化方案：NF4、AWQ、GPTQ、GGUF Q4_K_M / Q5_K_M / Q8_0、int8、FP8…）的 γ-shift 与 ΔPPL。架构感知：小 d_head + 激进 GQA → 更敏感；校准方案（AWQ）比未校准方案（NF4）更好地吸收偏移。检测到 cliff 时推荐更安全的替代方案。<em>用例</em>：量化之前，预测你的特定架构 × 方案组合是否能保持 PPL 可接受，否则给出具体的切换建议。",
    "help.v07.drift.title":        "🔀 跨框架 Drift 界",
    "help.v07.drift.body":         "同一模型，不同 setup 下分数不同。工具预测仅由数值噪声（dtype、framework、batch）允许的最大 drift。若观测差距超过它 → 真实 bug，通常是 chat-template mismatch（lm-eval-harness issue #1841）或 KV-cache 布局。试试 &quot;加载样本&quot; 按钮看典型的 chat-template bug。<em>用例</em>：在报告回归或声称可复现性之前，验证两个评估之间的差距是否大于数值噪声能解释的范围。",
    "inv.v07.drift":               "<strong>🔀 Drift</strong> — bug 还是噪声？预测两个评估间的最大可允许差距",
    "help.v07.niah.title":         "🔍 NIAH → Reasoning Gap",
    "help.v07.niah.body":          "RULER 论文（NVIDIA 2024）显示长上下文模型经常通过 NIAH（needle 检索）但在相同上下文上多跳 reasoning 失败。工具仅根据架构（γ_Padé + d_horizon + 架构压力：小 d_head、GQA、SWA）预测两种通过率，报告 gap，并找到模型 reasoning 保持 ≥65% 的\"安全 reasoning 上下文\"。扫描模式显示在 1k/4k/16k/64k/T_train 的曲线。<em>用例</em>：在声称的上下文部署之前，搞清楚模型是真的能在那里 reasoning 还是只能检索。",
    "inv.v07.niah":                "<strong>🔍 NIAH→Reason</strong> — 你的\"128k 上下文\"真的能在那里 reasoning，还是只能检索？",

    // v0.7 — Inventory 模态第 5 卡
    "inv.v07.title":               "🆕 v0.7 anti-bullshit 套件",
    "inv.v07.unmask":              "<strong>🪟 Unmask</strong> — config.json 声称 32k？看它是否真的注意那么远",
    "inv.v07.template":            "<strong>📜 Chat-template</strong> — 精确 CLI flag，让 lm-eval 不会静默对半你的 accuracy",
    "inv.v07.arena":               "<strong>🎯 Arena CI</strong> — 恢复 Chatbot Arena 隐藏的置信区间",
    "inv.v07.contam":              "<strong>🧪 污染</strong> — 按污染概率对 20+ benchmark 评级",
    "inv.v07.quant":               "<strong>⚖️ Quant</strong> — 预测任意（模型 × 量化方案）组合的 γ-shift + ΔPPL",

    // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier
    "modes.quant":                 "⚖️ Quant",
    "mode_desc.quant":             "预测任意（模型 × 量化方案）的 γ-shift 与 ΔPPL。架构感知：小 d_head + GQA → 更敏感。检测到 cliff 时推荐更安全的替代方案。",
    "quant.title":                 "⚖️ 量化机制分类器",
    "quant.tip":                   "预测给定（模型 × 量化方案）的 γ-shift（及由此产生的 ΔPPL）。\"AWQ 保留 ~95%\" 这类通用说法太模糊 — TAF 利用 d_head、GQA 比、SWA 标志和模型大小给出特定于架构的判定。解决：HF 社区普遍报告不可预测的量化 cliff（NF4 在 Phi-3 上 -2 PPL，但在 Llama-3-8B 上没问题）。",
    "quant.desc":                  "<strong>量化会破坏你的模型吗？</strong>粘贴 HF 模型 id，选择量化方案 — 获取预测的 γ-shift、预期 ΔPPL 区间，以及在 cliff 情况下的推荐替代方案。仅浏览器，无 GPU，无需校准集。",
    "quant.id_label":              "HF 模型 id：",
    "quant.fetch_btn":             "📥 获取 config",
    "quant.scheme_label":          "量化方案：",
    "quant.run_btn":                "⚖️ 预测",
    "quant.all_btn":               "📊 比较所有方案",
    "quant.regime.safe":           "✅ 安全",
    "quant.regime.mild":           "✅ 轻度压缩",
    "quant.regime.significant":    "⚠ 显著退化",
    "quant.regime.cliff":          "❌ 重大 CLIFF",
    "quant.label.gamma_shift":     "γ 偏移",
    "quant.label.delta_ppl":       "ΔPPL（估）",
    "quant.label.arch_mult":       "架构乘数",
    "quant.section.breakdown":     "细节分解",
    "quant.section.reco":          "建议",
    "quant.section.compare":       "所有方案（按安全性排序）",
    "quant.field.scheme":          "方案",
    "quant.field.calibrated":      "已校准",
    "quant.field.uncalibrated":    "未校准",
    "quant.field.base_penalty":    "基础惩罚",
    "quant.field.arch_mult_full":  "架构乘数",
    "quant.field.gamma_shift":     "预测 γ 偏移",
    "quant.field.ppl_band":        "ΔPPL 区间（估）",
    "quant.field.params":          "参数量",
    "quant.col.scheme":            "方案",
    "quant.col.bits":              "比特",
    "quant.col.gamma_shift":       "γ 偏移",
    "quant.col.ppl_band":          "ΔPPL 区间",
    "quant.col.regime":            "机制",
    "quant.reco.switch_to_awq":    "<strong>切换到 {scheme}</strong> — 校准的 4-bit 处理小 d_head + GQA 比 NF4 好得多。预期 ΔPPL 下降 ~2-3 倍。",
    "quant.reco.switch_to_q5_km":  "<strong>切换到 {scheme}</strong> — Q5 以低成本保留更多 head 维度（仅大约 25% 文件更大）。",
    "quant.reco.switch_to_q4_km":  "<strong>切换到 {scheme}</strong> — Q3/Q2 对此架构过于激进。",
    "quant.reco.consider_awq":     "<strong>考虑 {scheme}</strong> — 在此架构上校准能显著降低 γ-shift。",
    "quant.reco.use_higher_bits":  "<strong>使用更高比特的替代</strong> — 此架构无法干净吸收 4-bit。尝试 5 或 8-bit。",
    "quant.reco.verify_with_eval": "<strong>用真实 eval 验证</strong> — 预测偏移在边缘。部署前在目标上下文运行 NIAH。",
    "quant.reco.no_action":        "无需操作 — 此架构下量化是安全的。",
    "quant.summary.headline_all":  "<code>{modelId}</code> 的所有方案",
    "quant.status.empty_id":       "⚠ 输入 model id（例如 meta-llama/Llama-3.2-1B）。",
    "quant.status.fetching":       "⏳ 正在获取 {modelId} 的 config.json...",
    "quant.status.fetched":        "✅ 已获取 {modelId} 的 config。选择方案并点击预测（或比较所有）。",
    "quant.status.no_scheme":      "⚠ 从下拉中选择一个量化方案。",
    "quant.status.done":           "✅ 预测机制：{regime}",
    "quant.status.done_all":       "✅ 已比较 {n} 个方案 — 按安全性排序。",

    // v0.7.4 — HF Hub 自动完成：隐私 + rate-limit
    "hf_auto.privacy":             "🔒 查询发送到 huggingface.co/api · 本地缓存 5 分钟",
    "hf_auto.rate_limited":        "⚠ HuggingFace 速率限制 — 稍后再试，或手动键入完整 model id",
    "hf_auto.gated_msg":           "是 gated 模型。在此接受许可证：",

    // v0.7.5 — anti-bullshit pack #6: 跨框架 drift 界
    "modes.drift":                 "🔀 Drift",
    "mode_desc.drift":             "在给定（framework、dtype、batch、chat-template）下预测两个 benchmark 分数之间的最大允许 drift。区分真实 bug 与数值噪声。",
    "drift.title":                 "🔀 跨框架 Drift 界",
    "drift.tip":                   "同一模型，不同 setup 下分数不同。差距是噪声还是真实 bug？输入两个分数及其（framework、dtype、batch、chat-template）— 工具预测仅由数值噪声允许的最大 drift。若观测差距超过它 → 真实 bug，通常是 chat-template mismatch（lm-eval issue #1841）或 KV-cache 布局。",
    "drift.desc":                  "<strong>你的模型在 lm-eval-hf 给 67.2，在 vLLM-served 给 65.1。Bug 还是噪声？</strong> 输入两个分数及（framework、dtype、batch、是否应用 chat-template）。工具预测噪声区间并标记真实 bug。arxiv 2506.09501 将此记录为评估再现性的主要问题。",
    "drift.setup_a":               "Setup A",
    "drift.setup_b":               "Setup B",
    "drift.score":                 "分数",
    "drift.framework":             "框架",
    "drift.dtype":                 "Dtype",
    "drift.batch":                 "Batch",
    "drift.template":              "Chat-template",
    "drift.template.applied":      "已应用",
    "drift.template.not_applied":  "未应用",
    "drift.template.unknown":      "未知",
    "drift.run_btn":               "🔀 计算 drift 界",
    "drift.sample_btn":            "📊 加载样本（chat-template bug）",
    "drift.label.observed":        "观测差距",
    "drift.label.band":            "数值区间",
    "drift.label.ratio":           "差距 / 区间",
    "drift.section.setups":        "Setups",
    "drift.section.breakdown":     "Drift 贡献者（数值区间）",
    "drift.section.verdict":       "判定与建议",
    "drift.contrib.dtype":         "Dtype 不匹配",
    "drift.contrib.framework":     "框架",
    "drift.contrib.batch":         "Batch 差异",
    "drift.contrib.template":      "Chat-template 不匹配",
    "drift.dominant_cause":        "主导原因",
    "drift.cause.dtype":           "dtype 精度差异",
    "drift.cause.framework":       "框架 / 内核差异",
    "drift.cause.batch":           "按 batch 的归一化路径",
    "drift.cause.template_mismatch": "一侧应用了 chat-template 而另一侧没有（lm-eval-harness #1841 模式 — 多轮通常 -50%）",
    "drift.verdict.noise":         "✅ 数值噪声",
    "drift.verdict.suspicious":    "⚠ 可疑 — 验证",
    "drift.verdict.bug":           "❌ 真实 BUG — 调查",
    "drift.verdict.bug_template":  "❌ CHAT-TEMPLATE BUG",
    "drift.reco.noise":            "差距落在预期的数值噪声区间内。无需操作；差异与单独的 framework/dtype/batch 变化一致。",
    "drift.reco.suspicious":       "差距是预测区间的 1–2×。边缘——可能是真实 bug。尝试对齐主导贡献者（例如匹配框架或 dtype）并重新测试。",
    "drift.reco.bug":              "差距 &gt; 预测区间的 2×。这是真实 bug。检查主导贡献者 — 很可能是 tokenizer / chat-template / KV-cache 布局差异。用 <code>--apply_chat_template</code> 运行 lm-eval-harness 并确认。",
    "drift.reco.bug_template":     "检测到 chat-template 不匹配。这是评估差距大的最常见原因（lm-eval-harness issue #1841）。用 <code>--apply_chat_template</code> 重跑 &quot;未应用&quot; 一侧（或设置 vLLM <code>--chat-template &lt;name&gt;</code>）并重测。",
    "drift.status.empty_scores":   "⚠ 输入两个分数。",
    "drift.status.done":           "✅ 判定：{verdict}",
    "drift.status.sample_loaded":  "✅ 样本已加载（典型 chat-template bug）。点击计算 drift 界。",

    // v0.7.6 — anti-bullshit pack #7: NIAH → reasoning gap 预测器
    "modes.niah":                  "🔍 NIAH→Reason",
    "mode_desc.niah":              "在任意上下文下预测 NIAH（检索）与多跳 reasoning 通过率。解决：长上下文模型常常通过 NIAH 但在同一上下文上 reasoning 失败（RULER 论文）。",
    "modes.saturation":            "📈 饱和度",
    "mode_desc.saturation":        "告诉你某个 benchmark 是否仍能区分 frontier 模型，或者已经饱和（例如 MMLU 88-94% 顶部，AIME 2025 已经 96-100%）。返回 top-3 + 判定 + 推荐替代品。",
    "modes.hub":                   "🧭 方案",
    "mode_desc.hub":               "每个 LLM-eval 问题的地图 → tafagent 模式（若覆盖）+ 精选外部工具。找到方案而非重新发明。30+ 问题，7 类别。",
    "niah.title":                  "🔍 NIAH → Reasoning Gap",
    "niah.tip":                    "NIAH（Needle in a Haystack）测试检索：\"在长文本中找到这个事实\"。多跳 reasoning 测试推理：\"把开头的事实 X+Y 与结尾的事实 Z 结合\"。RULER 论文（NVIDIA 2024）显示长上下文模型经常通过 NIAH 但在相同上下文上 reasoning 失败。本工具仅根据架构预测两种通过率。",
    "niah.desc":                   "<strong>你的模型声称 128k 上下文。它在 64k 是真的能 reasoning，还是只能检索？</strong>粘贴 HF 模型 id 和目标 eval 上下文 — 工具预测 NIAH 与多跳 reasoning 通过率、gap，以及 reasoning 保持 ≥65% 的 \"安全上下文\"。",
    "niah.id_label":               "HF 模型 id：",
    "niah.fetch_btn":              "📥 获取 config",
    "niah.teval_label":            "目标上下文 (T_eval)：",
    "niah.run_btn":                "🔍 预测",
    "niah.sweep_btn":              "📊 扫描上下文",
    "niah.label.niah":             "NIAH 通过率",
    "niah.label.reasoning":        "Reasoning 通过率",
    "niah.label.gap":              "Gap",
    "niah.label.safe_ctx":         "Reasoning 安全上下文",
    "niah.section.breakdown":      "架构细节",
    "niah.section.reco":           "建议",
    "niah.section.sweep":          "按上下文长度扫描通过率",
    "niah.field.dhorizon":         "d_horizon（有效）",
    "niah.field.ratio":            "T_eval / d_horizon",
    "niah.field.arch_pressure":    "架构压力（小 d_head + GQA + SWA）",
    "niah.field.theta":            "RoPE θ",
    "niah.field.t_train":          "T_train（声称）",
    "niah.col.context":            "T_eval",
    "niah.col.niah":               "NIAH",
    "niah.col.reasoning":          "Reasoning",
    "niah.col.gap":                "Gap",
    "niah.col.verdict":            "判定",
    "niah.verdict.robust":         "✅ 稳健",
    "niah.verdict.marginal":       "⚠ 边缘",
    "niah.verdict.degraded":       "⚠ 退化",
    "niah.verdict.retrieval_only": "❌ 仅检索",
    "niah.verdict.broken":         "❌ 失效",
    "niah.reco.robust":            "在此上下文下检索与 reasoning 都稳定。可安全部署用于查询和推理任务。",
    "niah.reco.marginal":          "边缘。检索可用但 reasoning 不稳。用于事实查询，不要用于多步推理。",
    "niah.reco.degraded":          "Reasoning 显著下降。模型能找到事实但难以组合它们。在此长度下避免多跳任务。",
    "niah.reco.retrieval_only":    "RULER 的典型发现：模型通过 NIAH 但 reasoning 失败。适用于 RAG 设置（LLM 仅定位事实），不适用于链式推理。把上下文降到下方的 \"安全\" 值。",
    "niah.reco.broken":            "在此上下文下模型连基本检索都失败。视为 out-of-distribution — 在更短上下文重测。",
    "niah.safe_context":           "≤ {ctx} tokens（reasoning ≥ 65%）",
    "niah.safe_context_none":      "在你的目标以下没找到安全上下文 — 模型即使在小上下文也 reasoning 失败。",
    "niah.summary.sweep":          "<code>{modelId}</code> — 按上下文的通过率",
    "niah.status.empty_id":        "⚠ 输入 model id（例如 meta-llama/Llama-3.1-8B-Instruct）。",
    "niah.status.bad_teval":       "⚠ 输入目标上下文（≥ 512 tokens）。",
    "niah.status.fetching":        "⏳ 正在获取 {modelId} 的 config.json...",
    "niah.status.fetched":        "✅ 已获取 {modelId} 的 config。设置 T_eval 并点击预测（或扫描上下文）。",
    "niah.status.done":            "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%",
    "niah.status.sweep_done":      "✅ 已扫描 {n} 个上下文长度。",
    "saturation.title":            "📈 Benchmark 饱和度检测器",
    "saturation.tip":              "MMLU 已饱和（所有 frontier 模型 88-94%）。报告\"92% on MMLU\"现在毫无意义。本工具告诉你哪些 benchmark 仍能区分 frontier 模型，哪些已饱和，以及替代方案。数据：DemandSphere AI Frontier Tracker（CC BY-NC 4.0），2026-05 刷新。",
    "saturation.desc":             "<strong>你的 benchmark 还有用吗？</strong>选一个 benchmark 查看 top-3 frontier 分数、spread 与判定（saturated / near-saturated / discriminative），并给出推荐替代品。",
    "saturation.select_label":     "Benchmark：",
    "saturation.select.all":       "— 显示所有 benchmark —",
    "saturation.run_btn":          "📈 分类",
    "saturation.all_btn":          "📊 显示全部",
    "saturation.col.spread":       "Top-3 spread",
    "saturation.col.mean":         "Top-3 平均",
    "saturation.col.n":            "模型数",
    "saturation.col.bench":        "Benchmark",
    "saturation.col.verdict":      "判定",
    "saturation.col.reco":         "首选替代",
    "saturation.col.model":        "模型",
    "saturation.col.score":        "分数",
    "saturation.section.top3":     "Top-3 frontier 分数",
    "saturation.section.recommendations": "推荐替代品",
    "saturation.section.note":     "备注",
    "saturation.section.all":      "所有跟踪的 benchmark",
    "saturation.verdict.saturated":      "🚨 已饱和",
    "saturation.verdict.near_saturated": "⚠ 接近饱和",
    "saturation.verdict.discriminative": "✅ 仍可区分",
    "saturation.verdict.sparse_data":    "ℹ 数据稀疏",
    "saturation.borderline":       "边缘 — 在阈值切点的 ±1pp 内。判定视为\"需仔细核对\"。",
    "saturation.unknown":          "未知 benchmark。",
    "saturation.attribution":      "数据：DemandSphere AI Frontier Model Tracker（CC BY-NC 4.0）· HF Open LLM Leaderboard v3（开源权重历史）· 最近一次 fetch 2026-05-05。",
    "saturation.status.live":      "✅ 实时数据已加载 — {count} 个模型。",
    "saturation.status.baked":     "ℹ 使用 baked 快照（实时 fetch 不可用）。",
    "saturation.status.kb_fail":   "⚠ 无法加载饱和度 KB。",
    "saturation.status.done":      "✅ {name} — {verdict}",
    "saturation.status.all_done":  "✅ 已分类 {n} 个 benchmark。",
    "help.v08.saturation.title":   "📈 Benchmark 饱和度检测器",
    "help.v08.saturation.body":    "MMLU 已饱和（top 88-94%），AIME 2025 上线几个月就饱和，HumanEval 接近饱和。选任何 benchmark，工具返回 top-3 frontier 分数、spread、平均，以及判定 — saturated / near-saturated / discriminative — 加上推荐替代品（例如 MMLU → MMLU-Pro / GPQA / HLE）。可达时从 DemandSphere AI Frontier Tracker（CC BY-NC 4.0）实时 fetch；不可达时使用 2026-05-05 的 baked 快照。<em>用例</em>：在引用\"92% on MMLU\"或设计 eval 之前，检查 benchmark 是否仍能区分任何东西。",
    "inv.v08.saturation":          "<strong>📈 Saturation</strong> — 你的 benchmark 还有用吗，还是所有 frontier 都在顶部并列？",
    "inv.v081.hub":                "<strong>🧭 Solutions Hub</strong> — 每个文档化的问题都映射到一个 tafagent 模式或精选外部工具。别重复发明 — 去找。",
    "help.v081.hub.title":         "🧭 Solutions Hub",
    "help.v081.hub.body":          "tafagent 作为集成者而非孤岛。30+ 问题跨 7 类别（评估可靠性 · 诊断 · 设置 · 训练 · 检索 · 多模态 · 可观测性），每个映射到（a）解决它的 tafagent 模式（若存在），以及（b）社区已信任的最佳外部工具（RAGAS、MTEB、HELM、MCP Schema Validator、llm-stats、llguidance、GlitchMiner 等）。搜索框匹配 pain、场景和工具名称。<em>用例</em>：'我有问题 X — tafagent 解决它吗，如果不，谁解决？'",
    "hub.title":                   "🧭 Solutions Hub",
    "hub.tip":                     "我们已知的每个 LLM-eval 问题的地图：哪个 tafagent 模式能解决它（若有），以及社区已信任的最佳外部工具。目标：全覆盖。如果规范工具已在别处，我们链接而非重建。",
    "hub.desc":                    "<strong>别重新发明 — 去找。</strong>30+ 问题映射到 tafagent 模式 + 精选外部工具。按类别浏览、按关键字搜索，或查看新模式最有帮助的空缺。",
    "hub.clear_btn":               "✕ 清空",
    "hub.no_mode":                 "外部",
    "hub.planned":                 "计划：",
    "hub.best_for":                "适合",
    "hub.not_for":                 "不适合",
    "hub.tools":                   "外部工具",
    "hub.status.loaded":           "✅ 已加载 {total} 个问题，跨 {categories} 类别 — {covered} 个由 tafagent 模式覆盖，精选 {externalLinks} 个外部链接。编译于 {compiled}。",
    "hub.status.fail":             "⚠ 无法加载 Solutions Hub。",
    "hub.search.empty":            "无 '{query}' 的匹配。尝试更宽泛的词（如 'eval'、'rag'、'tokenizer'）。",
    "hub.search.results":          "为 '{query}' 找到 {n} 个匹配。",

    // v0.7.7 — 任务卡片（UX 重构：按用户意图分组的 14 个模式）
    "tiles.title":                 "🎯 你想做什么？",
    "tiles.subtitle":              "选择一项任务。每一项会打开下方对应的工具。或往下滚动查看完整的 14 个模式列表。",
    "tile.diagnose.title":         "🔬 诊断一个模型",
    "tile.diagnose.desc":          "这个具体模型符合我的用例吗？",
    "tile.trust.title":            "✓ 相信 benchmark 分数",
    "tile.trust.desc":             "我该相信这个数字吗？是 bug 还是噪声？",
    "tile.eval.title":              "⚙️ 正确设置 eval",
    "tile.eval.desc":              "获取 lm-eval / vLLM / transformers 的精确 CLI flag。",
    "tile.compare.title":          "🆚 比较模型",
    "tile.compare.desc":           "并排，或浏览经验模型面板。",
    "tile.manual.title":           "📋 手动 / 自由",
    "tile.manual.desc":            "手动挑一个具体 recipe，或用自然语言提问。",
    "tile.diagnose.tip":           "当你有具体的 model id 并想要完整诊断时从这里开始：<strong>Profile</strong> 一次运行所有 5 个 recipe。<strong>Unmask</strong> 检查 max_position_embeddings 是否诚实。<strong>NIAH→Reason</strong> 预测 retrieval-vs-reasoning 的 gap。<strong>Quant</strong> 预测量化是否会破坏它。<strong>Inspect</strong> 允许粘贴原始 config.json，适用于私有 / 在研模型。",
    "tile.trust.tip":              "当你看到一个分数想知道它是否可靠。<strong>Contamination</strong> 按模型在训练时看到 benchmark 的可能性给 20+ 个 benchmark 评级。<strong>Drift</strong> 告诉你两个 eval 之间的 gap 是数值噪声还是真实 bug（chat-template 不匹配、KV-cache 布局等）。<strong>Arena CI</strong> 重建 Chatbot Arena 隐藏的置信区间——很多 top-Elo 的 &quot;胜利&quot; 在统计上是并列。",
    "tile.eval.tip":               "在运行 lm-eval-harness 或 vLLM serve 之前，获取正确的 CLI flag。<strong>Chat-template Sniffer</strong> 检测 template 系列（Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none）并输出精确的 <code>--apply_chat_template</code> / <code>--chat-template</code> 调用。解决 lm-eval-harness 的 issue #1841（accuracy 静默对半）。<strong>Diagnose CLI</strong> 生成 Python 命令在你的本地 GPU 上测量 γ_obs。",
    "tile.compare.tip":            "<strong>Compare</strong>：选择 2-3 个候选模型 + 一个 recipe，在并排表格中看判定（例如 Llama-3-8B vs Mistral-7B 在 32k 上下文）。<strong>Phase diagram</strong>：23 个经验模型在 (log θ, γ) 平面上的散点图，叠加 Padé 曲线。悬停点查看详情，点击将该模型加载到 Recipe 表单。",
    "tile.manual.tip":             "<strong>Recipe</strong>：挑选具体的 X-N recipe（X-1 自训 vs API、X-2 长上下文、X-3 预算、X-5 硬件、X-19 KV 压缩、X-21 imprint、X-22 compute-context 不变量、X-23 IH 相位）并手动填表，完全控制。<strong>Ask</strong>：输入自由问题；浏览器内的 0.5B LLM（Qwen2.5）选择合适的 recipe 并运行。最适合 &quot;如果……会怎样&quot; 的探索。",
    "share.import_desc":       "有他人 TAF 分析的 JSON 文件? 在这里加载以本地查看判定 + 链。与您自己运行的视图相同。",
    "share.import_btn":        "📂 加载共享的 JSON",
    "synthesis.system":        "您是 transformer LLM 的精确诊断助手。给定预先计算的 TAF 公式结果,用 4-6 句中文写出清晰的摘要。为每个提到的数字引用章节号 (§X.Y)。始终给出具体建议。不要编造数字。",

    // INSPECTOR 模式
    "inspector.title":         "🔍 架构检查器",
    "inspector.desc":          "粘贴 <code>config.json</code> 的原始内容。工具提取架构参数并运行完整的 5 配方 Profile。",
    "inspector.tip":           "<strong>直接粘贴任意 config.json</strong>。工具解析它并运行完整 Profile。适用于:私有模型、开发中的 configs、尚未在 HuggingFace 的模型,或比较自定义架构的行为。",
    "inspector.quickstart":    "💡 用例:您有未在 HF Hub 上的私有模型,或正在设计的 config。粘贴下面的原始 JSON,获取完整 TAF 画像。",
    "inspector.placeholder":   "{\n  \"model_type\": \"llama\",\n  \"rope_theta\": 500000,\n  \"max_position_embeddings\": 8192,\n  \"num_attention_heads\": 32,\n  \"num_key_value_heads\": 8,\n  \"hidden_size\": 4096,\n  \"num_hidden_layers\": 32\n}",
    "inspector.T_eval":        "T_eval (您的目标上下文):",
    "inspector.btn":           "🚀 检查并画像",

    // WHAT-IF 滑块
    "whatif.title":            "🎚 What-if: 拖动 T_eval 实时查看 γ 变化",
    "whatif.desc":             "纯 JS 重新计算 (不调用 Pyodide)。滑动时显示几何 γ_Padé 和 d_horizon。点击按钮重新运行完整链。",
    "whatif.T_eval":           "<strong>T_eval</strong>",
    "whatif.gamma_pade":       "<strong>γ_Padé</strong>",
    "whatif.d_horizon":        "<strong>d_horizon</strong>",
    "whatif.l_niah":           "<strong>L_NIAH 上限</strong>",
    "whatif.predicted":        "<strong>预测几何判定</strong>",
    "whatif.rerun":            "↻ 在此 T_eval 重新计算完整链",

    // COMMUNITY 反馈
    "community.title":         "🌐 社区最近提交",
    "community.desc":          "公共 registry 的实时反馈。点击任意提交查看完整分析。",
    "community.browse_all":    "浏览全部 →",
    "community.loading":       "加载中...",
    "community.no_repo":       "Registry 仓库尚未创建。一旦它存在并有提交,它们将在此处实时显示。",
    "community.no_submissions": "暂无提交。成为第一个 — 生成一个 Profile 并点击 📤 提交到 registry。",

    // FALSIFICATION 仪表板
    "falsification.title":     "🔬 论文预测 — 可证伪状态",
    "falsification.desc":      "TAF 框架基于可证伪的预测 (F1-F23)。每一个都经过经验测试。这是论文中每个预测的实时状态。",
    "falsification.summary":   "{confirmed} 已确认 · {partial} 部分 · {refuted} 已反驳 · {untested} 未测试 (共 {total} 个预测)",
    "falsification.col.id":    "ID",
    "falsification.col.claim": "Claim",
    "falsification.col.status": "状态",
    "falsification.col.evidence": "证据",

    "tafcard.title":           "📇 TAF 卡 — 完整模型画像",
    "tafcard.recipes_title":   "📋 配方 — 各维度判定",
    "tafcard.recipes_count_label": "维度",
    "tafcard.numbers_title":   "🔢 关键数字 (paper §26)",
    "tafcard.fals_title":      "🔬 可证伪状态 (F1-F23)",
    "tafcard.fals_none":       "无适用的可证伪。",
    "tafcard.diag_title":      "🔬 诊断 — 数字 · γ 检验 · what-if",
    "tafcard.verify_title":    "✓ 验证 — Lean + Sage + 可证伪",
    "tafcard.share_title":     "📂 来源与分享",
    "tafcard.whatif_title":    "🎚️ What-if 浏览器",
    "verdict.go":              "通过",
    "verdict.no":              "否",
    "verdict.degraded":        "降级",

    "compare.title_out":       "🆚 比较表",

    "status.loading_pyodide":  "⏳ 加载 Python 运行时 (~10MB,首次加载)...",
    "status.loading_taf":      "⏳ 加载 TAF 公式 + 配方...",
    "status.ready":            "✅ 就绪。选择一个模型并点击画像开始。",
    "status.computing":        "🧮 计算 TAF 链...",
    "status.done":             "✅ 完成。",

    "profile.hf_placeholder":  "例如: meta-llama/Meta-Llama-3-8B 或 Qwen/Qwen2.5-7B",
    "compare.hf_placeholder":  "HF 模型 id (例如: meta-llama/Meta-Llama-3-8B)",
    "compare.slot1_placeholder": "HF 模型 id (例如: meta-llama/Meta-Llama-3-8B)",
    "compare.slot2_placeholder": "HF 模型 id #2",
    "compare.slot3_placeholder": "HF 模型 id #3 (可选)",
    "compare.preset_default": "— 或预设 —",

    // 表单参数
    "param.theta":         "θ (rope_theta)",
    "param.theta.tip":     "<strong>RoPE 基础频率</strong> 来自 <code>config.rope_theta</code>。越高 = 长程能力越强。",
    "param.T_train":       "T_train",
    "param.T_train.tip":   "<strong>训练最大上下文</strong>。来自 <code>max_position_embeddings</code>。超出此范围属于外推。",
    "param.T_eval":        "T_eval (您的目标)",
    "param.T_eval.tip":    "<strong>您的目标推理上下文</strong>。关键问题: 模型在 <em>这个</em> 长度下表现是否良好?",
    "param.n_attn":        "n_attention_heads",
    "param.n_attn.tip":    "<strong>每层 attention heads 数</strong>。来自 <code>num_attention_heads</code>。",
    "param.n_kv":          "n_kv_heads",
    "param.n_kv.tip":      "<strong>KV heads</strong>。若 &lt; n_attention_heads → GQA (Grouped Query Attention)。降低 KV 内存但将 γ 推向 Hagedorn。",
    "param.d_head":        "head_dim",
    "param.d_head.tip":    "<strong>每 head 维度</strong>。典型 64、96、128。来自 <code>head_dim</code> 或 <code>hidden_size / num_attention_heads</code>。",
    "param.n_layers":      "n_layers",
    "param.n_layers.tip":  "<strong>Transformer 块数</strong>。来自 <code>num_hidden_layers</code>。",
    "param.n_params":      "n_params (例如 8e9)",
    "param.n_params.tip":  "<strong>总参数量</strong>。约 400M 阈值出现 induction heads。影响 KV 内存和预算配方。",
    "param.has_swa":       "有 SWA 吗?",
    "param.has_swa.tip":   "<strong>Sliding Window Attention</strong>。Mistral、gemma-2、phi-3 为 <code>true</code>。v0.5.3 校准审计禁用了历史 δ_SWA 校正 (n=1 拟合)。",
    "common.yes":          "是",
    "common.no":           "否",

    // 模式提示
    "modes.tip":           "<strong>十四种使用方式</strong>。<br><strong>📇 画像</strong>: 粘贴模型 id → 5 个配方的 TAF 卡。<br><strong>🆚 比较</strong>: 2-3 个模型在一个配方上并排比较。<br><strong>🔍 检查 config</strong>: 粘贴原始 config.json → 完整画像。<br><strong>💬 提问</strong>: 自由形式问题,浏览器 LLM 选择配方。<br><strong>📋 配方</strong>: 手动选择,完全控制表单。<br><strong>🩺 CLI 诊断</strong>: 生成 Python 命令在本地测量 γ。<br><strong>📊 相图</strong>: 23 个面板模型在 (log θ, γ) 平面上。<br><strong>🪟 揭示</strong>: 检测误导的 max_position_embeddings（SWA / YaRN / RoPE 缩放）。<br><strong>📜 Chat-template</strong>: 检测系列 + 给出 lm-eval / vLLM / transformers 的精确 CLI flag。<br><strong>🎯 Arena CI</strong>: 从原始 pairwise 投票数据重建置信区间；检测 Arena 隐藏的统计并列。<br><strong>🧪 污染</strong>: 根据训练 cutoff 与发布日期，对 20+ benchmark 进行污染概率评估。<br><strong>⚖️ Quant</strong>: 预测任意（模型 × 量化方案）的 γ-shift 与 ΔPPL；cliff 时推荐更安全替代方案。<br><strong>🔀 Drift</strong>: 同一模型，两 setup 下分数不同 — bug 还是噪声？预测数值噪声区间并标记真实 bug。<br><strong>🔍 NIAH→Reason</strong>: 从架构预测 NIAH 与多跳 reasoning 通过率；找到模型的安全 reasoning 上下文。",
    "profile.tip":         "<strong>一键完整诊断</strong>。粘贴任意 HF 模型 id (或选择预设)。工具运行所有 5 个配方 (长上下文、KV 压缩、自定义 vs API、预算、硬件),生成单个 <strong>TAF 卡</strong>,显示每个维度的判定 + 关键数字 + 架构分类。<br><br><strong>用例</strong>: \"我正在为生产评估 Qwen2.5-32B — 它的完整可行性概况是什么?\" → 粘贴 id → 画像 → 完成。",
    "compare.tip":         "<strong>同一配方,多个模型</strong>。选择 2-3 个候选模型和一个配方。在单个比较表中查看判定。<br><br><strong>用例</strong>: \"我需要在 16K 进行长上下文检索 — 哪个最好: Llama-3-8B、Mistral-7B 或 Qwen-7B?\" → 选择 3 个 + X-2 + 16K → 看赢家。",

    // 帮助模态框
    "help.title":               "📘 TAF Agent — 用户手册",
    "help.what.title":          "它做什么?",
    "help.what.body":           "在<em>花费 GPU/$ 之前</em>,预测任意 transformer LLM 的<strong>实际可行性</strong>。回答诸如 \"这个模型能在 L=32K 工作吗?\" 或 \"我应该自定义训练还是使用 API?\" 等问题,使用确定性 Python 公式 (TAF — Thermodynamic Attention Framework)。",
    "help.modes.title":         "如何使用 — 7 种模式",
    "help.modes.profile":       "<strong>📇 画像</strong>: 粘贴模型 id → 同时运行所有配方 = TAF 卡。<strong>最佳起点</strong>。",
    "help.modes.compare":       "<strong>🆚 比较</strong>: 2-3 个模型在同一配方上并排。最适合在候选者之间选择。",
    "help.modes.inspector":     "<strong>🔍 检查 config</strong>: 粘贴原始 <code>config.json</code> → 工具解析并运行完整画像。适用于私有模型、开发中的配置、或尚未在 HF Hub 上的模型。",
    "help.modes.ask":           "<strong>💬 自由提问</strong>: 自然语言问题,浏览器 LLM 选择配方。最适合随意探索。",
    "help.modes.recipe":        "<strong>📋 配方 + 表单</strong>: 手动选择,完全控制参数。最适合需要精确控制时。",
    "help.modes.diagnose":      "<strong>🩺 CLI 诊断</strong>: 生成 Python 命令在你的本地机器上测量 γ (transformers + numpy)。快速 ≈5 分钟 CPU;完整 ≈20–60 分钟 GPU。结果 JSON 可通过 Inspect 重新上传。",
    "help.modes.phase":         "<strong>📊 相图</strong>: 23 个面板模型在 (log θ, γ) 平面上的散点图。Hagedorn 线 γ=1 分隔 A 相和 B 相。点击点将该模型加载到配方表单。",
    "help.recipes.title":       "可用的 8 个配方",
    "help.recipe.x1.title":     "<strong>X-1 自定义训练 vs API</strong> — 比较训练自己模型的成本与付费使用 API 的成本。",
    "help.recipe.x1.example":   "尝试: <em>\"我应该训练 8B 自定义模型还是使用 GPT-4o 处理每月 50M tokens?\"</em><br>答案: 是 (自定义) / 否 (API),含损益平衡月数。",
    "help.recipe.x2.title":     "<strong>X-2 长上下文可行性</strong> — 预测模型是否能可靠地服务目标上下文长度。",
    "help.recipe.x2.example":   "尝试: <em>\"Meta-Llama-3-8B 能处理 32000 tokens 检索吗?\"</em><br>链: γ_Padé → 分解 → d_horizon → NIAH 上限 → 幻觉 → KV 内存。<br>判定: 是 / 降级 / 否,如需则提供缓解措施。",
    "help.recipe.x3.title":     "<strong>X-3 预算预飞行</strong> — 给定 $ 预算,可行训练什么模型?",
    "help.recipe.x3.example":   "尝试: <em>\"我有 $5000,可以训练什么模型?\"</em><br>答案: GO / TINY-MODEL / MEMORY-LIMITED 含具体的 N (参数) 和 D (tokens)。",
    "help.recipe.x5.title":     "<strong>X-5 硬件选择</strong> — 应该使用哪个 GPU 以达到目标吞吐量?",
    "help.recipe.x5.example":   "尝试: <em>\"以每天 1000 万 tokens 提供 Llama-3-8B 的最便宜硬件\"</em><br>答案: 最佳 GPU + $/Mtok + 容量 vs 目标。",
    "help.recipe.x19.title":    "<strong>X-19 KV 压缩决策</strong> — 应该使用 soft decay、hard cutoff 还是文献方法?",
    "help.recipe.x21.title":    "<strong>X-21 Imprint 纯度诊断</strong> — 通过 ν=−1/(2π) 预测 RANDOM token 上的 γ；模型的 RoPE 预测有多干净?",
    "help.recipe.x22.title":    "<strong>X-22 Compute-Context 不变量</strong> — γ × log(N²·D) 是否落在 51.2 ± 16.8 区间内?检测 scaling/training 异常。",
    "help.recipe.x23.title":    "<strong>X-23 IH-Phase 检测器</strong> — 前- 还是后-induction-head?通过 sign(γ_text − γ_random) 进行廉价探测。",
    "help.recipe.x19.example":  "尝试: <em>\"如何为 Qwen2.5-7B 在 32K 压缩 KV 缓存?\"</em><br>答案: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.",
    "help.recipe.x21.example":  "尝试: <em>\"Llama-3-8B 上的 RoPE 预测有多干净?\"</em><br>答案: 预测的 γ_random + 诊断 (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED)。",
    "help.recipe.x22.example":  "尝试: <em>\"Mistral-7B 是否符合 compute-context 不变量?\"</em><br>答案: K = γ·log(N²·D)、z-score、IN-BAND 或 OUTLIER。",
    "help.recipe.x23.example":  "尝试: <em>\"Qwen2.5-7B 是后-induction-head 吗?\"</em><br>答案: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY。",
    "help.section.v04":         "<strong>v0.4 新增</strong> (第 29 次研究会话, 2026-04-28): 来自 cross-model panel 分析 (n=22 LLMs) 的三个诊断 recipes。",
    "help.divider.v04_s29":     "— v0.4 (第 29 次会话发现) —",
    "footer.tech_stack":        "计算：Pyodide · 综合：WebLLM (Qwen2.5-0.5B 本地) · 托管：GitHub Pages · 成本：$0",
    "help.v04.imprint":         "<strong>学习印记斜率 ν = −1/(2π)</strong>: RoPE 旋转周期 2π 在权重上引发位置偏置, 与 log(N_params) 成正比。即使 random token 也显示此 scaling。ν 是 DERIVED — 非拟合 (经验误差 0.3%)。",
    "help.v04.invariant":       "<strong>Chinchilla-attention 不变量 K</strong>: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329)。将 compute scaling 和 attention 指数连接为单一无量纲数。",
    "help.v04.ih_probe":        "<strong>Δγ 作为 IH 探测</strong>: sign(γ_text − γ_random) > 0 ⟺ post-induction-head。比运行 in-context-learning 基准更便宜。",
    "help.v04.constants":       "<strong>γ 簇落在著名常数上</strong> (有趣, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (黄金共轭, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e)。Caveat: 可能是巧合。",
    "help.param.theta":         "<strong>θ (rope_theta)</strong>: RoPE 基础频率。越高 = 长程能力越强。典型: 10000 (早期),500000 (Llama-3),1000000 (Qwen2.5)。",
    "help.param.T_train":       "<strong>T_train</strong>: 模型训练时的最大上下文。来自 <code>max_position_embeddings</code>。",
    "help.param.T_eval":        "<strong>T_eval</strong>: <em>您的</em> 目标推理上下文长度。关键旋钮。",
    "help.param.gqa":           "<strong>n_kv_heads &lt; n_attention_heads</strong>: 模型使用 GQA (Grouped Query Attention)。减少 KV 内存但将 γ 推向 Hagedorn。",
    "help.param.swa":           "<strong>has_SWA</strong>: 模型使用 Sliding Window Attention (Mistral、gemma-2)。",
    "help.param.nparams":       "<strong>n_params</strong>: 总参数数量。诱导头出现的阈值约 400M。",
    "help.add_models.title":    "添加新模型 (3 种方式)",
    "help.add_models.preset":   "<strong>预设列表</strong>: 11 个流行模型已策划。从下拉菜单选择。",
    "help.add_models.hf":       "<strong>HF Hub 获取</strong>: 粘贴任意 id (例如 <code>Qwen/Qwen2.5-32B-Instruct</code>),点击 📥 获取。浏览器直接从 HuggingFace 下载 <code>config.json</code>,填充表单。适用于任何公共模型。",
    "help.add_models.manual":   "<strong>手动</strong>: 用模型卡的值直接填充表单字段。",
    "help.audit.title":         "可审计链",
    "help.audit.body":          "每个结果都显示完整的<strong>计算链</strong> — 每个公式步骤及其输入、输出和解释。点击任意步骤展开。引用的章节号 (§26.1、§19.1 等) 指向论文中的推导。",
    "help.synthesis.title":     "自然语言回答",
    "help.synthesis.body":      "在确定性链运行后,浏览器中的 LLM (Qwen2.5-0.5B,首次加载后约 350MB 缓存) 综合自然语言摘要。上面的数字<em>始终正确</em> (确定性 Python);综合由 LLM 生成 — 如有疑问,请对照链验证。",
    "help.params.title":        "常见参数解释",
    "help.verdicts.title":      "判定中要看什么",
    "help.verdict.yes":         "<strong style=\"color:#3fb950;\">是 / GO</strong> — 自信地继续;数字支持选择。",
    "help.verdict.deg":         "<strong style=\"color:#d29922;\">降级 / TINY-MODEL</strong> — 有警告地工作;阅读操作。",
    "help.verdict.no":          "<strong style=\"color:#f85149;\">否 / MEMORY-LIMITED</strong> — 不要按原样进行;提供缓解措施。",
    "help.privacy.title":       "隐私",
    "help.privacy.body":        "一切都在您的浏览器中运行。无遥测,无分析,无数据发送到任何地方。即使是 LLM 模型也通过 WebGPU/WebAssembly 在本地运行。您的 model_ids 和问题永不离开此页面。",
    "help.source.title":        "源代码和论文",
    "help.source.body":         "源代码: <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>论文: <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a>; arXiv 即将)<br>数据集: <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 32个模型上的58次γ测量 (CC-BY-4.0)",

    "footer.text":             "© 2026 Carles Marin · Apache-2.0 · 独立研究 · 闭合论文回路的工具。",
  },
};

let currentLang = "en";

export function getLang() {
  return currentLang;
}

export function setLang(code) {
  if (!TRANSLATIONS[code]) return;
  currentLang = code;
  try { localStorage.setItem("tafagent_lang", code); } catch (e) {}
  applyTranslations();
  // Highlight active flag
  document.querySelectorAll("[data-lang]").forEach(el => {
    el.classList.toggle("lang-active", el.dataset.lang === code);
  });
}

export function t(key) {
  return TRANSLATIONS[currentLang][key] ?? TRANSLATIONS.en[key] ?? key;
}

export function applyTranslations() {
  document.querySelectorAll("[data-i18n]").forEach(el => {
    const key = el.dataset.i18n;
    const value = t(key);
    // Allow HTML in translations (we control them)
    el.innerHTML = value;
  });
  document.querySelectorAll("[data-i18n-placeholder]").forEach(el => {
    el.placeholder = t(el.dataset.i18nPlaceholder);
  });
}

// Expose so dynamically-inserted DOM (renderProfile, renderCompare) can re-apply
if (typeof window !== "undefined") {
  window.__taf_applyTranslations = applyTranslations;
  // Also expose the lookup itself so non-import-based modules (e.g. hf_autocomplete
  // that runs outside main.js context) can localize without a circular import.
  window.__taf_t = t;
}

export function initI18n() {
  // Browser default lang detection or stored preference
  let stored = null;
  try { stored = localStorage.getItem("tafagent_lang"); } catch (e) {}
  if (stored && TRANSLATIONS[stored]) {
    currentLang = stored;
  } else {
    const browserLang = (navigator.language || "en").slice(0, 2);
    if (TRANSLATIONS[browserLang]) currentLang = browserLang;
  }
  applyTranslations();
  // Mark active flag
  document.querySelectorAll("[data-lang]").forEach(el => {
    el.classList.toggle("lang-active", el.dataset.lang === currentLang);
  });
}