Spaces:
Running
Running
| // TAF Agent i18n — minimal translation system. | |
| // Add languages by extending TRANSLATIONS. Set data-i18n="key" on any element. | |
| // Persist user choice in localStorage. | |
| export const LANGUAGES = [ | |
| { code: "en", flag: "🇬🇧", label: "English" }, | |
| { code: "es", flag: "🇪🇸", label: "Español" }, | |
| { code: "fr", flag: "🇫🇷", label: "Français" }, | |
| { code: "zh", flag: "🇨🇳", label: "中文" }, | |
| ]; | |
| export const TRANSLATIONS = { | |
| en: { | |
| "hero.title": "🔬 TAF Agent", | |
| "hero.tagline": "Diagnose any transformer LLM in 30 seconds. Free. No GPU. No signup.", | |
| "hero.subtitle": "Predicts whether a model will work for your use case <em>before</em> you spend money or time. Everything runs in your browser — your inputs never leave this tab.", | |
| "hero.help": "📘 Manual & examples", | |
| "hero.quickstart_btn": "⚡ Quick start", | |
| "hero.inventory_btn": "🧰 What it gives you", | |
| "hero.about": "Built by an independent researcher. Open source. Not affiliated with any model vendor.", | |
| "modes.title": "🎯 Mode", | |
| "modes.profile": "📇 Profile a model", | |
| "modes.compare": "🆚 Compare models", | |
| "modes.inspector": "🔍 Inspect config", | |
| "modes.ask": "💬 Ask plain English", | |
| "modes.recipe": "📋 Pick recipe", | |
| "modes.diagnose": "🩺 Diagnose CLI", | |
| "diagnose.title": "🩺 Diagnose CLI Command Builder", | |
| "diagnose.tip": "Browser predicts γ from config; the CLI measures γ_obs on real weights. Builder produces the exact command to run locally.", | |
| "diagnose.desc": "Pick options and copy-paste the generated command on your local machine (Python + transformers + numpy). Fast mode ≈5 min CPU; full ≈20–60 min GPU.", | |
| "diagnose.model_label": "HF model id:", | |
| "diagnose.theta_label": "θ (auto if blank):", | |
| "diagnose.n_label": "Context N:", | |
| "diagnose.options_label": "Options:", | |
| "diagnose.opt_fast": "--fast (CPU, ~5 min)", | |
| "diagnose.opt_cpu": "--cpu (force CPU)", | |
| "diagnose.opt_4bit": "--load_in_4bit (≥7B models)", | |
| "diagnose.local_label": "--local path (optional):", | |
| "diagnose.build_btn": "📋 Build command", | |
| "diagnose.cmd_title": "Generated command:", | |
| "diagnose.copy_btn": "📋 Copy to clipboard", | |
| "diagnose.next_steps": "Next steps: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Run the command (4) Result JSON → upload via Inspect mode for full TAF analysis.", | |
| "modes.phase": "📊 Phase diagram", | |
| "phase.title": "📊 Phase diagram (γ × θ)", | |
| "phase.tip": "Each dot is one model from the paper's empirical panel. x-axis log θ; y-axis γ. Hagedorn line γ=1 separates Phase A from Phase B. Hover for details, click to load into the recipe form.", | |
| "phase.desc": "23 models in the panel; Padé curve at T=2000.", | |
| "modes.desc": "<strong>Quickest start</strong>: paste any HuggingFace model id (e.g. <code>meta-llama/Meta-Llama-3-8B</code>), click Profile. See all 5 recipes scored in seconds.", | |
| "profile.title": "📇 Profile a model", | |
| "profile.desc": "<strong>For technicians</strong>: when you need a complete viability snapshot of a candidate model. One-click runs all 5 recipes and produces a unified TAF Card.", | |
| "profile.preset_label": "Preset:", | |
| "profile.preset_default": "— or pick from list —", | |
| "profile.hf_label": "HF model id:", | |
| "profile.fetch_btn": "📥 Fetch", | |
| "profile.btn": "🚀 Generate full profile", | |
| "profile.quickstart": "💡 Quick start: pick any preset → click Generate. Or paste a model id from <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub trending</a> → 📥 Fetch → Generate.", | |
| "compare.title": "🆚 Compare models side-by-side", | |
| "compare.desc": "<strong>For technicians</strong>: when choosing between 2-3 candidate models for a specific deployment scenario. Same recipe, multiple models, side-by-side verdicts.", | |
| "compare.recipe_label": "Recipe:", | |
| "compare.T_eval_label": "T_eval (target context):", | |
| "compare.models_title": "Models to compare (add up to 3)", | |
| "compare.btn": "🚀 Compare", | |
| "compare.example": "💡 Try: paste 3 popular 7-8B models (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), pick recipe X-2, T_eval=16000. See which best handles long context.", | |
| "ask.title": "❓ Your question", | |
| "ask.placeholder": "e.g. Will Mistral-7B handle 16K NIAH retrieval? Or: I have $5,000, what model can I train? Or: Cheapest GPU to serve Llama-70B at 100M tokens/day?", | |
| "ask.btn": "🚀 Analyze", | |
| "ask.example_btn": "💡 Try an example", | |
| "recipe.title": "📋 Recipe", | |
| "recipe.default": "— select a recipe —", | |
| "recipe.input_title": "🎯 Inputs", | |
| "verdict.title": "📊 Verdict", | |
| "chain.title": "🔍 Computation Chain", | |
| "chain.desc": "Every number below is deterministic Python. Click a step to expand.", | |
| "answer.title": "💬 Plain-English Answer", | |
| "share.btn": "🔗 Copy share link", | |
| "share.copied": "✅ Copied to clipboard!", | |
| "share.download": "💾 Download JSON", | |
| "share.download_md": "📝 Markdown", | |
| "share.download_tex": "📜 LaTeX", | |
| "share.submit": "📤 Submit to registry", | |
| "share.submit_clip_ok": "↗ Opened GitHub. Body copied to clipboard — paste it into the issue body.", | |
| "share.submit_clip_fail": "↗ Opened GitHub. Clipboard blocked — body logged in browser console (F12).", | |
| "share.import_title": "📂 Import a shared TAF result", | |
| "a11y.skip": "Skip to main content", | |
| // v0.6.2 — landing rework: quick-start + inventory + arch tooltips | |
| "qs.title": "⚡ Quick start", | |
| "qs.step1": "Paste a HuggingFace model ID (e.g. <code>meta-llama/Meta-Llama-3-8B</code>)", | |
| "qs.step2": "Click <strong>📇 Profile a model</strong>", | |
| "qs.step3": "Read your TAF Card — verdict per use case + key numbers + math verified by Lean+Mathlib", | |
| "qs.cta": "↓ Start now", | |
| "inv.title": "🧰 What this tool gives you", | |
| "inv.recipes.title": "🎯 8 recipes — does this model fit your use case?", | |
| "inv.recipes.x1.title": "Custom train vs API", | |
| "inv.recipes.x1.body": "which is cheaper for your traffic?", | |
| "inv.recipes.x2.title": "Long context", | |
| "inv.recipes.x2.body": "will it handle 32k / 128k tokens reliably?", | |
| "inv.recipes.x3.title": "Budget", | |
| "inv.recipes.x3.body": "with $X, what model can you train from scratch?", | |
| "inv.recipes.x5.title": "Hardware", | |
| "inv.recipes.x5.body": "which GPU to serve N tokens/day?", | |
| "inv.recipes.x19.title": "KV cache", | |
| "inv.recipes.x19.body": "how to compress without breaking quality?", | |
| "inv.recipes.x21.title": "Imprint purity", | |
| "inv.recipes.x21.body": "how clean is the model's positional encoding?", | |
| "inv.recipes.x22.title": "Compute-context", | |
| "inv.recipes.x22.body": "does the model fit the empirical band?", | |
| "inv.recipes.x23.title": "IH-phase", | |
| "inv.recipes.x23.body": "pre- or post-induction-head?", | |
| "inv.diag.title": "🔬 Diagnostics", | |
| "inv.diag.gamma": "<strong>γ predicted vs observed</strong> — auto-classifies the model into 5 regimes (normal · fraud / inflated context · compressed · over-Padé · sliding-window)", | |
| "inv.diag.cardy": "<strong>Cardy ΔH</strong> — entropy shift between observed and nominal context", | |
| "inv.diag.fals": "<strong>Falsification dashboard</strong> — checks 23 specific predictions (F1–F23)", | |
| "inv.diag.alg": "<strong>Algebraic consistency</strong> — 8 mathematical identities the model must satisfy", | |
| "inv.verify.title": "✓ Formally verified math", | |
| "inv.verify.count": "<strong>37 theorems</strong> machine-proven in Lean 4 + Mathlib4", | |
| "inv.verify.click": "Click any badge → opens the source line on GitHub", | |
| "inv.verify.reverify": "Verify yourself: <code>lake build</code> (≈5 s after cache fetch)", | |
| "inv.export.title": "📤 Export & share", | |
| "inv.export.formats": "<strong>JSON · Markdown · LaTeX</strong> (paper-ready)", | |
| "inv.export.share": "Reproducible share link (state encoded in URL)", | |
| "inv.export.registry": "Submit to community registry on GitHub", | |
| "arch.summary": "Architectures supported", | |
| "arch.anyhf": "✓ Any HuggingFace public model", | |
| "tooltip.mha": "Multi-Head Attention: each token position attends through several parallel heads at once.", | |
| "tooltip.gqa": "Grouped Query Attention: queries share fewer keys/values than heads (saves memory but pushes γ toward Hagedorn).", | |
| "tooltip.alibi": "Attention with Linear Biases: position info is a learned slope added to attention scores, no rotation.", | |
| "tooltip.abspe": "Absolute Position Embeddings: each position has a fixed learned vector added to the token embedding.", | |
| "tooltip.swa": "Sliding Window Attention: each token only attends within a fixed local window (Mistral, gemma-2 use this).", | |
| "tooltip.ssm": "State Space Model: a sequence layer that maintains internal state instead of attention (Mamba, Jamba use this).", | |
| // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker | |
| "modes.unmask": "🪟 Unmask", | |
| "unmask.title": "🪟 Context Unmasker", | |
| "unmask.tip": "Paste a HuggingFace model id (or raw config.json). The tool checks for sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), and GQA — anything that makes <code>max_position_embeddings</code> larger than the practical effective context. Mistral-7B-v0.1 is the canonical example: declared 32k, attends within ~4-8k.", | |
| "unmask.desc": "<strong>Are you about to spend money on a model that won't actually attend that far?</strong> Paste an id and find out in 1 second. No GPU, no inference — just config.json arithmetic.", | |
| "unmask.id_label": "HF model id:", | |
| "unmask.fetch_btn": "🔍 Unmask", | |
| "unmask.paste_summary": "Or paste raw config.json (private / in-dev models)", | |
| "unmask.paste_btn": "🔍 Unmask pasted config", | |
| "unmask.label.declared": "Declared context", | |
| "unmask.label.effective": "Effective (estimate)", | |
| "unmask.label.ratio": "Ratio", | |
| "unmask.section.flags": "Architecture flags", | |
| "unmask.section.warnings": "Warnings", | |
| "unmask.section.reco": "Recommendation", | |
| "unmask.flag.swa": "SWA", | |
| "unmask.flag.rope": "RoPE scaling", | |
| "unmask.flag.gqa": "GQA", | |
| "unmask.flag.layers": "Layers", | |
| "unmask.flag.dhead": "d_head", | |
| "unmask.flag.theta": "RoPE θ", | |
| "unmask.flag.yes": "yes", | |
| "unmask.flag.no": "no", | |
| "unmask.flag.full_mha": "no (full MHA, {n} heads)", | |
| "unmask.verdict.honest": "✅ HONEST", | |
| "unmask.verdict.inflated": "⚠ INFLATED", | |
| "unmask.verdict.severely_inflated": "❌ SEVERELY INFLATED", | |
| "unmask.verdict.yarn_extended": "⚠ YARN-EXTENDED", | |
| "unmask.verdict.unknown": "❓ UNKNOWN", | |
| "unmask.warn.swa_window": "SWA window: {window} tokens — each layer only attends within this window.", | |
| "unmask.warn.multihop": "Multi-hop estimate: ~{multiHop} tokens (conservative: window × {factor}).", | |
| "unmask.warn.yarn": "RoPE scaling ({type}) extends context {factor}× from ~{original} to {declared} tokens.", | |
| "unmask.warn.yarn_advice": "RoPE-extended context — verify γ behavior at the full claimed length with the γ_check diagnostic.", | |
| "unmask.warn.gqa_small_dhead": "Small head dim ({d_head}) + GQA: KV cache compression at long context is likely (γ pushed toward Hagedorn).", | |
| "unmask.reco.honest": "Standard full-attention model. Effective context matches declared ({declared} tokens).", | |
| "unmask.reco.inflated": "Effective ~{effective} tokens via SWA. Use γ_check to verify behavior at your target evaluation length.", | |
| "unmask.reco.severely_inflated": "Treat as a ~{effective}-token context model in practice. The {declared}-token claim only applies via cross-layer attention chains, which empirically degrade past ~2× the SWA window.", | |
| "unmask.reco.yarn_extended": "RoPE-extended context. Run a long-context benchmark (NIAH at 8k / 16k / 32k / full) to confirm the extension holds. Use γ_check with T_eval = {declared}.", | |
| "unmask.reco.unknown": "Could not parse config. Verify the URL is a valid HF model with public config.json.", | |
| "unmask.status.empty_id": "⚠ Enter a model id (e.g. mistralai/Mistral-7B-v0.1).", | |
| "unmask.status.fetching": "⏳ Fetching config.json for {modelId}...", | |
| "unmask.status.success": "✅ Analyzed {modelId} (verdict: {verdict})", | |
| "unmask.status.empty_paste": "⚠ Paste a config.json first.", | |
| "unmask.status.invalid_json": "❌ Not valid JSON: {error}", | |
| "unmask.status.success_paste": "✅ Analyzed pasted config (verdict: {verdict})", | |
| "unmask.pasted_label": "(pasted config)", | |
| "mode_desc.ask": "Type a free-form question. The in-browser LLM picks the right recipe and runs it.", | |
| "mode_desc.recipe": "Pick a recipe directly and fill the form. Full manual control.", | |
| "mode_desc.profile": "Quickest start: paste any HuggingFace model id, click Profile. See all 5 recipes scored in seconds.", | |
| "mode_desc.compare": "Pick 2-3 candidate models + one recipe. See verdicts side-by-side in a comparison table.", | |
| "mode_desc.inspector": "Paste a config.json directly. Useful for private/in-development models not on HF Hub.", | |
| "mode_desc.diagnose": "Build the diagnose_model.py CLI command to MEASURE γ_obs on real GPU. Browser predicts; CLI measures.", | |
| "mode_desc.phase": "γ × θ scatter of the paper's empirical panel. Hover a dot for details, click to load into Diagnose / Recipe forms.", | |
| "mode_desc.unmask": "Detects whether max_position_embeddings is misleading (SWA / YaRN / RoPE-scaling). Paste a model id, get a 1-line verdict.", | |
| "profile.preset_loaded": "✅ Loaded preset for <strong>{id}</strong>. Form pre-filled. (Click 📥 Fetch to override with the latest config from HF Hub.)", | |
| // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer | |
| "modes.template": "📜 Chat-template", | |
| "mode_desc.template": "Detects which chat-template family a model uses (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Gives the exact CLI flag for lm-eval / vLLM / transformers.", | |
| "template.title": "📜 Chat-template Sniffer", | |
| "template.tip": "Paste an HF model id (or raw tokenizer_config.json). Detects the chat-template family and gives you the exact framework command to use it correctly. lm-eval-harness silently halves accuracy if you forget to apply it (issue #1841).", | |
| "template.desc": "<strong>Did you forget <code>--apply_chat_template</code>?</strong> Most multi-turn evals fail by ~50% because the chat template wasn't applied. Paste a model id, get the exact CLI flag for your stack.", | |
| "template.id_label": "HF model id:", | |
| "template.fetch_btn": "📜 Sniff", | |
| "template.paste_summary": "Or paste raw tokenizer_config.json (private models)", | |
| "template.paste_btn": "📜 Sniff pasted config", | |
| "template.label.family": "Detected family", | |
| "template.label.markers": "Matched markers", | |
| "template.label.tpl_len": "Template length", | |
| "template.section.warnings": "Warnings", | |
| "template.section.commands": "Commands by framework", | |
| "template.section.raw": "Raw template (preview)", | |
| "template.family.custom": "custom (unknown family)", | |
| "template.family.none": "(no chat_template)", | |
| "template.verdict.ok": "✅ TEMPLATE DETECTED", | |
| "template.verdict.custom": "⚠ CUSTOM TEMPLATE", | |
| "template.verdict.missing": "❌ NO CHAT TEMPLATE", | |
| "template.verdict.base_model": "ℹ BASE MODEL (no chat)", | |
| "template.verdict.unknown": "❓ UNKNOWN", | |
| "template.warn.no_chat_template": "No <code>chat_template</code> field in tokenizer_config.json. This is typical for base / pretrained-only models. If you intended an instruct-tuned model, the wrong file may be loaded.", | |
| "template.warn.custom_template": "Template is non-standard ({length} chars). The tool could not match it against known families. Inspect the raw preview below and verify your eval framework supports it.", | |
| "template.warn.lm_eval_apply": "<strong>lm-eval-harness:</strong> add <code>--apply_chat_template</code> or your accuracy will silently drop ~50% on multi-turn evals (issue #1841).", | |
| "template.warn.vllm_apply": "<strong>vLLM serve:</strong> verify <code>--chat-template</code> is set (auto-detection sometimes fails for fine-tuned variants). Suggested: <code>{name}</code>.", | |
| "template.status.empty_id": "⚠ Enter a model id (e.g. mistralai/Mistral-7B-Instruct-v0.3).", | |
| "template.status.fetching": "⏳ Fetching tokenizer_config.json for {modelId}...", | |
| "template.status.success": "✅ Sniffed {modelId} (verdict: {verdict})", | |
| "template.status.empty_paste": "⚠ Paste a tokenizer_config.json first.", | |
| "template.status.invalid_json":"❌ Not valid JSON: {error}", | |
| "template.status.success_paste":"✅ Sniffed pasted config (verdict: {verdict})", | |
| "template.pasted_label": "(pasted tokenizer_config)", | |
| // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor | |
| "modes.arena": "🎯 Arena CI", | |
| "mode_desc.arena": "Recovers confidence intervals from raw pairwise vote data (Bradley-Terry MLE + bootstrap). Detects statistically tied pairs that the public Arena leaderboard hides.", | |
| "arena.title": "🎯 Arena-Elo CI Reconstructor", | |
| "arena.tip": "Chatbot Arena strips confidence intervals from the public leaderboard. A 5-Elo gap can be statistically meaningless. Paste raw vote data (model_a, model_b, winner) — the tool computes Bradley-Terry MLE + bootstrap CIs and lists statistical ties (CI overlap).", | |
| "arena.desc": "<strong>Is GPT-4 actually better than Claude — or are they tied?</strong> Paste pairwise vote CSV (or click <em>Load sample</em>). Bradley-Terry MLE + 200-iteration bootstrap → ranked Elos with 95% CIs and statistical-tie detection. All in browser.", | |
| "arena.sample_btn": "📊 Load sample data", | |
| "arena.run_btn": "🎯 Compute CIs", | |
| "arena.clear_btn": "🗑️ Clear", | |
| "arena.csv_summary": "Vote CSV (header: <code>model_a,model_b,winner</code>; winner ∈ a/b/tie)", | |
| "arena.section.ranked": "Ranked Elos with 95% CIs", | |
| "arena.section.ties": "Statistical ties (CI overlap)", | |
| "arena.section.summary": "Summary", | |
| "arena.col.rank": "#", | |
| "arena.col.model": "Model", | |
| "arena.col.elo": "Elo", | |
| "arena.col.ci": "95% CI", | |
| "arena.col.ci_width": "± half-width", | |
| "arena.col.matches": "Matches", | |
| "arena.col.wins": "W / L / T", | |
| "arena.col.tie_pair": "Pair", | |
| "arena.col.tie_diff": "Elo gap", | |
| "arena.col.tie_overlap": "CI overlap", | |
| "arena.no_ties": "No statistical ties — all pairs distinguishable at 95% CI.", | |
| "arena.summary.votes": "Total votes", | |
| "arena.summary.models": "Models", | |
| "arena.summary.ties": "Statistical ties", | |
| "arena.summary.bootstrap": "Bootstrap iters", | |
| "arena.summary.ci_level": "CI level", | |
| "arena.status.empty": "⚠ Paste vote CSV or click Load sample.", | |
| "arena.status.too_few": "⚠ Only {n} valid votes — need at least 10 to fit Bradley-Terry reliably.", | |
| "arena.status.computing": "⏳ Computing Bradley-Terry MLE + bootstrap on {n} votes...", | |
| "arena.status.done": "✅ {n} votes · {models} models · {ties} statistical ties · {ms} ms", | |
| "arena.status.sample_loaded": "✅ Sample loaded (synthetic 6-model Arena data). Click Compute CIs.", | |
| // v0.7.3 — anti-bullshit pack #4: Contamination Prior | |
| "modes.contam": "🧪 Contamination", | |
| "mode_desc.contam": "Bayesian-ish prior on whether a benchmark score is contaminated. Enter your model's training cutoff → rates 20+ popular benchmarks (MMLU, GSM8K, HumanEval, MMLU-Pro…).", | |
| "contam.title": "🧪 Contamination Prior", | |
| "contam.tip": "Computes a Bayesian-ish prior on whether a benchmark score is contaminated, based on (model training cutoff date) × (benchmark release date) × (known corpus inclusion + leak history). Open LLM Leaderboard v1 was killed in 2024 after MMLU/HellaSwag scores became contaminated.", | |
| "contam.desc": "<strong>Should you trust your model's MMLU score?</strong> Enter the model's training cutoff date — the tool rates 20+ popular benchmarks (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) and tells you which scores are likely contaminated.", | |
| "contam.cutoff_label": "Training cutoff:", | |
| "contam.run_btn": "🧪 Rate all benchmarks", | |
| "contam.section.ranked": "Benchmark contamination priors", | |
| "contam.section.high": "🔴 High-risk benchmarks (treat scores as unreliable)", | |
| "contam.section.medium": "🟡 Medium-risk (verify with alternates)", | |
| "contam.section.low": "🟢 Low-risk (likely clean)", | |
| "contam.col.benchmark": "Benchmark", | |
| "contam.col.released": "Released", | |
| "contam.col.gap": "Gap (months)", | |
| "contam.col.prior": "P(contam)", | |
| "contam.col.level": "Level", | |
| "contam.col.corpora": "In corpora", | |
| "contam.col.category": "Category", | |
| "contam.label.high": "High risk", | |
| "contam.label.medium": "Medium", | |
| "contam.label.low": "Low", | |
| "contam.no_entries": "(none in this category)", | |
| "contam.advice.high": "Treat these scores as unreliable. Replace with newer / private-test alternates (MMLU-Pro, GPQA, MUSR, MATH-500).", | |
| "contam.advice.medium": "Take with caution. Look for replication on a held-out subset or community reproductions.", | |
| "contam.advice.low": "Score likely uncontaminated, but absence of leak is not proof — still cross-check with alternate test.", | |
| "contam.summary.headline": "Cutoff <code>{cutoff}</code> · {n} benchmarks rated", | |
| "contam.status.empty": "⚠ Enter a model training cutoff date (e.g. 2023-12).", | |
| "contam.status.bad_date": "⚠ Bad date format. Use YYYY-MM or YYYY-MM-DD.", | |
| "contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks rated · {high} high-risk", | |
| // v0.7 — Help modal section | |
| "help.v07.title": "🆕 v0.7 — Anti-bullshit pack (4 new modes)", | |
| "help.v07.intro": "<em>v0.7 (2026-05-06): four new modes that solve concrete pain points reported by the HuggingFace community. Each one runs in your browser with no inference — pure metadata + math.</em>", | |
| "help.v07.unmask.title": "🪟 Context Unmasker", | |
| "help.v07.unmask.body": "Detects when <code>max_position_embeddings</code> is misleading. Mistral-7B-v0.1 declares 32k but attends within ~4-8k via SWA. Paste an HF model id → 1-second verdict (HONEST / INFLATED / SEVERELY INFLATED / YARN-EXTENDED). Catches SWA, RoPE-scaling (YaRN/linear/dynamic NTK), small-d_head + GQA. <em>Use case</em>: before paying GPU for 32k context, verify the model actually attends that far.", | |
| "help.v07.template.title": "📜 Chat-template Sniffer", | |
| "help.v07.template.body": "Detects which chat-template family a model uses (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) and gives you the exact CLI flag for lm-evaluation-harness, vLLM, and transformers. Solves issue #1841 in lm-eval-harness: forgetting <code>--apply_chat_template</code> silently halves multi-turn accuracy. <em>Use case</em>: before reporting a benchmark score, confirm you applied the template correctly.", | |
| "help.v07.arena.title": "🎯 Arena-Elo CI Reconstructor", | |
| "help.v07.arena.body": "Chatbot Arena strips confidence intervals from its public leaderboard — a 5-Elo gap can be statistically meaningless. Paste raw pairwise vote data (model_a, model_b, winner) → Bradley-Terry MLE + 200-iteration bootstrap → ranked Elos with 95% CIs and a \"statistical ties\" panel listing pairs whose CIs overlap. Try the Load sample button. <em>Use case</em>: before declaring \"model A beats model B\", verify their CIs don't overlap.", | |
| "help.v07.contam.title": "🧪 Contamination Prior", | |
| "help.v07.contam.body": "Bayesian-ish prior on whether a benchmark score is contaminated. Enter your model's training cutoff date → tool rates 20+ popular benchmarks (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) by P(contamination) based on time gap, corpus inclusion, and known leak history. Open LLM Leaderboard v1 was killed in 2024 after MMLU/HellaSwag scores became contaminated. <em>Use case</em>: decide which scores to trust when comparing two models.", | |
| "help.v07.quant.title": "⚖️ Quant-regime Classifier", | |
| "help.v07.quant.body": "Predicts γ-shift and ΔPPL for any (model × quant scheme: NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8, …). Architecture-aware: small d_head + aggressive GQA → more sensitive; calibrated schemes (AWQ) absorb shift better than uncalibrated (NF4). Recommends safer alternatives if a cliff is detected. <em>Use case</em>: before quantizing, predict whether your specific architecture × scheme combo will keep PPL acceptable, with a concrete switch-to suggestion otherwise.", | |
| "help.v07.drift.title": "🔀 Cross-framework Drift Bound", | |
| "help.v07.drift.body": "Same model, different scores on different setups. Tool predicts the maximum drift admissible from numerical noise alone (dtype, framework, batch). If the observed gap exceeds it → real bug, typically chat-template mismatch (lm-eval-harness issue #1841) or KV-cache layout. Try the "Load sample" button for the canonical chat-template bug. <em>Use case</em>: before reporting a regression or claiming reproducibility, verify whether the gap between two evals is bigger than what numerical noise can explain.", | |
| "inv.v07.drift": "<strong>🔀 Drift</strong> — bug or noise? Predict max admissible gap between two evals", | |
| "help.v07.niah.title": "🔍 NIAH → Reasoning Gap", | |
| "help.v07.niah.body": "RULER paper (NVIDIA 2024) shows that long-context models often pass NIAH (needle retrieval) but fail multi-hop reasoning at the same context. Tool predicts both pass rates from architecture (γ_Padé + d_horizon + arch pressure: small d_head, GQA, SWA), reports the gap, and finds your model's \"safe reasoning context\" where reasoning stays ≥65%. Sweep mode shows the curve across 1k/4k/16k/64k/T_train. <em>Use case</em>: before deploying at the claimed context, find out whether the model will actually reason there or just retrieve.", | |
| "inv.v07.niah": "<strong>🔍 NIAH→Reason</strong> — does your \"128k context\" actually reason there, or just retrieve?", | |
| // v0.7 — Inventory modal 5th card | |
| "inv.v07.title": "🆕 v0.7 anti-bullshit pack", | |
| "inv.v07.unmask": "<strong>🪟 Unmask</strong> — config.json claims 32k? See if it actually attends that far", | |
| "inv.v07.template": "<strong>📜 Chat-template</strong> — exact CLI flag so lm-eval doesn't silently halve your accuracy", | |
| "inv.v07.arena": "<strong>🎯 Arena CI</strong> — recover the confidence intervals Chatbot Arena hides", | |
| "inv.v07.contam": "<strong>🧪 Contamination</strong> — rate 20+ benchmarks for contamination probability", | |
| "inv.v07.quant": "<strong>⚖️ Quant</strong> — predict γ shift + ΔPPL for any (model × quant scheme) combo", | |
| // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier | |
| "modes.quant": "⚖️ Quant", | |
| "mode_desc.quant": "Predicts γ-shift and ΔPPL for any (model × quant scheme). Architecture-aware: small d_head + GQA → more sensitive. Recommends safer alternatives if a cliff is detected.", | |
| "quant.title": "⚖️ Quant-regime Classifier", | |
| "quant.tip": "Predicts γ-shift (and downstream ΔPPL) for a given (model × quant scheme). Generic claims like 'AWQ ~95% retention' are too vague — TAF uses d_head, GQA ratio, SWA flag, and model size to give an architecture-specific verdict. Solves: HF community widely reports unpredictable quant cliffs (NF4 -2 PPL on Phi-3 but fine on Llama-3-8B).", | |
| "quant.desc": "<strong>Will quantizing your model break it?</strong> Paste an HF model id, pick a quant scheme — get predicted γ-shift, expected ΔPPL band, and a recommended alternative if it's a cliff. Browser-only, no GPU, no calibration set required.", | |
| "quant.id_label": "HF model id:", | |
| "quant.fetch_btn": "📥 Fetch config", | |
| "quant.scheme_label": "Quant scheme:", | |
| "quant.run_btn": "⚖️ Predict", | |
| "quant.all_btn": "📊 Compare all schemes", | |
| "quant.regime.safe": "✅ SAFE", | |
| "quant.regime.mild": "✅ MILD COMPRESSION", | |
| "quant.regime.significant": "⚠ SIGNIFICANT DEGRADATION", | |
| "quant.regime.cliff": "❌ HEAVY CLIFF", | |
| "quant.label.gamma_shift": "γ shift", | |
| "quant.label.delta_ppl": "ΔPPL (est.)", | |
| "quant.label.arch_mult": "Arch multiplier", | |
| "quant.section.breakdown": "Breakdown", | |
| "quant.section.reco": "Recommendation", | |
| "quant.section.compare": "All schemes (sorted by safety)", | |
| "quant.field.scheme": "Scheme", | |
| "quant.field.calibrated": "calibrated", | |
| "quant.field.uncalibrated": "uncalibrated", | |
| "quant.field.base_penalty": "Base penalty", | |
| "quant.field.arch_mult_full": "Architecture multiplier", | |
| "quant.field.gamma_shift": "Predicted γ shift", | |
| "quant.field.ppl_band": "ΔPPL band (est.)", | |
| "quant.field.params": "Parameters", | |
| "quant.col.scheme": "Scheme", | |
| "quant.col.bits": "Bits", | |
| "quant.col.gamma_shift": "γ shift", | |
| "quant.col.ppl_band": "ΔPPL band", | |
| "quant.col.regime": "Regime", | |
| "quant.reco.switch_to_awq": "<strong>Switch to {scheme}</strong> — calibrated 4-bit handles small d_head + GQA much better than NF4. Expected ΔPPL drops ~2-3×.", | |
| "quant.reco.switch_to_q5_km": "<strong>Switch to {scheme}</strong> — Q5 keeps more head dimensions intact at low cost (only ~25% bigger file).", | |
| "quant.reco.switch_to_q4_km": "<strong>Switch to {scheme}</strong> — Q3/Q2 are too aggressive for this architecture.", | |
| "quant.reco.consider_awq": "<strong>Consider {scheme}</strong> — calibration meaningfully reduces γ-shift on this architecture.", | |
| "quant.reco.use_higher_bits": "<strong>Use higher-bit alternative</strong> — this architecture cannot absorb 4-bit cleanly. Try 5- or 8-bit.", | |
| "quant.reco.verify_with_eval": "<strong>Verify with a real eval</strong> — predicted shift is borderline. Run NIAH at your target context before deploying.", | |
| "quant.reco.no_action": "No action needed — quantization is safe for this architecture.", | |
| "quant.summary.headline_all": "All schemes for <code>{modelId}</code>", | |
| "quant.status.empty_id": "⚠ Enter a model id (e.g. meta-llama/Llama-3.2-1B).", | |
| "quant.status.fetching": "⏳ Fetching config.json for {modelId}...", | |
| "quant.status.fetched": "✅ Config fetched for {modelId}. Pick a scheme and click Predict (or Compare all schemes).", | |
| "quant.status.no_scheme": "⚠ Pick a quant scheme from the dropdown.", | |
| "quant.status.done": "✅ Predicted regime: {regime}", | |
| "quant.status.done_all": "✅ Compared {n} schemes — sorted by safety.", | |
| // v0.7.4 — HF Hub autocomplete privacy + rate-limit notices | |
| "hf_auto.privacy": "🔒 Queries sent to huggingface.co/api · cached locally 5 min", | |
| "hf_auto.rate_limited": "⚠ HuggingFace rate limit — try again in a moment, or type the full model id manually", | |
| "hf_auto.gated_msg": "is gated. Accept the license here:", | |
| // v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound | |
| "modes.drift": "🔀 Drift", | |
| "mode_desc.drift": "Predicts max-allowable drift between two benchmark scores given (framework, dtype, batch, chat-template). Flags real bugs vs numerical noise.", | |
| "drift.title": "🔀 Cross-framework Drift Bound", | |
| "drift.tip": "Same model, different scores on different setups. Is the gap noise or a real bug? Enter two scores with their (framework, dtype, batch, chat-template) — tool predicts the maximum allowable drift from numerical noise alone. If observed gap exceeds it → real bug, usually chat-template mismatch (lm-eval issue #1841) or KV-cache layout.", | |
| "drift.desc": "<strong>Your model gives 67.2 on lm-eval-hf and 65.1 on vLLM-served. Bug or noise?</strong> Enter both scores with (framework, dtype, batch, chat-template applied?). Tool predicts the noise band and flags real bugs. arxiv 2506.09501 documents this as a major eval reproducibility problem.", | |
| "drift.setup_a": "Setup A", | |
| "drift.setup_b": "Setup B", | |
| "drift.score": "Score", | |
| "drift.framework": "Framework", | |
| "drift.dtype": "Dtype", | |
| "drift.batch": "Batch", | |
| "drift.template": "Chat-template", | |
| "drift.template.applied": "applied", | |
| "drift.template.not_applied": "not applied", | |
| "drift.template.unknown": "unknown", | |
| "drift.run_btn": "🔀 Compute drift bound", | |
| "drift.sample_btn": "📊 Load sample (chat-template bug)", | |
| "drift.label.observed": "Observed gap", | |
| "drift.label.band": "Numerical band", | |
| "drift.label.ratio": "Gap / band", | |
| "drift.section.setups": "Setups", | |
| "drift.section.breakdown": "Drift contributors (numerical band)", | |
| "drift.section.verdict": "Verdict & recommendation", | |
| "drift.contrib.dtype": "Dtype mismatch", | |
| "drift.contrib.framework": "Framework", | |
| "drift.contrib.batch": "Batch difference", | |
| "drift.contrib.template": "Chat-template MISMATCH", | |
| "drift.dominant_cause": "Dominant cause", | |
| "drift.cause.dtype": "dtype precision difference", | |
| "drift.cause.framework": "framework / kernel difference", | |
| "drift.cause.batch": "batch normalization paths", | |
| "drift.cause.template_mismatch": "chat-template applied on one side but not the other (lm-eval-harness #1841 pattern — typical -50% drop on multi-turn)", | |
| "drift.verdict.noise": "✅ NUMERICAL NOISE", | |
| "drift.verdict.suspicious": "⚠ SUSPICIOUS — verify", | |
| "drift.verdict.bug": "❌ REAL BUG — investigate", | |
| "drift.verdict.bug_template": "❌ CHAT-TEMPLATE BUG", | |
| "drift.reco.noise": "Gap fits within the expected numerical-noise band. No action needed; the difference is consistent with framework/dtype/batch variation alone.", | |
| "drift.reco.suspicious": "Gap is 1–2× the predicted noise band. Borderline — possibly a real bug. Try aligning the dominant contributor (e.g. match framework or dtype) and re-test.", | |
| "drift.reco.bug": "Gap is > 2× the predicted noise band. This is a real bug. Inspect the dominant contributor — most likely tokenizer / chat-template / KV-cache layout difference. Run lm-eval-harness with <code>--apply_chat_template</code> and confirm.", | |
| "drift.reco.bug_template": "Chat-template mismatch detected. This is the most common cause of large eval discrepancies (lm-eval-harness issue #1841). Re-run the "not applied" side with <code>--apply_chat_template</code> (or set vLLM <code>--chat-template <name></code>) and re-test.", | |
| "drift.status.empty_scores": "⚠ Enter both scores.", | |
| "drift.status.done": "✅ Verdict: {verdict}", | |
| "drift.status.sample_loaded": "✅ Sample loaded (canonical chat-template bug). Click Compute drift bound.", | |
| // v0.7.6 — anti-bullshit pack #7: NIAH → reasoning gap predictor | |
| "modes.niah": "🔍 NIAH→Reason", | |
| "mode_desc.niah": "Predicts NIAH (retrieval) and multi-hop reasoning pass rates at any context. Solves: long-context models often pass NIAH but fail reasoning at the same context (RULER paper).", | |
| "modes.saturation": "📈 Saturation", | |
| "mode_desc.saturation": "Tells you whether a benchmark still discriminates frontier models or has saturated (e.g. MMLU 88-94% top, AIME 2025 already 96-100%). Returns top-3 + verdict + recommended replacements.", | |
| "modes.hub": "🧭 Solutions", | |
| "mode_desc.hub": "Map of every documented LLM-eval pain → tafagent mode (if covered) + curated external tools. Find the right solution without rebuilding it. 30+ pains, 7 categories.", | |
| "niah.title": "🔍 NIAH → Reasoning Gap", | |
| "niah.tip": "NIAH (Needle in a Haystack) tests retrieval: 'find this fact in long text'. Multi-hop reasoning tests inference: 'combine facts X+Y at the start with fact Z at the end'. RULER paper (NVIDIA 2024) shows long-context models often pass NIAH but fail reasoning at the same context. This tool predicts both pass rates from architecture alone.", | |
| "niah.desc": "<strong>Your model claims 128k context. Will it actually reason at 64k, or just retrieve?</strong> Paste an HF model id and a target eval context — tool predicts NIAH and multi-hop reasoning pass rates, the gap, and a 'safe context' where reasoning stays ≥65%.", | |
| "niah.id_label": "HF model id:", | |
| "niah.fetch_btn": "📥 Fetch config", | |
| "niah.teval_label": "Target context (T_eval):", | |
| "niah.run_btn": "🔍 Predict", | |
| "niah.sweep_btn": "📊 Sweep contexts", | |
| "niah.label.niah": "NIAH pass rate", | |
| "niah.label.reasoning": "Reasoning pass rate", | |
| "niah.label.gap": "Gap", | |
| "niah.label.safe_ctx": "Safe reasoning context", | |
| "niah.section.breakdown": "Architecture breakdown", | |
| "niah.section.reco": "Recommendation", | |
| "niah.section.sweep": "Pass rate sweep across context lengths", | |
| "niah.field.dhorizon": "d_horizon (effective)", | |
| "niah.field.ratio": "T_eval / d_horizon", | |
| "niah.field.arch_pressure": "Arch pressure (small d_head + GQA + SWA)", | |
| "niah.field.theta": "RoPE θ", | |
| "niah.field.t_train": "T_train (claimed)", | |
| "niah.col.context": "T_eval", | |
| "niah.col.niah": "NIAH", | |
| "niah.col.reasoning": "Reasoning", | |
| "niah.col.gap": "Gap", | |
| "niah.col.verdict": "Verdict", | |
| "niah.verdict.robust": "✅ ROBUST", | |
| "niah.verdict.marginal": "⚠ MARGINAL", | |
| "niah.verdict.degraded": "⚠ DEGRADED", | |
| "niah.verdict.retrieval_only": "❌ RETRIEVAL-ONLY", | |
| "niah.verdict.broken": "❌ BROKEN", | |
| "niah.reco.robust": "Both retrieval and reasoning hold up at this context. Safe to deploy for both lookup and inference tasks.", | |
| "niah.reco.marginal": "Borderline. Retrieval works but reasoning is shaky. Use for fact-lookup, not multi-step inference.", | |
| "niah.reco.degraded": "Significant reasoning drop. The model can find facts but struggles to combine them. Avoid multi-hop tasks at this length.", | |
| "niah.reco.retrieval_only": "Canonical RULER finding: model passes NIAH but fails reasoning. Useful for retrieval-augmented setups (where the LLM only locates facts) but NOT for chained inference. Cut your context to the 'safe' value below.", | |
| "niah.reco.broken": "Model fails even basic retrieval at this context. Treat as out-of-distribution — re-test at a shorter context.", | |
| "niah.safe_context": "≤ {ctx} tokens (reasoning ≥ 65%)", | |
| "niah.safe_context_none": "No safe context found below your target — model fails reasoning even at small contexts.", | |
| "niah.summary.sweep": "<code>{modelId}</code> — pass rates by context", | |
| "niah.status.empty_id": "⚠ Enter a model id (e.g. meta-llama/Llama-3.1-8B-Instruct).", | |
| "niah.status.bad_teval": "⚠ Enter a target context (≥ 512 tokens).", | |
| "niah.status.fetching": "⏳ Fetching config.json for {modelId}...", | |
| "niah.status.fetched": "✅ Config fetched for {modelId}. Set T_eval and click Predict (or Sweep contexts).", | |
| "niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%", | |
| "niah.status.sweep_done": "✅ Swept {n} context lengths.", | |
| "saturation.title": "📈 Benchmark Saturation Detector", | |
| "saturation.tip": "MMLU is saturated (88-94% all frontier models). Reporting '92% on MMLU' is now meaningless. This tool tells you which benchmarks still discriminate frontier models, which are saturated, and what to use instead. Data: DemandSphere AI Frontier Tracker (CC BY-NC 4.0) refreshed 2026-05.", | |
| "saturation.desc": "<strong>Is your benchmark still useful?</strong> Pick a benchmark to see top-3 frontier scores, spread, and a verdict (saturated / near-saturated / discriminative) plus recommended replacements.", | |
| "saturation.select_label": "Benchmark:", | |
| "saturation.select.all": "— show all benchmarks —", | |
| "saturation.run_btn": "📈 Classify", | |
| "saturation.all_btn": "📊 Show all", | |
| "saturation.col.spread": "Top-3 spread", | |
| "saturation.col.mean": "Top-3 mean", | |
| "saturation.col.n": "Models", | |
| "saturation.col.bench": "Benchmark", | |
| "saturation.col.verdict": "Verdict", | |
| "saturation.col.reco": "Top reco", | |
| "saturation.col.model": "Model", | |
| "saturation.col.score": "Score", | |
| "saturation.section.top3": "Top-3 frontier scores", | |
| "saturation.section.recommendations": "Recommended alternatives", | |
| "saturation.section.note": "Notes", | |
| "saturation.section.all": "All tracked benchmarks", | |
| "saturation.verdict.saturated": "🚨 SATURATED", | |
| "saturation.verdict.near_saturated": "⚠ NEAR SATURATED", | |
| "saturation.verdict.discriminative": "✅ DISCRIMINATIVE", | |
| "saturation.verdict.sparse_data": "ℹ SPARSE DATA", | |
| "saturation.borderline": "Borderline — within ±1pp of a threshold cutoff. Treat verdict as 'check carefully'.", | |
| "saturation.unknown": "Unknown benchmark.", | |
| "saturation.attribution": "Data: DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (open-weight historical) · last fetch 2026-05-05.", | |
| "saturation.status.live": "✅ Live data loaded — {count} models.", | |
| "saturation.status.baked": "ℹ Using baked snapshot (live fetch unavailable).", | |
| "saturation.status.kb_fail": "⚠ Could not load saturation KB.", | |
| "saturation.status.done": "✅ {name} — {verdict}", | |
| "saturation.status.all_done": "✅ Classified {n} benchmarks.", | |
| "help.v08.saturation.title": "📈 Benchmark Saturation Detector", | |
| "help.v08.saturation.body": "MMLU is saturated (88-94% top), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.", | |
| "inv.v08.saturation": "<strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?", | |
| "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — every documented pain mapped to a tafagent mode or curated external tool. Don't reinvent — find.", | |
| "help.v081.hub.title": "🧭 Solutions Hub", | |
| "help.v081.hub.body": "tafagent as integrator, not silo. 30+ pains across 7 categories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), each mapped to (a) the tafagent mode that addresses it, if any, and (b) the best-of-breed external tools the community already trusts (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Search box matches across pain, scenario, and tool name. <em>Use case</em>: 'I have problem X — does tafagent solve it, and if not, who does?'", | |
| "hub.title": "🧭 Solutions Hub", | |
| "hub.tip": "Map of every documented LLM-eval pain we know about: which tafagent mode addresses it (if any), and the best-of-breed external tools the community already trusts. Goal: full coverage. If a canonical tool exists elsewhere, we link rather than rebuild.", | |
| "hub.desc": "<strong>Don't reinvent — find.</strong> 30+ pains mapped to tafagent modes + curated external tools. Browse by category, search by keyword, or see the gaps where new modes would help most.", | |
| "hub.clear_btn": "✕ Clear", | |
| "hub.no_mode": "external", | |
| "hub.planned": "planned:", | |
| "hub.best_for": "Best for", | |
| "hub.not_for": "Not for", | |
| "hub.tools": "External tools", | |
| "hub.status.loaded": "✅ Loaded {total} pains across {categories} categories — {covered} covered by tafagent modes, {externalLinks} external links curated. Compiled {compiled}.", | |
| "hub.status.fail": "⚠ Could not load Solutions Hub.", | |
| "hub.search.empty": "No matches for '{query}'. Try broader terms (e.g. 'eval', 'rag', 'tokenizer').", | |
| "hub.search.results": "Found {n} match(es) for '{query}'.", | |
| // v0.7.7 — Task tiles (UX restructure: 14 modes grouped by user intent) | |
| "tiles.title": "🎯 What do you want to do?", | |
| "tiles.subtitle": "Pick a task. Each one opens the right tool below. Or scroll down for the full list of 14 modes.", | |
| "tile.diagnose.title": "🔬 Diagnose a model", | |
| "tile.diagnose.desc": "Will this specific model work for my use case?", | |
| "tile.trust.title": "✓ Trust a benchmark score", | |
| "tile.trust.desc": "Should I believe this number? Bug or noise?", | |
| "tile.eval.title": "⚙️ Set up an eval correctly", | |
| "tile.eval.desc": "Get the exact CLI flag for lm-eval / vLLM / transformers.", | |
| "tile.compare.title": "🆚 Compare models", | |
| "tile.compare.desc": "Side-by-side, or browse the empirical model landscape.", | |
| "tile.manual.title": "📋 Manual / free-form", | |
| "tile.manual.desc": "Pick a specific recipe by hand, or ask in plain English.", | |
| "tile.diagnose.tip": "Start here when you have a specific model id and want a full diagnostic: <strong>Profile</strong> runs all 5 recipes at once. <strong>Unmask</strong> checks if max_position_embeddings is honest. <strong>NIAH→Reason</strong> predicts retrieval-vs-reasoning gap. <strong>Quant</strong> predicts whether quantizing will break it. <strong>Inspect</strong> lets you paste raw config.json for private/in-dev models.", | |
| "tile.trust.tip": "When you see a score and want to know if it's real. <strong>Contamination</strong> rates 20+ benchmarks for likelihood the model saw them during training. <strong>Drift</strong> tells you if a gap between two evals is numerical noise or a real bug (chat-template mismatch, KV-cache layout, etc.). <strong>Arena CI</strong> reconstructs the confidence intervals Chatbot Arena hides — many top-Elo "wins" are statistically tied.", | |
| "tile.eval.tip": "Before you run lm-eval-harness or vLLM serve, get the right CLI flag. <strong>Chat-template Sniffer</strong> detects the template family (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) and emits the exact <code>--apply_chat_template</code> / <code>--chat-template</code> invocation. Solves issue #1841 in lm-eval-harness (silent ÷2 accuracy). <strong>Diagnose CLI</strong> generates the Python command to measure γ_obs on your local GPU.", | |
| "tile.compare.tip": "<strong>Compare</strong>: pick 2-3 candidate models + one recipe, see verdicts in a side-by-side table (e.g. Llama-3-8B vs Mistral-7B at 32k context). <strong>Phase diagram</strong>: scatter of 23 empirical models on the (log θ, γ) plane, with the Padé curve overlaid. Hover dots for details, click to load that model into the Recipe form.", | |
| "tile.manual.tip": "<strong>Recipe</strong>: pick a specific X-N recipe (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 KV compression, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) and fill the form by hand for full control. <strong>Ask</strong>: type a free-form question; an in-browser 0.5B LLM (Qwen2.5) picks the right recipe and runs it. Best for "what would happen if..." exploration.", | |
| "share.import_desc": "Got a JSON file from someone else's TAF analysis? Load it here to see the verdict + chain locally. Same view as if you'd run it yourself.", | |
| "share.import_btn": "📂 Load shared JSON", | |
| "synthesis.system": "You are a precise transformer LLM diagnostic assistant. Given pre-computed TAF formula results, write a clear plain-English summary in 4-6 sentences. Cite the section number (§X.Y) for each number you mention. Always give a concrete recommendation. Do NOT invent numbers.", | |
| // INSPECTOR mode | |
| "inspector.title": "🔍 Architecture Inspector", | |
| "inspector.desc": "Paste the raw <code>config.json</code> contents. The tool extracts the architectural parameters and runs the full 5-recipe Profile.", | |
| "inspector.tip": "<strong>Paste any config.json directly</strong>. Tool parses it and runs the full Profile. Useful for: private models, in-development configs, models not yet on HuggingFace, or comparing what your custom architecture would do.", | |
| "inspector.quickstart": "💡 Use case: you have a private model not on HF Hub, or a config you're designing. Paste the raw JSON below and get a full TAF profile.", | |
| "inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}", | |
| "inspector.T_eval": "T_eval (your target context):", | |
| "inspector.btn": "🚀 Inspect & profile", | |
| // WHAT-IF slider | |
| "whatif.title": "🎚 What-if: drag T_eval to see γ change live", | |
| "whatif.desc": "Pure JS recompute (no Pyodide call). Shows the geometric γ_Padé and d_horizon as you slide. The full chain re-runs on click.", | |
| "whatif.T_eval": "<strong>T_eval</strong>", | |
| "whatif.gamma_pade": "<strong>γ_Padé</strong>", | |
| "whatif.d_horizon": "<strong>d_horizon</strong>", | |
| "whatif.l_niah": "<strong>L_NIAH ceiling</strong>", | |
| "whatif.predicted": "<strong>Predicted geometric verdict</strong>", | |
| "whatif.rerun": "↻ Recompute full chain at this T_eval", | |
| // COMMUNITY feed | |
| "community.title": "🌐 Recent community submissions", | |
| "community.desc": "Live feed from the public registry. Click any submission to view full analysis.", | |
| "community.browse_all": "Browse all →", | |
| "community.loading": "Loading...", | |
| "community.no_repo": "The registry repo isn't created yet. Once it exists with submissions, they'll appear here live.", | |
| "community.no_submissions": "No submissions yet. Be the first — generate a Profile and click 📤 Submit to registry.", | |
| // FALSIFICATION dashboard | |
| "falsification.title": "🔬 Paper predictions — falsification status", | |
| "falsification.desc": "The TAF framework rests on falsifiable predictions (F1-F23). Each is empirically tested. Here's the live status of every prediction in the paper.", | |
| "falsification.summary": "{confirmed} confirmed · {partial} partial · {refuted} refuted · {untested} untested (out of {total} total predictions)", | |
| "falsification.col.id": "ID", | |
| "falsification.col.claim": "Claim", | |
| "falsification.col.status": "Status", | |
| "falsification.col.evidence": "Evidence", | |
| "tafcard.title": "📇 TAF Card — full model profile", | |
| "tafcard.recipes_title": "📋 Recipes — verdict per dimension", | |
| "tafcard.recipes_count_label": "dimensions", | |
| "tafcard.numbers_title": "🔢 Key numbers (paper §26)", | |
| "tafcard.fals_title": "🔬 Falsification status (F1-F23)", | |
| "tafcard.fals_none": "No falsifications applicable.", | |
| "tafcard.diag_title": "🔬 Diagnostics — numbers · γ check · what-if", | |
| "tafcard.verify_title": "✓ Verification — Lean + Sage + falsification", | |
| "tafcard.share_title": "📂 Provenance & share", | |
| "tafcard.whatif_title": "🎚️ What-if explorer", | |
| "verdict.go": "GO", | |
| "verdict.no": "NO", | |
| "verdict.degraded": "DEGRADED", | |
| "compare.title_out": "🆚 Comparison Table", | |
| "status.loading_pyodide": "⏳ Loading Python runtime (~10MB, first time only)...", | |
| "status.loading_taf": "⏳ Loading TAF formulas + recipes...", | |
| "status.ready": "✅ Ready. Pick a model and click Profile to start.", | |
| "status.computing": "🧮 Computing TAF chain...", | |
| "status.done": "✅ Done.", | |
| "profile.hf_placeholder": "e.g. meta-llama/Meta-Llama-3-8B or Qwen/Qwen2.5-7B", | |
| "compare.hf_placeholder": "HF model id (e.g. meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot1_placeholder": "HF model id (e.g. meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot2_placeholder": "HF model id #2", | |
| "compare.slot3_placeholder": "HF model id #3 (optional)", | |
| "compare.preset_default": "— or preset —", | |
| // Form parameters | |
| "param.theta": "θ (rope_theta)", | |
| "param.theta.tip": "<strong>RoPE base frequency</strong> from <code>config.rope_theta</code>. Higher = more long-range capacity.", | |
| "param.T_train": "T_train", | |
| "param.T_train.tip": "<strong>Max training context</strong>. From <code>max_position_embeddings</code>. Beyond this is extrapolation.", | |
| "param.T_eval": "T_eval (your target)", | |
| "param.T_eval.tip": "<strong>Your target inference context</strong>. The whole question is: will the model behave well at THIS length?", | |
| "param.n_attn": "n_attention_heads", | |
| "param.n_attn.tip": "<strong>Number of attention heads</strong> per layer. From <code>num_attention_heads</code>.", | |
| "param.n_kv": "n_kv_heads", | |
| "param.n_kv.tip": "<strong>KV heads</strong>. If < n_attention_heads → GQA (Grouped Query Attention). Reduces KV memory but pushes γ toward Hagedorn.", | |
| "param.d_head": "head_dim", | |
| "param.d_head.tip": "<strong>Per-head dimension</strong>. Typical 64, 96, 128. From <code>head_dim</code> or <code>hidden_size / num_attention_heads</code>.", | |
| "param.n_layers": "n_layers", | |
| "param.n_layers.tip": "<strong>Number of transformer blocks</strong>. From <code>num_hidden_layers</code>.", | |
| "param.n_params": "n_params (e.g. 8e9)", | |
| "param.n_params.tip": "<strong>Total parameter count</strong>. Threshold ~400M for induction-head emergence. Affects KV memory and budget recipes.", | |
| "param.has_swa": "Has SWA?", | |
| "param.has_swa.tip": "<strong>Sliding Window Attention</strong>. <code>true</code> for Mistral, gemma-2, phi-3. v0.5.3 calibration audit disabled the historical δ_SWA correction (n=1 fit).", | |
| "common.yes": "Yes", | |
| "common.no": "No", | |
| // Mode tooltips | |
| "modes.tip": "<strong>Fourteen ways to use the tool</strong>.<br><strong>📇 Profile</strong>: paste a model id → 5-recipe TAF Card.<br><strong>🆚 Compare</strong>: 2-3 models side-by-side on one recipe.<br><strong>🔍 Inspect config</strong>: paste raw config.json → full Profile.<br><strong>💬 Ask</strong>: free-form question, browser LLM picks the recipe.<br><strong>📋 Recipe</strong>: manual selection with full form control.<br><strong>🩺 Diagnose CLI</strong>: generate Python command for local γ measurement.<br><strong>📊 Phase diagram</strong>: 23-model panel on (log θ, γ) plane.<br><strong>🪟 Unmask</strong>: detect misleading max_position_embeddings (SWA / YaRN / RoPE-scaling).<br><strong>📜 Chat-template</strong>: detect family + give exact CLI flag for lm-eval / vLLM / transformers.<br><strong>🎯 Arena CI</strong>: reconstruct confidence intervals from raw pairwise vote data; detect statistical ties Arena hides.<br><strong>🧪 Contamination</strong>: rate 20+ benchmarks for contamination probability based on training cutoff vs release date.<br><strong>⚖️ Quant</strong>: predict γ-shift and ΔPPL for any (model × quant scheme); recommend safer alternative on cliff.<br><strong>🔀 Drift</strong>: same model, different scores on two setups — bug or noise? Predict numerical-noise band and flag real bugs.<br><strong>🔍 NIAH→Reason</strong>: predict NIAH and multi-hop reasoning pass rates from architecture; find your model's safe reasoning context.", | |
| "profile.tip": "<strong>One-click full diagnosis</strong>. Paste any HF model id (or pick preset). Tool runs all 5 recipes (long-context, KV-compression, custom-vs-API, budget, hardware) and produces a single <strong>TAF Card</strong> with verdict per dimension + key numbers + architecture classification.<br><br><strong>Use case</strong>: \"I'm evaluating Qwen2.5-32B for production — what's its full viability profile?\" → paste id → Profile → done.", | |
| "compare.tip": "<strong>Same recipe, multiple models</strong>. Pick 2-3 candidate models and one recipe. See verdicts in a single comparison table.<br><br><strong>Use case</strong>: \"I need long-context retrieval at 16K — which is best: Llama-3-8B, Mistral-7B, or Qwen-7B?\" → pick 3 + X-2 + 16K → see winner.", | |
| // Help modal | |
| "help.title": "📘 TAF Agent — User Manual", | |
| "help.what.title": "What does it do?", | |
| "help.what.body": "Predicts <strong>practical viability</strong> of any transformer LLM <em>before you spend GPU/$</em>. Answers questions like \"will this model work at L=32K?\" or \"should I train custom or use API?\" using deterministic Python formulas (TAF — Thermodynamic Attention Framework).", | |
| "help.modes.title": "How to use — 7 modes", | |
| "help.modes.profile": "<strong>📇 Profile</strong>: paste model id → all recipes at once = TAF Card. <strong>Best starting point</strong>.", | |
| "help.modes.compare": "<strong>🆚 Compare</strong>: 2-3 models side-by-side on same recipe. Best when choosing between candidates.", | |
| "help.modes.inspector": "<strong>🔍 Inspect config</strong>: paste raw <code>config.json</code> → tool parses + runs full Profile. For private models, in-development configs, or models not yet on HF Hub.", | |
| "help.modes.ask": "<strong>💬 Ask plain English</strong>: free-form question, in-browser LLM picks the recipe. Best for casual exploration.", | |
| "help.modes.recipe": "<strong>📋 Recipe + form</strong>: manual selection, full parameter control. Best when you want exact control.", | |
| "help.modes.diagnose": "<strong>🩺 Diagnose CLI</strong>: generate Python command to measure γ on your local machine (transformers + numpy). Fast ≈5 min CPU; full ≈20–60 min GPU. Output JSON re-uploadable via Inspect.", | |
| "help.modes.phase": "<strong>📊 Phase diagram</strong>: scatter plot of 23 panel models on (log θ, γ) plane. Hagedorn line γ=1 separates Phase A from Phase B. Click a dot to load that model into Recipe form.", | |
| "help.recipes.title": "The 8 recipes available", | |
| "help.recipe.x1.title": "<strong>X-1 Custom training vs API</strong> — compares cost of training your own model vs paying for API access.", | |
| "help.recipe.x1.example": "Try: <em>\"Should I train an 8B custom model or use GPT-4o for 50M tokens/month?\"</em><br>Answer types: YES (custom) / NO (API) with break-even months.", | |
| "help.recipe.x2.title": "<strong>X-2 Long Context Viability</strong> — predicts if a model serves a target context length reliably.", | |
| "help.recipe.x2.example": "Try: <em>\"Will Meta-Llama-3-8B handle 32000 tokens for retrieval?\"</em><br>Chains: γ_Padé → decomposition → d_horizon → NIAH ceiling → hallucination → KV memory.<br>Verdict: YES / DEGRADED / NO with mitigation if needed.", | |
| "help.recipe.x3.title": "<strong>X-3 Budget pre-flight</strong> — given $ budget, what model is feasible to train?", | |
| "help.recipe.x3.example": "Try: <em>\"I have $5000, what model can I train?\"</em><br>Answer: GO / TINY-MODEL / MEMORY-LIMITED with concrete N (params) and D (tokens).", | |
| "help.recipe.x5.title": "<strong>X-5 Hardware selection</strong> — which GPU should I use to serve at target throughput?", | |
| "help.recipe.x5.example": "Try: <em>\"Cheapest hardware to serve Llama-3-8B at 10M tokens/day\"</em><br>Answer: best GPU + $/Mtok + capacity vs target.", | |
| "help.recipe.x19.title": "<strong>X-19 KV Compression decision</strong> — should I use soft decay, hard cutoff, or literature methods?", | |
| "help.recipe.x21.title": "<strong>X-21 Imprint Purity Diagnostic</strong> — predicts γ on RANDOM tokens via ν=−1/(2π); how clean is the model's RoPE prediction?", | |
| "help.recipe.x22.title": "<strong>X-22 Compute-Context Invariant</strong> — does γ × log(N²·D) lie in panel band 51.2 ± 16.8? Detects scaling/training anomalies.", | |
| "help.recipe.x23.title": "<strong>X-23 IH-Phase Detector</strong> — pre- or post-induction-head? Cheap probe via sign(γ_text − γ_random).", | |
| "help.recipe.x19.example": "Try: <em>\"How to compress KV cache for Qwen2.5-7B at 32K?\"</em><br>Answer: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.", | |
| "help.recipe.x21.example": "Try: <em>\"How clean is the RoPE prediction on Llama-3-8B?\"</em><br>Answer: predicted γ_random + purity diagnostic (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).", | |
| "help.recipe.x22.example": "Try: <em>\"Does Mistral-7B fit the compute-context invariant?\"</em><br>Answer: K = γ·log(N²·D), z-score, IN-BAND or OUTLIER.", | |
| "help.recipe.x23.example": "Try: <em>\"Is Qwen2.5-7B post-induction-head?\"</em><br>Answer: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY (with size-vs-Δγ consistency check).", | |
| "help.section.v04": "<strong>What's new in v0.4</strong> (sesión 29 findings 2026-04-28): three diagnostic recipes derived from cross-model panel analysis (n=22 LLMs).", | |
| "help.divider.v04_s29": "— v0.4 (sesión 29 findings) —", | |
| "footer.tech_stack": "Computation: Pyodide · Synthesis: WebLLM (Qwen2.5-0.5B local) · Hosting: GitHub Pages · Cost: $0", | |
| "help.v04.imprint": "<strong>Learned-imprint slope ν = −1/(2π)</strong>: RoPE rotation period 2π drives a positional bias on weights, proportional to log(N_params). Even random tokens show this scaling. ν is DERIVED — not fitted (empirical err 0.3%).", | |
| "help.v04.invariant": "<strong>Chinchilla-attention invariant K</strong>: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Connects compute scaling and attention exponent into a single dimensionless number.", | |
| "help.v04.ih_probe": "<strong>Δγ as IH probe</strong>: sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Cheaper than running an in-context-learning benchmark.", | |
| "help.v04.constants": "<strong>γ-cluster on famous constants</strong> (intriguing, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (golden conjugate, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat: could be coincidence.", | |
| "help.param.theta": "<strong>θ (rope_theta)</strong>: RoPE base frequency. Higher = more long-range capacity. Typical: 10000 (early), 500000 (Llama-3), 1000000 (Qwen2.5).", | |
| "help.param.T_train": "<strong>T_train</strong>: max context the model was trained on. From <code>max_position_embeddings</code>.", | |
| "help.param.T_eval": "<strong>T_eval</strong>: <em>your target</em> inference context length. The key knob.", | |
| "help.param.gqa": "<strong>n_kv_heads < n_attention_heads</strong>: model uses GQA (Grouped Query Attention). Reduces KV memory but pushes γ toward Hagedorn.", | |
| "help.param.swa": "<strong>has_SWA</strong>: model uses Sliding Window Attention (Mistral, gemma-2).", | |
| "help.param.nparams": "<strong>n_params</strong>: total parameter count. Threshold ~400M for induction-head emergence.", | |
| "help.add_models.title": "Adding new models (3 ways)", | |
| "help.add_models.preset": "<strong>Preset list</strong>: 11 popular models curated. Just select from dropdown.", | |
| "help.add_models.hf": "<strong>HF Hub fetch</strong>: paste any model id (e.g. <code>Qwen/Qwen2.5-32B-Instruct</code>), click 📥 Fetch. Browser downloads <code>config.json</code> directly from HuggingFace, fills the form. Works for any public model.", | |
| "help.add_models.manual": "<strong>Manual</strong>: fill the form fields directly with values from the model card.", | |
| "help.audit.title": "The audit chain", | |
| "help.audit.body": "Every result shows the full <strong>Computation Chain</strong> — each formula step with its inputs, output, and interpretation. Click any step to expand. Cite section numbers (§26.1, §19.1, etc.) refer to the underlying paper for derivation.", | |
| "help.synthesis.title": "The plain-English answer", | |
| "help.synthesis.body": "After the deterministic chain runs, an in-browser LLM (Qwen2.5-0.5B, ~350MB cached after first load) synthesizes a plain-English summary. The numbers above are <em>always correct</em> (deterministic Python); the synthesis is LLM-generated — verify against the chain if in doubt.", | |
| "help.params.title": "Common parameters explained", | |
| "help.verdicts.title": "What to look for in verdicts", | |
| "help.verdict.yes": "<strong style=\"color:#3fb950;\">YES / GO</strong> — proceed with confidence; numbers support the choice.", | |
| "help.verdict.deg": "<strong style=\"color:#d29922;\">DEGRADED / TINY-MODEL</strong> — works but with caveats; read the action.", | |
| "help.verdict.no": "<strong style=\"color:#f85149;\">NO / MEMORY-LIMITED</strong> — don't proceed as-is; mitigation provided.", | |
| "help.privacy.title": "Privacy", | |
| "help.privacy.body": "Everything runs in your browser. No telemetry, no analytics, no data sent anywhere. Even the LLM model runs locally via WebGPU/WebAssembly. Your model_ids and questions never leave this page.", | |
| "help.source.title": "Source & paper", | |
| "help.source.body": "Source code: <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>Paper: <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a>; arXiv forthcoming)<br>Dataset: <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 58 γ-measurements across 32 models (CC-BY-4.0)", | |
| "footer.text": "© 2026 Carles Marin · Apache-2.0 · independent research · the tool that closes the loop of the paper.", | |
| // §33 v0.4 (sesion 31, 2026-04-30) — new diagnostic functions | |
| "v04.title": "🆕 v0.4 — New diagnostics (sesion 31)", | |
| "v04.section.intro": "Four new diagnostic functions derived sesion 31 (2026-04-30) from cross-of-crosses formula games + Sócratic interrogation. Available in <code>taf_browser.py</code> §33.", | |
| "v04.arch.label": "Architectural Concentration", | |
| "v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv. Cross-panel correlational law (R²=0.30). Caveat: not per-model predictor.", | |
| "v04.pdi.label": "PDI — Padé Deviation Index", | |
| "v04.pdi.desc": "PDI = d_horizon_obs/T_eval. Traffic light: green (≈1), orange (>>1), yellow (<<1), red (Phase B negative).", | |
| "v04.4bit.label": "4-bit Shift Predictor", | |
| "v04.4bit.desc": "MHA: R²(bf16)<0.9 → γ rises; R²>0.99 → γ drops. GQA: precision-robust regardless.", | |
| "v04.crit.label": "Critical Exponents Bundle", | |
| "v04.crit.desc": "ν_c, β_c, η_c (=γ−1, CORRECTED), α_C, γ_susc with AM-GM minimum at γ=1−1/√2≈0.293.", | |
| // §34 v0.5 (sesion 32, 2026-05-01) — Machine-verified framework consistency | |
| "v05.title": "🔬 v0.5 — Machine-verified consistency (sesion 32)", | |
| "v05.section.intro": "Sage Groebner basis + Lean Mathlib4 dual-tool verification of <strong>15 algebraic identities</strong> of TAF critical exponents. First transformer-attention framework with formal machine-proof backing.", | |
| "v05.verify.label": "Algebraic Consistency Check", | |
| "v05.verify.desc": "Given measured γ, verifies 12 D-SAGE identities (D-SAGE-1: 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). All passing = framework intact. Failures indicate bf16 outliers / quantization artifacts.", | |
| "v05.dsage1.label": "D-SAGE-1 (★★ core)", | |
| "v05.dsage1.desc": "Quadratic identity 2η² + η·γ_χ + 1 = 0 (Sage Groebner-discovered, Lean-verified). Replaces incorrect 'triple closure' claim. Refutes paper 1's η=2γ algebraically.", | |
| "v05.erratum.label": "Paper 1 erratum — η correction", | |
| "v05.erratum.desc": "Paper 1 originally claimed η = 2γ. Sage Groebner + Lean Mathlib4 proved this fails (residual (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Phase A). Correct value: η = γ−1, satisfying D-SAGE-1.", | |
| "v05.repro.label": "Reproducibility", | |
| "v05.repro.desc": "All 15 theorems machine-proof in Lean Mathlib4 (1973 jobs build success). Sage script: <code>analysis/sage_recursive_sweep_2026-04-30.sage</code>. Lean code: <code>lean_taf/taf/Taf/Identities.lean</code>.", | |
| // v0.5.1 — TAF Card consistency check button | |
| "v05.consistency.title": "🔬 Algebraic consistency check (Sage + Lean v0.5)", | |
| "v05.consistency.desc": "Verifies 12 D-SAGE algebraic identities of TAF critical exponents (machine-proof Sage Groebner basis + Lean Mathlib4). Pass = framework intact. Fail = bf16 outlier / quantization artifact.", | |
| "v05.consistency.btn": "🔬 Verify algebraic consistency", | |
| // v0.5.2 — Anti-Ising universality class badge | |
| "v05.antiising.badge": "🧲 Anti-Ising class (β=γ−1<0, machine-verified)", | |
| // v0.5.2 — Per-identity tooltips (plain English explanations) | |
| "v05.tooltip.D_SAGE_1": "Quadratic algebraic identity connecting anomalous dimension η and susceptibility γ_χ. The CORE identity discovered by Sage Groebner basis (machine-proof). Replaces earlier wrong claim of triple closure.", | |
| "v05.tooltip.D_SAGE_2": "In Phase A, β = γ−1 is negative (anti-Ising). Multiplied by χ = 1/(1−γ) gives exactly −1. Signature of TAF's negative-β regime.", | |
| "v05.tooltip.D_SAGE_4": "The specific heat exponent α and susceptibility χ sum to exactly 2 in TAF. Algebraic consequence of Josephson hyperscaling.", | |
| "v05.tooltip.D_SAGE_5": "Linear sum identity: α + γ_χ = 2(2−γ). Means as γ approaches 1 (Hagedorn), the sum approaches 2; at γ=0 it's 4.", | |
| "v05.tooltip.D_SAGE_6": "Order parameter exponent times susceptibility exponent equals a specific quadratic in γ. Factored algebraic relation.", | |
| "v05.tooltip.Rushbrooke_tautology": "Standard Rushbrooke hyperscaling 2β + γ_χ = ν·d at d=1. In TAF this is a TAUTOLOGY — γ_χ is defined exactly so this holds. Confirmed by Sage Groebner basis.", | |
| "v05.tooltip.Josephson_tautology": "Standard Josephson hyperscaling 2 − α = ν·d at d=1. In TAF this is a TAUTOLOGY — α is defined exactly so this holds.", | |
| "v05.tooltip.Fisher_independent": "Fisher relation γ_χ = (2−η)·ν. In TAF this is INDEPENDENT (does NOT close as identity, contrary to triple-closure claim). Residual is γ(2γ−3)/(1−γ).", | |
| "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 originally claimed η=2γ. This identity refutes it: residual is positive throughout Phase A. Lean Mathlib4 machine-proof refutation.", | |
| "v05.tooltip.D_14_nu_imprint": "The learned imprint slope ν = −1/(2π) times 2π equals −1. Trivial dimensional check from paper 1.", | |
| "v05.tooltip.D_SAGE_7": "The central charge c=3 times |ν_imprint| times 2π equals 3. Dimensional closure connecting CFT and training imprint.", | |
| "v05.tooltip.nu_beta_id": "Correlation length exponent ν times order parameter exponent β equals −1 in Phase A. Variant of D-SAGE-2.", | |
| "v053.calibration.title": "🔬 v0.5.3 — Calibration audit (2026-05-02)", | |
| "v053.calibration.note": "<strong>SWA correction disabled</strong> — original δ_SWA = -0.21 was fit on n=1 model (insufficient data; group-mean +0.355 with single yes-case). <strong>post_IH correction marked exploratory</strong> — group-mean ≈ 0 in re-audit (n=22 panel) does not replicate the OLS fit. <strong>GQA correction replicates</strong> (panel +0.115 vs hardcoded +0.11). <strong>D_f formula corrected for Phase B (γ>1)</strong> — uses discrete cumulative sum instead of continuum approximation. LLaMA-3, Mistral, Gemma now report correct compression values.", | |
| "v053.release.banner": "🔧 v0.5.3 — Audit-driven fixes: KV compression D_f now uses discrete sum (correct for all γ); δ_SWA disabled (n=1 calibration); paper §5.2 C_V coefficient erratum (1/4 → 1/12).", | |
| // §35 v0.6 — γ predicted-vs-observed diagnostic | |
| "gamma_check.title": "🔍 γ predicted vs observed", | |
| "gamma_check.desc": "Enter your empirically measured γ. Tool detects regime: fraud (θ inflated) / compressed / over-Padé / SWA-random / normal.", | |
| "gamma_check.gobs_label": "γ_observed", | |
| "gamma_check.gobs_tip": "Empirically measured γ from your model's attention scores. Use the Diagnose CLI to obtain this from real weights.", | |
| "gamma_check.random_label": "Random corpus?", | |
| "gamma_check.random_tip": "Tick if γ_observed was measured on random/unstructured tokens. Distinguishes SWA signature (γ_obs > 1) from anomaly.", | |
| "gamma_check.regime": "Regime", | |
| "gamma_check.regime.normal": "Normal", | |
| "gamma_check.regime.fraud": "Fraud (θ inflated)", | |
| "gamma_check.regime.compressed": "Compressed context", | |
| "gamma_check.regime.overpade": "Over-Padé", | |
| "gamma_check.regime.swa": "SWA random-corpus signature", | |
| "gamma_check.regime.unknown": "Unknown", | |
| "gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15]: model uses its full nominal context, no anomaly.", | |
| "gamma_check.regime.fraud.desc": "η < 0.01: nominal θ inflated. Model behaves as if θ ≪ advertised. Likely YaRN/marketing inflation without true context extension.", | |
| "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5): context is compressed (model attends less far than nominal θ predicts). Common in instruction-tuned / RLHF models.", | |
| "gamma_check.regime.overpade.desc": "η > 1.5: model attends farther than Padé predicts. Possible Lerch-corrected regime or undertrained early-checkpoint.", | |
| "gamma_check.regime.swa.desc": "γ_obs > 1.05 on random corpus = sliding-window attention signature (Mistral / Gemma family).", | |
| "gamma_check.regime.unknown.desc": "Inputs out of range or γ_obs > 1 without random-corpus flag. Verify measurement.", | |
| "gamma_check.glossary.title": "ⓘ Glossary — what these variables mean", | |
| "gamma_check.glossary.gamma_pade": "<strong>γ_Padé</strong>: closed-form prediction (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.", | |
| "gamma_check.glossary.gamma_obs": "<strong>γ_observed</strong>: empirically measured from your model's attention scores (run the Diagnose CLI on real weights).", | |
| "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (observed)</strong>: inverted from γ_obs via T√2 / (1 − γ_obs). Effective θ implied by your measurement.", | |
| "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong>: θ + T/√2. Effective θ predicted by closed-form.", | |
| "gamma_check.glossary.efficiency": "<strong>η</strong>: ratio θ_eff_obs / θ_eff_Padé. ≈1 = normal · <0.01 = fraud · <0.5 = compressed · >1.5 = over-Padé.", | |
| "gamma_check.glossary.delta_h": "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cardy entropy shift. Negative = compression entropy. ~0 = nominal match.", | |
| "gamma_check.glossary.regime": "<strong>Regime</strong>: automatic classifier from η + γ_obs + random_corpus flag.", | |
| // §36 v0.6 — Tooltips for inline ⓘ icons (per-variable explanations) | |
| "tooltip.gamma_pade": "<strong>γ_Padé(T_eval)</strong>: closed-form prediction (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.", | |
| "tooltip.gamma_decomposed": "<strong>γ_decomposed</strong>: γ from full architectural decomposition. Padé baseline + GQA shift + post-IH shift (calibrated audit-replicated subset).", | |
| "tooltip.d_horizon": "<strong>d_horizon</strong>: effective attention horizon. Beyond this position, scores fall below noise floor (paper §26).", | |
| "tooltip.L_NIAH": "<strong>L_NIAH ceiling</strong>: predicted ceiling for needle-in-a-haystack retrieval reliability at current d_horizon.", | |
| "tooltip.chi": "<strong>χ susceptibility</strong>: χ = 1/(1−γ). Diverges at the Hagedorn line γ=1.", | |
| "tooltip.kv_memory": "<strong>KV memory @ T_eval (BF16)</strong>: per-request KV cache = 2 · n_layers · n_kv_heads · d_head · T_eval bytes.", | |
| "tooltip.theta_eff_obs": "<strong>θ_eff (observed)</strong>: effective θ implied by your γ_observed: T√2 / (1 − γ_obs).", | |
| "tooltip.theta_eff_pade": "<strong>θ_eff (Padé)</strong>: effective θ predicted by closed-form: θ + T/√2.", | |
| "tooltip.efficiency": "<strong>η = θ_eff_obs / θ_eff_Padé</strong>: efficiency ratio. ≈1 = normal · <0.01 = fraud · <0.5 = compressed · >1.5 = over-Padé.", | |
| "tooltip.delta_h_cardy": "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cardy entropy shift. Negative = compression entropy. ~0 = nominal match.", | |
| "tooltip.verdict_aggregate": "<strong>Verdict</strong>: worst-of across all recipes. ✅ GO = all green · ⚠ DEGRADED = ≥1 yellow · ❌ NO = ≥1 red.", | |
| "tooltip.verdict_breakdown": "<strong>Per-recipe breakdown</strong>: each recipe tests an <em>independent</em> decision axis (long-context · budget · hardware · custom-vs-API · KV-compression). A ❌ on X-1 means \"use the API for your volume\" not \"the model fails\" — open the Recipes section for per-axis context.", | |
| "tooltip.gamma_pill": "<strong>γ headline</strong>: γ_decomposed (or γ_Padé fallback). Range (0,1) = Phase A (anti-Ising). γ ≥ 1 = Hagedorn / Phase B.", | |
| "tooltip.anti_ising": "<strong>Anti-Ising class</strong>: Phase A → β = γ−1 < 0. Machine-verified (Sage + Lean Mathlib4). See §35 v0.5.", | |
| // §37 v0.6 — Lean+Mathlib theorem table | |
| "lean.table.title": "📑 Lean+Mathlib theorem table", | |
| "lean.table.desc": "Every entry below is machine-proven against Lean 4 + Mathlib4. Click any L# link to jump to the source line on GitHub. Grouped by topic — click a header to expand.", | |
| "lean.table.theorem": "Theorem", | |
| "lean.table.claim": "Claim", | |
| "lean.table.tactic": "Tactic", | |
| "lean.table.source": "Source", | |
| "lean.table.lean": "Lean", | |
| "lean.findings.title": "🔎 Substantive findings", | |
| "lean.findings.detected_by": "Detected by", | |
| "lean.findings.fixed_by": "Fixed by", | |
| "lean.findings.recommendation":"Recommendation", | |
| "lean.meta.repo": "Repo", | |
| "lean.meta.build": "Build", | |
| "lean.meta.theorems": "Theorems", | |
| "lean.meta.verified": "verified", | |
| "lean.meta.rejected": "rejected", | |
| "lean.meta.sorry": "sorry", | |
| "lean.meta.findings": "substantive findings", | |
| "lean.manifest.loading": "Loading Lean manifest…", | |
| "lean.manifest.error": "Lean manifest unavailable", | |
| // Help modal — v0.6 section | |
| "help.v06.title": "🆕 v0.6 — γ predicted-vs-observed + Cardy ΔH + Lean badges", | |
| "help.v06.intro": "<em>v0.6 (2026-05-06): three new diagnostics live in the TAF Card under <strong>🔬 Diagnostics</strong>. All run in your browser; γ_observed comes from the Diagnose CLI on real weights.</em>", | |
| "help.v06.layout.title": "TAF Card layout (new in v0.6)", | |
| "help.v06.layout.body": "After clicking <strong>🚀 Generate full profile</strong> the card shows: a <strong>hero strip</strong> on top (architecture class + meta + 3 pills: aggregate verdict ✅/⚠/❌, γ headline, 🧲 Anti-Ising if Phase A) and four <strong>expandable sections</strong>: <strong>📋 Recipes</strong> (open by default — verdict per dimension), <strong>🔬 Diagnostics</strong> (key numbers, γ predicted vs observed, what-if explorer), <strong>✓ Verification</strong> (Sage+Lean algebraic consistency, falsification F1-F23), <strong>📂 Provenance & share</strong> (calibration audit + JSON download / share link / registry submit). Click any header to expand. Every variable has an inline <strong>ⓘ</strong> tooltip.", | |
| "help.v06.gamma_check.title": "γ predicted vs observed", | |
| "help.v06.gamma_check.body": "Enter the empirically-measured γ from your model and the tool computes <strong>η = θ_eff_obs / θ_eff_Padé</strong> and classifies into one of 5 regimes:", | |
| "help.v06.case.normal": "<strong>Normal</strong> (η ∈ [0.85, 1.15]) — model uses its full nominal context. <em>Use case</em>: validate a new release before adopting it.", | |
| "help.v06.case.fraud": "<strong>Fraud</strong> (η < 0.01) — nominal θ inflated; model behaves as if θ ≪ advertised. <em>Use case</em>: detect YaRN/marketing inflation (CodeLlama / Mistral-Nemo pattern).", | |
| "help.v06.case.compressed": "<strong>Compressed</strong> (η < 0.5) — context compressed; model attends shorter than nominal θ. <em>Use case</em>: spot RLHF/instruction-tuning compression (LLaMA-2 pattern).", | |
| "help.v06.case.overpade": "<strong>Over-Padé</strong> (η > 1.5) — model attends farther than Padé predicts. <em>Use case</em>: identify Lerch-corrected regime or undertrained early checkpoints (pythia-1b pattern).", | |
| "help.v06.case.swa": "<strong>SWA random-corpus</strong> (γ_obs > 1.05 with random_corpus=Yes) — sliding-window attention signature. <em>Use case</em>: confirm Mistral / Gemma SWA on random tokens.", | |
| "help.v06.cardy.title": "Cardy ΔH diagnostic", | |
| "help.v06.cardy.body": "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>. Entropy shift between observed effective θ and nominal θ. Strong negative = compression entropy; near zero = nominal match. Complements η for borderline cases.", | |
| "help.v06.lean.title": "Lean + Mathlib verification badges", | |
| "help.v06.lean.body": "TAF identities are formally machine-proven in Lean Mathlib4: <strong>37 theorems</strong> in 7 groups (Padé, RG flow, Cayley, D-SAGE, audit findings, erratum CV, misc) + <strong>1 substantive finding</strong> (V-derivative factor-2, theorem <code>V_derivative_ne_RG_beta</code>). Source: <a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a> (commit 25c77fd). Re-verify locally: <code>git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean</code>. The 🧲 Anti-Ising pill in the hero strip and the Verification accordion link to specific source lines.", | |
| "help.v06.glossary.title": "Variable glossary (also embedded in TAF Card)", | |
| "help.v06.glossary.body": "Every variable in the TAF Card has an inline ⓘ tooltip. The complete list: γ, γ_Padé, γ_decomposed, γ_observed, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, KV memory, regime. Hover any ⓘ for the definition + paper section.", | |
| }, | |
| // ──────────────────────────────────────────────────────────────────────── | |
| // ES — Español | |
| // ──────────────────────────────────────────────────────────────────────── | |
| es: { | |
| // §33 v0.4 (sesion 31, 2026-04-30) — nuevas funciones diagnósticas | |
| "v04.title": "🆕 v0.4 — Nuevos diagnósticos (sesion 31)", | |
| "v04.section.intro": "Cuatro nuevas funciones diagnósticas derivadas en sesión 31 (2026-04-30) desde juegos de fórmulas cross-of-crosses + interrogación socrática. Disponibles en <code>taf_browser.py</code> §33.", | |
| "v04.arch.label": "Concentración Arquitectural", | |
| "v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv. Ley correlacional cross-panel (R²=0.30). Caveat: no es predictor per-model.", | |
| "v04.pdi.label": "PDI — Índice de Desviación de Padé", | |
| "v04.pdi.desc": "PDI = d_horizon_obs/T_eval. Semáforo: verde (≈1), naranja (>>1), amarillo (<<1), rojo (Phase B negativo).", | |
| "v04.4bit.label": "Predictor de Shift 4-bit", | |
| "v04.4bit.desc": "MHA: R²(bf16)<0.9 → γ sube; R²>0.99 → γ baja. GQA: precision-robusto.", | |
| "v04.crit.label": "Bundle de Exponentes Críticos", | |
| "v04.crit.desc": "ν_c, β_c, η_c (=γ−1, CORREGIDO), α_C, γ_susc con mínimo AM-GM en γ=1−1/√2≈0.293.", | |
| // §34 v0.5 (sesion 32, 2026-05-01) — Consistencia algebraica verificada por máquina | |
| "v05.title": "🔬 v0.5 — Consistencia verificada por máquina (sesion 32)", | |
| "v05.section.intro": "Verificación dual con Sage Groebner basis + Lean Mathlib4 de <strong>15 identidades algebraicas</strong> de los exponentes críticos TAF. Primer framework transformer-attention con respaldo formal machine-proof.", | |
| "v05.verify.label": "Comprobación de Consistencia Algebraica", | |
| "v05.verify.desc": "Dado γ medido, verifica 12 identidades D-SAGE (D-SAGE-1: 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). Todas pasando = framework intacto. Fallos indican bf16 outliers / artefactos de cuantización.", | |
| "v05.dsage1.label": "D-SAGE-1 (★★ core)", | |
| "v05.dsage1.desc": "Identidad cuadrática 2η² + η·γ_χ + 1 = 0 (descubierta por Sage Groebner, verificada Lean). Reemplaza claim incorrecto de 'cierre triple'. Refuta η=2γ del paper 1 algebraicamente.", | |
| "v05.erratum.label": "Erratum paper 1 — corrección η", | |
| "v05.erratum.desc": "Paper 1 afirmaba η = 2γ. Sage Groebner + Lean Mathlib4 demostraron que falla (residual (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Fase A). Valor correcto: η = γ−1, satisface D-SAGE-1.", | |
| "v05.repro.label": "Reproducibilidad", | |
| "v05.repro.desc": "Los 15 teoremas son machine-proof en Lean Mathlib4 (build exitoso 1973 jobs). Script Sage: <code>analysis/sage_recursive_sweep_2026-04-30.sage</code>. Código Lean: <code>lean_taf/taf/Taf/Identities.lean</code>.", | |
| // v0.5.1 — TAF Card consistency check button | |
| "v05.consistency.title": "🔬 Comprobación de consistencia algebraica (Sage + Lean v0.5)", | |
| "v05.consistency.desc": "Verifica 12 identidades algebraicas D-SAGE de los exponentes críticos TAF (machine-proof Sage Groebner basis + Lean Mathlib4). Pasa = framework intacto. Falla = bf16 outlier / artefacto de cuantización.", | |
| "v05.consistency.btn": "🔬 Verificar consistencia algebraica", | |
| // v0.5.2 — Anti-Ising universality class badge | |
| "v05.antiising.badge": "🧲 Clase Anti-Ising (β=γ−1<0, verificado por máquina)", | |
| // v0.5.2 — Per-identity tooltips (explicaciones en lenguaje claro) | |
| "v05.tooltip.D_SAGE_1": "Identidad algebraica cuadrática que conecta dimensión anómala η con susceptibilidad γ_χ. Identidad CORE descubierta por Sage Groebner basis (machine-proof). Reemplaza claim incorrecto de triple closure.", | |
| "v05.tooltip.D_SAGE_2": "En Fase A, β = γ−1 es negativo (anti-Ising). Multiplicado por χ = 1/(1−γ) da exactamente −1. Signature del régimen negativo-β de TAF.", | |
| "v05.tooltip.D_SAGE_4": "El exponente de calor específico α y la susceptibilidad χ suman exactamente 2 en TAF. Consecuencia algebraica del hiperescalado de Josephson.", | |
| "v05.tooltip.D_SAGE_5": "Identidad lineal: α + γ_χ = 2(2−γ). Significa que cuando γ se acerca a 1 (Hagedorn), la suma se acerca a 2; en γ=0 vale 4.", | |
| "v05.tooltip.D_SAGE_6": "Exponente de parámetro de orden multiplicado por exponente de susceptibilidad da una cuadrática específica en γ. Relación algebraica factorizada.", | |
| "v05.tooltip.Rushbrooke_tautology": "Hiperescalado de Rushbrooke estándar 2β + γ_χ = ν·d en d=1. En TAF es TAUTOLOGÍA — γ_χ se define exactamente para que se cumpla. Confirmado por Sage Groebner basis.", | |
| "v05.tooltip.Josephson_tautology": "Hiperescalado de Josephson estándar 2 − α = ν·d en d=1. En TAF es TAUTOLOGÍA — α se define exactamente para que se cumpla.", | |
| "v05.tooltip.Fisher_independent": "Relación de Fisher γ_χ = (2−η)·ν. En TAF es INDEPENDIENTE (NO cierra como identidad, contrario al claim de triple closure). El residuo es γ(2γ−3)/(1−γ).", | |
| "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 afirmaba η=2γ. Esta identidad lo refuta: el residuo es positivo en toda Fase A. Refutación machine-proof por Lean Mathlib4.", | |
| "v05.tooltip.D_14_nu_imprint": "La pendiente de imprint aprendido ν = −1/(2π) multiplicada por 2π da −1. Verificación dimensional trivial del paper 1.", | |
| "v05.tooltip.D_SAGE_7": "La carga central c=3 multiplicada por |ν_imprint| multiplicada por 2π da 3. Cierre dimensional conectando CFT con imprint de entrenamiento.", | |
| "v05.tooltip.nu_beta_id": "Exponente de longitud de correlación ν multiplicado por exponente de parámetro de orden β da −1 en Fase A. Variante de D-SAGE-2.", | |
| "v053.calibration.title": "🔬 v0.5.3 — Auditoría de calibración (2026-05-02)", | |
| "v053.calibration.note": "<strong>Corrección SWA desactivada</strong> — δ_SWA = -0.21 original se ajustó con n=1 modelo (datos insuficientes; promedio del único caso +0.355). <strong>Corrección post_IH marcada exploratoria</strong> — promedio de grupo ≈ 0 en re-auditoría (panel n=22) no replica el ajuste OLS. <strong>Corrección GQA replica</strong> (panel +0.115 vs hardcoded +0.11). <strong>Fórmula D_f corregida para Fase B (γ>1)</strong> — usa suma cumulativa discreta en lugar de aproximación continua. LLaMA-3, Mistral, Gemma ahora reportan valores correctos de compresión.", | |
| "v053.release.banner": "🔧 v0.5.3 — Correcciones por audit: D_f de compresión KV ahora usa suma discreta (correcto para todo γ); δ_SWA desactivado (calibración n=1); erratum coeficiente C_V paper §5.2 (1/4 → 1/12).", | |
| // §35 v0.6 — Diagnóstico γ predicho vs observado | |
| "gamma_check.title": "🔍 γ predicho vs observado", | |
| "gamma_check.desc": "Introduce tu γ medido empíricamente. La herramienta detecta el régimen: fraude (θ inflado) / comprimido / sobre-Padé / SWA-aleatorio / normal.", | |
| "gamma_check.gobs_label": "γ_observado", | |
| "gamma_check.gobs_tip": "γ medido empíricamente desde los attention scores de tu modelo. Usa la CLI de Diagnose para obtenerlo desde pesos reales.", | |
| "gamma_check.random_label": "¿Corpus aleatorio?", | |
| "gamma_check.random_tip": "Marca sí si γ_observado se midió sobre tokens aleatorios/no estructurados. Distingue la firma SWA (γ_obs > 1) de una anomalía.", | |
| "gamma_check.regime": "Régimen", | |
| "gamma_check.regime.normal": "Normal", | |
| "gamma_check.regime.fraud": "Fraude (θ inflado)", | |
| "gamma_check.regime.compressed": "Contexto comprimido", | |
| "gamma_check.regime.overpade": "Sobre-Padé", | |
| "gamma_check.regime.swa": "Firma SWA (corpus aleatorio)", | |
| "gamma_check.regime.unknown": "Desconocido", | |
| "gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15]: el modelo usa su contexto nominal completo, sin anomalías.", | |
| "gamma_check.regime.fraud.desc": "η < 0.01: θ nominal inflado. El modelo se comporta como si θ ≪ del anunciado. Probable inflación tipo YaRN/marketing sin extensión real de contexto.", | |
| "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5): contexto comprimido (el modelo atiende menos lejos de lo que predice θ nominal). Común en modelos instruction-tuned / RLHF.", | |
| "gamma_check.regime.overpade.desc": "η > 1.5: el modelo atiende más lejos de lo que Padé predice. Posible régimen Lerch-corregido o checkpoint temprano sub-entrenado.", | |
| "gamma_check.regime.swa.desc": "γ_obs > 1.05 sobre corpus aleatorio = firma de sliding-window attention (familias Mistral / Gemma).", | |
| "gamma_check.regime.unknown.desc": "Entradas fuera de rango o γ_obs > 1 sin flag de corpus aleatorio. Verifica la medición.", | |
| "gamma_check.glossary.title": "ⓘ Glosario — significado de las variables", | |
| "gamma_check.glossary.gamma_pade": "<strong>γ_Padé</strong>: predicción cerrada (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.", | |
| "gamma_check.glossary.gamma_obs": "<strong>γ_observado</strong>: medido empíricamente desde los attention scores (ejecuta Diagnose CLI sobre pesos reales).", | |
| "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (observado)</strong>: invertido desde γ_obs vía T√2 / (1 − γ_obs). θ efectivo implicado por tu medición.", | |
| "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong>: θ + T/√2. θ efectivo predicho por la fórmula cerrada.", | |
| "gamma_check.glossary.efficiency": "<strong>η</strong>: ratio θ_eff_obs / θ_eff_Padé. ≈1 = normal · <0.01 = fraude · <0.5 = comprimido · >1.5 = sobre-Padé.", | |
| "gamma_check.glossary.delta_h": "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cambio de entropía de Cardy. Negativo = entropía de compresión. ~0 = coincide con nominal.", | |
| "gamma_check.glossary.regime": "<strong>Régimen</strong>: clasificador automático a partir de η + γ_obs + flag corpus_aleatorio.", | |
| // §36 v0.6 — Tooltips para iconos ⓘ inline | |
| "tooltip.gamma_pade": "<strong>γ_Padé(T_eval)</strong>: predicción cerrada (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.", | |
| "tooltip.gamma_decomposed": "<strong>γ_descompuesto</strong>: γ desde descomposición arquitectural completa. Línea base Padé + shift GQA + shift post-IH (subconjunto replicado en audit calibrado).", | |
| "tooltip.d_horizon": "<strong>d_horizon</strong>: horizonte efectivo de atención. Más allá los scores caen bajo el suelo de ruido (paper §26).", | |
| "tooltip.L_NIAH": "<strong>Techo L_NIAH</strong>: techo predicho de fiabilidad needle-in-a-haystack al d_horizon actual.", | |
| "tooltip.chi": "<strong>χ susceptibilidad</strong>: χ = 1/(1−γ). Diverge en la línea Hagedorn γ=1.", | |
| "tooltip.kv_memory": "<strong>Memoria KV @ T_eval (BF16)</strong>: caché KV por petición = 2 · n_layers · n_kv_heads · d_head · T_eval bytes.", | |
| "tooltip.theta_eff_obs": "<strong>θ_eff (observado)</strong>: θ efectivo implicado por tu γ_observado: T√2 / (1 − γ_obs).", | |
| "tooltip.theta_eff_pade": "<strong>θ_eff (Padé)</strong>: θ efectivo predicho por la fórmula cerrada: θ + T/√2.", | |
| "tooltip.efficiency": "<strong>η = θ_eff_obs / θ_eff_Padé</strong>: ratio de eficiencia. ≈1 = normal · <0.01 = fraude · <0.5 = comprimido · >1.5 = sobre-Padé.", | |
| "tooltip.delta_h_cardy": "<strong>ΔH_Cardy</strong>: log(θ_eff_obs / θ_nominal). Cambio de entropía de Cardy. Negativo = entropía de compresión. ~0 = coincide con nominal.", | |
| "tooltip.verdict_aggregate": "<strong>Veredicto</strong>: peor-de entre todas las recipes. ✅ ADELANTE = todo verde · ⚠ DEGRADADO = ≥1 amarillo · ❌ NO = ≥1 rojo.", | |
| "tooltip.verdict_breakdown": "<strong>Desglose por recipe</strong>: cada recipe evalúa un eje de decisión <em>independiente</em> (contexto-largo · presupuesto · hardware · custom-vs-API · compresión-KV). Un ❌ en X-1 significa «usa la API para tu volumen» no «el modelo falla» — abre la sección Recipes para contexto por eje.", | |
| "tooltip.gamma_pill": "<strong>γ titular</strong>: γ_descompuesto (o γ_Padé como fallback). Rango (0,1) = Fase A (anti-Ising). γ ≥ 1 = Hagedorn / Fase B.", | |
| "tooltip.anti_ising": "<strong>Clase Anti-Ising</strong>: Fase A → β = γ−1 < 0. Machine-verified (Sage + Lean Mathlib4). Ver §35 v0.5.", | |
| // §37 v0.6 — Tabla de teoremas Lean+Mathlib | |
| "lean.table.title": "📑 Tabla de teoremas Lean+Mathlib", | |
| "lean.table.desc": "Cada entrada está machine-proven contra Lean 4 + Mathlib4. Click en cualquier link L# para saltar a la línea fuente en GitHub. Agrupado por tema — click en cabecera para expandir.", | |
| "lean.table.theorem": "Teorema", | |
| "lean.table.claim": "Afirmación", | |
| "lean.table.tactic": "Táctica", | |
| "lean.table.source": "Fuente", | |
| "lean.table.lean": "Lean", | |
| "lean.findings.title": "🔎 Findings sustantivos", | |
| "lean.findings.detected_by": "Detectado por", | |
| "lean.findings.fixed_by": "Arreglado por", | |
| "lean.findings.recommendation":"Recomendación", | |
| "lean.meta.repo": "Repo", | |
| "lean.meta.build": "Build", | |
| "lean.meta.theorems": "Teoremas", | |
| "lean.meta.verified": "verificados", | |
| "lean.meta.rejected": "rechazados", | |
| "lean.meta.sorry": "sorry", | |
| "lean.meta.findings": "findings sustantivos", | |
| "lean.manifest.loading": "Cargando manifest Lean…", | |
| "lean.manifest.error": "Manifest Lean no disponible", | |
| // Help modal — sección v0.6 | |
| "help.v06.title": "🆕 v0.6 — γ predicho-vs-observado + Cardy ΔH + badges Lean", | |
| "help.v06.intro": "<em>v0.6 (2026-05-06): tres diagnósticos nuevos viven en el TAF Card bajo <strong>🔬 Diagnósticos</strong>. Todo corre en tu navegador; γ_observado lo obtienes con la Diagnose CLI sobre pesos reales.</em>", | |
| "help.v06.layout.title": "Layout del TAF Card (nuevo en v0.6)", | |
| "help.v06.layout.body": "Tras click en <strong>🚀 Generar perfil completo</strong> la tarjeta muestra: una <strong>tira hero</strong> arriba (clase de arquitectura + meta + 3 pills: veredicto agregado ✅/⚠/❌, γ titular, 🧲 Anti-Ising si Fase A) y cuatro <strong>secciones plegables</strong>: <strong>📋 Recipes</strong> (abierto por defecto — veredicto por dimensión), <strong>🔬 Diagnósticos</strong> (números clave, γ predicho vs observado, explorador what-if), <strong>✓ Verificación</strong> (consistencia algebraica Sage+Lean, falsificación F1-F23), <strong>📂 Procedencia y compartir</strong> (auditoría de calibración + descarga JSON / enlace / submit al registro). Click en cualquier cabecera para expandir. Cada variable tiene tooltip <strong>ⓘ</strong> inline.", | |
| "help.v06.gamma_check.title": "γ predicho vs observado", | |
| "help.v06.gamma_check.body": "Introduces el γ medido empíricamente y la herramienta calcula <strong>η = θ_eff_obs / θ_eff_Padé</strong> y clasifica en uno de 5 regímenes:", | |
| "help.v06.case.normal": "<strong>Normal</strong> (η ∈ [0.85, 1.15]) — el modelo usa su contexto nominal completo. <em>Caso de uso</em>: validar un release nuevo antes de adoptarlo.", | |
| "help.v06.case.fraud": "<strong>Fraude</strong> (η < 0.01) — θ nominal inflado; el modelo se comporta como si θ ≪ del anunciado. <em>Caso de uso</em>: detectar inflación YaRN/marketing (patrón CodeLlama / Mistral-Nemo).", | |
| "help.v06.case.compressed": "<strong>Comprimido</strong> (η < 0.5) — contexto comprimido; el modelo atiende menos lejos que θ nominal. <em>Caso de uso</em>: detectar compresión por RLHF/instruction-tuning (patrón LLaMA-2).", | |
| "help.v06.case.overpade": "<strong>Sobre-Padé</strong> (η > 1.5) — el modelo atiende más lejos que Padé predice. <em>Caso de uso</em>: identificar régimen Lerch-corregido o checkpoints tempranos sub-entrenados (patrón pythia-1b).", | |
| "help.v06.case.swa": "<strong>SWA corpus aleatorio</strong> (γ_obs > 1.05 con corpus_aleatorio=Sí) — firma de sliding-window attention. <em>Caso de uso</em>: confirmar SWA en Mistral / Gemma sobre tokens random.", | |
| "help.v06.cardy.title": "Diagnóstico Cardy ΔH", | |
| "help.v06.cardy.body": "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>. Cambio de entropía entre el θ efectivo observado y el θ nominal. Negativo fuerte = entropía de compresión; cerca de cero = coincide con nominal. Complementa a η para casos borderline.", | |
| "help.v06.lean.title": "Badges de verificación Lean + Mathlib", | |
| "help.v06.lean.body": "Las identidades TAF están formalmente machine-proven en Lean Mathlib4: <strong>37 teoremas</strong> en 7 grupos (Padé, flujo RG, Cayley, D-SAGE, hallazgos de auditoría, erratum CV, misc) + <strong>1 hallazgo sustantivo</strong> (factor 2 en derivada V, teorema <code>V_derivative_ne_RG_beta</code>). Fuente: <a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a> (commit 25c77fd). Re-verifica localmente: <code>git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean</code>. La pill 🧲 Anti-Ising del hero y la sección Verificación enlazan a líneas específicas del código fuente.", | |
| "help.v06.glossary.title": "Glosario de variables (también embebido en TAF Card)", | |
| "help.v06.glossary.body": "Cada variable del TAF Card tiene un tooltip ⓘ inline. Lista completa: γ, γ_Padé, γ_descompuesto, γ_observado, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, memoria KV, régimen. Pasa el ratón sobre cualquier ⓘ para la definición + sección del paper.", | |
| "hero.title": "🔬 TAF Agent", | |
| "hero.tagline": "Diagnostica cualquier LLM transformer en 30 segundos. Gratis. Sin GPU. Sin registro.", | |
| "hero.subtitle": "Predice si un modelo te servirá para tu caso de uso <em>antes</em> de gastar dinero o tiempo. Todo corre en tu navegador — tus datos nunca salen de esta pestaña.", | |
| "hero.help": "📘 Manual y ejemplos", | |
| "hero.quickstart_btn": "⚡ Inicio rápido", | |
| "hero.inventory_btn": "🧰 Qué te ofrece", | |
| "hero.about": "Construido por un investigador independiente. Código abierto. Sin afiliación con ningún proveedor de modelos.", | |
| "modes.title": "🎯 Modo", | |
| "modes.profile": "📇 Perfilar un modelo", | |
| "modes.compare": "🆚 Comparar modelos", | |
| "modes.inspector": "🔍 Inspeccionar config", | |
| "modes.ask": "💬 Pregunta libre", | |
| "modes.recipe": "📋 Elegir receta", | |
| "modes.diagnose": "🩺 Diagnóstico CLI", | |
| "diagnose.title": "🩺 Generador del comando Diagnose CLI", | |
| "diagnose.tip": "El navegador predice γ desde config; el CLI mide γ_obs sobre los pesos reales. Este generador produce el comando exacto para ejecutar localmente.", | |
| "diagnose.desc": "Elige opciones y copia-pega el comando generado en tu máquina local (Python + transformers + numpy). Modo rápido ≈5 min CPU; completo ≈20–60 min GPU.", | |
| "diagnose.model_label": "ID del modelo HF:", | |
| "diagnose.theta_label": "θ (auto si vacío):", | |
| "diagnose.n_label": "Contexto N:", | |
| "diagnose.options_label": "Opciones:", | |
| "diagnose.opt_fast": "--fast (CPU, ~5 min)", | |
| "diagnose.opt_cpu": "--cpu (forzar CPU)", | |
| "diagnose.opt_4bit": "--load_in_4bit (modelos ≥7B)", | |
| "diagnose.local_label": "--local path (opcional):", | |
| "diagnose.build_btn": "📋 Generar comando", | |
| "diagnose.cmd_title": "Comando generado:", | |
| "diagnose.copy_btn": "📋 Copiar al portapapeles", | |
| "diagnose.next_steps": "Siguientes pasos: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Ejecuta el comando (4) JSON resultado → subir vía modo Inspect para análisis TAF completo.", | |
| "modes.phase": "📊 Diagrama de fase", | |
| "phase.title": "📊 Diagrama de fase (γ × θ)", | |
| "phase.tip": "Cada punto es un modelo del panel empírico del paper. x: log θ; y: γ. La línea Hagedorn γ=1 separa Fase A de Fase B. Hover para detalles, click para cargar en el formulario.", | |
| "phase.desc": "23 modelos en el panel; curva Padé a T=2000.", | |
| "modes.desc": "<strong>Inicio rápido</strong>: pega cualquier id de modelo HuggingFace (ej. <code>meta-llama/Meta-Llama-3-8B</code>), click Perfilar. Verás las 5 recetas evaluadas en segundos.", | |
| "profile.title": "📇 Perfilar un modelo", | |
| "profile.desc": "<strong>Para técnicos</strong>: cuando necesitas una foto completa de viabilidad de un modelo candidato. Un click ejecuta las 5 recetas y produce una TAF Card unificada.", | |
| "profile.preset_label": "Preset:", | |
| "profile.preset_default": "— o elige de la lista —", | |
| "profile.hf_label": "ID modelo HF:", | |
| "profile.fetch_btn": "📥 Cargar", | |
| "profile.btn": "🚀 Generar perfil completo", | |
| "profile.quickstart": "💡 Inicio rápido: elige cualquier preset → click Generar. O pega un id desde <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub trending</a> → 📥 Cargar → Generar.", | |
| "compare.title": "🆚 Comparar modelos lado a lado", | |
| "compare.desc": "<strong>Para técnicos</strong>: cuando eliges entre 2-3 modelos candidatos para un escenario de despliegue específico. Misma receta, múltiples modelos, veredictos lado a lado.", | |
| "compare.recipe_label": "Receta:", | |
| "compare.T_eval_label": "T_eval (contexto objetivo):", | |
| "compare.models_title": "Modelos a comparar (hasta 3)", | |
| "compare.btn": "🚀 Comparar", | |
| "compare.example": "💡 Prueba: pega 3 modelos populares de 7-8B (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), receta X-2, T_eval=16000. Mira cuál maneja mejor contexto largo.", | |
| "ask.title": "❓ Tu pregunta", | |
| "ask.placeholder": "ej. ¿Mistral-7B aguanta 16K NIAH retrieval? O: Tengo 5,000$, ¿qué modelo puedo entrenar? O: ¿GPU más barato para servir Llama-70B a 100M tokens/día?", | |
| "ask.btn": "🚀 Analizar", | |
| "ask.example_btn": "💡 Probar ejemplo", | |
| "recipe.title": "📋 Receta", | |
| "recipe.default": "— elige una receta —", | |
| "recipe.input_title": "🎯 Entradas", | |
| "verdict.title": "📊 Veredicto", | |
| "chain.title": "🔍 Cadena de cálculo", | |
| "chain.desc": "Cada número de abajo es Python determinista. Click en un paso para expandir.", | |
| "answer.title": "💬 Respuesta en lenguaje natural", | |
| "share.btn": "🔗 Copiar link", | |
| "share.copied": "✅ ¡Copiado al portapapeles!", | |
| "share.download": "💾 Descargar JSON", | |
| "share.download_md": "📝 Markdown", | |
| "share.download_tex": "📜 LaTeX", | |
| "share.submit": "📤 Enviar al registry", | |
| "share.submit_clip_ok": "↗ GitHub abierto. Cuerpo copiado al portapapeles — pégalo en el cuerpo del issue.", | |
| "share.submit_clip_fail": "↗ GitHub abierto. Portapapeles bloqueado — cuerpo volcado en la consola del navegador (F12).", | |
| "share.import_title": "📂 Importar un resultado TAF compartido", | |
| "a11y.skip": "Saltar al contenido principal", | |
| // v0.6.2 — rework de landing: inicio rápido + inventario + tooltips de arquitectura | |
| "qs.title": "⚡ Inicio rápido", | |
| "qs.step1": "Pega un model ID de HuggingFace (ej. <code>meta-llama/Meta-Llama-3-8B</code>)", | |
| "qs.step2": "Click en <strong>📇 Profile a model</strong>", | |
| "qs.step3": "Lee tu TAF Card — veredicto por caso de uso + números clave + matemáticas verificadas con Lean+Mathlib", | |
| "qs.cta": "↓ Empezar ahora", | |
| "inv.title": "🧰 Qué te ofrece esta herramienta", | |
| "inv.recipes.title": "🎯 8 recetas — ¿sirve este modelo para tu caso?", | |
| "inv.recipes.x1.title": "Entrenar propio vs API", | |
| "inv.recipes.x1.body": "¿cuál sale más barato para tu tráfico?", | |
| "inv.recipes.x2.title": "Contexto largo", | |
| "inv.recipes.x2.body": "¿aguanta 32k / 128k tokens de forma fiable?", | |
| "inv.recipes.x3.title": "Presupuesto", | |
| "inv.recipes.x3.body": "con $X, ¿qué modelo puedes entrenar desde cero?", | |
| "inv.recipes.x5.title": "Hardware", | |
| "inv.recipes.x5.body": "¿qué GPU para servir N tokens/día?", | |
| "inv.recipes.x19.title": "KV cache", | |
| "inv.recipes.x19.body": "¿cómo comprimir sin romper la calidad?", | |
| "inv.recipes.x21.title": "Pureza de imprint", | |
| "inv.recipes.x21.body": "¿cómo de limpia es la codificación posicional del modelo?", | |
| "inv.recipes.x22.title": "Compute-context", | |
| "inv.recipes.x22.body": "¿el modelo entra en la banda empírica?", | |
| "inv.recipes.x23.title": "Fase IH", | |
| "inv.recipes.x23.body": "¿pre- o post-induction-head?", | |
| "inv.diag.title": "🔬 Diagnósticos", | |
| "inv.diag.gamma": "<strong>γ predicho vs observado</strong> — auto-clasifica el modelo en 5 regímenes (normal · fraude / contexto inflado · comprimido · over-Padé · sliding-window)", | |
| "inv.diag.cardy": "<strong>Cardy ΔH</strong> — desplazamiento de entropía entre contexto observado y nominal", | |
| "inv.diag.fals": "<strong>Tabla de falsabilidad</strong> — comprueba 23 predicciones específicas (F1–F23)", | |
| "inv.diag.alg": "<strong>Consistencia algebraica</strong> — 8 identidades matemáticas que el modelo debe cumplir", | |
| "inv.verify.title": "✓ Matemáticas formalmente verificadas", | |
| "inv.verify.count": "<strong>37 teoremas</strong> machine-proven en Lean 4 + Mathlib4", | |
| "inv.verify.click": "Click en cualquier badge → abre la línea fuente en GitHub", | |
| "inv.verify.reverify": "Verifícalo tú: <code>lake build</code> (≈5 s tras cache)", | |
| "inv.export.title": "📤 Exportar y compartir", | |
| "inv.export.formats": "<strong>JSON · Markdown · LaTeX</strong> (listo para paper)", | |
| "inv.export.share": "Link reproducible (estado codificado en URL)", | |
| "inv.export.registry": "Envía al registro comunitario en GitHub", | |
| "arch.summary": "Arquitecturas soportadas", | |
| "arch.anyhf": "✓ Cualquier modelo público de HuggingFace", | |
| "tooltip.mha": "Multi-Head Attention: cada posición atiende mediante varios heads paralelos a la vez.", | |
| "tooltip.gqa": "Grouped Query Attention: las queries comparten menos keys/values que heads (ahorra memoria pero empuja γ hacia Hagedorn).", | |
| "tooltip.alibi": "Attention with Linear Biases: la info de posición es una pendiente aprendida añadida a los scores, sin rotación.", | |
| "tooltip.abspe": "Absolute Position Embeddings: cada posición tiene un vector fijo aprendido sumado al embedding del token.", | |
| "tooltip.swa": "Sliding Window Attention: cada token solo atiende dentro de una ventana local fija (Mistral, gemma-2 lo usan).", | |
| "tooltip.ssm": "State Space Model: capa de secuencia que mantiene estado interno en lugar de atención (Mamba, Jamba lo usan).", | |
| // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker | |
| "modes.unmask": "🪟 Desenmascarar", | |
| "unmask.title": "🪟 Desenmascarador de contexto", | |
| "unmask.tip": "Pega un id de modelo HuggingFace (o config.json crudo). La herramienta detecta sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), y GQA — todo lo que hace que <code>max_position_embeddings</code> sea mayor que el contexto efectivo real. Mistral-7B-v0.1 es el ejemplo canónico: declara 32k, atiende dentro de ~4-8k.", | |
| "unmask.desc": "<strong>¿Estás a punto de gastar dinero en un modelo que en realidad no atiende tan lejos?</strong> Pega un id y descúbrelo en 1 segundo. Sin GPU, sin inferencia — solo aritmética sobre config.json.", | |
| "unmask.id_label": "ID modelo HF:", | |
| "unmask.fetch_btn": "🔍 Desenmascarar", | |
| "unmask.paste_summary": "O pega config.json crudo (modelos privados / en desarrollo)", | |
| "unmask.paste_btn": "🔍 Desenmascarar config pegado", | |
| "unmask.label.declared": "Contexto declarado", | |
| "unmask.label.effective": "Efectivo (estimado)", | |
| "unmask.label.ratio": "Ratio", | |
| "unmask.section.flags": "Banderas de arquitectura", | |
| "unmask.section.warnings": "Avisos", | |
| "unmask.section.reco": "Recomendación", | |
| "unmask.flag.swa": "SWA", | |
| "unmask.flag.rope": "RoPE scaling", | |
| "unmask.flag.gqa": "GQA", | |
| "unmask.flag.layers": "Capas", | |
| "unmask.flag.dhead": "d_head", | |
| "unmask.flag.theta": "RoPE θ", | |
| "unmask.flag.yes": "sí", | |
| "unmask.flag.no": "no", | |
| "unmask.flag.full_mha": "no (MHA completo, {n} heads)", | |
| "unmask.verdict.honest": "✅ HONESTO", | |
| "unmask.verdict.inflated": "⚠ INFLADO", | |
| "unmask.verdict.severely_inflated": "❌ GRAVEMENTE INFLADO", | |
| "unmask.verdict.yarn_extended": "⚠ YARN-EXTENDIDO", | |
| "unmask.verdict.unknown": "❓ DESCONOCIDO", | |
| "unmask.warn.swa_window": "Ventana SWA: {window} tokens — cada capa solo atiende dentro de esta ventana.", | |
| "unmask.warn.multihop": "Estimación multi-hop: ~{multiHop} tokens (conservador: ventana × {factor}).", | |
| "unmask.warn.yarn": "RoPE scaling ({type}) extiende contexto {factor}× desde ~{original} hasta {declared} tokens.", | |
| "unmask.warn.yarn_advice": "Contexto RoPE-extendido — verifica el comportamiento de γ a la longitud declarada con el diagnóstico γ_check.", | |
| "unmask.warn.gqa_small_dhead": "head dim pequeño ({d_head}) + GQA: probable compresión de KV cache a contexto largo (γ empujado hacia Hagedorn).", | |
| "unmask.reco.honest": "Modelo de atención completa estándar. Contexto efectivo coincide con declarado ({declared} tokens).", | |
| "unmask.reco.inflated": "Efectivo ~{effective} tokens vía SWA. Usa γ_check para verificar el comportamiento a tu longitud objetivo.", | |
| "unmask.reco.severely_inflated": "Trátalo como un modelo de ~{effective} tokens en la práctica. El claim de {declared} tokens solo aplica vía cadenas de atención cross-layer, que empíricamente degradan más allá de ~2× la ventana SWA.", | |
| "unmask.reco.yarn_extended": "Contexto RoPE-extendido. Corre un benchmark long-context (NIAH a 8k / 16k / 32k / full) para confirmar que la extensión se sostiene. Usa γ_check con T_eval = {declared}.", | |
| "unmask.reco.unknown": "No se pudo parsear el config. Verifica que la URL sea un modelo HF válido con config.json público.", | |
| "unmask.status.empty_id": "⚠ Introduce un model id (ej. mistralai/Mistral-7B-v0.1).", | |
| "unmask.status.fetching": "⏳ Obteniendo config.json para {modelId}...", | |
| "unmask.status.success": "✅ Analizado {modelId} (veredicto: {verdict})", | |
| "unmask.status.empty_paste": "⚠ Pega un config.json primero.", | |
| "unmask.status.invalid_json": "❌ JSON inválido: {error}", | |
| "unmask.status.success_paste": "✅ Config pegado analizado (veredicto: {verdict})", | |
| "unmask.pasted_label": "(config pegado)", | |
| "mode_desc.ask": "Escribe una pregunta libre. El LLM en el navegador elige la receta correcta y la ejecuta.", | |
| "mode_desc.recipe": "Selecciona una receta directamente y rellena el formulario. Control manual completo.", | |
| "mode_desc.profile": "Inicio más rápido: pega cualquier model id de HuggingFace, click Profile. Mira las 5 recetas en segundos.", | |
| "mode_desc.compare": "Elige 2-3 modelos candidatos + una receta. Ve veredictos lado a lado en tabla.", | |
| "mode_desc.inspector": "Pega un config.json directamente. Útil para modelos privados / en desarrollo no en HF Hub.", | |
| "mode_desc.diagnose": "Construye el comando CLI diagnose_model.py para MEDIR γ_obs en GPU real. El navegador predice; el CLI mide.", | |
| "mode_desc.phase": "Scatter γ × θ del panel empírico del paper. Hover sobre puntos para detalles, click para cargar en Diagnose / Recipe.", | |
| "mode_desc.unmask": "Detecta si max_position_embeddings es engañoso (SWA / YaRN / RoPE-scaling). Pega un model id, obtén un veredicto en 1 línea.", | |
| "profile.preset_loaded": "✅ Preset cargado para <strong>{id}</strong>. Formulario pre-rellenado. (Click 📥 Fetch para sobreescribir con el último config de HF Hub.)", | |
| // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer | |
| "modes.template": "📜 Chat-template", | |
| "mode_desc.template": "Detecta qué familia de chat-template usa un modelo (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Da el flag CLI exacto para lm-eval / vLLM / transformers.", | |
| "template.title": "📜 Detector de Chat-template", | |
| "template.tip": "Pega un model id de HF (o tokenizer_config.json crudo). Detecta la familia del chat-template y te da el comando exacto para usarlo bien. lm-eval-harness divide la accuracy entre 2 silenciosamente si te olvidas de aplicarlo (issue #1841).", | |
| "template.desc": "<strong>¿Olvidaste <code>--apply_chat_template</code>?</strong> La mayoría de evals multi-turn fallan ~50% porque el chat template no se aplicó. Pega un model id, obtén el flag CLI exacto para tu stack.", | |
| "template.id_label": "ID modelo HF:", | |
| "template.fetch_btn": "📜 Detectar", | |
| "template.paste_summary": "O pega tokenizer_config.json crudo (modelos privados)", | |
| "template.paste_btn": "📜 Detectar config pegado", | |
| "template.label.family": "Familia detectada", | |
| "template.label.markers": "Marcadores coincidentes", | |
| "template.label.tpl_len": "Longitud template", | |
| "template.section.warnings": "Avisos", | |
| "template.section.commands": "Comandos por framework", | |
| "template.section.raw": "Template crudo (preview)", | |
| "template.family.custom": "custom (familia desconocida)", | |
| "template.family.none": "(sin chat_template)", | |
| "template.verdict.ok": "✅ TEMPLATE DETECTADO", | |
| "template.verdict.custom": "⚠ TEMPLATE CUSTOM", | |
| "template.verdict.missing": "❌ SIN CHAT TEMPLATE", | |
| "template.verdict.base_model": "ℹ MODELO BASE (sin chat)", | |
| "template.verdict.unknown": "❓ DESCONOCIDO", | |
| "template.warn.no_chat_template": "Sin campo <code>chat_template</code> en tokenizer_config.json. Típico de modelos base / pretrained. Si esperabas un modelo instruct-tuned, puede que el archivo cargado sea incorrecto.", | |
| "template.warn.custom_template": "Template no estándar ({length} chars). La herramienta no lo encajó en familias conocidas. Revisa el preview y verifica que tu framework de eval lo soporta.", | |
| "template.warn.lm_eval_apply": "<strong>lm-eval-harness:</strong> añade <code>--apply_chat_template</code> o tu accuracy bajará ~50% silenciosamente en evals multi-turn (issue #1841).", | |
| "template.warn.vllm_apply": "<strong>vLLM serve:</strong> verifica que <code>--chat-template</code> esté puesto (la auto-detección a veces falla en variantes fine-tuned). Sugerido: <code>{name}</code>.", | |
| "template.status.empty_id": "⚠ Introduce un model id (ej. mistralai/Mistral-7B-Instruct-v0.3).", | |
| "template.status.fetching": "⏳ Obteniendo tokenizer_config.json para {modelId}...", | |
| "template.status.success": "✅ Detectado {modelId} (veredicto: {verdict})", | |
| "template.status.empty_paste": "⚠ Pega un tokenizer_config.json primero.", | |
| "template.status.invalid_json":"❌ JSON inválido: {error}", | |
| "template.status.success_paste":"✅ Config pegado detectado (veredicto: {verdict})", | |
| "template.pasted_label": "(tokenizer_config pegado)", | |
| // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor | |
| "modes.arena": "🎯 Arena CI", | |
| "mode_desc.arena": "Recupera intervalos de confianza desde datos crudos de votos pairwise (MLE Bradley-Terry + bootstrap). Detecta pares estadísticamente empatados que el leaderboard público de Arena oculta.", | |
| "arena.title": "🎯 Reconstructor Arena-Elo CI", | |
| "arena.tip": "Chatbot Arena oculta los intervalos de confianza en el leaderboard público. Una diferencia de 5 Elo puede ser estadísticamente irrelevante. Pega datos crudos de votos (model_a, model_b, winner) — la herramienta calcula MLE Bradley-Terry + bootstrap CIs y lista los empates estadísticos (overlap de CI).", | |
| "arena.desc": "<strong>¿GPT-4 es realmente mejor que Claude — o están empatados?</strong> Pega CSV de votos pairwise (o click <em>Cargar sample</em>). MLE Bradley-Terry + 200 iteraciones de bootstrap → Elos ranked con CIs 95% y detección de empates estadísticos. Todo en el navegador.", | |
| "arena.sample_btn": "📊 Cargar datos sample", | |
| "arena.run_btn": "🎯 Calcular CIs", | |
| "arena.clear_btn": "🗑️ Limpiar", | |
| "arena.csv_summary": "CSV de votos (header: <code>model_a,model_b,winner</code>; winner ∈ a/b/tie)", | |
| "arena.section.ranked": "Elos ranked con CIs 95%", | |
| "arena.section.ties": "Empates estadísticos (overlap CI)", | |
| "arena.section.summary": "Resumen", | |
| "arena.col.rank": "#", | |
| "arena.col.model": "Modelo", | |
| "arena.col.elo": "Elo", | |
| "arena.col.ci": "CI 95%", | |
| "arena.col.ci_width": "± semi-anchura", | |
| "arena.col.matches": "Partidas", | |
| "arena.col.wins": "V / D / E", | |
| "arena.col.tie_pair": "Par", | |
| "arena.col.tie_diff": "Brecha Elo", | |
| "arena.col.tie_overlap": "Overlap CI", | |
| "arena.no_ties": "Sin empates estadísticos — todos los pares distinguibles al CI 95%.", | |
| "arena.summary.votes": "Votos totales", | |
| "arena.summary.models": "Modelos", | |
| "arena.summary.ties": "Empates estadísticos", | |
| "arena.summary.bootstrap": "Iteraciones bootstrap", | |
| "arena.summary.ci_level": "Nivel CI", | |
| "arena.status.empty": "⚠ Pega un CSV de votos o click en Cargar sample.", | |
| "arena.status.too_few": "⚠ Solo {n} votos válidos — se necesitan al menos 10 para ajustar Bradley-Terry de forma fiable.", | |
| "arena.status.computing": "⏳ Calculando MLE Bradley-Terry + bootstrap sobre {n} votos...", | |
| "arena.status.done": "✅ {n} votos · {models} modelos · {ties} empates estadísticos · {ms} ms", | |
| "arena.status.sample_loaded": "✅ Sample cargado (datos sintéticos Arena de 6 modelos). Click en Calcular CIs.", | |
| // v0.7.3 — anti-bullshit pack #4: Contamination Prior | |
| "modes.contam": "🧪 Contaminación", | |
| "mode_desc.contam": "Prior bayesiano-ish sobre si un score de benchmark está contaminado. Introduce la fecha de cutoff de entrenamiento → puntúa 20+ benchmarks populares (MMLU, GSM8K, HumanEval, MMLU-Pro…).", | |
| "contam.title": "🧪 Prior de Contaminación", | |
| "contam.tip": "Calcula un prior bayesiano-ish sobre si un score de benchmark está contaminado, basado en (fecha de cutoff de entrenamiento) × (fecha de release del benchmark) × (inclusión conocida en corpus + historial de leaks). Open LLM Leaderboard v1 fue cancelado en 2024 tras la contaminación de MMLU/HellaSwag.", | |
| "contam.desc": "<strong>¿Deberías confiar en el MMLU de tu modelo?</strong> Introduce la fecha cutoff de entrenamiento — la herramienta puntúa 20+ benchmarks populares (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) y te dice qué scores son probablemente contaminados.", | |
| "contam.cutoff_label": "Cutoff entrenamiento:", | |
| "contam.run_btn": "🧪 Puntuar todos los benchmarks", | |
| "contam.section.ranked": "Priors de contaminación por benchmark", | |
| "contam.section.high": "🔴 Benchmarks de alto riesgo (trata los scores como no fiables)", | |
| "contam.section.medium": "🟡 Riesgo medio (verifica con alternativas)", | |
| "contam.section.low": "🟢 Bajo riesgo (probablemente limpios)", | |
| "contam.col.benchmark": "Benchmark", | |
| "contam.col.released": "Release", | |
| "contam.col.gap": "Gap (meses)", | |
| "contam.col.prior": "P(contam)", | |
| "contam.col.level": "Nivel", | |
| "contam.col.corpora": "En corpus", | |
| "contam.col.category": "Categoría", | |
| "contam.label.high": "Alto riesgo", | |
| "contam.label.medium": "Medio", | |
| "contam.label.low": "Bajo", | |
| "contam.no_entries": "(ninguno en esta categoría)", | |
| "contam.advice.high": "Trata estos scores como no fiables. Sustituye por alternativas más recientes / con test privado (MMLU-Pro, GPQA, MUSR, MATH-500).", | |
| "contam.advice.medium": "Toma con cautela. Busca replicación sobre subset held-out o reproducciones comunitarias.", | |
| "contam.advice.low": "Score probablemente no contaminado, pero ausencia de leak no es prueba — verifica también con test alternativo.", | |
| "contam.summary.headline": "Cutoff <code>{cutoff}</code> · {n} benchmarks puntuados", | |
| "contam.status.empty": "⚠ Introduce una fecha cutoff de entrenamiento (ej. 2023-12).", | |
| "contam.status.bad_date": "⚠ Formato de fecha incorrecto. Usa YYYY-MM o YYYY-MM-DD.", | |
| "contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks puntuados · {high} de alto riesgo", | |
| // v0.7 — Sección Help modal | |
| "help.v07.title": "🆕 v0.7 — Pack anti-bullshit (4 modos nuevos)", | |
| "help.v07.intro": "<em>v0.7 (2026-05-06): cuatro modos nuevos que resuelven problemas concretos reportados por la comunidad HuggingFace. Cada uno corre en tu navegador sin inferencia — pura metadata + matemáticas.</em>", | |
| "help.v07.unmask.title": "🪟 Desenmascarador de Contexto", | |
| "help.v07.unmask.body": "Detecta cuándo <code>max_position_embeddings</code> es engañoso. Mistral-7B-v0.1 declara 32k pero atiende dentro de ~4-8k vía SWA. Pega un id HF → veredicto en 1 segundo (HONESTO / INFLADO / GRAVEMENTE INFLADO / YARN-EXTENDIDO). Pilla SWA, RoPE-scaling (YaRN/linear/dynamic NTK), d_head pequeño + GQA. <em>Caso de uso</em>: antes de pagar GPU para 32k de contexto, verifica que el modelo realmente atiende tan lejos.", | |
| "help.v07.template.title": "📜 Detector de Chat-template", | |
| "help.v07.template.body": "Detecta qué familia de chat-template usa un modelo (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) y te da el flag CLI exacto para lm-evaluation-harness, vLLM, y transformers. Resuelve el issue #1841 de lm-eval-harness: olvidar <code>--apply_chat_template</code> divide la accuracy multi-turn por 2 silenciosamente. <em>Caso de uso</em>: antes de reportar un score, confirma que aplicaste el template correctamente.", | |
| "help.v07.arena.title": "🎯 Reconstructor Arena-Elo CI", | |
| "help.v07.arena.body": "Chatbot Arena oculta los intervalos de confianza en su leaderboard público — una diferencia de 5 Elo puede ser estadísticamente irrelevante. Pega datos crudos de votos pairwise (model_a, model_b, winner) → MLE Bradley-Terry + bootstrap de 200 iteraciones → Elos ranked con CIs 95% y un panel de \"empates estadísticos\" listando pares cuyos CIs se solapan. Prueba el botón Cargar sample. <em>Caso de uso</em>: antes de afirmar \"modelo A vence a modelo B\", verifica que sus CIs no se solapen.", | |
| "help.v07.contam.title": "🧪 Prior de Contaminación", | |
| "help.v07.contam.body": "Prior bayesiano-ish sobre si un score de benchmark está contaminado. Introduce la fecha cutoff de entrenamiento de tu modelo → la herramienta puntúa 20+ benchmarks populares (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) por P(contaminación) según gap temporal, inclusión en corpus y historial de leaks conocidos. Open LLM Leaderboard v1 fue cancelado en 2024 tras la contaminación de MMLU/HellaSwag. <em>Caso de uso</em>: decide qué scores te puedes creer al comparar dos modelos.", | |
| "help.v07.quant.title": "⚖️ Clasificador de régimen de cuantización", | |
| "help.v07.quant.body": "Predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización: NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8…). Arch-aware: d_head pequeño + GQA agresivo → más sensible; los esquemas calibrados (AWQ) absorben mejor el shift que los no calibrados (NF4). Recomienda alternativas más seguras si detecta cliff. <em>Caso de uso</em>: antes de cuantizar, predice si tu combo arquitectura × esquema mantendrá la PPL aceptable, con sugerencia concreta de switch si no.", | |
| "help.v07.drift.title": "🔀 Cota de drift entre frameworks", | |
| "help.v07.drift.body": "Mismo modelo, scores distintos en setups distintos. La herramienta predice el drift máximo admisible solo por ruido numérico (dtype, framework, batch). Si el gap observado lo excede → bug real, normalmente chat-template mismatch (issue #1841 de lm-eval-harness) o layout de KV-cache. Prueba el botón "Cargar sample" para el bug canónico de chat-template. <em>Caso de uso</em>: antes de reportar una regresión o reclamar reproducibilidad, verifica si el gap entre dos evals es mayor de lo que el ruido numérico puede explicar.", | |
| "inv.v07.drift": "<strong>🔀 Drift</strong> — ¿bug o ruido? Predice el gap máximo admisible entre dos evals", | |
| "help.v07.niah.title": "🔍 Gap NIAH → Reasoning", | |
| "help.v07.niah.body": "El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH (retrieval de needle) pero fallan reasoning multi-hop al mismo contexto. La herramienta predice ambas tasas de pass desde la arquitectura (γ_Padé + d_horizon + presión arq: d_head pequeño, GQA, SWA), reporta el gap, y encuentra el \"contexto seguro de reasoning\" donde reasoning se mantiene ≥65%. Modo barrido muestra la curva a 1k/4k/16k/64k/T_train. <em>Caso de uso</em>: antes de desplegar al contexto declarado, descubre si el modelo realmente razonará ahí o solo encontrará.", | |
| "inv.v07.niah": "<strong>🔍 NIAH→Reason</strong> — ¿tu \"128k\" realmente razona ahí, o solo encuentra?", | |
| // v0.7 — Inventory modal 5ª card | |
| "inv.v07.title": "🆕 Pack anti-bullshit v0.7", | |
| "inv.v07.unmask": "<strong>🪟 Unmask</strong> — ¿config.json declara 32k? Mira si de verdad atiende tan lejos", | |
| "inv.v07.template": "<strong>📜 Chat-template</strong> — flag CLI exacto para que lm-eval no divida tu accuracy entre 2 silenciosamente", | |
| "inv.v07.arena": "<strong>🎯 Arena CI</strong> — recupera los intervalos de confianza que Chatbot Arena oculta", | |
| "inv.v07.contam": "<strong>🧪 Contaminación</strong> — puntúa 20+ benchmarks por probabilidad de contaminación", | |
| "inv.v07.quant": "<strong>⚖️ Quant</strong> — predice γ-shift + ΔPPL para cualquier combo (modelo × esquema de cuantización)", | |
| // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier | |
| "modes.quant": "⚖️ Quant", | |
| "mode_desc.quant": "Predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización). Arch-aware: d_head pequeño + GQA → más sensible. Recomienda alternativas más seguras si detecta cliff.", | |
| "quant.title": "⚖️ Clasificador de régimen de cuantización", | |
| "quant.tip": "Predice γ-shift (y la ΔPPL resultante) para un par (modelo × esquema). Claims genéricos como 'AWQ ~95% retención' son demasiado vagos — TAF usa d_head, ratio GQA, flag SWA y tamaño del modelo para dar veredicto arquitectura-específico. Resuelve: la comunidad HF reporta cliffs de cuantización impredecibles (NF4 -2 PPL en Phi-3 pero bien en Llama-3-8B).", | |
| "quant.desc": "<strong>¿Cuantizar romperá tu modelo?</strong> Pega un id HF, elige esquema de cuantización — obtén γ-shift predicho, banda ΔPPL esperada y alternativa recomendada si es un cliff. Solo navegador, sin GPU, sin set de calibración.", | |
| "quant.id_label": "ID modelo HF:", | |
| "quant.fetch_btn": "📥 Fetch config", | |
| "quant.scheme_label": "Esquema cuant:", | |
| "quant.run_btn": "⚖️ Predecir", | |
| "quant.all_btn": "📊 Comparar todos los esquemas", | |
| "quant.regime.safe": "✅ SEGURO", | |
| "quant.regime.mild": "✅ COMPRESIÓN LEVE", | |
| "quant.regime.significant": "⚠ DEGRADACIÓN SIGNIFICATIVA", | |
| "quant.regime.cliff": "❌ CLIFF FUERTE", | |
| "quant.label.gamma_shift": "γ shift", | |
| "quant.label.delta_ppl": "ΔPPL (est.)", | |
| "quant.label.arch_mult": "Multiplicador arch", | |
| "quant.section.breakdown": "Desglose", | |
| "quant.section.reco": "Recomendación", | |
| "quant.section.compare": "Todos los esquemas (ordenados por seguridad)", | |
| "quant.field.scheme": "Esquema", | |
| "quant.field.calibrated": "calibrado", | |
| "quant.field.uncalibrated": "no calibrado", | |
| "quant.field.base_penalty": "Penalización base", | |
| "quant.field.arch_mult_full": "Multiplicador arquitectónico", | |
| "quant.field.gamma_shift": "γ shift predicho", | |
| "quant.field.ppl_band": "Banda ΔPPL (est.)", | |
| "quant.field.params": "Parámetros", | |
| "quant.col.scheme": "Esquema", | |
| "quant.col.bits": "Bits", | |
| "quant.col.gamma_shift": "γ shift", | |
| "quant.col.ppl_band": "Banda ΔPPL", | |
| "quant.col.regime": "Régimen", | |
| "quant.reco.switch_to_awq": "<strong>Cambia a {scheme}</strong> — el 4-bit calibrado maneja d_head pequeño + GQA mucho mejor que NF4. ΔPPL esperada cae ~2-3×.", | |
| "quant.reco.switch_to_q5_km": "<strong>Cambia a {scheme}</strong> — Q5 mantiene más dimensiones de head intactas a bajo coste (solo ~25% más grande).", | |
| "quant.reco.switch_to_q4_km": "<strong>Cambia a {scheme}</strong> — Q3/Q2 son demasiado agresivos para esta arquitectura.", | |
| "quant.reco.consider_awq": "<strong>Considera {scheme}</strong> — la calibración reduce γ-shift significativamente en esta arquitectura.", | |
| "quant.reco.use_higher_bits": "<strong>Usa alternativa de mayor bit</strong> — esta arquitectura no absorbe 4-bit limpiamente. Prueba 5 u 8-bit.", | |
| "quant.reco.verify_with_eval": "<strong>Verifica con eval real</strong> — el shift predicho está en el límite. Corre NIAH a tu contexto objetivo antes de desplegar.", | |
| "quant.reco.no_action": "No requiere acción — la cuantización es segura para esta arquitectura.", | |
| "quant.summary.headline_all": "Todos los esquemas para <code>{modelId}</code>", | |
| "quant.status.empty_id": "⚠ Introduce un model id (ej. meta-llama/Llama-3.2-1B).", | |
| "quant.status.fetching": "⏳ Obteniendo config.json para {modelId}...", | |
| "quant.status.fetched": "✅ Config obtenido para {modelId}. Elige un esquema y click Predecir (o Comparar todos).", | |
| "quant.status.no_scheme": "⚠ Elige un esquema de cuantización del dropdown.", | |
| "quant.status.done": "✅ Régimen predicho: {regime}", | |
| "quant.status.done_all": "✅ Comparados {n} esquemas — ordenados por seguridad.", | |
| // v0.7.4 — autocomplete HF Hub: privacy + rate-limit | |
| "hf_auto.privacy": "🔒 Queries enviadas a huggingface.co/api · caché local 5 min", | |
| "hf_auto.rate_limited": "⚠ Rate limit de HuggingFace — espera un momento, o teclea el id completo manualmente", | |
| "hf_auto.gated_msg": "es gated. Acepta la licencia aquí:", | |
| // v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound | |
| "modes.drift": "🔀 Drift", | |
| "mode_desc.drift": "Predice el drift máximo permitido entre dos scores de benchmark dados (framework, dtype, batch, chat-template). Distingue bugs reales de ruido numérico.", | |
| "drift.title": "🔀 Cota de drift entre frameworks", | |
| "drift.tip": "Mismo modelo, scores distintos en setups distintos. ¿La diferencia es ruido o un bug real? Introduce dos scores con su (framework, dtype, batch, chat-template) — la herramienta predice el drift máximo permitido por ruido numérico solo. Si el gap observado lo excede → bug real, normalmente chat-template mismatch (issue #1841 de lm-eval) o layout de KV-cache.", | |
| "drift.desc": "<strong>Tu modelo da 67.2 en lm-eval-hf y 65.1 en vLLM-served. ¿Bug o ruido?</strong> Introduce ambos scores con (framework, dtype, batch, ¿chat-template aplicado?). La herramienta predice la banda de ruido y flagea bugs reales. arxiv 2506.09501 documenta esto como problema mayor de reproducibilidad de evals.", | |
| "drift.setup_a": "Setup A", | |
| "drift.setup_b": "Setup B", | |
| "drift.score": "Score", | |
| "drift.framework": "Framework", | |
| "drift.dtype": "Dtype", | |
| "drift.batch": "Batch", | |
| "drift.template": "Chat-template", | |
| "drift.template.applied": "aplicado", | |
| "drift.template.not_applied": "no aplicado", | |
| "drift.template.unknown": "desconocido", | |
| "drift.run_btn": "🔀 Calcular cota de drift", | |
| "drift.sample_btn": "📊 Cargar sample (bug de chat-template)", | |
| "drift.label.observed": "Gap observado", | |
| "drift.label.band": "Banda numérica", | |
| "drift.label.ratio": "Gap / banda", | |
| "drift.section.setups": "Setups", | |
| "drift.section.breakdown": "Contribuyentes al drift (banda numérica)", | |
| "drift.section.verdict": "Veredicto y recomendación", | |
| "drift.contrib.dtype": "Mismatch de dtype", | |
| "drift.contrib.framework": "Framework", | |
| "drift.contrib.batch": "Diferencia de batch", | |
| "drift.contrib.template": "MISMATCH de chat-template", | |
| "drift.dominant_cause": "Causa dominante", | |
| "drift.cause.dtype": "diferencia de precisión dtype", | |
| "drift.cause.framework": "diferencia de framework / kernel", | |
| "drift.cause.batch": "paths de normalización por batch", | |
| "drift.cause.template_mismatch": "chat-template aplicado en un lado pero no en el otro (patrón #1841 de lm-eval-harness — típico -50% en multi-turn)", | |
| "drift.verdict.noise": "✅ RUIDO NUMÉRICO", | |
| "drift.verdict.suspicious": "⚠ SOSPECHOSO — verifica", | |
| "drift.verdict.bug": "❌ BUG REAL — investiga", | |
| "drift.verdict.bug_template": "❌ BUG DE CHAT-TEMPLATE", | |
| "drift.reco.noise": "El gap encaja en la banda esperada de ruido numérico. No requiere acción; la diferencia es consistente con variación de framework/dtype/batch sola.", | |
| "drift.reco.suspicious": "El gap es 1–2× la banda predicha. Borderline — posible bug real. Intenta alinear el contribuyente dominante (ej. iguala framework o dtype) y re-testea.", | |
| "drift.reco.bug": "El gap es > 2× la banda predicha. Es un bug real. Inspecciona el contribuyente dominante — probablemente diferencia de tokenizer / chat-template / layout de KV-cache. Corre lm-eval-harness con <code>--apply_chat_template</code> y confirma.", | |
| "drift.reco.bug_template": "Mismatch de chat-template detectado. Es la causa más común de gaps grandes en evals (issue #1841 de lm-eval-harness). Re-corre el lado "no aplicado" con <code>--apply_chat_template</code> (o pon vLLM <code>--chat-template <name></code>) y re-testea.", | |
| "drift.status.empty_scores": "⚠ Introduce ambos scores.", | |
| "drift.status.done": "✅ Veredicto: {verdict}", | |
| "drift.status.sample_loaded": "✅ Sample cargado (bug canónico de chat-template). Click en Calcular cota de drift.", | |
| // v0.7.6 — anti-bullshit pack #7: NIAH → predictor de gap de reasoning | |
| "modes.niah": "🔍 NIAH→Reason", | |
| "mode_desc.niah": "Predice tasas de pass de NIAH (retrieval) y reasoning multi-hop a cualquier contexto. Resuelve: modelos long-context pasan NIAH pero fallan reasoning al mismo contexto (paper RULER).", | |
| "modes.saturation": "📈 Saturación", | |
| "mode_desc.saturation": "Te dice si un benchmark sigue discriminando frontier models o ya está saturado (ej. MMLU 88-94% top, AIME 2025 ya 96-100%). Devuelve top-3 + veredicto + reemplazos recomendados.", | |
| "modes.hub": "🧭 Soluciones", | |
| "mode_desc.hub": "Mapa de cada problema documentado de LLM-eval → mode tafagent (si cubierto) + herramientas externas curadas. Encuentra la solución sin reinventarla. 30+ pains, 7 categorías.", | |
| "niah.title": "🔍 Gap NIAH → Reasoning", | |
| "niah.tip": "NIAH (Needle in a Haystack) testea retrieval: 'encuentra este hecho en texto largo'. Reasoning multi-hop testea inferencia: 'combina hechos X+Y del principio con hecho Z del final'. El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH pero fallan reasoning al mismo contexto. Esta herramienta predice ambas tasas desde la arquitectura sola.", | |
| "niah.desc": "<strong>Tu modelo dice 128k de contexto. ¿Razonará realmente a 64k, o solo encontrará?</strong> Pega un model id HF y un contexto objetivo — la herramienta predice tasas de pass NIAH y reasoning multi-hop, el gap, y un 'contexto seguro' donde reasoning se mantiene ≥65%.", | |
| "niah.id_label": "ID modelo HF:", | |
| "niah.fetch_btn": "📥 Fetch config", | |
| "niah.teval_label": "Contexto objetivo (T_eval):", | |
| "niah.run_btn": "🔍 Predecir", | |
| "niah.sweep_btn": "📊 Barrer contextos", | |
| "niah.label.niah": "Tasa pass NIAH", | |
| "niah.label.reasoning": "Tasa pass Reasoning", | |
| "niah.label.gap": "Gap", | |
| "niah.label.safe_ctx": "Contexto seguro de reasoning", | |
| "niah.section.breakdown": "Desglose arquitectónico", | |
| "niah.section.reco": "Recomendación", | |
| "niah.section.sweep": "Barrido de tasas pass por longitud de contexto", | |
| "niah.field.dhorizon": "d_horizon (efectivo)", | |
| "niah.field.ratio": "T_eval / d_horizon", | |
| "niah.field.arch_pressure": "Presión arq (d_head pequeño + GQA + SWA)", | |
| "niah.field.theta": "RoPE θ", | |
| "niah.field.t_train": "T_train (declarado)", | |
| "niah.col.context": "T_eval", | |
| "niah.col.niah": "NIAH", | |
| "niah.col.reasoning": "Reasoning", | |
| "niah.col.gap": "Gap", | |
| "niah.col.verdict": "Veredicto", | |
| "niah.verdict.robust": "✅ ROBUSTO", | |
| "niah.verdict.marginal": "⚠ MARGINAL", | |
| "niah.verdict.degraded": "⚠ DEGRADADO", | |
| "niah.verdict.retrieval_only": "❌ SOLO RETRIEVAL", | |
| "niah.verdict.broken": "❌ ROTO", | |
| "niah.reco.robust": "Tanto retrieval como reasoning aguantan a este contexto. Seguro para desplegar tareas de lookup e inferencia.", | |
| "niah.reco.marginal": "Borderline. Retrieval funciona pero reasoning está flojo. Úsalo para lookup, no para inferencia multi-paso.", | |
| "niah.reco.degraded": "Caída significativa de reasoning. El modelo encuentra hechos pero le cuesta combinarlos. Evita tareas multi-hop a esta longitud.", | |
| "niah.reco.retrieval_only": "Hallazgo canónico de RULER: el modelo pasa NIAH pero falla reasoning. Útil para setups RAG (donde el LLM solo localiza hechos) pero NO para inferencia encadenada. Reduce tu contexto al valor 'seguro' de abajo.", | |
| "niah.reco.broken": "El modelo falla incluso retrieval básico a este contexto. Trátalo como out-of-distribution — re-testea a contexto más corto.", | |
| "niah.safe_context": "≤ {ctx} tokens (reasoning ≥ 65%)", | |
| "niah.safe_context_none": "No se encontró contexto seguro bajo tu objetivo — el modelo falla reasoning incluso a contextos pequeños.", | |
| "niah.summary.sweep": "<code>{modelId}</code> — tasas pass por contexto", | |
| "niah.status.empty_id": "⚠ Introduce un model id (ej. meta-llama/Llama-3.1-8B-Instruct).", | |
| "niah.status.bad_teval": "⚠ Introduce un contexto objetivo (≥ 512 tokens).", | |
| "niah.status.fetching": "⏳ Obteniendo config.json para {modelId}...", | |
| "niah.status.fetched": "✅ Config obtenido para {modelId}. Pon T_eval y click Predecir (o Barrer contextos).", | |
| "niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%", | |
| "niah.status.sweep_done": "✅ Barridos {n} largos de contexto.", | |
| "saturation.title": "📈 Detector de saturación de benchmarks", | |
| "saturation.tip": "MMLU está saturado (88-94% en todos los frontier). Reportar '92% en MMLU' ya no significa nada. Esta herramienta te dice qué benchmarks aún discriminan frontier models, cuáles están saturados, y qué usar en su lugar. Datos: DemandSphere AI Frontier Tracker (CC BY-NC 4.0) refrescado 2026-05.", | |
| "saturation.desc": "<strong>¿Sigue siendo útil tu benchmark?</strong> Elige un benchmark para ver top-3 frontier scores, spread, y un veredicto (saturated / near-saturated / discriminative) + reemplazos recomendados.", | |
| "saturation.select_label": "Benchmark:", | |
| "saturation.select.all": "— mostrar todos los benchmarks —", | |
| "saturation.run_btn": "📈 Clasificar", | |
| "saturation.all_btn": "📊 Mostrar todos", | |
| "saturation.col.spread": "Spread top-3", | |
| "saturation.col.mean": "Media top-3", | |
| "saturation.col.n": "Modelos", | |
| "saturation.col.bench": "Benchmark", | |
| "saturation.col.verdict": "Veredicto", | |
| "saturation.col.reco": "Mejor reco", | |
| "saturation.col.model": "Modelo", | |
| "saturation.col.score": "Score", | |
| "saturation.section.top3": "Top-3 frontier scores", | |
| "saturation.section.recommendations": "Alternativas recomendadas", | |
| "saturation.section.note": "Notas", | |
| "saturation.section.all": "Todos los benchmarks rastreados", | |
| "saturation.verdict.saturated": "🚨 SATURADO", | |
| "saturation.verdict.near_saturated": "⚠ CASI SATURADO", | |
| "saturation.verdict.discriminative": "✅ DISCRIMINATIVO", | |
| "saturation.verdict.sparse_data": "ℹ DATOS ESCASOS", | |
| "saturation.borderline": "Borderline — dentro de ±1pp de un umbral. Trata el veredicto como 'verifica con cuidado'.", | |
| "saturation.unknown": "Benchmark desconocido.", | |
| "saturation.attribution": "Datos: DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (histórico open-weight) · último fetch 2026-05-05.", | |
| "saturation.status.live": "✅ Datos en vivo cargados — {count} modelos.", | |
| "saturation.status.baked": "ℹ Usando snapshot baked (fetch en vivo no disponible).", | |
| "saturation.status.kb_fail": "⚠ No se pudo cargar el KB de saturación.", | |
| "saturation.status.done": "✅ {name} — {verdict}", | |
| "saturation.status.all_done": "✅ Clasificados {n} benchmarks.", | |
| "help.v08.saturation.title": "📈 Detector de saturación de benchmarks", | |
| "help.v08.saturation.body": "MMLU está saturado (top 88-94%), AIME 2025 saturó a los pocos meses de salir, HumanEval near-saturated. Elige cualquier benchmark y la herramienta retorna top-3 frontier scores, spread, media, y un veredicto — saturated / near-saturated / discriminative — más un reemplazo recomendado (ej. MMLU → MMLU-Pro / GPQA / HLE). Fetch en vivo desde DemandSphere AI Frontier Tracker (CC BY-NC 4.0) cuando llega; snapshot baked 2026-05-05 cuando no. <em>Caso de uso</em>: antes de citar '92% en MMLU' o diseñar una eval, verifica si el benchmark aún discrimina algo.", | |
| "inv.v08.saturation": "<strong>📈 Saturation</strong> — ¿sigue siendo útil tu benchmark, o están todos los frontiers empatados arriba?", | |
| "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — cada pain documentado mapeado a un mode tafagent o herramienta externa curada. No reinventes — encuentra.", | |
| "help.v081.hub.title": "🧭 Solutions Hub", | |
| "help.v081.hub.body": "tafagent como integrador, no silo. 30+ pains en 7 categorías (eval reliability · diagnósticos · setup · training · retrieval · multimodal · observability), cada uno mapeado a (a) el mode tafagent que lo resuelve, si existe, y (b) las herramientas externas best-of-breed que la comunidad ya usa (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Caja de búsqueda matchea pain, scenario, y nombre de herramienta. <em>Caso de uso</em>: 'tengo problema X — ¿lo resuelve tafagent, y si no, quién?'", | |
| "hub.title": "🧭 Solutions Hub", | |
| "hub.tip": "Mapa de cada pain de LLM-eval documentado: qué mode tafagent lo resuelve (si alguno), y las herramientas externas best-of-breed que la comunidad ya usa. Objetivo: cobertura total. Si la herramienta canónica existe en otra parte, enlazamos en vez de rebuildear.", | |
| "hub.desc": "<strong>No reinventes — encuentra.</strong> 30+ pains mapeados a modes tafagent + herramientas externas curadas. Navega por categoría, busca por keyword, o ve los huecos donde nuevos modes ayudarían más.", | |
| "hub.clear_btn": "✕ Limpiar", | |
| "hub.no_mode": "externo", | |
| "hub.planned": "planeado:", | |
| "hub.best_for": "Mejor para", | |
| "hub.not_for": "No para", | |
| "hub.tools": "Herramientas externas", | |
| "hub.status.loaded": "✅ Cargados {total} pains en {categories} categorías — {covered} cubiertos por modes tafagent, {externalLinks} enlaces externos curados. Compilado {compiled}.", | |
| "hub.status.fail": "⚠ No se pudo cargar Solutions Hub.", | |
| "hub.search.empty": "Sin coincidencias para '{query}'. Prueba términos más amplios (ej. 'eval', 'rag', 'tokenizer').", | |
| "hub.search.results": "Encontradas {n} coincidencia(s) para '{query}'.", | |
| // v0.7.7 — Tiles de tareas (UX restructure: 14 modos agrupados por intención) | |
| "tiles.title": "🎯 ¿Qué quieres hacer?", | |
| "tiles.subtitle": "Elige una tarea. Cada una abre la herramienta adecuada debajo. O baja para la lista completa de 14 modos.", | |
| "tile.diagnose.title": "🔬 Diagnosticar un modelo", | |
| "tile.diagnose.desc": "¿Servirá este modelo concreto para mi caso de uso?", | |
| "tile.trust.title": "✓ Confiar en un score de benchmark", | |
| "tile.trust.desc": "¿Me creo este número? ¿Es bug o ruido?", | |
| "tile.eval.title": "⚙️ Configurar bien una eval", | |
| "tile.eval.desc": "Obtén el flag CLI exacto para lm-eval / vLLM / transformers.", | |
| "tile.compare.title": "🆚 Comparar modelos", | |
| "tile.compare.desc": "Lado a lado, o explora el panel empírico de modelos.", | |
| "tile.manual.title": "📋 Manual / libre", | |
| "tile.manual.desc": "Elige una receta concreta a mano, o pregunta en inglés llano.", | |
| "tile.diagnose.tip": "Empieza aquí cuando tengas un id de modelo concreto y quieras diagnóstico completo: <strong>Profile</strong> corre las 5 recetas a la vez. <strong>Unmask</strong> comprueba si max_position_embeddings es honesto. <strong>NIAH→Reason</strong> predice el gap retrieval-vs-reasoning. <strong>Quant</strong> predice si cuantizar lo romperá. <strong>Inspect</strong> permite pegar config.json crudo para modelos privados / en desarrollo.", | |
| "tile.trust.tip": "Cuando ves un score y quieres saber si es real. <strong>Contamination</strong> puntúa 20+ benchmarks por probabilidad de que el modelo los viera en entrenamiento. <strong>Drift</strong> te dice si el gap entre dos evals es ruido numérico o bug real (chat-template mismatch, layout KV-cache, etc.). <strong>Arena CI</strong> reconstruye los intervalos de confianza que Chatbot Arena oculta — muchas "victorias" top-Elo están estadísticamente empatadas.", | |
| "tile.eval.tip": "Antes de correr lm-eval-harness o vLLM serve, obtén el flag CLI correcto. <strong>Chat-template Sniffer</strong> detecta la familia de template (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) y emite la invocación exacta <code>--apply_chat_template</code> / <code>--chat-template</code>. Resuelve el issue #1841 de lm-eval-harness (÷2 accuracy silencioso). <strong>Diagnose CLI</strong> genera el comando Python para medir γ_obs en tu GPU local.", | |
| "tile.compare.tip": "<strong>Compare</strong>: elige 2-3 modelos candidatos + una receta, ve veredictos en tabla lado a lado (ej. Llama-3-8B vs Mistral-7B a 32k). <strong>Phase diagram</strong>: scatter de 23 modelos empíricos en el plano (log θ, γ), con la curva Padé superpuesta. Hover puntos para detalles, click para cargar ese modelo en la Recipe form.", | |
| "tile.manual.tip": "<strong>Recipe</strong>: elige una receta X-N específica (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 compresión KV, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) y rellena la form a mano para control total. <strong>Ask</strong>: escribe una pregunta libre; un LLM 0.5B (Qwen2.5) en tu navegador elige la receta correcta y la ejecuta. Ideal para exploración "qué pasaría si...".", | |
| "share.import_desc": "¿Tienes un fichero JSON del análisis TAF de alguien? Cárgalo aquí para ver el veredicto + cadena localmente. La misma vista que si lo hubieras ejecutado tú.", | |
| "share.import_btn": "📂 Cargar JSON compartido", | |
| "synthesis.system": "Eres un asistente de diagnóstico preciso para LLMs transformer. Dados resultados de fórmulas TAF pre-calculados, escribe un resumen claro en español de 4-6 frases. Cita el número de sección (§X.Y) para cada número que menciones. Da siempre una recomendación concreta. NO inventes números.", | |
| // INSPECTOR mode | |
| "inspector.title": "🔍 Inspector de Arquitectura", | |
| "inspector.desc": "Pega el contenido crudo de <code>config.json</code>. La herramienta extrae los parámetros arquitectónicos y ejecuta el Profile completo de 5 recetas.", | |
| "inspector.tip": "<strong>Pega cualquier config.json directamente</strong>. La herramienta lo parsea y ejecuta el Profile completo. Útil para: modelos privados, configs en desarrollo, modelos aún no en HuggingFace, o comparar qué haría tu arquitectura custom.", | |
| "inspector.quickstart": "💡 Caso de uso: tienes un modelo privado no en HF Hub, o una config que estás diseñando. Pega el JSON crudo abajo y obtén un perfil TAF completo.", | |
| "inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}", | |
| "inspector.T_eval": "T_eval (tu contexto objetivo):", | |
| "inspector.btn": "🚀 Inspeccionar y perfilar", | |
| // WHAT-IF slider | |
| "whatif.title": "🎚 What-if: arrastra T_eval para ver γ cambiar en vivo", | |
| "whatif.desc": "Recálculo puro JS (sin llamada Pyodide). Muestra γ_Padé y d_horizon geométricos mientras deslizas. Click en el botón para re-ejecutar la cadena completa.", | |
| "whatif.T_eval": "<strong>T_eval</strong>", | |
| "whatif.gamma_pade": "<strong>γ_Padé</strong>", | |
| "whatif.d_horizon": "<strong>d_horizon</strong>", | |
| "whatif.l_niah": "<strong>Techo L_NIAH</strong>", | |
| "whatif.predicted": "<strong>Veredicto geométrico predicho</strong>", | |
| "whatif.rerun": "↻ Re-calcular cadena completa con este T_eval", | |
| // COMMUNITY feed | |
| "community.title": "🌐 Envíos recientes de la comunidad", | |
| "community.desc": "Feed en vivo del registry público. Click en cualquier envío para ver análisis completo.", | |
| "community.browse_all": "Ver todo →", | |
| "community.loading": "Cargando...", | |
| "community.no_repo": "El repo del registry aún no está creado. Cuando exista con envíos, aparecerán aquí en vivo.", | |
| "community.no_submissions": "Sin envíos aún. Sé el primero — genera un Profile y click 📤 Enviar al registry.", | |
| // FALSIFICATION dashboard | |
| "falsification.title": "🔬 Predicciones del paper — estado de falsificación", | |
| "falsification.desc": "El framework TAF se basa en predicciones falsificables (F1-F23). Cada una está empíricamente testada. Aquí está el estado en vivo de cada predicción del paper.", | |
| "falsification.summary": "{confirmed} confirmadas · {partial} parciales · {refuted} refutadas · {untested} sin testear (de {total} predicciones totales)", | |
| "falsification.col.id": "ID", | |
| "falsification.col.claim": "Claim", | |
| "falsification.col.status": "Estado", | |
| "falsification.col.evidence": "Evidencia", | |
| "tafcard.title": "📇 TAF Card — perfil completo del modelo", | |
| "tafcard.recipes_title": "📋 Recetas — veredicto por dimensión", | |
| "tafcard.recipes_count_label": "dimensiones", | |
| "tafcard.numbers_title": "🔢 Números clave (paper §26)", | |
| "tafcard.fals_title": "🔬 Estado de falsificación (F1-F23)", | |
| "tafcard.fals_none": "Sin falsificaciones aplicables.", | |
| "tafcard.diag_title": "🔬 Diagnósticos — números · γ check · what-if", | |
| "tafcard.verify_title": "✓ Verificación — Lean + Sage + falsificación", | |
| "tafcard.share_title": "📂 Procedencia y compartir", | |
| "tafcard.whatif_title": "🎚️ Explorador what-if", | |
| "verdict.go": "ADELANTE", | |
| "verdict.no": "NO", | |
| "verdict.degraded": "DEGRADADO", | |
| "compare.title_out": "🆚 Tabla comparativa", | |
| "status.loading_pyodide": "⏳ Cargando runtime Python (~10MB, solo primera vez)...", | |
| "status.loading_taf": "⏳ Cargando fórmulas TAF + recetas...", | |
| "status.ready": "✅ Listo. Elige un modelo y click Perfilar para empezar.", | |
| "status.computing": "🧮 Calculando cadena TAF...", | |
| "status.done": "✅ Hecho.", | |
| "profile.hf_placeholder": "ej. meta-llama/Meta-Llama-3-8B o Qwen/Qwen2.5-7B", | |
| "compare.hf_placeholder": "ID modelo HF (ej. meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot1_placeholder": "ID modelo HF (ej. meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot2_placeholder": "ID modelo HF #2", | |
| "compare.slot3_placeholder": "ID modelo HF #3 (opcional)", | |
| "compare.preset_default": "— o preset —", | |
| // Parámetros del formulario | |
| "param.theta": "θ (rope_theta)", | |
| "param.theta.tip": "<strong>Frecuencia base RoPE</strong> de <code>config.rope_theta</code>. Mayor = más capacidad de largo alcance.", | |
| "param.T_train": "T_train", | |
| "param.T_train.tip": "<strong>Contexto máximo de entrenamiento</strong>. De <code>max_position_embeddings</code>. Más allá es extrapolación.", | |
| "param.T_eval": "T_eval (tu objetivo)", | |
| "param.T_eval.tip": "<strong>Tu contexto de inferencia objetivo</strong>. La pregunta clave: ¿se comportará bien el modelo a ESTA longitud?", | |
| "param.n_attn": "n_attention_heads", | |
| "param.n_attn.tip": "<strong>Número de attention heads</strong> por capa. De <code>num_attention_heads</code>.", | |
| "param.n_kv": "n_kv_heads", | |
| "param.n_kv.tip": "<strong>KV heads</strong>. Si < n_attention_heads → GQA (Grouped Query Attention). Reduce memoria KV pero empuja γ hacia Hagedorn.", | |
| "param.d_head": "head_dim", | |
| "param.d_head.tip": "<strong>Dimensión por head</strong>. Típico 64, 96, 128. De <code>head_dim</code> o <code>hidden_size / num_attention_heads</code>.", | |
| "param.n_layers": "n_layers", | |
| "param.n_layers.tip": "<strong>Número de bloques transformer</strong>. De <code>num_hidden_layers</code>.", | |
| "param.n_params": "n_params (ej. 8e9)", | |
| "param.n_params.tip": "<strong>Número total de parámetros</strong>. Umbral ~400M para emergencia de induction heads. Afecta memoria KV y recipes de presupuesto.", | |
| "param.has_swa": "¿Tiene SWA?", | |
| "param.has_swa.tip": "<strong>Sliding Window Attention</strong>. <code>true</code> para Mistral, gemma-2, phi-3. El audit de calibración v0.5.3 desactivó la corrección histórica δ_SWA (ajuste n=1).", | |
| "common.yes": "Sí", | |
| "common.no": "No", | |
| // Tooltips de modos | |
| "modes.tip": "<strong>Catorce formas de usar la herramienta</strong>.<br><strong>📇 Perfil</strong>: pega un id → TAF Card de 5 recetas.<br><strong>🆚 Comparar</strong>: 2-3 modelos lado a lado en una receta.<br><strong>🔍 Inspeccionar config</strong>: pega config.json crudo → Perfil completo.<br><strong>💬 Pregunta</strong>: pregunta libre, el LLM del navegador elige la receta.<br><strong>📋 Receta</strong>: selección manual con control total del formulario.<br><strong>🩺 Diagnóstico CLI</strong>: genera comando Python para medir γ localmente.<br><strong>📊 Diagrama de fase</strong>: panel de 23 modelos en plano (log θ, γ).<br><strong>🪟 Desenmascarar</strong>: detecta max_position_embeddings engañoso (SWA / YaRN / RoPE-scaling).<br><strong>📜 Chat-template</strong>: detecta familia + da el flag CLI exacto para lm-eval / vLLM / transformers.<br><strong>🎯 Arena CI</strong>: reconstruye intervalos de confianza desde votos pairwise crudos; detecta empates estadísticos que Arena oculta.<br><strong>🧪 Contaminación</strong>: puntúa 20+ benchmarks por probabilidad de contaminación según cutoff de entrenamiento vs fecha de release.<br><strong>⚖️ Quant</strong>: predice γ-shift y ΔPPL para cualquier (modelo × esquema de cuantización); recomienda alternativa segura si hay cliff.<br><strong>🔀 Drift</strong>: mismo modelo, scores distintos en dos setups — ¿bug o ruido? Predice banda de ruido numérico y flagea bugs reales.<br><strong>🔍 NIAH→Reason</strong>: predice tasas pass NIAH y reasoning multi-hop desde arquitectura; encuentra el contexto seguro de reasoning.", | |
| "profile.tip": "<strong>Diagnóstico completo en un click</strong>. Pega cualquier id de modelo HF (o elige preset). La herramienta ejecuta las 5 recetas (contexto largo, compresión KV, custom vs API, presupuesto, hardware) y produce una única <strong>TAF Card</strong> con veredicto por dimensión + números clave + clasificación arquitectónica.<br><br><strong>Caso de uso</strong>: \"Estoy evaluando Qwen2.5-32B para producción — ¿cuál es su perfil completo de viabilidad?\" → pega id → Perfilar → listo.", | |
| "compare.tip": "<strong>Misma receta, múltiples modelos</strong>. Elige 2-3 modelos candidatos y una receta. Ve los veredictos en una única tabla comparativa.<br><br><strong>Caso de uso</strong>: \"Necesito recuperación de contexto largo a 16K — ¿cuál es mejor: Llama-3-8B, Mistral-7B o Qwen-7B?\" → elige 3 + X-2 + 16K → ve el ganador.", | |
| // Modal de ayuda | |
| "help.title": "📘 TAF Agent — Manual de Usuario", | |
| "help.what.title": "¿Qué hace?", | |
| "help.what.body": "Predice la <strong>viabilidad práctica</strong> de cualquier LLM transformer <em>antes de gastar GPU/€</em>. Responde preguntas como \"¿funcionará este modelo a L=32K?\" o \"¿debería entrenar custom o usar API?\" usando fórmulas Python deterministas (TAF — Thermodynamic Attention Framework).", | |
| "help.modes.title": "Cómo usar — 7 modos", | |
| "help.modes.profile": "<strong>📇 Perfilar</strong>: pega id de modelo → todas las recetas a la vez = TAF Card. <strong>Mejor punto de inicio</strong>.", | |
| "help.modes.compare": "<strong>🆚 Comparar</strong>: 2-3 modelos lado a lado en la misma receta. Mejor al elegir entre candidatos.", | |
| "help.modes.inspector": "<strong>🔍 Inspeccionar config</strong>: pega <code>config.json</code> crudo → la herramienta lo parsea y ejecuta el Perfil completo. Para modelos privados, configs en desarrollo, o modelos aún no en HF Hub.", | |
| "help.modes.ask": "<strong>💬 Pregunta libre</strong>: pregunta en lenguaje natural, el LLM del navegador elige la receta. Mejor para exploración casual.", | |
| "help.modes.recipe": "<strong>📋 Receta + formulario</strong>: selección manual, control total de parámetros. Mejor cuando quieres control exacto.", | |
| "help.modes.diagnose": "<strong>🩺 Diagnóstico CLI</strong>: genera comando Python para medir γ en tu máquina local (transformers + numpy). Rápido ≈5 min CPU; completo ≈20–60 min GPU. JSON resultado re-subible por Inspect.", | |
| "help.modes.phase": "<strong>📊 Diagrama de fase</strong>: scatter de 23 modelos del panel en plano (log θ, γ). Línea Hagedorn γ=1 separa Fase A de Fase B. Click en un punto para cargar ese modelo en el formulario de Receta.", | |
| "help.recipes.title": "Las 8 recetas disponibles", | |
| "help.recipe.x1.title": "<strong>X-1 Entrenamiento custom vs API</strong> — compara coste de entrenar tu propio modelo vs pagar API.", | |
| "help.recipe.x1.example": "Prueba: <em>\"¿Entrenar 8B custom o usar GPT-4o para 50M tokens/mes?\"</em><br>Respuestas: SÍ (custom) / NO (API) con meses para break-even.", | |
| "help.recipe.x2.title": "<strong>X-2 Viabilidad contexto largo</strong> — predice si un modelo sirve longitud objetivo de manera fiable.", | |
| "help.recipe.x2.example": "Prueba: <em>\"¿Meta-Llama-3-8B maneja 32000 tokens para retrieval?\"</em><br>Cadena: γ_Padé → descomposición → d_horizon → techo NIAH → alucinación → memoria KV.<br>Veredicto: SÍ / DEGRADADO / NO con mitigación si hace falta.", | |
| "help.recipe.x3.title": "<strong>X-3 Pre-flight presupuesto</strong> — dado un presupuesto $, ¿qué modelo es viable entrenar?", | |
| "help.recipe.x3.example": "Prueba: <em>\"Tengo $5000, ¿qué modelo puedo entrenar?\"</em><br>Respuesta: GO / TINY-MODEL / MEMORY-LIMITED con N (params) y D (tokens) concretos.", | |
| "help.recipe.x5.title": "<strong>X-5 Selección hardware</strong> — ¿qué GPU usar para servir al throughput objetivo?", | |
| "help.recipe.x5.example": "Prueba: <em>\"Hardware más barato para servir Llama-3-8B a 10M tokens/día\"</em><br>Respuesta: mejor GPU + $/Mtok + capacidad vs objetivo.", | |
| "help.recipe.x19.title": "<strong>X-19 Decisión compresión KV</strong> — ¿usar soft decay, hard cutoff, o métodos de literatura?", | |
| "help.recipe.x21.title": "<strong>X-21 Diagnóstico Pureza Imprint</strong> — predice γ sobre tokens RANDOM via ν=−1/(2π); ¿cuán limpia es la predicción RoPE del modelo?", | |
| "help.recipe.x22.title": "<strong>X-22 Invariante Compute-Context</strong> — ¿γ × log(N²·D) está en banda 51.2 ± 16.8? Detecta anomalías de scaling/training.", | |
| "help.recipe.x23.title": "<strong>X-23 Detector Fase IH</strong> — ¿pre- o post-induction-head? Probe barato via sign(γ_text − γ_random).", | |
| "help.recipe.x21.example": "Prueba: <em>«¿Cuán limpia es la predicción RoPE en Llama-3-8B?»</em><br>Respuesta: γ_random predicho + diagnóstico (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).", | |
| "help.recipe.x22.example": "Prueba: <em>«¿Mistral-7B entra en el invariante compute-context?»</em><br>Respuesta: K = γ·log(N²·D), z-score, IN-BAND u OUTLIER.", | |
| "help.recipe.x23.example": "Prueba: <em>«¿Qwen2.5-7B es post-induction-head?»</em><br>Respuesta: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY (chequeo consistencia tamaño vs Δγ).", | |
| "help.section.v04": "<strong>Novedades v0.4</strong> (hallazgos sesión 29 del 2026-04-28): tres recipes diagnósticas derivadas del análisis panel cross-model (n=22 LLMs).", | |
| "help.divider.v04_s29": "— v0.4 (hallazgos sesión 29) —", | |
| "footer.tech_stack": "Cómputo: Pyodide · Síntesis: WebLLM (Qwen2.5-0.5B local) · Hosting: GitHub Pages · Coste: $0", | |
| "help.v04.imprint": "<strong>Slope imprint aprendido ν = −1/(2π)</strong>: el periodo de rotación RoPE 2π provoca un sesgo posicional en los pesos, proporcional a log(N_params). Incluso tokens random muestran este scaling. ν es DERIVADO — no ajustado (err empírico 0.3%).", | |
| "help.v04.invariant": "<strong>Invariante Chinchilla-atención K</strong>: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Conecta compute scaling y exponente de atención en un solo número adimensional.", | |
| "help.v04.ih_probe": "<strong>Δγ como probe IH</strong>: sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Más barato que correr un benchmark in-context-learning.", | |
| "help.v04.constants": "<strong>γ-cluster en constantes famosas</strong> (intrigante, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (conjugado áureo, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat: podría ser coincidencia.", | |
| "help.recipe.x19.example": "Prueba: <em>\"¿Cómo comprimir caché KV para Qwen2.5-7B a 32K?\"</em><br>Respuesta: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.", | |
| "help.param.theta": "<strong>θ (rope_theta)</strong>: frecuencia base RoPE. Mayor = más capacidad de largo alcance. Típico: 10000 (modelos antiguos), 500000 (Llama-3), 1000000 (Qwen2.5).", | |
| "help.param.T_train": "<strong>T_train</strong>: contexto máximo que vio el modelo durante entrenamiento. De <code>max_position_embeddings</code>.", | |
| "help.param.T_eval": "<strong>T_eval</strong>: <em>tu</em> longitud de contexto objetivo en inferencia. La perilla clave.", | |
| "help.param.gqa": "<strong>n_kv_heads < n_attention_heads</strong>: el modelo usa GQA (Grouped Query Attention). Reduce memoria KV pero empuja γ hacia Hagedorn.", | |
| "help.param.swa": "<strong>has_SWA</strong>: el modelo usa Sliding Window Attention (Mistral, gemma-2).", | |
| "help.param.nparams": "<strong>n_params</strong>: número total de parámetros. Umbral ~400M para emergencia de induction heads.", | |
| "help.add_models.title": "Añadir nuevos modelos (3 maneras)", | |
| "help.add_models.preset": "<strong>Lista de presets</strong>: 11 modelos populares curados. Selecciona del dropdown.", | |
| "help.add_models.hf": "<strong>HF Hub fetch</strong>: pega cualquier id (ej. <code>Qwen/Qwen2.5-32B-Instruct</code>), click 📥 Cargar. El navegador descarga <code>config.json</code> directamente de HuggingFace, llena el formulario. Funciona con cualquier modelo público.", | |
| "help.add_models.manual": "<strong>Manual</strong>: rellena los campos directamente con valores de la model card.", | |
| "help.audit.title": "La cadena auditable", | |
| "help.audit.body": "Cada resultado muestra la <strong>Cadena de Cálculo</strong> completa — cada paso de fórmula con sus entradas, salida e interpretación. Click en cualquier paso para expandir. Las referencias de sección (§26.1, §19.1, etc.) apuntan al paper para la derivación.", | |
| "help.synthesis.title": "La respuesta en lenguaje natural", | |
| "help.synthesis.body": "Tras ejecutar la cadena determinista, un LLM en el navegador (Qwen2.5-0.5B, ~350MB cacheado tras primera carga) sintetiza un resumen en lenguaje natural. Los números arriba son <em>siempre correctos</em> (Python determinista); la síntesis la genera el LLM — verifica contra la cadena si dudas.", | |
| "help.params.title": "Parámetros comunes explicados", | |
| "help.verdicts.title": "Qué mirar en los veredictos", | |
| "help.verdict.yes": "<strong style=\"color:#3fb950;\">SÍ / GO</strong> — procede con confianza; los números apoyan la elección.", | |
| "help.verdict.deg": "<strong style=\"color:#d29922;\">DEGRADADO / TINY-MODEL</strong> — funciona con caveats; lee la acción.", | |
| "help.verdict.no": "<strong style=\"color:#f85149;\">NO / MEMORY-LIMITED</strong> — no procedas tal cual; se da mitigación.", | |
| "help.privacy.title": "Privacidad", | |
| "help.privacy.body": "Todo corre en tu navegador. Sin telemetría, sin analytics, sin datos enviados a ningún sitio. Incluso el modelo LLM corre localmente vía WebGPU/WebAssembly. Tus model_ids y preguntas nunca abandonan esta página.", | |
| "help.source.title": "Código fuente y paper", | |
| "help.source.body": "Código: <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>Paper: <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a>; arXiv próximamente)<br>Dataset: <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 58 mediciones γ sobre 32 modelos (CC-BY-4.0)", | |
| "footer.text": "© 2026 Carles Marin · Apache-2.0 · investigación independiente · la herramienta que cierra el círculo del paper.", | |
| }, | |
| // ──────────────────────────────────────────────────────────────────────── | |
| // FR — Français | |
| // ──────────────────────────────────────────────────────────────────────── | |
| fr: { | |
| // §33 v0.4 (sesion 31, 2026-04-30) — nouvelles fonctions de diagnostic | |
| "v04.title": "🆕 v0.4 — Nouveaux diagnostics (sesion 31)", | |
| "v04.section.intro": "Quatre nouvelles fonctions diagnostiques dérivées en session 31 (2026-04-30) depuis jeux de formules cross-of-crosses + interrogation socratique. Disponibles dans <code>taf_browser.py</code> §33.", | |
| "v04.arch.label": "Concentration Architecturale", | |
| "v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv. Loi corrélationnelle cross-panel (R²=0.30). Caveat : pas un prédicteur par-modèle.", | |
| "v04.pdi.label": "PDI — Indice de Déviation de Padé", | |
| "v04.pdi.desc": "PDI = d_horizon_obs/T_eval. Feu : vert (≈1), orange (>>1), jaune (<<1), rouge (Phase B négatif).", | |
| "v04.4bit.label": "Prédicteur de Décalage 4-bit", | |
| "v04.4bit.desc": "MHA : R²(bf16)<0.9 → γ monte ; R²>0.99 → γ descend. GQA : précision-robuste.", | |
| "v04.crit.label": "Ensemble d'Exposants Critiques", | |
| "v04.crit.desc": "ν_c, β_c, η_c (=γ−1, CORRIGÉ), α_C, γ_susc avec minimum AM-GM à γ=1−1/√2≈0.293.", | |
| // §34 v0.5 (session 32, 2026-05-01) — Cohérence algébrique vérifiée par machine | |
| "v05.title": "🔬 v0.5 — Cohérence vérifiée par machine (session 32)", | |
| "v05.section.intro": "Vérification duale par Sage Groebner basis + Lean Mathlib4 de <strong>15 identités algébriques</strong> des exposants critiques TAF. Premier framework transformer-attention avec preuve formelle machine.", | |
| "v05.verify.label": "Vérification de Cohérence Algébrique", | |
| "v05.verify.desc": "Étant donné γ mesuré, vérifie 12 identités D-SAGE (D-SAGE-1 : 2η²+η·γ_χ+1=0, β·χ=−1, α+χ=2, etc.). Toutes passantes = framework intact. Échecs = outliers bf16 / artefacts de quantification.", | |
| "v05.dsage1.label": "D-SAGE-1 (★★ core)", | |
| "v05.dsage1.desc": "Identité quadratique 2η² + η·γ_χ + 1 = 0 (découverte par Sage Groebner, vérifiée Lean). Remplace l'affirmation incorrecte de 'fermeture triple'. Réfute η=2γ du paper 1 algébriquement.", | |
| "v05.erratum.label": "Erratum paper 1 — correction η", | |
| "v05.erratum.desc": "Paper 1 affirmait η = 2γ. Sage Groebner + Lean Mathlib4 ont prouvé l'échec (résidu (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ Phase A). Valeur correcte : η = γ−1, satisfaisant D-SAGE-1.", | |
| "v05.repro.label": "Reproductibilité", | |
| "v05.repro.desc": "Les 15 théorèmes sont machine-proof en Lean Mathlib4 (build réussi 1973 jobs). Script Sage : <code>analysis/sage_recursive_sweep_2026-04-30.sage</code>. Code Lean : <code>lean_taf/taf/Taf/Identities.lean</code>.", | |
| // v0.5.1 — TAF Card consistency check button | |
| "v05.consistency.title": "🔬 Vérification de cohérence algébrique (Sage + Lean v0.5)", | |
| "v05.consistency.desc": "Vérifie 12 identités algébriques D-SAGE des exposants critiques TAF (machine-proof Sage Groebner basis + Lean Mathlib4). Passe = framework intact. Échec = outlier bf16 / artefact de quantification.", | |
| "v05.consistency.btn": "🔬 Vérifier cohérence algébrique", | |
| // v0.5.2 — Anti-Ising universality class badge | |
| "v05.antiising.badge": "🧲 Classe Anti-Ising (β=γ−1<0, vérifié par machine)", | |
| // v0.5.2 — Tooltips par identité (explications en langage clair) | |
| "v05.tooltip.D_SAGE_1": "Identité algébrique quadratique reliant la dimension anormale η et la susceptibilité γ_χ. Identité CENTRALE découverte par Sage Groebner basis (machine-proof). Remplace l'ancienne affirmation incorrecte de triple closure.", | |
| "v05.tooltip.D_SAGE_2": "En Phase A, β = γ−1 est négatif (anti-Ising). Multiplié par χ = 1/(1−γ) donne exactement −1. Signature du régime négatif-β de TAF.", | |
| "v05.tooltip.D_SAGE_4": "L'exposant de chaleur spécifique α et la susceptibilité χ se somment exactement à 2 en TAF. Conséquence algébrique de l'hyperscaling de Josephson.", | |
| "v05.tooltip.D_SAGE_5": "Identité linéaire : α + γ_χ = 2(2−γ). Signifie que quand γ s'approche de 1 (Hagedorn), la somme s'approche de 2 ; à γ=0 elle vaut 4.", | |
| "v05.tooltip.D_SAGE_6": "Exposant de paramètre d'ordre multiplié par exposant de susceptibilité donne une quadratique spécifique en γ. Relation algébrique factorisée.", | |
| "v05.tooltip.Rushbrooke_tautology": "Hyperscaling de Rushbrooke standard 2β + γ_χ = ν·d à d=1. En TAF c'est une TAUTOLOGIE — γ_χ est défini exactement pour que cela soit vrai. Confirmé par Sage Groebner basis.", | |
| "v05.tooltip.Josephson_tautology": "Hyperscaling de Josephson standard 2 − α = ν·d à d=1. En TAF c'est une TAUTOLOGIE — α est défini exactement pour que cela soit vrai.", | |
| "v05.tooltip.Fisher_independent": "Relation de Fisher γ_χ = (2−η)·ν. En TAF est INDÉPENDANTE (ne ferme PAS comme identité, contrairement à l'affirmation de triple closure). Le résidu est γ(2γ−3)/(1−γ).", | |
| "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 affirmait η=2γ. Cette identité le réfute : le résidu est positif dans toute la Phase A. Réfutation machine-proof par Lean Mathlib4.", | |
| "v05.tooltip.D_14_nu_imprint": "La pente d'empreinte apprise ν = −1/(2π) multipliée par 2π donne −1. Vérification dimensionnelle triviale du paper 1.", | |
| "v05.tooltip.D_SAGE_7": "La charge centrale c=3 multipliée par |ν_imprint| multipliée par 2π donne 3. Fermeture dimensionnelle reliant CFT à l'empreinte d'entraînement.", | |
| "v05.tooltip.nu_beta_id": "Exposant de longueur de corrélation ν multiplié par exposant de paramètre d'ordre β donne −1 en Phase A. Variante de D-SAGE-2.", | |
| "v053.calibration.title": "🔬 v0.5.3 — Audit de calibrage (2026-05-02)", | |
| "v053.calibration.note": "<strong>Correction SWA désactivée</strong> — δ_SWA = -0.21 d'origine était calibrée sur n=1 modèle (données insuffisantes ; moyenne du cas unique +0.355). <strong>Correction post_IH marquée exploratoire</strong> — moyenne de groupe ≈ 0 en ré-audit (panel n=22) ne réplique pas l'ajustement OLS. <strong>Correction GQA réplique</strong> (panel +0.115 vs hardcoded +0.11). <strong>Formule D_f corrigée pour Phase B (γ>1)</strong> — utilise une somme cumulative discrète au lieu d'une approximation continue. LLaMA-3, Mistral, Gemma rapportent maintenant des valeurs de compression correctes.", | |
| "v053.release.banner": "🔧 v0.5.3 — Corrections issues d'audit : D_f de compression KV utilise maintenant la somme discrète (correct pour tout γ) ; δ_SWA désactivé (calibrage n=1) ; erratum du coefficient C_V paper §5.2 (1/4 → 1/12).", | |
| // §35 v0.6 — Diagnostic γ prédit vs observé | |
| "gamma_check.title": "🔍 γ prédit vs observé", | |
| "gamma_check.desc": "Saisissez votre γ mesuré empiriquement. L'outil détecte le régime : fraude (θ gonflé) / comprimé / sur-Padé / SWA-aléatoire / normal.", | |
| "gamma_check.gobs_label": "γ_observé", | |
| "gamma_check.gobs_tip": "γ mesuré empiriquement à partir des attention scores de votre modèle. Utilisez la CLI Diagnose pour l'obtenir depuis les poids réels.", | |
| "gamma_check.random_label": "Corpus aléatoire ?", | |
| "gamma_check.random_tip": "Cochez si γ_observé a été mesuré sur des tokens aléatoires/non structurés. Distingue la signature SWA (γ_obs > 1) d'une anomalie.", | |
| "gamma_check.regime": "Régime", | |
| "gamma_check.regime.normal": "Normal", | |
| "gamma_check.regime.fraud": "Fraude (θ gonflé)", | |
| "gamma_check.regime.compressed": "Contexte comprimé", | |
| "gamma_check.regime.overpade": "Sur-Padé", | |
| "gamma_check.regime.swa": "Signature SWA (corpus aléatoire)", | |
| "gamma_check.regime.unknown": "Inconnu", | |
| "gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15] : le modèle utilise son contexte nominal complet, sans anomalie.", | |
| "gamma_check.regime.fraud.desc": "η < 0.01 : θ nominal gonflé. Le modèle se comporte comme si θ ≪ annoncé. Probable inflation YaRN/marketing sans vraie extension de contexte.", | |
| "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5) : contexte comprimé (le modèle attend moins loin que ne le prédit θ nominal). Fréquent en instruction-tuned / RLHF.", | |
| "gamma_check.regime.overpade.desc": "η > 1.5 : le modèle attend plus loin que Padé ne le prédit. Régime Lerch-corrigé possible ou checkpoint précoce sous-entraîné.", | |
| "gamma_check.regime.swa.desc": "γ_obs > 1.05 sur corpus aléatoire = signature de sliding-window attention (familles Mistral / Gemma).", | |
| "gamma_check.regime.unknown.desc": "Entrées hors plage ou γ_obs > 1 sans flag corpus_aléatoire. Vérifiez la mesure.", | |
| "gamma_check.glossary.title": "ⓘ Glossaire — signification des variables", | |
| "gamma_check.glossary.gamma_pade": "<strong>γ_Padé</strong> : prédiction fermée (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.", | |
| "gamma_check.glossary.gamma_obs": "<strong>γ_observé</strong> : mesuré empiriquement à partir des attention scores (exécutez Diagnose CLI sur poids réels).", | |
| "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (observé)</strong> : inversé depuis γ_obs via T√2 / (1 − γ_obs). θ effectif impliqué par votre mesure.", | |
| "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong> : θ + T/√2. θ effectif prédit par la formule fermée.", | |
| "gamma_check.glossary.efficiency": "<strong>η</strong> : rapport θ_eff_obs / θ_eff_Padé. ≈1 = normal · <0.01 = fraude · <0.5 = comprimé · >1.5 = sur-Padé.", | |
| "gamma_check.glossary.delta_h": "<strong>ΔH_Cardy</strong> : log(θ_eff_obs / θ_nominal). Variation d'entropie de Cardy. Négatif = entropie de compression. ~0 = correspondance nominale.", | |
| "gamma_check.glossary.regime": "<strong>Régime</strong> : classifieur automatique à partir de η + γ_obs + flag corpus_aléatoire.", | |
| // §36 v0.6 — Tooltips pour icônes ⓘ inline | |
| "tooltip.gamma_pade": "<strong>γ_Padé(T_eval)</strong> : prédiction fermée (2−z)/(2+z), z = T√2/θ. Paper §sec:gamma_decomposition.", | |
| "tooltip.gamma_decomposed": "<strong>γ_décomposé</strong> : γ depuis la décomposition architecturale complète. Ligne de base Padé + shift GQA + shift post-IH (sous-ensemble répliqué dans audit calibré).", | |
| "tooltip.d_horizon": "<strong>d_horizon</strong> : horizon d'attention effectif. Au-delà, les scores tombent sous le plancher de bruit (paper §26).", | |
| "tooltip.L_NIAH": "<strong>Plafond L_NIAH</strong> : plafond prédit de fiabilité needle-in-a-haystack au d_horizon courant.", | |
| "tooltip.chi": "<strong>χ susceptibilité</strong> : χ = 1/(1−γ). Diverge à la ligne Hagedorn γ=1.", | |
| "tooltip.kv_memory": "<strong>Mémoire KV @ T_eval (BF16)</strong> : cache KV par requête = 2 · n_layers · n_kv_heads · d_head · T_eval octets.", | |
| "tooltip.theta_eff_obs": "<strong>θ_eff (observé)</strong> : θ effectif impliqué par votre γ_observé : T√2 / (1 − γ_obs).", | |
| "tooltip.theta_eff_pade": "<strong>θ_eff (Padé)</strong> : θ effectif prédit par la formule fermée : θ + T/√2.", | |
| "tooltip.efficiency": "<strong>η = θ_eff_obs / θ_eff_Padé</strong> : ratio d'efficacité. ≈1 = normal · <0.01 = fraude · <0.5 = comprimé · >1.5 = sur-Padé.", | |
| "tooltip.delta_h_cardy": "<strong>ΔH_Cardy</strong> : log(θ_eff_obs / θ_nominal). Variation d'entropie de Cardy. Négatif = entropie de compression. ~0 = correspondance nominale.", | |
| "tooltip.verdict_aggregate": "<strong>Verdict</strong> : pire-de toutes les recettes. ✅ GO = tout vert · ⚠ DÉGRADÉ = ≥1 jaune · ❌ NON = ≥1 rouge.", | |
| "tooltip.verdict_breakdown": "<strong>Décomposition par recette</strong> : chaque recette teste un axe de décision <em>indépendant</em> (contexte-long · budget · matériel · custom-vs-API · compression-KV). Un ❌ en X-1 signifie « utilisez l'API pour votre volume » et non « le modèle échoue » — ouvrez la section Recettes pour le contexte par axe.", | |
| "tooltip.gamma_pill": "<strong>γ vedette</strong> : γ_décomposé (ou γ_Padé en fallback). Plage (0,1) = Phase A (anti-Ising). γ ≥ 1 = Hagedorn / Phase B.", | |
| "tooltip.anti_ising": "<strong>Classe Anti-Ising</strong> : Phase A → β = γ−1 < 0. Machine-verified (Sage + Lean Mathlib4). Voir §35 v0.5.", | |
| // §37 v0.6 — Table des théorèmes Lean+Mathlib | |
| "lean.table.title": "📑 Table des théorèmes Lean+Mathlib", | |
| "lean.table.desc": "Chaque entrée ci-dessous est machine-proven contre Lean 4 + Mathlib4. Cliquez sur un lien L# pour aller à la ligne source sur GitHub. Groupé par thème — cliquez sur un en-tête pour déplier.", | |
| "lean.table.theorem": "Théorème", | |
| "lean.table.claim": "Énoncé", | |
| "lean.table.tactic": "Tactique", | |
| "lean.table.source": "Source", | |
| "lean.table.lean": "Lean", | |
| "lean.findings.title": "🔎 Findings substantiels", | |
| "lean.findings.detected_by": "Détecté par", | |
| "lean.findings.fixed_by": "Corrigé par", | |
| "lean.findings.recommendation":"Recommandation", | |
| "lean.meta.repo": "Repo", | |
| "lean.meta.build": "Build", | |
| "lean.meta.theorems": "Théorèmes", | |
| "lean.meta.verified": "vérifiés", | |
| "lean.meta.rejected": "rejetés", | |
| "lean.meta.sorry": "sorry", | |
| "lean.meta.findings": "findings substantiels", | |
| "lean.manifest.loading": "Chargement du manifeste Lean…", | |
| "lean.manifest.error": "Manifeste Lean indisponible", | |
| // Help modal — section v0.6 | |
| "help.v06.title": "🆕 v0.6 — γ prédit-vs-observé + Cardy ΔH + badges Lean", | |
| "help.v06.intro": "<em>v0.6 (2026-05-06) : trois nouveaux diagnostics vivent dans la TAF Card sous <strong>🔬 Diagnostics</strong>. Tout tourne dans votre navigateur ; γ_observé provient de la Diagnose CLI sur poids réels.</em>", | |
| "help.v06.layout.title": "Disposition de la TAF Card (nouveau en v0.6)", | |
| "help.v06.layout.body": "Après avoir cliqué <strong>🚀 Générer profil complet</strong>, la carte affiche : une <strong>bande hero</strong> en haut (classe d'architecture + méta + 3 pills : verdict agrégé ✅/⚠/❌, γ vedette, 🧲 Anti-Ising si Phase A) et quatre <strong>sections pliables</strong> : <strong>📋 Recettes</strong> (ouverte par défaut — verdict par dimension), <strong>🔬 Diagnostics</strong> (nombres clés, γ prédit vs observé, explorateur what-if), <strong>✓ Vérification</strong> (cohérence algébrique Sage+Lean, falsification F1-F23), <strong>📂 Provenance & partage</strong> (audit de calibration + téléchargement JSON / lien / soumission registre). Cliquez sur n'importe quel en-tête pour déplier. Chaque variable a un tooltip <strong>ⓘ</strong> inline.", | |
| "help.v06.gamma_check.title": "γ prédit vs observé", | |
| "help.v06.gamma_check.body": "Saisissez le γ mesuré empiriquement et l'outil calcule <strong>η = θ_eff_obs / θ_eff_Padé</strong> et classe en l'un de 5 régimes :", | |
| "help.v06.case.normal": "<strong>Normal</strong> (η ∈ [0.85, 1.15]) — le modèle utilise son contexte nominal complet. <em>Cas d'usage</em> : valider une nouvelle release avant adoption.", | |
| "help.v06.case.fraud": "<strong>Fraude</strong> (η < 0.01) — θ nominal gonflé ; le modèle se comporte comme si θ ≪ annoncé. <em>Cas d'usage</em> : détecter inflation YaRN/marketing (motif CodeLlama / Mistral-Nemo).", | |
| "help.v06.case.compressed": "<strong>Comprimé</strong> (η < 0.5) — contexte comprimé ; le modèle attend moins loin que θ nominal. <em>Cas d'usage</em> : repérer compression par RLHF/instruction-tuning (motif LLaMA-2).", | |
| "help.v06.case.overpade": "<strong>Sur-Padé</strong> (η > 1.5) — le modèle attend plus loin que Padé ne le prédit. <em>Cas d'usage</em> : identifier régime Lerch-corrigé ou checkpoints précoces sous-entraînés (motif pythia-1b).", | |
| "help.v06.case.swa": "<strong>SWA corpus aléatoire</strong> (γ_obs > 1.05 avec corpus_aléatoire=Oui) — signature de sliding-window attention. <em>Cas d'usage</em> : confirmer SWA Mistral / Gemma sur tokens aléatoires.", | |
| "help.v06.cardy.title": "Diagnostic Cardy ΔH", | |
| "help.v06.cardy.body": "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>. Variation d'entropie entre le θ effectif observé et le θ nominal. Fortement négatif = entropie de compression ; proche de zéro = correspondance nominale. Complète η pour les cas borderline.", | |
| "help.v06.lean.title": "Badges de vérification Lean + Mathlib", | |
| "help.v06.lean.body": "Les identités TAF sont formellement machine-proven en Lean Mathlib4 : <strong>37 théorèmes</strong> en 7 groupes (Padé, flot RG, Cayley, D-SAGE, résultats d'audit, erratum CV, divers) + <strong>1 résultat substantiel</strong> (facteur 2 dans la dérivée V, théorème <code>V_derivative_ne_RG_beta</code>). Source : <a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a> (commit 25c77fd). Re-vérifier localement : <code>git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean</code>. La pill 🧲 Anti-Ising du hero et la section Vérification renvoient à des lignes sources spécifiques.", | |
| "help.v06.glossary.title": "Glossaire des variables (également intégré dans la TAF Card)", | |
| "help.v06.glossary.body": "Chaque variable de la TAF Card a un tooltip ⓘ inline. Liste complète : γ, γ_Padé, γ_décomposé, γ_observé, θ, θ_eff_obs, θ_eff_Padé, η, ΔH_Cardy, χ, d_horizon, L_NIAH, mémoire KV, régime. Survolez n'importe quel ⓘ pour la définition + section du paper.", | |
| "hero.title": "🔬 TAF Agent", | |
| "hero.tagline": "Diagnostiquez n'importe quel LLM transformer en 30 secondes. Gratuit. Sans GPU. Sans inscription.", | |
| "hero.subtitle": "Prédit si un modèle conviendra à votre cas d'usage <em>avant</em> que vous ne dépensiez argent ou temps. Tout tourne dans votre navigateur — vos données ne quittent jamais cet onglet.", | |
| "hero.help": "📘 Manuel et exemples", | |
| "hero.quickstart_btn": "⚡ Démarrage rapide", | |
| "hero.inventory_btn": "🧰 Ce que ça offre", | |
| "hero.about": "Conçu par un chercheur indépendant. Open source. Non affilié à un fournisseur de modèles.", | |
| "modes.title": "🎯 Mode", | |
| "modes.profile": "📇 Profiler un modèle", | |
| "modes.compare": "🆚 Comparer des modèles", | |
| "modes.inspector": "🔍 Inspecter config", | |
| "modes.ask": "💬 Question libre", | |
| "modes.recipe": "📋 Choisir une recette", | |
| "modes.diagnose": "🩺 Diagnose CLI", | |
| "diagnose.title": "🩺 Générateur de commande Diagnose CLI", | |
| "diagnose.tip": "Le navigateur prédit γ à partir de la config; le CLI mesure γ_obs sur les poids réels. Ce générateur produit la commande exacte à exécuter localement.", | |
| "diagnose.desc": "Choisis les options et copie-colle la commande générée sur ta machine locale (Python + transformers + numpy). Mode rapide ≈5 min CPU; complet ≈20–60 min GPU.", | |
| "diagnose.model_label": "ID du modèle HF:", | |
| "diagnose.theta_label": "θ (auto si vide):", | |
| "diagnose.n_label": "Contexte N:", | |
| "diagnose.options_label": "Options:", | |
| "diagnose.opt_fast": "--fast (CPU, ~5 min)", | |
| "diagnose.opt_cpu": "--cpu (forcer CPU)", | |
| "diagnose.opt_4bit": "--load_in_4bit (modèles ≥7B)", | |
| "diagnose.local_label": "--local path (optionnel):", | |
| "diagnose.build_btn": "📋 Générer la commande", | |
| "diagnose.cmd_title": "Commande générée :", | |
| "diagnose.copy_btn": "📋 Copier dans le presse-papiers", | |
| "diagnose.next_steps": "Prochaines étapes: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) Exécute la commande (4) JSON résultat → upload via mode Inspect pour analyse TAF complète.", | |
| "modes.phase": "📊 Diagramme de phase", | |
| "phase.title": "📊 Diagramme de phase (γ × θ)", | |
| "phase.tip": "Chaque point est un modèle du panel empirique du paper. x: log θ; y: γ. La ligne Hagedorn γ=1 sépare Phase A de Phase B. Hover pour détails, click pour charger dans le formulaire.", | |
| "phase.desc": "23 modèles dans le panel; courbe Padé à T=2000.", | |
| "modes.desc": "<strong>Démarrage rapide</strong>: collez n'importe quel id de modèle HuggingFace (ex. <code>meta-llama/Meta-Llama-3-8B</code>), cliquez Profiler. Voyez les 5 recettes évaluées en quelques secondes.", | |
| "profile.title": "📇 Profiler un modèle", | |
| "profile.desc": "<strong>Pour techniciens</strong>: quand vous avez besoin d'un instantané complet de viabilité d'un modèle candidat. Un clic exécute les 5 recettes et produit une TAF Card unifiée.", | |
| "profile.preset_label": "Préréglage:", | |
| "profile.preset_default": "— ou choisir dans la liste —", | |
| "profile.hf_label": "ID modèle HF:", | |
| "profile.fetch_btn": "📥 Charger", | |
| "profile.btn": "🚀 Générer profil complet", | |
| "profile.quickstart": "💡 Démarrage rapide: choisissez un préréglage → cliquez Générer. Ou collez un id depuis <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub tendances</a> → 📥 Charger → Générer.", | |
| "compare.title": "🆚 Comparer côte à côte", | |
| "compare.desc": "<strong>Pour techniciens</strong>: quand vous choisissez entre 2-3 modèles candidats pour un scénario de déploiement spécifique. Même recette, plusieurs modèles, verdicts côte à côte.", | |
| "compare.recipe_label": "Recette:", | |
| "compare.T_eval_label": "T_eval (contexte cible):", | |
| "compare.models_title": "Modèles à comparer (jusqu'à 3)", | |
| "compare.btn": "🚀 Comparer", | |
| "compare.example": "💡 Essayez: collez 3 modèles populaires de 7-8B (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B), recette X-2, T_eval=16000. Voyez lequel gère le mieux le contexte long.", | |
| "ask.title": "❓ Votre question", | |
| "ask.placeholder": "ex. Mistral-7B gérera-t-il 16K NIAH? Ou: J'ai 5,000$, quel modèle puis-je entraîner? Ou: GPU le moins cher pour servir Llama-70B à 100M tokens/jour?", | |
| "ask.btn": "🚀 Analyser", | |
| "ask.example_btn": "💡 Essayer un exemple", | |
| "recipe.title": "📋 Recette", | |
| "recipe.default": "— choisir une recette —", | |
| "recipe.input_title": "🎯 Entrées", | |
| "verdict.title": "📊 Verdict", | |
| "chain.title": "🔍 Chaîne de calcul", | |
| "chain.desc": "Chaque nombre ci-dessous est du Python déterministe. Cliquez sur une étape pour développer.", | |
| "answer.title": "💬 Réponse en langage naturel", | |
| "share.btn": "🔗 Copier le lien", | |
| "share.copied": "✅ Copié dans le presse-papiers!", | |
| "share.download": "💾 Télécharger JSON", | |
| "share.download_md": "📝 Markdown", | |
| "share.download_tex": "📜 LaTeX", | |
| "share.submit": "📤 Soumettre au registry", | |
| "share.submit_clip_ok": "↗ GitHub ouvert. Corps copié dans le presse-papiers — collez-le dans le corps de l'issue.", | |
| "share.submit_clip_fail": "↗ GitHub ouvert. Presse-papiers bloqué — corps dans la console du navigateur (F12).", | |
| "share.import_title": "📂 Importer un résultat TAF partagé", | |
| "a11y.skip": "Aller au contenu principal", | |
| // v0.6.2 — refonte de la landing : démarrage rapide + inventaire + tooltips d'architecture | |
| "qs.title": "⚡ Démarrage rapide", | |
| "qs.step1": "Collez un model ID HuggingFace (ex. <code>meta-llama/Meta-Llama-3-8B</code>)", | |
| "qs.step2": "Cliquez sur <strong>📇 Profile a model</strong>", | |
| "qs.step3": "Lisez votre TAF Card — verdict par cas d'usage + chiffres clés + maths vérifiées par Lean+Mathlib", | |
| "qs.cta": "↓ Commencer", | |
| "inv.title": "🧰 Ce que cet outil vous offre", | |
| "inv.recipes.title": "🎯 8 recettes — ce modèle convient-il à votre usage ?", | |
| "inv.recipes.x1.title": "Entraînement propre vs API", | |
| "inv.recipes.x1.body": "lequel coûte moins cher pour votre trafic ?", | |
| "inv.recipes.x2.title": "Contexte long", | |
| "inv.recipes.x2.body": "tient-il 32k / 128k tokens de manière fiable ?", | |
| "inv.recipes.x3.title": "Budget", | |
| "inv.recipes.x3.body": "avec $X, quel modèle pouvez-vous entraîner ?", | |
| "inv.recipes.x5.title": "Matériel", | |
| "inv.recipes.x5.body": "quel GPU pour servir N tokens/jour ?", | |
| "inv.recipes.x19.title": "KV cache", | |
| "inv.recipes.x19.body": "comment compresser sans casser la qualité ?", | |
| "inv.recipes.x21.title": "Pureté d'imprint", | |
| "inv.recipes.x21.body": "à quel point l'encodage positionnel est-il propre ?", | |
| "inv.recipes.x22.title": "Compute-contexte", | |
| "inv.recipes.x22.body": "le modèle entre-t-il dans la bande empirique ?", | |
| "inv.recipes.x23.title": "Phase IH", | |
| "inv.recipes.x23.body": "pré- ou post-induction-head ?", | |
| "inv.diag.title": "🔬 Diagnostics", | |
| "inv.diag.gamma": "<strong>γ prédit vs observé</strong> — auto-classe le modèle en 5 régimes (normal · fraude / contexte gonflé · compressé · over-Padé · sliding-window)", | |
| "inv.diag.cardy": "<strong>Cardy ΔH</strong> — décalage d'entropie entre contexte observé et nominal", | |
| "inv.diag.fals": "<strong>Tableau de falsifiabilité</strong> — vérifie 23 prédictions spécifiques (F1–F23)", | |
| "inv.diag.alg": "<strong>Cohérence algébrique</strong> — 8 identités mathématiques que le modèle doit satisfaire", | |
| "inv.verify.title": "✓ Maths formellement vérifiées", | |
| "inv.verify.count": "<strong>37 théorèmes</strong> machine-proven en Lean 4 + Mathlib4", | |
| "inv.verify.click": "Cliquez sur un badge → ouvre la ligne source sur GitHub", | |
| "inv.verify.reverify": "Vérifiez vous-même : <code>lake build</code> (≈5 s après cache)", | |
| "inv.export.title": "📤 Export et partage", | |
| "inv.export.formats": "<strong>JSON · Markdown · LaTeX</strong> (prêt pour papier)", | |
| "inv.export.share": "Lien reproductible (état encodé dans l'URL)", | |
| "inv.export.registry": "Soumettre au registre communautaire sur GitHub", | |
| "arch.summary": "Architectures prises en charge", | |
| "arch.anyhf": "✓ Tout modèle public HuggingFace", | |
| "tooltip.mha": "Multi-Head Attention : chaque position attend via plusieurs têtes parallèles à la fois.", | |
| "tooltip.gqa": "Grouped Query Attention : les queries partagent moins de keys/values que de heads (économise mémoire mais pousse γ vers Hagedorn).", | |
| "tooltip.alibi": "Attention with Linear Biases : l'info de position est une pente apprise ajoutée aux scores, sans rotation.", | |
| "tooltip.abspe": "Absolute Position Embeddings : chaque position a un vecteur fixe appris ajouté au token.", | |
| "tooltip.swa": "Sliding Window Attention : chaque token n'attend que dans une fenêtre locale fixe (Mistral, gemma-2 l'utilisent).", | |
| "tooltip.ssm": "State Space Model : couche de séquence qui maintient un état interne au lieu d'attention (Mamba, Jamba l'utilisent).", | |
| // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling unmasker | |
| "modes.unmask": "🪟 Démasquer", | |
| "unmask.title": "🪟 Démasqueur de contexte", | |
| "unmask.tip": "Collez un id de modèle HuggingFace (ou config.json brut). L'outil détecte sliding-window attention, RoPE scaling (YaRN/linear/dynamic NTK), et GQA — tout ce qui rend <code>max_position_embeddings</code> plus grand que le contexte effectif réel. Mistral-7B-v0.1 est l'exemple canonique : déclare 32k, attend dans ~4-8k.", | |
| "unmask.desc": "<strong>Êtes-vous sur le point de dépenser de l'argent sur un modèle qui n'attend pas vraiment aussi loin ?</strong> Collez un id et découvrez-le en 1 seconde. Sans GPU, sans inférence — juste de l'arithmétique sur config.json.", | |
| "unmask.id_label": "ID modèle HF :", | |
| "unmask.fetch_btn": "🔍 Démasquer", | |
| "unmask.paste_summary": "Ou collez config.json brut (modèles privés / en dev)", | |
| "unmask.paste_btn": "🔍 Démasquer config collé", | |
| "unmask.label.declared": "Contexte déclaré", | |
| "unmask.label.effective": "Effectif (estimé)", | |
| "unmask.label.ratio": "Ratio", | |
| "unmask.section.flags": "Drapeaux d'architecture", | |
| "unmask.section.warnings": "Avertissements", | |
| "unmask.section.reco": "Recommandation", | |
| "unmask.flag.swa": "SWA", | |
| "unmask.flag.rope": "RoPE scaling", | |
| "unmask.flag.gqa": "GQA", | |
| "unmask.flag.layers": "Couches", | |
| "unmask.flag.dhead": "d_head", | |
| "unmask.flag.theta": "RoPE θ", | |
| "unmask.flag.yes": "oui", | |
| "unmask.flag.no": "non", | |
| "unmask.flag.full_mha": "non (MHA complet, {n} heads)", | |
| "unmask.verdict.honest": "✅ HONNÊTE", | |
| "unmask.verdict.inflated": "⚠ GONFLÉ", | |
| "unmask.verdict.severely_inflated": "❌ GRAVEMENT GONFLÉ", | |
| "unmask.verdict.yarn_extended": "⚠ YARN-ÉTENDU", | |
| "unmask.verdict.unknown": "❓ INCONNU", | |
| "unmask.warn.swa_window": "Fenêtre SWA : {window} tokens — chaque couche n'attend que dans cette fenêtre.", | |
| "unmask.warn.multihop": "Estimation multi-hop : ~{multiHop} tokens (conservateur : fenêtre × {factor}).", | |
| "unmask.warn.yarn": "RoPE scaling ({type}) étend le contexte {factor}× de ~{original} à {declared} tokens.", | |
| "unmask.warn.yarn_advice": "Contexte RoPE-étendu — vérifiez le comportement de γ à la longueur déclarée avec le diagnostic γ_check.", | |
| "unmask.warn.gqa_small_dhead": "Petite head dim ({d_head}) + GQA : compression de KV cache probable en contexte long (γ poussé vers Hagedorn).", | |
| "unmask.reco.honest": "Modèle d'attention complète standard. Contexte effectif correspond au déclaré ({declared} tokens).", | |
| "unmask.reco.inflated": "Effectif ~{effective} tokens via SWA. Utilisez γ_check pour vérifier le comportement à votre longueur cible.", | |
| "unmask.reco.severely_inflated": "Traitez-le comme un modèle de ~{effective} tokens en pratique. Le claim de {declared} tokens ne s'applique que via des chaînes d'attention cross-layer, qui dégradent empiriquement au-delà de ~2× la fenêtre SWA.", | |
| "unmask.reco.yarn_extended": "Contexte RoPE-étendu. Lancez un benchmark long-context (NIAH à 8k / 16k / 32k / full) pour confirmer que l'extension tient. Utilisez γ_check avec T_eval = {declared}.", | |
| "unmask.reco.unknown": "Impossible de parser le config. Vérifiez que l'URL est un modèle HF valide avec config.json public.", | |
| "unmask.status.empty_id": "⚠ Saisissez un model id (ex. mistralai/Mistral-7B-v0.1).", | |
| "unmask.status.fetching": "⏳ Récupération config.json pour {modelId}...", | |
| "unmask.status.success": "✅ {modelId} analysé (verdict : {verdict})", | |
| "unmask.status.empty_paste": "⚠ Collez d'abord un config.json.", | |
| "unmask.status.invalid_json": "❌ JSON invalide : {error}", | |
| "unmask.status.success_paste": "✅ Config collé analysé (verdict : {verdict})", | |
| "unmask.pasted_label": "(config collé)", | |
| "mode_desc.ask": "Tapez une question libre. Le LLM dans le navigateur choisit la recette et l'exécute.", | |
| "mode_desc.recipe": "Sélectionnez une recette directement et remplissez le formulaire. Contrôle manuel complet.", | |
| "mode_desc.profile": "Démarrage le plus rapide : collez n'importe quel model id HuggingFace, cliquez Profile. Voyez les 5 recettes en quelques secondes.", | |
| "mode_desc.compare": "Choisissez 2-3 modèles candidats + une recette. Verdicts côte à côte dans un tableau.", | |
| "mode_desc.inspector": "Collez un config.json directement. Utile pour modèles privés / en dev non publiés sur HF Hub.", | |
| "mode_desc.diagnose": "Construit la commande CLI diagnose_model.py pour MESURER γ_obs sur GPU réel. Le navigateur prédit ; le CLI mesure.", | |
| "mode_desc.phase": "Scatter γ × θ du panel empirique du papier. Survolez les points pour détails, cliquez pour charger dans Diagnose / Recipe.", | |
| "mode_desc.unmask": "Détecte si max_position_embeddings est trompeur (SWA / YaRN / RoPE-scaling). Collez un model id, obtenez un verdict en 1 ligne.", | |
| "profile.preset_loaded": "✅ Préréglage chargé pour <strong>{id}</strong>. Formulaire pré-rempli. (Cliquez 📥 Fetch pour écraser avec le dernier config depuis HF Hub.)", | |
| // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer | |
| "modes.template": "📜 Chat-template", | |
| "mode_desc.template": "Détecte la famille de chat-template d'un modèle (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek). Donne le flag CLI exact pour lm-eval / vLLM / transformers.", | |
| "template.title": "📜 Détecteur de Chat-template", | |
| "template.tip": "Collez un model id HF (ou tokenizer_config.json brut). Détecte la famille du chat-template et donne le commande exacte pour l'utiliser correctement. lm-eval-harness divise l'accuracy par 2 silencieusement si vous oubliez de l'appliquer (issue #1841).", | |
| "template.desc": "<strong>Avez-vous oublié <code>--apply_chat_template</code> ?</strong> La plupart des évals multi-tours échouent à ~50% parce que le chat template n'a pas été appliqué. Collez un model id, obtenez le flag CLI exact pour votre stack.", | |
| "template.id_label": "ID modèle HF :", | |
| "template.fetch_btn": "📜 Détecter", | |
| "template.paste_summary": "Ou collez tokenizer_config.json brut (modèles privés)", | |
| "template.paste_btn": "📜 Détecter config collé", | |
| "template.label.family": "Famille détectée", | |
| "template.label.markers": "Marqueurs correspondants", | |
| "template.label.tpl_len": "Longueur du template", | |
| "template.section.warnings": "Avertissements", | |
| "template.section.commands": "Commandes par framework", | |
| "template.section.raw": "Template brut (preview)", | |
| "template.family.custom": "custom (famille inconnue)", | |
| "template.family.none": "(pas de chat_template)", | |
| "template.verdict.ok": "✅ TEMPLATE DÉTECTÉ", | |
| "template.verdict.custom": "⚠ TEMPLATE CUSTOM", | |
| "template.verdict.missing": "❌ PAS DE CHAT TEMPLATE", | |
| "template.verdict.base_model": "ℹ MODÈLE DE BASE (sans chat)", | |
| "template.verdict.unknown": "❓ INCONNU", | |
| "template.warn.no_chat_template": "Pas de champ <code>chat_template</code> dans tokenizer_config.json. Typique des modèles base / pré-entraînés. Si vous attendiez un modèle instruct-tuned, le mauvais fichier peut être chargé.", | |
| "template.warn.custom_template": "Template non standard ({length} chars). L'outil n'a pas pu le faire correspondre aux familles connues. Inspectez le preview et vérifiez que votre framework d'éval le supporte.", | |
| "template.warn.lm_eval_apply": "<strong>lm-eval-harness :</strong> ajoutez <code>--apply_chat_template</code> ou votre accuracy chutera silencieusement de ~50% sur les évals multi-tours (issue #1841).", | |
| "template.warn.vllm_apply": "<strong>vLLM serve :</strong> vérifiez que <code>--chat-template</code> est défini (l'auto-détection échoue parfois sur les variantes fine-tunées). Suggéré : <code>{name}</code>.", | |
| "template.status.empty_id": "⚠ Saisissez un model id (ex. mistralai/Mistral-7B-Instruct-v0.3).", | |
| "template.status.fetching": "⏳ Récupération tokenizer_config.json pour {modelId}...", | |
| "template.status.success": "✅ {modelId} détecté (verdict : {verdict})", | |
| "template.status.empty_paste": "⚠ Collez d'abord un tokenizer_config.json.", | |
| "template.status.invalid_json":"❌ JSON invalide : {error}", | |
| "template.status.success_paste":"✅ Config collé détecté (verdict : {verdict})", | |
| "template.pasted_label": "(tokenizer_config collé)", | |
| // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor | |
| "modes.arena": "🎯 Arena CI", | |
| "mode_desc.arena": "Récupère les intervalles de confiance à partir des données brutes de votes pairwise (MLE Bradley-Terry + bootstrap). Détecte les paires statistiquement à égalité que le leaderboard public d'Arena cache.", | |
| "arena.title": "🎯 Reconstructeur Arena-Elo CI", | |
| "arena.tip": "Chatbot Arena masque les intervalles de confiance dans le leaderboard public. Un écart de 5 Elo peut être statistiquement insignifiant. Collez les données brutes de votes (model_a, model_b, winner) — l'outil calcule le MLE Bradley-Terry + bootstrap CIs et liste les égalités statistiques (overlap CI).", | |
| "arena.desc": "<strong>GPT-4 est-il vraiment meilleur que Claude — ou sont-ils à égalité ?</strong> Collez le CSV de votes pairwise (ou cliquez <em>Charger un échantillon</em>). MLE Bradley-Terry + 200 itérations de bootstrap → Elos classés avec CIs 95% et détection d'égalités statistiques. Tout dans le navigateur.", | |
| "arena.sample_btn": "📊 Charger échantillon", | |
| "arena.run_btn": "🎯 Calculer CIs", | |
| "arena.clear_btn": "🗑️ Effacer", | |
| "arena.csv_summary": "CSV de votes (header : <code>model_a,model_b,winner</code> ; winner ∈ a/b/tie)", | |
| "arena.section.ranked": "Elos classés avec CIs 95%", | |
| "arena.section.ties": "Égalités statistiques (overlap CI)", | |
| "arena.section.summary": "Résumé", | |
| "arena.col.rank": "#", | |
| "arena.col.model": "Modèle", | |
| "arena.col.elo": "Elo", | |
| "arena.col.ci": "CI 95%", | |
| "arena.col.ci_width": "± demi-largeur", | |
| "arena.col.matches": "Matchs", | |
| "arena.col.wins": "V / D / E", | |
| "arena.col.tie_pair": "Paire", | |
| "arena.col.tie_diff": "Écart Elo", | |
| "arena.col.tie_overlap": "Overlap CI", | |
| "arena.no_ties": "Aucune égalité statistique — toutes les paires sont distinguables à 95% CI.", | |
| "arena.summary.votes": "Total des votes", | |
| "arena.summary.models": "Modèles", | |
| "arena.summary.ties": "Égalités statistiques", | |
| "arena.summary.bootstrap": "Itérations bootstrap", | |
| "arena.summary.ci_level": "Niveau CI", | |
| "arena.status.empty": "⚠ Collez un CSV de votes ou cliquez sur Charger échantillon.", | |
| "arena.status.too_few": "⚠ Seulement {n} votes valides — il en faut au moins 10 pour ajuster Bradley-Terry de manière fiable.", | |
| "arena.status.computing": "⏳ Calcul MLE Bradley-Terry + bootstrap sur {n} votes...", | |
| "arena.status.done": "✅ {n} votes · {models} modèles · {ties} égalités statistiques · {ms} ms", | |
| "arena.status.sample_loaded": "✅ Échantillon chargé (données Arena synthétiques 6 modèles). Cliquez sur Calculer CIs.", | |
| // v0.7.3 — anti-bullshit pack #4: Contamination Prior | |
| "modes.contam": "🧪 Contamination", | |
| "mode_desc.contam": "Prior bayésien-ish sur la contamination d'un score de benchmark. Saisissez le cutoff d'entraînement → note 20+ benchmarks populaires (MMLU, GSM8K, HumanEval, MMLU-Pro…).", | |
| "contam.title": "🧪 Prior de Contamination", | |
| "contam.tip": "Calcule un prior bayésien-ish indiquant si un score de benchmark est contaminé, basé sur (date de cutoff d'entraînement) × (date de sortie du benchmark) × (inclusion connue dans corpus + historique de leaks). Open LLM Leaderboard v1 a été tué en 2024 après la contamination de MMLU/HellaSwag.", | |
| "contam.desc": "<strong>Devez-vous faire confiance au score MMLU de votre modèle ?</strong> Saisissez la date de cutoff d'entraînement — l'outil note 20+ benchmarks populaires (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA…) et vous dit quels scores sont probablement contaminés.", | |
| "contam.cutoff_label": "Cutoff entraînement :", | |
| "contam.run_btn": "🧪 Noter tous les benchmarks", | |
| "contam.section.ranked": "Priors de contamination par benchmark", | |
| "contam.section.high": "🔴 Benchmarks à haut risque (traitez les scores comme non fiables)", | |
| "contam.section.medium": "🟡 Risque moyen (vérifiez avec des alternatives)", | |
| "contam.section.low": "🟢 Faible risque (probablement propres)", | |
| "contam.col.benchmark": "Benchmark", | |
| "contam.col.released": "Sorti", | |
| "contam.col.gap": "Écart (mois)", | |
| "contam.col.prior": "P(contam)", | |
| "contam.col.level": "Niveau", | |
| "contam.col.corpora": "Dans corpus", | |
| "contam.col.category": "Catégorie", | |
| "contam.label.high": "Haut risque", | |
| "contam.label.medium": "Moyen", | |
| "contam.label.low": "Faible", | |
| "contam.no_entries": "(aucun dans cette catégorie)", | |
| "contam.advice.high": "Traitez ces scores comme non fiables. Remplacez par des alternatives plus récentes / à test privé (MMLU-Pro, GPQA, MUSR, MATH-500).", | |
| "contam.advice.medium": "À prendre avec précaution. Cherchez une réplication sur un subset held-out ou des reproductions communautaires.", | |
| "contam.advice.low": "Score probablement non contaminé, mais absence de leak n'est pas une preuve — vérifiez avec un test alternatif.", | |
| "contam.summary.headline": "Cutoff <code>{cutoff}</code> · {n} benchmarks notés", | |
| "contam.status.empty": "⚠ Saisissez une date de cutoff d'entraînement (ex. 2023-12).", | |
| "contam.status.bad_date": "⚠ Format de date incorrect. Utilisez YYYY-MM ou YYYY-MM-DD.", | |
| "contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks notés · {high} à haut risque", | |
| // v0.7 — Section Help modal | |
| "help.v07.title": "🆕 v0.7 — Pack anti-bullshit (4 nouveaux modes)", | |
| "help.v07.intro": "<em>v0.7 (2026-05-06) : quatre nouveaux modes qui résolvent des problèmes concrets remontés par la communauté HuggingFace. Chacun tourne dans votre navigateur sans inférence — pure métadonnée + maths.</em>", | |
| "help.v07.unmask.title": "🪟 Démasqueur de Contexte", | |
| "help.v07.unmask.body": "Détecte quand <code>max_position_embeddings</code> est trompeur. Mistral-7B-v0.1 déclare 32k mais attend dans ~4-8k via SWA. Collez un id HF → verdict en 1 seconde (HONNÊTE / GONFLÉ / GRAVEMENT GONFLÉ / YARN-ÉTENDU). Détecte SWA, RoPE-scaling (YaRN/linear/dynamic NTK), petit d_head + GQA. <em>Cas d'usage</em> : avant de payer un GPU pour 32k de contexte, vérifiez que le modèle attend vraiment aussi loin.", | |
| "help.v07.template.title": "📜 Détecteur de Chat-template", | |
| "help.v07.template.body": "Détecte la famille de chat-template d'un modèle (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / custom / none) et donne le flag CLI exact pour lm-evaluation-harness, vLLM, et transformers. Résout l'issue #1841 de lm-eval-harness : oublier <code>--apply_chat_template</code> divise l'accuracy multi-tours par 2 silencieusement. <em>Cas d'usage</em> : avant de reporter un score, confirmez avoir appliqué le template correctement.", | |
| "help.v07.arena.title": "🎯 Reconstructeur Arena-Elo CI", | |
| "help.v07.arena.body": "Chatbot Arena masque les intervalles de confiance de son leaderboard public — un écart de 5 Elo peut être statistiquement insignifiant. Collez des données brutes de votes pairwise (model_a, model_b, winner) → MLE Bradley-Terry + bootstrap 200 itérations → Elos classés avec CIs 95% et un panneau \"égalités statistiques\" listant les paires dont les CIs se chevauchent. Essayez le bouton Charger échantillon. <em>Cas d'usage</em> : avant de déclarer \"modèle A bat modèle B\", vérifiez que leurs CIs ne se chevauchent pas.", | |
| "help.v07.contam.title": "🧪 Prior de Contamination", | |
| "help.v07.contam.body": "Prior bayésien-ish sur la contamination d'un score de benchmark. Saisissez la date de cutoff d'entraînement de votre modèle → l'outil note 20+ benchmarks populaires (MMLU, HellaSwag, GSM8K, HumanEval, IFEval, MMLU-Pro, GPQA, AIME, MATH-500, BBH, MUSR…) par P(contamination) selon l'écart temporel, l'inclusion dans corpus et l'historique de leaks connus. Open LLM Leaderboard v1 a été tué en 2024 après la contamination de MMLU/HellaSwag. <em>Cas d'usage</em> : décidez quels scores croire en comparant deux modèles.", | |
| "help.v07.quant.title": "⚖️ Classificateur de régime de quantification", | |
| "help.v07.quant.body": "Prédit le γ-shift et ΔPPL pour tout (modèle × schéma de quantification : NF4, AWQ, GPTQ, GGUF Q4_K_M / Q5_K_M / Q8_0, int8, FP8…). Arch-aware : petit d_head + GQA agressif → plus sensible ; les schémas calibrés (AWQ) absorbent mieux le shift que les non calibrés (NF4). Recommande des alternatives plus sûres si un cliff est détecté. <em>Cas d'usage</em> : avant de quantifier, prédisez si votre combo architecture × schéma maintiendra la PPL acceptable, avec une suggestion concrète de switch sinon.", | |
| "help.v07.drift.title": "🔀 Borne de drift inter-frameworks", | |
| "help.v07.drift.body": "Même modèle, scores différents sur setups différents. L'outil prédit le drift max admissible dû au seul bruit numérique (dtype, framework, batch). Si l'écart observé le dépasse → vrai bug, généralement chat-template mismatch (issue #1841 lm-eval-harness) ou layout KV-cache. Essayez le bouton "Charger échantillon" pour le bug chat-template canonique. <em>Cas d'usage</em> : avant de reporter une régression ou de revendiquer la reproductibilité, vérifiez si l'écart entre deux évals est plus grand que ce que le bruit numérique peut expliquer.", | |
| "inv.v07.drift": "<strong>🔀 Drift</strong> — bug ou bruit ? Prédit l'écart max admissible entre deux évals", | |
| "help.v07.niah.title": "🔍 Gap NIAH → Reasoning", | |
| "help.v07.niah.body": "Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH (retrieval de needle) mais échouent au reasoning multi-hop au même contexte. L'outil prédit les deux taux de réussite à partir de l'architecture (γ_Padé + d_horizon + pression arch : petit d_head, GQA, SWA), reporte le gap, et trouve le \"contexte sûr pour reasoning\" où le reasoning reste ≥65%. Mode balayage montre la courbe à 1k/4k/16k/64k/T_train. <em>Cas d'usage</em> : avant de déployer au contexte revendiqué, découvrez si le modèle va vraiment raisonner là ou seulement retrouver.", | |
| "inv.v07.niah": "<strong>🔍 NIAH→Reason</strong> — votre \"128k\" raisonne-t-il vraiment là, ou seulement retrouve ?", | |
| // v0.7 — Inventory modal 5ème card | |
| "inv.v07.title": "🆕 Pack anti-bullshit v0.7", | |
| "inv.v07.unmask": "<strong>🪟 Unmask</strong> — config.json annonce 32k ? Voyez s'il attend vraiment aussi loin", | |
| "inv.v07.template": "<strong>📜 Chat-template</strong> — flag CLI exact pour que lm-eval ne divise pas votre accuracy par 2 en silence", | |
| "inv.v07.arena": "<strong>🎯 Arena CI</strong> — récupère les intervalles de confiance que Chatbot Arena cache", | |
| "inv.v07.contam": "<strong>🧪 Contamination</strong> — note 20+ benchmarks par probabilité de contamination", | |
| "inv.v07.quant": "<strong>⚖️ Quant</strong> — prédit le γ-shift + ΔPPL pour tout combo (modèle × schéma de quantification)", | |
| // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier | |
| "modes.quant": "⚖️ Quant", | |
| "mode_desc.quant": "Prédit le γ-shift et ΔPPL pour tout (modèle × schéma de quantification). Arch-aware : petit d_head + GQA → plus sensible. Recommande des alternatives plus sûres si un cliff est détecté.", | |
| "quant.title": "⚖️ Classificateur de régime de quantification", | |
| "quant.tip": "Prédit le γ-shift (et la ΔPPL résultante) pour une paire (modèle × schéma). Les claims génériques comme 'AWQ ~95% retention' sont trop vagues — TAF utilise d_head, ratio GQA, flag SWA et taille du modèle pour donner un verdict arch-spécifique. Résout : la communauté HF rapporte des cliffs de quantification imprédictibles (NF4 -2 PPL sur Phi-3 mais OK sur Llama-3-8B).", | |
| "quant.desc": "<strong>La quantification cassera-t-elle votre modèle ?</strong> Collez un id HF, choisissez un schéma — obtenez le γ-shift prédit, la bande ΔPPL attendue et une alternative recommandée si c'est un cliff. Navigateur uniquement, sans GPU, sans set de calibration.", | |
| "quant.id_label": "ID modèle HF :", | |
| "quant.fetch_btn": "📥 Récupérer config", | |
| "quant.scheme_label": "Schéma quant :", | |
| "quant.run_btn": "⚖️ Prédire", | |
| "quant.all_btn": "📊 Comparer tous les schémas", | |
| "quant.regime.safe": "✅ SÛR", | |
| "quant.regime.mild": "✅ COMPRESSION LÉGÈRE", | |
| "quant.regime.significant": "⚠ DÉGRADATION SIGNIFICATIVE", | |
| "quant.regime.cliff": "❌ CLIFF SÉVÈRE", | |
| "quant.label.gamma_shift": "γ shift", | |
| "quant.label.delta_ppl": "ΔPPL (est.)", | |
| "quant.label.arch_mult": "Multiplicateur arch", | |
| "quant.section.breakdown": "Détail", | |
| "quant.section.reco": "Recommandation", | |
| "quant.section.compare": "Tous les schémas (triés par sécurité)", | |
| "quant.field.scheme": "Schéma", | |
| "quant.field.calibrated": "calibré", | |
| "quant.field.uncalibrated": "non calibré", | |
| "quant.field.base_penalty": "Pénalité de base", | |
| "quant.field.arch_mult_full": "Multiplicateur architectural", | |
| "quant.field.gamma_shift": "γ shift prédit", | |
| "quant.field.ppl_band": "Bande ΔPPL (est.)", | |
| "quant.field.params": "Paramètres", | |
| "quant.col.scheme": "Schéma", | |
| "quant.col.bits": "Bits", | |
| "quant.col.gamma_shift": "γ shift", | |
| "quant.col.ppl_band": "Bande ΔPPL", | |
| "quant.col.regime": "Régime", | |
| "quant.reco.switch_to_awq": "<strong>Passez à {scheme}</strong> — le 4-bit calibré gère bien mieux les petits d_head + GQA que NF4. ΔPPL attendue chute ~2-3×.", | |
| "quant.reco.switch_to_q5_km": "<strong>Passez à {scheme}</strong> — Q5 garde plus de dimensions de head intactes à faible coût (~25% plus grand seulement).", | |
| "quant.reco.switch_to_q4_km": "<strong>Passez à {scheme}</strong> — Q3/Q2 sont trop agressifs pour cette architecture.", | |
| "quant.reco.consider_awq": "<strong>Considérez {scheme}</strong> — la calibration réduit significativement le γ-shift sur cette architecture.", | |
| "quant.reco.use_higher_bits": "<strong>Utilisez une alternative à plus de bits</strong> — cette architecture n'absorbe pas le 4-bit proprement. Essayez 5 ou 8-bit.", | |
| "quant.reco.verify_with_eval": "<strong>Vérifiez avec une vraie éval</strong> — le shift prédit est borderline. Lancez NIAH à votre contexte cible avant de déployer.", | |
| "quant.reco.no_action": "Pas d'action requise — la quantification est sûre pour cette architecture.", | |
| "quant.summary.headline_all": "Tous les schémas pour <code>{modelId}</code>", | |
| "quant.status.empty_id": "⚠ Saisissez un model id (ex. meta-llama/Llama-3.2-1B).", | |
| "quant.status.fetching": "⏳ Récupération config.json pour {modelId}...", | |
| "quant.status.fetched": "✅ Config récupéré pour {modelId}. Choisissez un schéma et cliquez Prédire (ou Comparer tous).", | |
| "quant.status.no_scheme": "⚠ Choisissez un schéma de quantification dans le dropdown.", | |
| "quant.status.done": "✅ Régime prédit : {regime}", | |
| "quant.status.done_all": "✅ Comparé {n} schémas — triés par sécurité.", | |
| // v0.7.4 — autocomplete HF Hub : confidentialité + rate-limit | |
| "hf_auto.privacy": "🔒 Requêtes envoyées à huggingface.co/api · cache local 5 min", | |
| "hf_auto.rate_limited": "⚠ Rate limit HuggingFace — réessayez dans un moment, ou tapez l'id complet manuellement", | |
| "hf_auto.gated_msg": "est gated. Acceptez la licence ici :", | |
| // v0.7.5 — anti-bullshit pack #6: Cross-framework drift bound | |
| "modes.drift": "🔀 Drift", | |
| "mode_desc.drift": "Prédit le drift max admissible entre deux scores de benchmark donnés (framework, dtype, batch, chat-template). Distingue les vrais bugs du bruit numérique.", | |
| "drift.title": "🔀 Borne de drift inter-frameworks", | |
| "drift.tip": "Même modèle, scores différents sur des setups différents. L'écart est-il du bruit ou un vrai bug ? Saisissez deux scores avec leur (framework, dtype, batch, chat-template) — l'outil prédit le drift max admissible dû au seul bruit numérique. Si l'écart observé le dépasse → vrai bug, généralement chat-template mismatch (issue #1841 lm-eval) ou layout KV-cache.", | |
| "drift.desc": "<strong>Votre modèle donne 67.2 sur lm-eval-hf et 65.1 sur vLLM-served. Bug ou bruit ?</strong> Saisissez les deux scores avec (framework, dtype, batch, chat-template appliqué ?). L'outil prédit la bande de bruit et signale les vrais bugs. arxiv 2506.09501 documente cela comme un problème majeur de reproductibilité d'évals.", | |
| "drift.setup_a": "Setup A", | |
| "drift.setup_b": "Setup B", | |
| "drift.score": "Score", | |
| "drift.framework": "Framework", | |
| "drift.dtype": "Dtype", | |
| "drift.batch": "Batch", | |
| "drift.template": "Chat-template", | |
| "drift.template.applied": "appliqué", | |
| "drift.template.not_applied": "non appliqué", | |
| "drift.template.unknown": "inconnu", | |
| "drift.run_btn": "🔀 Calculer la borne de drift", | |
| "drift.sample_btn": "📊 Charger échantillon (bug chat-template)", | |
| "drift.label.observed": "Écart observé", | |
| "drift.label.band": "Bande numérique", | |
| "drift.label.ratio": "Écart / bande", | |
| "drift.section.setups": "Setups", | |
| "drift.section.breakdown": "Contributeurs au drift (bande numérique)", | |
| "drift.section.verdict": "Verdict et recommandation", | |
| "drift.contrib.dtype": "Mismatch de dtype", | |
| "drift.contrib.framework": "Framework", | |
| "drift.contrib.batch": "Différence de batch", | |
| "drift.contrib.template": "MISMATCH de chat-template", | |
| "drift.dominant_cause": "Cause dominante", | |
| "drift.cause.dtype": "différence de précision dtype", | |
| "drift.cause.framework": "différence de framework / kernel", | |
| "drift.cause.batch": "chemins de normalisation par batch", | |
| "drift.cause.template_mismatch": "chat-template appliqué d'un côté mais pas de l'autre (motif #1841 lm-eval-harness — typiquement -50% sur multi-tours)", | |
| "drift.verdict.noise": "✅ BRUIT NUMÉRIQUE", | |
| "drift.verdict.suspicious": "⚠ SUSPECT — vérifiez", | |
| "drift.verdict.bug": "❌ VRAI BUG — investiguez", | |
| "drift.verdict.bug_template": "❌ BUG DE CHAT-TEMPLATE", | |
| "drift.reco.noise": "L'écart entre dans la bande de bruit numérique attendue. Pas d'action requise ; la différence est cohérente avec la seule variation framework/dtype/batch.", | |
| "drift.reco.suspicious": "L'écart est 1–2× la bande prédite. Borderline — possible vrai bug. Essayez d'aligner le contributeur dominant (ex. égalisez framework ou dtype) et re-testez.", | |
| "drift.reco.bug": "L'écart est > 2× la bande prédite. C'est un vrai bug. Inspectez le contributeur dominant — probablement une différence de tokenizer / chat-template / layout KV-cache. Lancez lm-eval-harness avec <code>--apply_chat_template</code> et confirmez.", | |
| "drift.reco.bug_template": "Mismatch de chat-template détecté. C'est la cause la plus commune des grands écarts d'évals (issue #1841 lm-eval-harness). Relancez le côté "non appliqué" avec <code>--apply_chat_template</code> (ou réglez vLLM <code>--chat-template <name></code>) et re-testez.", | |
| "drift.status.empty_scores": "⚠ Saisissez les deux scores.", | |
| "drift.status.done": "✅ Verdict : {verdict}", | |
| "drift.status.sample_loaded": "✅ Échantillon chargé (bug chat-template canonique). Cliquez sur Calculer la borne de drift.", | |
| // v0.7.6 — anti-bullshit pack #7: prédicteur de gap NIAH → reasoning | |
| "modes.niah": "🔍 NIAH→Reason", | |
| "mode_desc.niah": "Prédit les taux de réussite NIAH (retrieval) et reasoning multi-hop à n'importe quel contexte. Résout : les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte (paper RULER).", | |
| "modes.saturation": "📈 Saturation", | |
| "mode_desc.saturation": "Indique si un benchmark discrimine encore les frontier models ou s'il est saturé (ex. MMLU 88-94% top, AIME 2025 déjà 96-100%). Retourne top-3 + verdict + remplacements recommandés.", | |
| "modes.hub": "🧭 Solutions", | |
| "mode_desc.hub": "Carte de chaque problème documenté de LLM-eval → mode tafagent (si couvert) + outils externes curés. Trouvez la solution sans la réinventer. 30+ pains, 7 catégories.", | |
| "niah.title": "🔍 Gap NIAH → Reasoning", | |
| "niah.tip": "NIAH (Needle in a Haystack) teste le retrieval : 'trouve ce fait dans un long texte'. Le reasoning multi-hop teste l'inférence : 'combine les faits X+Y au début avec le fait Z à la fin'. Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte. Cet outil prédit les deux taux à partir de la seule architecture.", | |
| "niah.desc": "<strong>Votre modèle revendique 128k de contexte. Va-t-il vraiment raisonner à 64k, ou seulement retrouver ?</strong> Collez un model id HF et un contexte cible — l'outil prédit les taux de réussite NIAH et reasoning multi-hop, le gap, et un 'contexte sûr' où le reasoning reste ≥65%.", | |
| "niah.id_label": "ID modèle HF :", | |
| "niah.fetch_btn": "📥 Récupérer config", | |
| "niah.teval_label": "Contexte cible (T_eval) :", | |
| "niah.run_btn": "🔍 Prédire", | |
| "niah.sweep_btn": "📊 Balayer les contextes", | |
| "niah.label.niah": "Taux NIAH", | |
| "niah.label.reasoning": "Taux Reasoning", | |
| "niah.label.gap": "Gap", | |
| "niah.label.safe_ctx": "Contexte sûr pour reasoning", | |
| "niah.section.breakdown": "Détail architectural", | |
| "niah.section.reco": "Recommandation", | |
| "niah.section.sweep": "Balayage des taux par longueur de contexte", | |
| "niah.field.dhorizon": "d_horizon (effectif)", | |
| "niah.field.ratio": "T_eval / d_horizon", | |
| "niah.field.arch_pressure": "Pression arch (petit d_head + GQA + SWA)", | |
| "niah.field.theta": "RoPE θ", | |
| "niah.field.t_train": "T_train (revendiqué)", | |
| "niah.col.context": "T_eval", | |
| "niah.col.niah": "NIAH", | |
| "niah.col.reasoning": "Reasoning", | |
| "niah.col.gap": "Gap", | |
| "niah.col.verdict": "Verdict", | |
| "niah.verdict.robust": "✅ ROBUSTE", | |
| "niah.verdict.marginal": "⚠ MARGINAL", | |
| "niah.verdict.degraded": "⚠ DÉGRADÉ", | |
| "niah.verdict.retrieval_only": "❌ RETRIEVAL UNIQUEMENT", | |
| "niah.verdict.broken": "❌ CASSÉ", | |
| "niah.reco.robust": "Retrieval et reasoning tiennent tous deux à ce contexte. Sûr de déployer pour les tâches de lookup et d'inférence.", | |
| "niah.reco.marginal": "Borderline. Le retrieval fonctionne mais le reasoning est fragile. À utiliser pour le lookup, pas pour l'inférence multi-étapes.", | |
| "niah.reco.degraded": "Chute significative du reasoning. Le modèle trouve des faits mais peine à les combiner. Évitez les tâches multi-hop à cette longueur.", | |
| "niah.reco.retrieval_only": "Constat canonique de RULER : le modèle passe NIAH mais échoue au reasoning. Utile pour les setups RAG (où le LLM ne fait que localiser les faits) mais PAS pour l'inférence chaînée. Réduisez votre contexte à la valeur 'sûre' ci-dessous.", | |
| "niah.reco.broken": "Le modèle échoue même au retrieval basique à ce contexte. Traitez-le comme hors-distribution — re-testez à un contexte plus court.", | |
| "niah.safe_context": "≤ {ctx} tokens (reasoning ≥ 65%)", | |
| "niah.safe_context_none": "Aucun contexte sûr trouvé sous votre cible — le modèle échoue au reasoning même à de petits contextes.", | |
| "niah.summary.sweep": "<code>{modelId}</code> — taux par contexte", | |
| "niah.status.empty_id": "⚠ Saisissez un model id (ex. meta-llama/Llama-3.1-8B-Instruct).", | |
| "niah.status.bad_teval": "⚠ Saisissez un contexte cible (≥ 512 tokens).", | |
| "niah.status.fetching": "⏳ Récupération config.json pour {modelId}...", | |
| "niah.status.fetched": "✅ Config récupéré pour {modelId}. Réglez T_eval et cliquez Prédire (ou Balayer les contextes).", | |
| "niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%", | |
| "niah.status.sweep_done": "✅ Balayé {n} longueurs de contexte.", | |
| "saturation.title": "📈 Détecteur de saturation des benchmarks", | |
| "saturation.tip": "MMLU est saturé (88-94% sur tous les frontier models). Annoncer '92% sur MMLU' n'a plus de sens. Cet outil vous dit quels benchmarks discriminent encore les frontier models, lesquels sont saturés, et quoi utiliser à la place. Données : DemandSphere AI Frontier Tracker (CC BY-NC 4.0) rafraîchi 2026-05.", | |
| "saturation.desc": "<strong>Votre benchmark est-il encore utile ?</strong> Choisissez un benchmark pour voir top-3 frontier scores, spread, et un verdict (saturated / near-saturated / discriminative) + remplacements recommandés.", | |
| "saturation.select_label": "Benchmark :", | |
| "saturation.select.all": "— afficher tous les benchmarks —", | |
| "saturation.run_btn": "📈 Classer", | |
| "saturation.all_btn": "📊 Afficher tout", | |
| "saturation.col.spread": "Écart top-3", | |
| "saturation.col.mean": "Moyenne top-3", | |
| "saturation.col.n": "Modèles", | |
| "saturation.col.bench": "Benchmark", | |
| "saturation.col.verdict": "Verdict", | |
| "saturation.col.reco": "Reco principale", | |
| "saturation.col.model": "Modèle", | |
| "saturation.col.score": "Score", | |
| "saturation.section.top3": "Top-3 frontier scores", | |
| "saturation.section.recommendations": "Alternatives recommandées", | |
| "saturation.section.note": "Notes", | |
| "saturation.section.all": "Tous les benchmarks suivis", | |
| "saturation.verdict.saturated": "🚨 SATURÉ", | |
| "saturation.verdict.near_saturated": "⚠ PRESQUE SATURÉ", | |
| "saturation.verdict.discriminative": "✅ DISCRIMINATIF", | |
| "saturation.verdict.sparse_data": "ℹ DONNÉES RARES", | |
| "saturation.borderline": "Borderline — à ±1pp d'un seuil de coupure. Traitez le verdict comme 'à vérifier soigneusement'.", | |
| "saturation.unknown": "Benchmark inconnu.", | |
| "saturation.attribution": "Données : DemandSphere AI Frontier Model Tracker (CC BY-NC 4.0) · HF Open LLM Leaderboard v3 (historique open-weight) · dernier fetch 2026-05-05.", | |
| "saturation.status.live": "✅ Données en direct chargées — {count} modèles.", | |
| "saturation.status.baked": "ℹ Utilisation du snapshot baked (fetch en direct indisponible).", | |
| "saturation.status.kb_fail": "⚠ Impossible de charger le KB de saturation.", | |
| "saturation.status.done": "✅ {name} — {verdict}", | |
| "saturation.status.all_done": "✅ {n} benchmarks classés.", | |
| "help.v08.saturation.title": "📈 Détecteur de saturation des benchmarks", | |
| "help.v08.saturation.body": "MMLU est saturé (top 88-94%), AIME 2025 saturé en quelques mois après sa sortie, HumanEval presque saturé. Choisissez un benchmark et l'outil retourne top-3 frontier scores, spread, moyenne, et un verdict — saturated / near-saturated / discriminative — plus un remplacement recommandé (ex. MMLU → MMLU-Pro / GPQA / HLE). Fetch en direct depuis DemandSphere AI Frontier Tracker (CC BY-NC 4.0) si accessible ; snapshot baked 2026-05-05 sinon. <em>Cas d'usage</em> : avant de citer '92% sur MMLU' ou de concevoir une eval, vérifiez si le benchmark discrimine encore quelque chose.", | |
| "inv.v08.saturation": "<strong>📈 Saturation</strong> — votre benchmark est-il encore utile, ou tous les frontiers sont-ils à égalité au sommet ?", | |
| "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — chaque pain documenté mappé à un mode tafagent ou outil externe curé. Ne réinventez pas — trouvez.", | |
| "help.v081.hub.title": "🧭 Solutions Hub", | |
| "help.v081.hub.body": "tafagent comme intégrateur, pas silo. 30+ pains à travers 7 catégories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), chacun mappé à (a) le mode tafagent qui le résout, s'il existe, et (b) les outils externes best-of-breed que la communauté utilise déjà (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). La barre de recherche matche pain, scénario, et nom d'outil. <em>Cas d'usage</em> : 'j'ai le problème X — tafagent le résout-il, et sinon, qui ?'", | |
| "hub.title": "🧭 Solutions Hub", | |
| "hub.tip": "Carte de chaque pain de LLM-eval documenté : quel mode tafagent l'adresse (si applicable), et les outils externes best-of-breed que la communauté utilise déjà. Objectif : couverture totale. Si l'outil canonique existe ailleurs, nous lions plutôt que de reconstruire.", | |
| "hub.desc": "<strong>Ne réinventez pas — trouvez.</strong> 30+ pains mappés à des modes tafagent + outils externes curés. Naviguez par catégorie, recherchez par mot-clé, ou voyez les lacunes où de nouveaux modes aideraient le plus.", | |
| "hub.clear_btn": "✕ Effacer", | |
| "hub.no_mode": "externe", | |
| "hub.planned": "prévu :", | |
| "hub.best_for": "Idéal pour", | |
| "hub.not_for": "Pas pour", | |
| "hub.tools": "Outils externes", | |
| "hub.status.loaded": "✅ Chargés {total} pains dans {categories} catégories — {covered} couverts par des modes tafagent, {externalLinks} liens externes curés. Compilé {compiled}.", | |
| "hub.status.fail": "⚠ Impossible de charger Solutions Hub.", | |
| "hub.search.empty": "Aucune correspondance pour '{query}'. Essayez des termes plus larges (ex. 'eval', 'rag', 'tokenizer').", | |
| "hub.search.results": "{n} correspondance(s) trouvée(s) pour '{query}'.", | |
| // v0.7.7 — Tuiles de tâches (refonte UX : 14 modes regroupés par intention) | |
| "tiles.title": "🎯 Que voulez-vous faire ?", | |
| "tiles.subtitle": "Choisissez une tâche. Chacune ouvre l'outil adéquat ci-dessous. Ou faites défiler pour la liste complète des 14 modes.", | |
| "tile.diagnose.title": "🔬 Diagnostiquer un modèle", | |
| "tile.diagnose.desc": "Ce modèle conviendra-t-il à mon cas d'usage ?", | |
| "tile.trust.title": "✓ Faire confiance à un score", | |
| "tile.trust.desc": "Dois-je croire ce nombre ? Bug ou bruit ?", | |
| "tile.eval.title": "⚙️ Configurer une éval correctement", | |
| "tile.eval.desc": "Obtenez le flag CLI exact pour lm-eval / vLLM / transformers.", | |
| "tile.compare.title": "🆚 Comparer des modèles", | |
| "tile.compare.desc": "Côte à côte, ou explorez le panel empirique de modèles.", | |
| "tile.manual.title": "📋 Manuel / libre", | |
| "tile.manual.desc": "Choisissez une recette à la main, ou demandez en langage naturel.", | |
| "tile.diagnose.tip": "Commencez ici quand vous avez un id de modèle spécifique et voulez un diagnostic complet : <strong>Profile</strong> lance les 5 recettes d'un coup. <strong>Unmask</strong> vérifie si max_position_embeddings est honnête. <strong>NIAH→Reason</strong> prédit le gap retrieval-vs-reasoning. <strong>Quant</strong> prédit si quantifier va le casser. <strong>Inspect</strong> permet de coller un config.json brut pour modèles privés / en dev.", | |
| "tile.trust.tip": "Quand vous voyez un score et voulez savoir s'il est réel. <strong>Contamination</strong> note 20+ benchmarks selon la probabilité que le modèle les ait vus en entraînement. <strong>Drift</strong> vous dit si l'écart entre deux évals est du bruit numérique ou un vrai bug (chat-template mismatch, layout KV-cache, etc.). <strong>Arena CI</strong> reconstruit les intervalles de confiance que Chatbot Arena cache — beaucoup de "victoires" top-Elo sont statistiquement à égalité.", | |
| "tile.eval.tip": "Avant de lancer lm-eval-harness ou vLLM serve, obtenez le bon flag CLI. <strong>Chat-template Sniffer</strong> détecte la famille de template (Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none) et émet l'invocation exacte <code>--apply_chat_template</code> / <code>--chat-template</code>. Résout l'issue #1841 de lm-eval-harness (÷2 accuracy silencieux). <strong>Diagnose CLI</strong> génère la commande Python pour mesurer γ_obs sur votre GPU local.", | |
| "tile.compare.tip": "<strong>Compare</strong> : choisissez 2-3 modèles candidats + une recette, voyez les verdicts dans un tableau côte à côte (ex. Llama-3-8B vs Mistral-7B à 32k). <strong>Phase diagram</strong> : nuage de 23 modèles empiriques dans le plan (log θ, γ), avec la courbe Padé superposée. Survolez les points pour détails, cliquez pour charger ce modèle dans le formulaire Recipe.", | |
| "tile.manual.tip": "<strong>Recipe</strong> : choisissez une recette X-N spécifique (X-1 custom-vs-API, X-2 long context, X-3 budget, X-5 hardware, X-19 compression KV, X-21 imprint, X-22 compute-context invariant, X-23 IH-phase) et remplissez le formulaire à la main pour contrôle total. <strong>Ask</strong> : tapez une question libre ; un LLM 0.5B (Qwen2.5) dans votre navigateur choisit la bonne recette et la lance. Idéal pour explorer "que se passerait-il si...".", | |
| "share.import_desc": "Vous avez un fichier JSON de l'analyse TAF de quelqu'un ? Chargez-le ici pour voir le verdict + la chaîne localement. La même vue que si vous l'aviez exécuté vous-même.", | |
| "share.import_btn": "📂 Charger JSON partagé", | |
| "synthesis.system": "Vous êtes un assistant de diagnostic précis pour LLMs transformer. Étant donné des résultats de formules TAF pré-calculés, écrivez un résumé clair en français de 4-6 phrases. Citez le numéro de section (§X.Y) pour chaque nombre mentionné. Donnez toujours une recommandation concrète. N'INVENTEZ PAS de nombres.", | |
| // INSPECTOR mode | |
| "inspector.title": "🔍 Inspecteur d'Architecture", | |
| "inspector.desc": "Collez le contenu brut de <code>config.json</code>. L'outil extrait les paramètres architecturaux et exécute le Profil complet à 5 recettes.", | |
| "inspector.tip": "<strong>Collez n'importe quel config.json directement</strong>. L'outil le parse et exécute le Profil complet. Utile pour : modèles privés, configs en développement, modèles pas encore sur HuggingFace, ou comparer ce que ferait votre architecture custom.", | |
| "inspector.quickstart": "💡 Cas d'usage : vous avez un modèle privé pas sur HF Hub, ou une config que vous concevez. Collez le JSON brut ci-dessous et obtenez un profil TAF complet.", | |
| "inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}", | |
| "inspector.T_eval": "T_eval (votre contexte cible) :", | |
| "inspector.btn": "🚀 Inspecter et profiler", | |
| // WHAT-IF slider | |
| "whatif.title": "🎚 What-if : faites glisser T_eval pour voir γ changer en direct", | |
| "whatif.desc": "Recalcul pur JS (sans appel Pyodide). Montre γ_Padé et d_horizon géométriques pendant que vous glissez. Cliquez pour ré-exécuter la chaîne complète.", | |
| "whatif.T_eval": "<strong>T_eval</strong>", | |
| "whatif.gamma_pade": "<strong>γ_Padé</strong>", | |
| "whatif.d_horizon": "<strong>d_horizon</strong>", | |
| "whatif.l_niah": "<strong>Plafond L_NIAH</strong>", | |
| "whatif.predicted": "<strong>Verdict géométrique prédit</strong>", | |
| "whatif.rerun": "↻ Recalculer la chaîne complète à ce T_eval", | |
| // COMMUNITY feed | |
| "community.title": "🌐 Soumissions récentes de la communauté", | |
| "community.desc": "Flux en direct du registre public. Cliquez sur n'importe quelle soumission pour voir l'analyse complète.", | |
| "community.browse_all": "Voir tout →", | |
| "community.loading": "Chargement...", | |
| "community.no_repo": "Le repo du registre n'est pas encore créé. Une fois qu'il existe avec des soumissions, elles apparaîtront ici en direct.", | |
| "community.no_submissions": "Aucune soumission. Soyez le premier — générez un Profil et cliquez 📤 Soumettre au registry.", | |
| // FALSIFICATION dashboard | |
| "falsification.title": "🔬 Prédictions du paper — statut de falsification", | |
| "falsification.desc": "Le framework TAF repose sur des prédictions falsifiables (F1-F23). Chacune est empiriquement testée. Voici le statut en direct de chaque prédiction du paper.", | |
| "falsification.summary": "{confirmed} confirmées · {partial} partielles · {refuted} réfutées · {untested} non testées (sur {total} prédictions au total)", | |
| "falsification.col.id": "ID", | |
| "falsification.col.claim": "Claim", | |
| "falsification.col.status": "Statut", | |
| "falsification.col.evidence": "Preuve", | |
| "tafcard.title": "📇 TAF Card — profil complet du modèle", | |
| "tafcard.recipes_title": "📋 Recettes — verdict par dimension", | |
| "tafcard.recipes_count_label": "dimensions", | |
| "tafcard.numbers_title": "🔢 Nombres clés (paper §26)", | |
| "tafcard.fals_title": "🔬 État de falsification (F1-F23)", | |
| "tafcard.fals_none": "Aucune falsification applicable.", | |
| "tafcard.diag_title": "🔬 Diagnostics — nombres · contrôle γ · what-if", | |
| "tafcard.verify_title": "✓ Vérification — Lean + Sage + falsification", | |
| "tafcard.share_title": "📂 Provenance & partage", | |
| "tafcard.whatif_title": "🎚️ Explorateur what-if", | |
| "verdict.go": "GO", | |
| "verdict.no": "NON", | |
| "verdict.degraded": "DÉGRADÉ", | |
| "compare.title_out": "🆚 Tableau comparatif", | |
| "status.loading_pyodide": "⏳ Chargement du runtime Python (~10MB, première fois)...", | |
| "status.loading_taf": "⏳ Chargement des formules TAF + recettes...", | |
| "status.ready": "✅ Prêt. Choisissez un modèle et cliquez Profiler pour commencer.", | |
| "status.computing": "🧮 Calcul de la chaîne TAF...", | |
| "status.done": "✅ Terminé.", | |
| "profile.hf_placeholder": "ex. meta-llama/Meta-Llama-3-8B ou Qwen/Qwen2.5-7B", | |
| "compare.hf_placeholder": "ID modèle HF (ex. meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot1_placeholder": "ID modèle HF (ex. meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot2_placeholder": "ID modèle HF #2", | |
| "compare.slot3_placeholder": "ID modèle HF #3 (optionnel)", | |
| "compare.preset_default": "— ou préréglage —", | |
| // Paramètres du formulaire | |
| "param.theta": "θ (rope_theta)", | |
| "param.theta.tip": "<strong>Fréquence de base RoPE</strong> de <code>config.rope_theta</code>. Plus haut = plus de capacité longue portée.", | |
| "param.T_train": "T_train", | |
| "param.T_train.tip": "<strong>Contexte max d'entraînement</strong>. De <code>max_position_embeddings</code>. Au-delà c'est de l'extrapolation.", | |
| "param.T_eval": "T_eval (votre cible)", | |
| "param.T_eval.tip": "<strong>Votre contexte d'inférence cible</strong>. La question clé : le modèle se comportera-t-il bien à CETTE longueur ?", | |
| "param.n_attn": "n_attention_heads", | |
| "param.n_attn.tip": "<strong>Nombre d'attention heads</strong> par couche. De <code>num_attention_heads</code>.", | |
| "param.n_kv": "n_kv_heads", | |
| "param.n_kv.tip": "<strong>KV heads</strong>. Si < n_attention_heads → GQA (Grouped Query Attention). Réduit la mémoire KV mais pousse γ vers Hagedorn.", | |
| "param.d_head": "head_dim", | |
| "param.d_head.tip": "<strong>Dimension par head</strong>. Typique 64, 96, 128. De <code>head_dim</code> ou <code>hidden_size / num_attention_heads</code>.", | |
| "param.n_layers": "n_layers", | |
| "param.n_layers.tip": "<strong>Nombre de blocs transformer</strong>. De <code>num_hidden_layers</code>.", | |
| "param.n_params": "n_params (ex. 8e9)", | |
| "param.n_params.tip": "<strong>Nombre total de paramètres</strong>. Seuil ~400M pour l'émergence d'induction heads. Affecte la mémoire KV et les recettes de budget.", | |
| "param.has_swa": "A SWA ?", | |
| "param.has_swa.tip": "<strong>Sliding Window Attention</strong>. <code>true</code> pour Mistral, gemma-2, phi-3. L'audit de calibration v0.5.3 a désactivé la correction historique δ_SWA (ajustement n=1).", | |
| "common.yes": "Oui", | |
| "common.no": "Non", | |
| // Tooltips des modes | |
| "modes.tip": "<strong>Quatorze façons d'utiliser l'outil</strong>.<br><strong>📇 Profil</strong>: collez un id → TAF Card avec 5 recettes.<br><strong>🆚 Comparer</strong>: 2-3 modèles côte à côte sur une recette.<br><strong>🔍 Inspecter config</strong>: collez config.json brut → Profil complet.<br><strong>💬 Question</strong>: question libre, le LLM du navigateur choisit la recette.<br><strong>📋 Recette</strong>: sélection manuelle avec contrôle total du formulaire.<br><strong>🩺 Diagnostic CLI</strong>: génère commande Python pour mesurer γ localement.<br><strong>📊 Diagramme de phase</strong>: panel de 23 modèles dans le plan (log θ, γ).<br><strong>🪟 Démasquer</strong>: détecte un max_position_embeddings trompeur (SWA / YaRN / RoPE-scaling).<br><strong>📜 Chat-template</strong>: détecte la famille + donne le flag CLI exact pour lm-eval / vLLM / transformers.<br><strong>🎯 Arena CI</strong>: reconstruit les intervalles de confiance depuis les votes pairwise bruts ; détecte les égalités statistiques qu'Arena cache.<br><strong>🧪 Contamination</strong>: note 20+ benchmarks pour leur probabilité de contamination selon le cutoff d'entraînement vs la date de sortie.<br><strong>⚖️ Quant</strong>: prédit γ-shift et ΔPPL pour tout (modèle × schéma de quantification) ; recommande une alternative sûre en cas de cliff.<br><strong>🔀 Drift</strong>: même modèle, scores différents sur deux setups — bug ou bruit ? Prédit la bande de bruit numérique et signale les vrais bugs.<br><strong>🔍 NIAH→Reason</strong>: prédit les taux NIAH et reasoning multi-hop depuis l'architecture ; trouve le contexte sûr pour reasoning.", | |
| "profile.tip": "<strong>Diagnostic complet en un clic</strong>. Collez n'importe quel id de modèle HF (ou choisissez préréglage). L'outil exécute les 5 recettes (contexte long, compression KV, custom vs API, budget, hardware) et produit une <strong>TAF Card</strong> unique avec verdict par dimension + nombres clés + classification architecturale.<br><br><strong>Cas d'usage</strong>: « J'évalue Qwen2.5-32B pour la production — quel est son profil complet de viabilité ? » → collez id → Profiler → fait.", | |
| "compare.tip": "<strong>Même recette, plusieurs modèles</strong>. Choisissez 2-3 modèles candidats et une recette. Voyez les verdicts dans un seul tableau comparatif.<br><br><strong>Cas d'usage</strong>: « J'ai besoin de récupération longue contexte à 16K — quel est le meilleur : Llama-3-8B, Mistral-7B ou Qwen-7B ? » → choisissez 3 + X-2 + 16K → voyez le gagnant.", | |
| // Modal d'aide | |
| "help.title": "📘 TAF Agent — Manuel d'utilisation", | |
| "help.what.title": "Que fait-il ?", | |
| "help.what.body": "Prédit la <strong>viabilité pratique</strong> de tout LLM transformer <em>avant de dépenser du GPU/€</em>. Répond à des questions comme « ce modèle fonctionnera-t-il à L=32K ? » ou « dois-je entraîner sur mesure ou utiliser une API ? » via des formules Python déterministes (TAF — Thermodynamic Attention Framework).", | |
| "help.modes.title": "Comment l'utiliser — 7 modes", | |
| "help.modes.profile": "<strong>📇 Profiler</strong>: collez id de modèle → toutes les recettes à la fois = TAF Card. <strong>Meilleur point de départ</strong>.", | |
| "help.modes.compare": "<strong>🆚 Comparer</strong>: 2-3 modèles côte à côte sur la même recette. Mieux pour choisir entre candidats.", | |
| "help.modes.inspector": "<strong>🔍 Inspecter config</strong>: collez <code>config.json</code> brut → l'outil le parse et lance le Profil complet. Pour modèles privés, configs en développement, ou modèles pas encore sur HF Hub.", | |
| "help.modes.ask": "<strong>💬 Question libre</strong>: question en langage naturel, le LLM du navigateur choisit la recette. Mieux pour exploration casuelle.", | |
| "help.modes.recipe": "<strong>📋 Recette + formulaire</strong>: sélection manuelle, contrôle total des paramètres. Mieux quand vous voulez un contrôle exact.", | |
| "help.modes.diagnose": "<strong>🩺 Diagnostic CLI</strong>: génère commande Python pour mesurer γ sur votre machine locale (transformers + numpy). Rapide ≈5 min CPU; complet ≈20–60 min GPU. JSON résultat ré-uploadable via Inspect.", | |
| "help.modes.phase": "<strong>📊 Diagramme de phase</strong>: nuage de 23 modèles du panel dans le plan (log θ, γ). Ligne Hagedorn γ=1 sépare Phase A de Phase B. Cliquer un point pour charger ce modèle dans le formulaire Recette.", | |
| "help.recipes.title": "Les 8 recettes disponibles", | |
| "help.recipe.x1.title": "<strong>X-1 Entraînement custom vs API</strong> — compare le coût d'entraîner votre propre modèle vs payer l'accès API.", | |
| "help.recipe.x1.example": "Essayez: <em>« Dois-je entraîner un 8B custom ou utiliser GPT-4o pour 50M tokens/mois ? »</em><br>Réponses: OUI (custom) / NON (API) avec mois pour break-even.", | |
| "help.recipe.x2.title": "<strong>X-2 Viabilité contexte long</strong> — prédit si un modèle sert une longueur cible de manière fiable.", | |
| "help.recipe.x2.example": "Essayez: <em>« Meta-Llama-3-8B gérera-t-il 32000 tokens pour récupération ? »</em><br>Chaîne: γ_Padé → décomposition → d_horizon → plafond NIAH → hallucination → mémoire KV.<br>Verdict: OUI / DÉGRADÉ / NON avec mitigation si nécessaire.", | |
| "help.recipe.x3.title": "<strong>X-3 Pre-flight budget</strong> — étant donné un budget $, quel modèle est faisable à entraîner ?", | |
| "help.recipe.x3.example": "Essayez: <em>« J'ai $5000, quel modèle puis-je entraîner ? »</em><br>Réponse: GO / TINY-MODEL / MEMORY-LIMITED avec N (params) et D (tokens) concrets.", | |
| "help.recipe.x5.title": "<strong>X-5 Sélection hardware</strong> — quel GPU utiliser pour servir au throughput cible ?", | |
| "help.recipe.x5.example": "Essayez: <em>« Hardware le moins cher pour servir Llama-3-8B à 10M tokens/jour »</em><br>Réponse: meilleur GPU + $/Mtok + capacité vs cible.", | |
| "help.recipe.x19.title": "<strong>X-19 Décision compression KV</strong> — utiliser soft decay, hard cutoff, ou méthodes de littérature ?", | |
| "help.recipe.x21.title": "<strong>X-21 Diagnostic Pureté Imprint</strong> — prédit γ sur tokens RANDOM via ν=−1/(2π); à quel point la prédiction RoPE du modèle est-elle propre ?", | |
| "help.recipe.x22.title": "<strong>X-22 Invariant Compute-Context</strong> — γ × log(N²·D) est-il dans la bande 51.2 ± 16.8 ? Détecte anomalies de scaling/training.", | |
| "help.recipe.x23.title": "<strong>X-23 Détecteur Phase IH</strong> — pré- ou post-induction-head ? Probe peu coûteux via sign(γ_text − γ_random).", | |
| "help.recipe.x19.example": "Essayez: <em>« Comment compresser le cache KV pour Qwen2.5-7B à 32K ? »</em><br>Réponse: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.", | |
| "help.recipe.x21.example": "Essayez: <em>« Quelle est la pureté de la prédiction RoPE sur Llama-3-8B ? »</em><br>Réponse: γ_random prédit + diagnostic (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED).", | |
| "help.recipe.x22.example": "Essayez: <em>« Mistral-7B entre-t-il dans l'invariant compute-context ? »</em><br>Réponse: K = γ·log(N²·D), z-score, IN-BAND ou OUTLIER.", | |
| "help.recipe.x23.example": "Essayez: <em>« Qwen2.5-7B est-il post-induction-head ? »</em><br>Réponse: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY.", | |
| "help.section.v04": "<strong>Nouveautés v0.4</strong> (résultats session 29, 2026-04-28) : trois recettes de diagnostic dérivées de l'analyse panel cross-model (n=22 LLMs).", | |
| "help.divider.v04_s29": "— v0.4 (résultats session 29) —", | |
| "footer.tech_stack": "Calcul : Pyodide · Synthèse : WebLLM (Qwen2.5-0.5B local) · Hébergement : GitHub Pages · Coût : 0 $", | |
| "help.v04.imprint": "<strong>Pente d'imprint apprise ν = −1/(2π)</strong> : la période de rotation RoPE 2π entraîne un biais positionnel dans les poids, proportionnel à log(N_params). Même les tokens aléatoires montrent ce scaling. ν est DÉRIVÉ — non ajusté (erreur empirique 0,3 %).", | |
| "help.v04.invariant": "<strong>Invariant Chinchilla-attention K</strong> : γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329). Connecte le scaling de compute et l'exposant d'attention en un seul nombre sans dimension.", | |
| "help.v04.ih_probe": "<strong>Δγ comme probe IH</strong> : sign(γ_text − γ_random) > 0 ⟺ post-induction-head. Moins coûteux que de lancer un benchmark in-context-learning.", | |
| "help.v04.constants": "<strong>γ-cluster sur constantes célèbres</strong> (intriguant, n=4) : CodeLlama-13b γ=0.382 ≈ 1−1/φ (conjugué doré, err 0,0003) ; pythia-1.4b γ=0.705 ≈ 1/√2 ; Llama-2-7b γ=0.287 ≈ 1−1/√2 ; Mistral-Nemo γ=0.428 ≈ log_10(e). Caveat : peut être coïncidence.", | |
| "help.param.theta": "<strong>θ (rope_theta)</strong>: fréquence de base RoPE. Plus haut = plus de capacité longue portée. Typique: 10000 (anciens), 500000 (Llama-3), 1000000 (Qwen2.5).", | |
| "help.param.T_train": "<strong>T_train</strong>: contexte max vu par le modèle pendant l'entraînement. De <code>max_position_embeddings</code>.", | |
| "help.param.T_eval": "<strong>T_eval</strong>: <em>votre</em> longueur de contexte cible en inférence. Le bouton clé.", | |
| "help.param.gqa": "<strong>n_kv_heads < n_attention_heads</strong>: le modèle utilise GQA (Grouped Query Attention). Réduit la mémoire KV mais pousse γ vers Hagedorn.", | |
| "help.param.swa": "<strong>has_SWA</strong>: le modèle utilise Sliding Window Attention (Mistral, gemma-2).", | |
| "help.param.nparams": "<strong>n_params</strong>: nombre total de paramètres. Seuil ~400M pour l'émergence des induction heads.", | |
| "help.add_models.title": "Ajouter de nouveaux modèles (3 façons)", | |
| "help.add_models.preset": "<strong>Liste de préréglages</strong>: 11 modèles populaires curés. Sélectionnez dans le dropdown.", | |
| "help.add_models.hf": "<strong>HF Hub fetch</strong>: collez n'importe quel id (ex. <code>Qwen/Qwen2.5-32B-Instruct</code>), cliquez 📥 Charger. Le navigateur télécharge <code>config.json</code> directement de HuggingFace, remplit le formulaire. Fonctionne avec tout modèle public.", | |
| "help.add_models.manual": "<strong>Manuel</strong>: remplissez les champs directement avec les valeurs de la model card.", | |
| "help.audit.title": "La chaîne auditable", | |
| "help.audit.body": "Chaque résultat montre la <strong>Chaîne de Calcul</strong> complète — chaque étape de formule avec ses entrées, sortie et interprétation. Cliquez sur n'importe quelle étape pour développer. Les références de section (§26.1, §19.1, etc.) renvoient au paper pour la dérivation.", | |
| "help.synthesis.title": "La réponse en langage naturel", | |
| "help.synthesis.body": "Après exécution de la chaîne déterministe, un LLM dans le navigateur (Qwen2.5-0.5B, ~350MB cachés après premier chargement) synthétise un résumé en langage naturel. Les nombres ci-dessus sont <em>toujours corrects</em> (Python déterministe) ; la synthèse est générée par LLM — vérifiez contre la chaîne en cas de doute.", | |
| "help.params.title": "Paramètres communs expliqués", | |
| "help.verdicts.title": "Quoi regarder dans les verdicts", | |
| "help.verdict.yes": "<strong style=\"color:#3fb950;\">OUI / GO</strong> — procédez avec confiance ; les nombres soutiennent le choix.", | |
| "help.verdict.deg": "<strong style=\"color:#d29922;\">DÉGRADÉ / TINY-MODEL</strong> — fonctionne avec caveats ; lisez l'action.", | |
| "help.verdict.no": "<strong style=\"color:#f85149;\">NON / MEMORY-LIMITED</strong> — ne procédez pas tel quel ; mitigation fournie.", | |
| "help.privacy.title": "Confidentialité", | |
| "help.privacy.body": "Tout s'exécute dans votre navigateur. Pas de télémétrie, pas d'analytique, pas de données envoyées ailleurs. Même le modèle LLM s'exécute localement via WebGPU/WebAssembly. Vos model_ids et questions ne quittent jamais cette page.", | |
| "help.source.title": "Code source et paper", | |
| "help.source.body": "Code : <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>Paper : <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a> ; arXiv à venir)<br>Dataset : <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 58 mesures γ sur 32 modèles (CC-BY-4.0)", | |
| "footer.text": "© 2026 Carles Marin · Apache-2.0 · recherche indépendante · l'outil qui ferme la boucle du paper.", | |
| }, | |
| // ──────────────────────────────────────────────────────────────────────── | |
| // ZH — 中文 | |
| // ──────────────────────────────────────────────────────────────────────── | |
| zh: { | |
| // §33 v0.4 (sesion 31, 2026-04-30) — 新诊断功能 | |
| "v04.title": "🆕 v0.4 — 新诊断 (会话 31)", | |
| "v04.section.intro": "会话 31 (2026-04-30) 从公式 cross-of-crosses 游戏 + 苏格拉底质询中得出的四个新诊断函数。在 <code>taf_browser.py</code> §33 中可用。", | |
| "v04.arch.label": "架构集中度", | |
| "v04.arch.desc": "γ_text ≈ γ_Padé − 0.012·n_kv。跨面板相关性定律(R²=0.30)。警告:不是逐模型预测器。", | |
| "v04.pdi.label": "PDI — Padé 偏差指数", | |
| "v04.pdi.desc": "PDI = d_horizon_obs/T_eval。交通灯:绿色(≈1)、橙色(>>1)、黄色(<<1)、红色(B 阶段负值)。", | |
| "v04.4bit.label": "4 位精度移位预测器", | |
| "v04.4bit.desc": "MHA: R²(bf16)<0.9 → γ 上升;R²>0.99 → γ 下降。GQA: 精度稳健。", | |
| "v04.crit.label": "临界指数捆绑", | |
| "v04.crit.desc": "ν_c、β_c、η_c (=γ−1, 已修正)、α_C、γ_susc,AM-GM 最小值在 γ=1−1/√2≈0.293。", | |
| // §34 v0.5 (会话 32, 2026-05-01) — 机器验证的代数一致性 | |
| "v05.title": "🔬 v0.5 — 机器验证一致性 (会话 32)", | |
| "v05.section.intro": "Sage Groebner basis + Lean Mathlib4 双工具验证 TAF 临界指数的<strong>15 个代数恒等式</strong>。首个具有形式化机器证明支持的 transformer-attention 框架。", | |
| "v05.verify.label": "代数一致性检查", | |
| "v05.verify.desc": "给定测得的 γ,验证 12 个 D-SAGE 恒等式(D-SAGE-1:2η²+η·γ_χ+1=0、β·χ=−1、α+χ=2 等)。全部通过 = 框架完整。失败表明 bf16 异常值 / 量化伪影。", | |
| "v05.dsage1.label": "D-SAGE-1 (★★ 核心)", | |
| "v05.dsage1.desc": "二次恒等式 2η² + η·γ_χ + 1 = 0(Sage Groebner 发现, Lean 验证)。取代错误的 '三重闭合' 主张。从代数上反驳 paper 1 的 η=2γ。", | |
| "v05.erratum.label": "Paper 1 勘误 — η 修正", | |
| "v05.erratum.desc": "Paper 1 原本声明 η = 2γ。Sage Groebner + Lean Mathlib4 证明此为失败(残差 (-4γ³+5γ+1)/(1-γ) > 0 ∀γ ∈ A 相)。正确值:η = γ−1,满足 D-SAGE-1。", | |
| "v05.repro.label": "可重现性", | |
| "v05.repro.desc": "全部 15 个定理在 Lean Mathlib4 中机器证明(build 成功 1973 jobs)。Sage 脚本:<code>analysis/sage_recursive_sweep_2026-04-30.sage</code>。Lean 代码:<code>lean_taf/taf/Taf/Identities.lean</code>。", | |
| // v0.5.1 — TAF Card consistency check button | |
| "v05.consistency.title": "🔬 代数一致性检查 (Sage + Lean v0.5)", | |
| "v05.consistency.desc": "验证 TAF 临界指数的 12 个 D-SAGE 代数恒等式(Sage Groebner basis + Lean Mathlib4 机器证明)。通过 = 框架完整。失败 = bf16 异常值 / 量化伪影。", | |
| "v05.consistency.btn": "🔬 验证代数一致性", | |
| // v0.5.2 — Anti-Ising universality class badge | |
| "v05.antiising.badge": "🧲 反 Ising 类 (β=γ−1<0,机器验证)", | |
| // v0.5.2 — 每个恒等式的工具提示(通俗解释) | |
| "v05.tooltip.D_SAGE_1": "二次代数恒等式,连接异常维度 η 和磁化率 γ_χ。Sage Groebner basis 发现的核心恒等式(机器证明)。取代了之前关于三重闭合的错误声明。", | |
| "v05.tooltip.D_SAGE_2": "在 A 相中,β = γ−1 为负(反 Ising)。乘以 χ = 1/(1−γ) 恰好等于 −1。TAF 负 β 体制的标志。", | |
| "v05.tooltip.D_SAGE_4": "比热指数 α 和磁化率 χ 在 TAF 中精确加和为 2。Josephson 超标度的代数推论。", | |
| "v05.tooltip.D_SAGE_5": "线性恒等式:α + γ_χ = 2(2−γ)。意味着当 γ 接近 1(Hagedorn)时,总和接近 2;在 γ=0 时为 4。", | |
| "v05.tooltip.D_SAGE_6": "序参量指数乘以磁化率指数等于 γ 的特定二次式。因式分解的代数关系。", | |
| "v05.tooltip.Rushbrooke_tautology": "标准 Rushbrooke 超标度 2β + γ_χ = ν·d 在 d=1。在 TAF 中这是一个重言式 — γ_χ 的定义就是为了使其成立。Sage Groebner basis 确认。", | |
| "v05.tooltip.Josephson_tautology": "标准 Josephson 超标度 2 − α = ν·d 在 d=1。在 TAF 中这是一个重言式 — α 的定义就是为了使其成立。", | |
| "v05.tooltip.Fisher_independent": "Fisher 关系 γ_χ = (2−η)·ν。在 TAF 中是独立的(不作为恒等式闭合,与三重闭合声明相反)。残差为 γ(2γ−3)/(1−γ)。", | |
| "v05.tooltip.eta_2gamma_REFUTED": "Paper 1 声称 η=2γ。这个恒等式驳斥了它:残差在整个 A 相中为正。Lean Mathlib4 的机器证明驳斥。", | |
| "v05.tooltip.D_14_nu_imprint": "学习到的印记斜率 ν = −1/(2π) 乘以 2π 得 −1。来自 paper 1 的简单维度检查。", | |
| "v05.tooltip.D_SAGE_7": "中心电荷 c=3 乘以 |ν_imprint| 乘以 2π 得 3。连接 CFT 和训练印记的维度闭合。", | |
| "v05.tooltip.nu_beta_id": "关联长度指数 ν 乘以序参量指数 β 在 A 相中得 −1。D-SAGE-2 的变体。", | |
| "v053.calibration.title": "🔬 v0.5.3 — 校准审计 (2026-05-02)", | |
| "v053.calibration.note": "<strong>SWA 修正已禁用</strong> — 原 δ_SWA = -0.21 基于 n=1 模型拟合(数据不足;唯一案例的均值为 +0.355)。<strong>post_IH 修正标记为探索性</strong> — 重审中组均值 ≈ 0(n=22 面板)未能复现 OLS 拟合。<strong>GQA 修正可复现</strong>(面板 +0.115 vs 硬编码 +0.11)。<strong>D_f 公式修正 Phase B (γ>1)</strong> — 使用离散累积和代替连续近似。LLaMA-3、Mistral、Gemma 现在报告正确的压缩值。", | |
| "v053.release.banner": "🔧 v0.5.3 — 审计驱动的修复:KV 压缩 D_f 现使用离散和(适用于所有 γ);δ_SWA 禁用(n=1 校准);论文 §5.2 C_V 系数勘误 (1/4 → 1/12)。", | |
| // §35 v0.6 — γ 预测 vs 观测 诊断 | |
| "gamma_check.title": "🔍 γ 预测 vs 观测", | |
| "gamma_check.desc": "输入你经验测量的 γ。工具自动检测体制:欺诈 (θ 虚高) / 压缩 / 超 Padé / SWA-随机 / 正常。", | |
| "gamma_check.gobs_label": "γ_观测", | |
| "gamma_check.gobs_tip": "从模型注意力分数经验测量的 γ。使用 Diagnose CLI 从真实权重获取。", | |
| "gamma_check.random_label": "随机语料?", | |
| "gamma_check.random_tip": "若 γ_观测在随机/无结构 token 上测得请勾选。区分 SWA 签名 (γ_obs > 1) 与异常。", | |
| "gamma_check.regime": "体制", | |
| "gamma_check.regime.normal": "正常", | |
| "gamma_check.regime.fraud": "欺诈 (θ 虚高)", | |
| "gamma_check.regime.compressed": "上下文压缩", | |
| "gamma_check.regime.overpade": "超 Padé", | |
| "gamma_check.regime.swa": "SWA 签名 (随机语料)", | |
| "gamma_check.regime.unknown": "未知", | |
| "gamma_check.regime.normal.desc": "η ∈ [0.85, 1.15]:模型完全利用名义上下文,无异常。", | |
| "gamma_check.regime.fraud.desc": "η < 0.01:名义 θ 虚高。模型表现如同 θ 远小于宣称值。可能是 YaRN/营销虚标,无真实上下文扩展。", | |
| "gamma_check.regime.compressed.desc":"η ∈ [0.01, 0.5):上下文压缩 (模型注意距离比名义 θ 预测更短)。常见于 instruction-tuned / RLHF 模型。", | |
| "gamma_check.regime.overpade.desc": "η > 1.5:模型注意距离超过 Padé 预测。可能是 Lerch 修正体制或欠训练早期 checkpoint。", | |
| "gamma_check.regime.swa.desc": "随机语料上 γ_obs > 1.05 = 滑动窗口注意力签名 (Mistral / Gemma 系列)。", | |
| "gamma_check.regime.unknown.desc": "输入超范围或 γ_obs > 1 但未标记随机语料。请核验测量。", | |
| "gamma_check.glossary.title": "ⓘ 词汇表 — 变量含义", | |
| "gamma_check.glossary.gamma_pade": "<strong>γ_Padé</strong>:闭式预测 (2−z)/(2+z), z = T√2/θ。论文 §sec:gamma_decomposition。", | |
| "gamma_check.glossary.gamma_obs": "<strong>γ_观测</strong>:从注意力分数经验测得 (在真实权重上运行 Diagnose CLI)。", | |
| "gamma_check.glossary.theta_eff_obs":"<strong>θ_eff (观测)</strong>:由 γ_obs 反演 T√2 / (1 − γ_obs)。测量隐含的有效 θ。", | |
| "gamma_check.glossary.theta_eff_pade":"<strong>θ_eff (Padé)</strong>:θ + T/√2。闭式公式预测的有效 θ。", | |
| "gamma_check.glossary.efficiency": "<strong>η</strong>:θ_eff_obs / θ_eff_Padé 比值。≈1 = 正常 · <0.01 = 欺诈 · <0.5 = 压缩 · >1.5 = 超 Padé。", | |
| "gamma_check.glossary.delta_h": "<strong>ΔH_Cardy</strong>:log(θ_eff_obs / θ_nominal)。Cardy 熵变。负值 = 压缩熵。~0 = 与名义匹配。", | |
| "gamma_check.glossary.regime": "<strong>体制</strong>:基于 η + γ_obs + 随机语料标志的自动分类器。", | |
| // §36 v0.6 — 内联 ⓘ 图标提示 | |
| "tooltip.gamma_pade": "<strong>γ_Padé(T_eval)</strong>:闭式预测 (2−z)/(2+z), z = T√2/θ。论文 §sec:gamma_decomposition。", | |
| "tooltip.gamma_decomposed": "<strong>γ_分解</strong>:基于完整架构分解的 γ。Padé 基线 + GQA 偏移 + post-IH 偏移 (校准审计已复制子集)。", | |
| "tooltip.d_horizon": "<strong>d_horizon</strong>:有效注意力视野。超过此位置分数低于噪声底 (论文 §26)。", | |
| "tooltip.L_NIAH": "<strong>L_NIAH 上限</strong>:当前 d_horizon 下针-在-干草堆检索可靠性的预测上限。", | |
| "tooltip.chi": "<strong>χ 易感性</strong>:χ = 1/(1−γ)。在 Hagedorn 线 γ=1 处发散。", | |
| "tooltip.kv_memory": "<strong>KV 内存 @ T_eval (BF16)</strong>:每请求 KV 缓存 = 2 · n_layers · n_kv_heads · d_head · T_eval 字节。", | |
| "tooltip.theta_eff_obs": "<strong>θ_eff (观测)</strong>:由 γ_观测 隐含的有效 θ:T√2 / (1 − γ_obs)。", | |
| "tooltip.theta_eff_pade": "<strong>θ_eff (Padé)</strong>:闭式公式预测的有效 θ:θ + T/√2。", | |
| "tooltip.efficiency": "<strong>η = θ_eff_obs / θ_eff_Padé</strong>:效率比。≈1 = 正常 · <0.01 = 欺诈 · <0.5 = 压缩 · >1.5 = 超 Padé。", | |
| "tooltip.delta_h_cardy": "<strong>ΔH_Cardy</strong>:log(θ_eff_obs / θ_nominal)。Cardy 熵变。负值 = 压缩熵。~0 = 与名义匹配。", | |
| "tooltip.verdict_aggregate": "<strong>判定</strong>:所有配方中最差。✅ 通过 = 全绿 · ⚠ 降级 = ≥1 黄 · ❌ 否 = ≥1 红。", | |
| "tooltip.verdict_breakdown": "<strong>各配方分解</strong>:每个配方测试一个<em>独立</em>的决策轴 (长上下文 · 预算 · 硬件 · 自训 vs API · KV 压缩)。X-1 上的 ❌ 表示「按你的量级用 API」而非「模型失败」——展开 Recipes 节查看各轴上下文。", | |
| "tooltip.gamma_pill": "<strong>γ 头条</strong>:γ_分解 (或 γ_Padé 回退)。范围 (0,1) = 相位 A (反伊辛)。γ ≥ 1 = Hagedorn / 相位 B。", | |
| "tooltip.anti_ising": "<strong>反伊辛类</strong>:相位 A → β = γ−1 < 0。机器证明 (Sage + Lean Mathlib4)。见 §35 v0.5。", | |
| // §37 v0.6 — Lean+Mathlib 定理表 | |
| "lean.table.title": "📑 Lean+Mathlib 定理表", | |
| "lean.table.desc": "下方每一项都已机器证明对 Lean 4 + Mathlib4。点击任意 L# 链接跳转到 GitHub 源码行。按主题分组——点击标题展开。", | |
| "lean.table.theorem": "定理", | |
| "lean.table.claim": "陈述", | |
| "lean.table.tactic": "策略", | |
| "lean.table.source": "出处", | |
| "lean.table.lean": "Lean", | |
| "lean.findings.title": "🔎 实质性发现", | |
| "lean.findings.detected_by": "检测于", | |
| "lean.findings.fixed_by": "修正于", | |
| "lean.findings.recommendation":"建议", | |
| "lean.meta.repo": "仓库", | |
| "lean.meta.build": "构建", | |
| "lean.meta.theorems": "定理", | |
| "lean.meta.verified": "已验证", | |
| "lean.meta.rejected": "已拒绝", | |
| "lean.meta.sorry": "sorry", | |
| "lean.meta.findings": "项实质性发现", | |
| "lean.manifest.loading": "正在加载 Lean 清单…", | |
| "lean.manifest.error": "Lean 清单不可用", | |
| // 帮助弹窗 — v0.6 节 | |
| "help.v06.title": "🆕 v0.6 — γ 预测-vs-观测 + Cardy ΔH + Lean 徽章", | |
| "help.v06.intro": "<em>v0.6 (2026-05-06):三个新诊断位于 TAF 卡的 <strong>🔬 诊断</strong> 下。全部在浏览器运行;γ_观测来自在真实权重上运行 Diagnose CLI。</em>", | |
| "help.v06.layout.title": "TAF 卡布局 (v0.6 新增)", | |
| "help.v06.layout.body": "点击 <strong>🚀 生成完整画像</strong> 后,卡片展示:顶部一条 <strong>hero 条</strong> (架构类 + 元信息 + 3 个 pill:聚合判定 ✅/⚠/❌、γ 头条、🧲 反伊辛若处于相位 A) 和四个 <strong>可展开节</strong>:<strong>📋 配方</strong> (默认展开 — 各维度判定)、<strong>🔬 诊断</strong> (关键数字、γ 预测 vs 观测、what-if 浏览器)、<strong>✓ 验证</strong> (Sage+Lean 代数一致性、可证伪 F1-F23)、<strong>📂 来源与分享</strong> (校准审计 + JSON 下载 / 链接 / 注册表提交)。点击任意标题展开。每个变量都有内联 <strong>ⓘ</strong> 提示。", | |
| "help.v06.gamma_check.title": "γ 预测 vs 观测", | |
| "help.v06.gamma_check.body": "输入经验测量的 γ,工具计算 <strong>η = θ_eff_obs / θ_eff_Padé</strong> 并分类到 5 种体制之一:", | |
| "help.v06.case.normal": "<strong>正常</strong> (η ∈ [0.85, 1.15]) — 模型完整使用名义上下文。<em>用例</em>:在采用前验证新发布。", | |
| "help.v06.case.fraud": "<strong>欺诈</strong> (η < 0.01) — 名义 θ 虚高;模型表现如同 θ ≪ 宣称值。<em>用例</em>:检测 YaRN/营销虚标 (CodeLlama / Mistral-Nemo 模式)。", | |
| "help.v06.case.compressed": "<strong>压缩</strong> (η < 0.5) — 上下文压缩;模型注意距离比名义 θ 短。<em>用例</em>:识别 RLHF/指令调优引起的压缩 (LLaMA-2 模式)。", | |
| "help.v06.case.overpade": "<strong>超 Padé</strong> (η > 1.5) — 模型注意距离超过 Padé 预测。<em>用例</em>:识别 Lerch 修正体制或欠训练早期 checkpoint (pythia-1b 模式)。", | |
| "help.v06.case.swa": "<strong>SWA 随机语料</strong> (γ_obs > 1.05 且 随机语料=是) — 滑动窗口注意力签名。<em>用例</em>:在随机 token 上确认 Mistral / Gemma SWA。", | |
| "help.v06.cardy.title": "Cardy ΔH 诊断", | |
| "help.v06.cardy.body": "<strong>ΔH_Cardy = log(θ_eff_obs / θ_nominal)</strong>。观测有效 θ 与名义 θ 之间的熵变。强负值 = 压缩熵;接近零 = 与名义匹配。在边界情况下补充 η。", | |
| "help.v06.lean.title": "Lean + Mathlib 验证徽章", | |
| "help.v06.lean.body": "TAF 恒等式在 Lean Mathlib4 中形式化机器证明:<strong>37 个定理</strong>分布于 7 组(Padé、RG 流、Cayley、D-SAGE、审计发现、CV 勘误、杂项)+ <strong>1 项实质性发现</strong>(V 导数 2 倍因子,定理 <code>V_derivative_ne_RG_beta</code>)。源:<a href=\"https://github.com/karlesmarin/lean-taf\" target=\"_blank\">github.com/karlesmarin/lean-taf</a>(commit 25c77fd)。本地重新验证:<code>git clone --depth=1 https://github.com/karlesmarin/lean-taf && cd lean-taf && lake exe cache get && lake env lean Taf/Identities.lean</code>。Hero 中的 🧲 反伊辛 pill 与验证手风琴链接到具体源码行。", | |
| "help.v06.glossary.title": "变量词汇表 (亦嵌入 TAF 卡)", | |
| "help.v06.glossary.body": "TAF 卡中每个变量都有内联 ⓘ 提示。完整列表:γ、γ_Padé、γ_分解、γ_观测、θ、θ_eff_obs、θ_eff_Padé、η、ΔH_Cardy、χ、d_horizon、L_NIAH、KV 内存、体制。鼠标悬停任意 ⓘ 查看定义 + 论文章节。", | |
| "hero.title": "🔬 TAF Agent", | |
| "hero.tagline": "30 秒诊断任意 transformer LLM。免费。无需 GPU。无需注册。", | |
| "hero.subtitle": "在你花钱或花时间<em>之前</em>,预测某个模型是否适合你的用例。所有计算在浏览器本地运行 — 你的输入永远不会离开此标签页。", | |
| "hero.help": "📘 手册与示例", | |
| "hero.quickstart_btn": "⚡ 快速开始", | |
| "hero.inventory_btn": "🧰 它能给你什么", | |
| "hero.about": "由独立研究员构建。开源。不隶属于任何模型供应商。", | |
| "modes.title": "🎯 模式", | |
| "modes.profile": "📇 模型画像", | |
| "modes.compare": "🆚 比较模型", | |
| "modes.inspector": "🔍 检查 config", | |
| "modes.ask": "💬 自由提问", | |
| "modes.recipe": "📋 选择配方", | |
| "modes.diagnose": "🩺 诊断 CLI", | |
| "diagnose.title": "🩺 诊断 CLI 命令生成器", | |
| "diagnose.tip": "浏览器从 config 预测 γ;CLI 在真实权重上测量 γ_obs。此生成器产生在本地运行的精确命令。", | |
| "diagnose.desc": "选择选项并将生成的命令复制粘贴到本地机器(Python + transformers + numpy)。快速模式 ≈5 分钟 CPU;完整 ≈20–60 分钟 GPU。", | |
| "diagnose.model_label": "HF 模型 id:", | |
| "diagnose.theta_label": "θ(留空自动):", | |
| "diagnose.n_label": "上下文 N:", | |
| "diagnose.options_label": "选项:", | |
| "diagnose.opt_fast": "--fast(CPU,≈5 分钟)", | |
| "diagnose.opt_cpu": "--cpu(强制 CPU)", | |
| "diagnose.opt_4bit": "--load_in_4bit(≥7B 模型)", | |
| "diagnose.local_label": "--local 路径(可选):", | |
| "diagnose.build_btn": "📋 生成命令", | |
| "diagnose.cmd_title": "生成的命令:", | |
| "diagnose.copy_btn": "📋 复制到剪贴板", | |
| "diagnose.next_steps": "下一步: (1) git clone https://github.com/karlesmarin/tafagent (2) cd tafagent && pip install torch transformers numpy (3) 运行命令 (4) JSON 结果 → 通过 Inspect 模式上传以进行完整 TAF 分析。", | |
| "modes.phase": "📊 相图", | |
| "phase.title": "📊 相图(γ × θ)", | |
| "phase.tip": "每个点是论文经验数据集中的一个模型。x 轴: log θ; y 轴: γ。Hagedorn 线 γ=1 分隔 A 相和 B 相。悬停查看详情,点击加载到表单。", | |
| "phase.desc": "数据集中 23 个模型;Padé 曲线在 T=2000。", | |
| "modes.desc": "<strong>最快开始</strong>: 粘贴任意 HuggingFace 模型 id (例如 <code>meta-llama/Meta-Llama-3-8B</code>),点击 画像。秒内看到所有 5 个配方的评分。", | |
| "profile.title": "📇 模型画像", | |
| "profile.desc": "<strong>面向技术人员</strong>: 当您需要候选模型的完整可行性快照时。一键运行所有 5 个配方,生成统一的 TAF 卡。", | |
| "profile.preset_label": "预设:", | |
| "profile.preset_default": "— 或从列表中选择 —", | |
| "profile.hf_label": "HF 模型 id:", | |
| "profile.fetch_btn": "📥 获取", | |
| "profile.btn": "🚀 生成完整画像", | |
| "profile.quickstart": "💡 快速开始: 选择任意预设 → 点击生成。或从 <a href='https://huggingface.co/models?library=transformers&sort=trending' target='_blank'>HF Hub 热门</a> 粘贴一个 id → 📥 获取 → 生成。", | |
| "compare.title": "🆚 模型并排比较", | |
| "compare.desc": "<strong>面向技术人员</strong>: 当为特定部署场景在 2-3 个候选模型之间选择时。同一配方,多个模型,并排判定。", | |
| "compare.recipe_label": "配方:", | |
| "compare.T_eval_label": "T_eval (目标上下文):", | |
| "compare.models_title": "要比较的模型(最多 3 个)", | |
| "compare.btn": "🚀 比较", | |
| "compare.example": "💡 尝试: 粘贴 3 个流行的 7-8B 模型 (Meta-Llama-3-8B, Mistral-7B-v0.1, Qwen/Qwen2.5-7B),配方 X-2, T_eval=16000。查看哪个最适合长上下文。", | |
| "ask.title": "❓ 您的问题", | |
| "ask.placeholder": "例如: Mistral-7B 能处理 16K NIAH 检索吗?或: 我有 5,000 美元,可以训练什么模型?或: 以每天 1 亿 tokens 提供 Llama-70B 的最便宜 GPU?", | |
| "ask.btn": "🚀 分析", | |
| "ask.example_btn": "💡 尝试示例", | |
| "recipe.title": "📋 配方", | |
| "recipe.default": "— 选择一个配方 —", | |
| "recipe.input_title": "🎯 输入", | |
| "verdict.title": "📊 判定", | |
| "chain.title": "🔍 计算链", | |
| "chain.desc": "下面每个数字都是确定性 Python。点击步骤展开。", | |
| "answer.title": "💬 自然语言回答", | |
| "share.btn": "🔗 复制分享链接", | |
| "share.copied": "✅ 已复制到剪贴板!", | |
| "share.download": "💾 下载 JSON", | |
| "share.download_md": "📝 Markdown", | |
| "share.download_tex": "📜 LaTeX", | |
| "share.submit": "📤 提交到 registry", | |
| "share.submit_clip_ok": "↗ 已打开 GitHub。正文已复制到剪贴板——粘贴到 issue 正文。", | |
| "share.submit_clip_fail": "↗ 已打开 GitHub。剪贴板被阻止——正文已写入浏览器控制台 (F12)。", | |
| "share.import_title": "📂 导入共享的 TAF 结果", | |
| "a11y.skip": "跳到主要内容", | |
| // v0.6.2 — landing 重构:快速开始 + 功能清单 + 架构提示 | |
| "qs.title": "⚡ 快速开始", | |
| "qs.step1": "粘贴 HuggingFace 模型 ID(例如 <code>meta-llama/Meta-Llama-3-8B</code>)", | |
| "qs.step2": "点击 <strong>📇 Profile a model</strong>", | |
| "qs.step3": "查看你的 TAF Card — 各用例的判定 + 关键数值 + 经 Lean+Mathlib 验证的数学", | |
| "qs.cta": "↓ 立即开始", | |
| "inv.title": "🧰 这个工具能给你什么", | |
| "inv.recipes.title": "🎯 8 个 recipe — 这个模型符合你的用例吗?", | |
| "inv.recipes.x1.title": "自训练 vs API", | |
| "inv.recipes.x1.body": "对你的流量哪个更便宜?", | |
| "inv.recipes.x2.title": "长上下文", | |
| "inv.recipes.x2.body": "能可靠处理 32k / 128k tokens 吗?", | |
| "inv.recipes.x3.title": "预算", | |
| "inv.recipes.x3.body": "用 $X,你能从零训练什么模型?", | |
| "inv.recipes.x5.title": "硬件", | |
| "inv.recipes.x5.body": "用什么 GPU 服务 N tokens/天?", | |
| "inv.recipes.x19.title": "KV 缓存", | |
| "inv.recipes.x19.body": "如何压缩而不破坏质量?", | |
| "inv.recipes.x21.title": "Imprint 纯度", | |
| "inv.recipes.x21.body": "模型的位置编码有多干净?", | |
| "inv.recipes.x22.title": "Compute-context", | |
| "inv.recipes.x22.body": "模型是否落入经验带?", | |
| "inv.recipes.x23.title": "IH 相位", | |
| "inv.recipes.x23.body": "induction-head 之前还是之后?", | |
| "inv.diag.title": "🔬 诊断", | |
| "inv.diag.gamma": "<strong>γ 预测 vs 观测</strong> — 自动分入 5 种状态(正常 · 欺诈/夸大上下文 · 压缩 · over-Padé · sliding-window)", | |
| "inv.diag.cardy": "<strong>Cardy ΔH</strong> — 观测上下文与名义上下文之间的熵偏移", | |
| "inv.diag.fals": "<strong>可证伪面板</strong> — 检查 23 个具体预测(F1–F23)", | |
| "inv.diag.alg": "<strong>代数一致性</strong> — 模型必须满足的 8 条数学恒等式", | |
| "inv.verify.title": "✓ 形式化验证的数学", | |
| "inv.verify.count": "<strong>37 个定理</strong>已在 Lean 4 + Mathlib4 机器证明", | |
| "inv.verify.click": "点击任意徽章 → 在 GitHub 打开源码行", | |
| "inv.verify.reverify": "自行验证:<code>lake build</code>(缓存后 ≈5 秒)", | |
| "inv.export.title": "📤 导出与分享", | |
| "inv.export.formats": "<strong>JSON · Markdown · LaTeX</strong>(论文级)", | |
| "inv.export.share": "可复现的分享链接(状态编入 URL)", | |
| "inv.export.registry": "提交到 GitHub 上的社区登记", | |
| "arch.summary": "支持的架构", | |
| "arch.anyhf": "✓ 任意 HuggingFace 公开模型", | |
| "tooltip.mha": "Multi-Head Attention:每个 token 位置同时通过多个并行 head 进行注意力计算。", | |
| "tooltip.gqa": "Grouped Query Attention:queries 共享比 heads 更少的 keys/values(节省内存但把 γ 推向 Hagedorn)。", | |
| "tooltip.alibi": "Attention with Linear Biases:位置信息以学习斜率加到注意力分数,无旋转。", | |
| "tooltip.abspe": "Absolute Position Embeddings:每个位置有一个固定的学习向量加到 token embedding。", | |
| "tooltip.swa": "Sliding Window Attention:每个 token 仅在固定局部窗口内做注意力(Mistral、gemma-2 使用此机制)。", | |
| "tooltip.ssm": "State Space Model:维护内部状态的序列层(取代注意力,Mamba、Jamba 使用此机制)。", | |
| // v0.7.0 — anti-bullshit pack #1: SWA / RoPE-scaling 揭示器 | |
| "modes.unmask": "🪟 揭示", | |
| "unmask.title": "🪟 上下文揭示器", | |
| "unmask.tip": "粘贴 HuggingFace 模型 id(或原始 config.json)。工具检测 sliding-window attention、RoPE 缩放(YaRN/linear/dynamic NTK)和 GQA — 所有使 <code>max_position_embeddings</code> 大于实际有效上下文的因素。Mistral-7B-v0.1 是经典例子:声称 32k,实际只在 ~4-8k 范围内做注意力。", | |
| "unmask.desc": "<strong>你即将为一个实际上注意力不到那么远的模型花钱吗?</strong> 粘贴 id,1 秒内得知。无需 GPU,无需推理 — 只是对 config.json 做算术。", | |
| "unmask.id_label": "HF 模型 id:", | |
| "unmask.fetch_btn": "🔍 揭示", | |
| "unmask.paste_summary": "或粘贴原始 config.json(私有 / 在研模型)", | |
| "unmask.paste_btn": "🔍 揭示已粘贴的 config", | |
| "unmask.label.declared": "声明上下文", | |
| "unmask.label.effective": "有效(估计)", | |
| "unmask.label.ratio": "比率", | |
| "unmask.section.flags": "架构标志", | |
| "unmask.section.warnings": "警告", | |
| "unmask.section.reco": "建议", | |
| "unmask.flag.swa": "SWA", | |
| "unmask.flag.rope": "RoPE 缩放", | |
| "unmask.flag.gqa": "GQA", | |
| "unmask.flag.layers": "层数", | |
| "unmask.flag.dhead": "d_head", | |
| "unmask.flag.theta": "RoPE θ", | |
| "unmask.flag.yes": "是", | |
| "unmask.flag.no": "否", | |
| "unmask.flag.full_mha": "否(完整 MHA,{n} heads)", | |
| "unmask.verdict.honest": "✅ 诚实", | |
| "unmask.verdict.inflated": "⚠ 夸大", | |
| "unmask.verdict.severely_inflated": "❌ 严重夸大", | |
| "unmask.verdict.yarn_extended": "⚠ YARN 扩展", | |
| "unmask.verdict.unknown": "❓ 未知", | |
| "unmask.warn.swa_window": "SWA 窗口:{window} tokens — 每层仅在此窗口内做注意力。", | |
| "unmask.warn.multihop": "多跳估计:~{multiHop} tokens(保守:窗口 × {factor})。", | |
| "unmask.warn.yarn": "RoPE 缩放({type})将上下文从 ~{original} 扩展 {factor}× 到 {declared} tokens。", | |
| "unmask.warn.yarn_advice": "RoPE 扩展的上下文 — 用 γ_check 诊断在声称的全长度验证 γ 行为。", | |
| "unmask.warn.gqa_small_dhead": "小 head dim({d_head})+ GQA:长上下文下 KV 缓存压缩很可能(γ 推向 Hagedorn)。", | |
| "unmask.reco.honest": "标准全注意力模型。有效上下文与声明一致({declared} tokens)。", | |
| "unmask.reco.inflated": "通过 SWA 有效 ~{effective} tokens。用 γ_check 验证你目标长度的行为。", | |
| "unmask.reco.severely_inflated": "实际把它当作 ~{effective} tokens 上下文模型。{declared} tokens 的声明仅通过跨层注意力链生效,经验上超过 ~2× SWA 窗口后会退化。", | |
| "unmask.reco.yarn_extended": "RoPE 扩展上下文。运行长上下文 benchmark(NIAH 在 8k / 16k / 32k / 全长度)以确认扩展是否成立。用 γ_check 设 T_eval = {declared}。", | |
| "unmask.reco.unknown": "无法解析 config。验证 URL 是带公开 config.json 的有效 HF 模型。", | |
| "unmask.status.empty_id": "⚠ 输入一个 model id(例如 mistralai/Mistral-7B-v0.1)。", | |
| "unmask.status.fetching": "⏳ 正在获取 {modelId} 的 config.json...", | |
| "unmask.status.success": "✅ 已分析 {modelId}(判定:{verdict})", | |
| "unmask.status.empty_paste": "⚠ 请先粘贴 config.json。", | |
| "unmask.status.invalid_json": "❌ JSON 无效:{error}", | |
| "unmask.status.success_paste": "✅ 已分析粘贴的 config(判定:{verdict})", | |
| "unmask.pasted_label": "(已粘贴 config)", | |
| "mode_desc.ask": "输入自由问题。浏览器内的 LLM 选择正确的 recipe 并运行。", | |
| "mode_desc.recipe": "直接选择一个 recipe 并填表。完整手动控制。", | |
| "mode_desc.profile": "最快开始:粘贴任意 HuggingFace model id,点击 Profile。几秒内看到 5 个 recipe。", | |
| "mode_desc.compare": "选择 2-3 个候选模型 + 一个 recipe。在表格中并排查看判定。", | |
| "mode_desc.inspector": "直接粘贴 config.json。适用于未发布 HF Hub 的私有 / 在研模型。", | |
| "mode_desc.diagnose": "构建 diagnose_model.py 的 CLI 命令,在真实 GPU 上测量 γ_obs。浏览器预测;CLI 测量。", | |
| "mode_desc.phase": "论文经验面板的 γ × θ 散点图。悬停点查看详情,点击加载到 Diagnose / Recipe 表单。", | |
| "mode_desc.unmask": "检测 max_position_embeddings 是否误导(SWA / YaRN / RoPE 缩放)。粘贴 model id,1 行判定。", | |
| "profile.preset_loaded": "✅ 已为 <strong>{id}</strong> 加载预设。表单已预填。(点击 📥 Fetch 用 HF Hub 最新 config 覆盖。)", | |
| // v0.7.1 — anti-bullshit pack #2: Chat-template Sniffer | |
| "modes.template": "📜 Chat-template", | |
| "mode_desc.template": "检测模型使用的 chat-template 系列(Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek)。给出 lm-eval / vLLM / transformers 的精确 CLI flag。", | |
| "template.title": "📜 Chat-template 检测器", | |
| "template.tip": "粘贴 HF 模型 id(或原始 tokenizer_config.json)。检测 chat-template 系列并给出正确使用的精确框架命令。如果忘记应用,lm-eval-harness 会让 accuracy 静默对半(issue #1841)。", | |
| "template.desc": "<strong>忘了 <code>--apply_chat_template</code> 吗?</strong> 大多数 multi-turn eval 因为 chat template 未应用而失败 ~50%。粘贴 model id,获取你 stack 的精确 CLI flag。", | |
| "template.id_label": "HF 模型 id:", | |
| "template.fetch_btn": "📜 检测", | |
| "template.paste_summary": "或粘贴原始 tokenizer_config.json(私有模型)", | |
| "template.paste_btn": "📜 检测已粘贴 config", | |
| "template.label.family": "检测到的系列", | |
| "template.label.markers": "匹配的标记", | |
| "template.label.tpl_len": "Template 长度", | |
| "template.section.warnings": "警告", | |
| "template.section.commands": "各框架命令", | |
| "template.section.raw": "原始 template(预览)", | |
| "template.family.custom": "自定义(未知系列)", | |
| "template.family.none": "(无 chat_template)", | |
| "template.verdict.ok": "✅ 已检测到 TEMPLATE", | |
| "template.verdict.custom": "⚠ 自定义 TEMPLATE", | |
| "template.verdict.missing": "❌ 无 CHAT TEMPLATE", | |
| "template.verdict.base_model": "ℹ 基础模型(无 chat)", | |
| "template.verdict.unknown": "❓ 未知", | |
| "template.warn.no_chat_template": "tokenizer_config.json 中无 <code>chat_template</code> 字段。基础 / 仅预训练模型的典型情况。如果你期待 instruct-tuned 模型,可能加载了错误的文件。", | |
| "template.warn.custom_template": "非标准 template({length} 字符)。工具无法将其匹配到已知系列。检查下方预览并验证你的 eval 框架是否支持。", | |
| "template.warn.lm_eval_apply": "<strong>lm-eval-harness:</strong>添加 <code>--apply_chat_template</code>,否则 multi-turn eval 上 accuracy 会静默下降 ~50%(issue #1841)。", | |
| "template.warn.vllm_apply": "<strong>vLLM serve:</strong>验证 <code>--chat-template</code> 已设置(fine-tuned 变体的自动检测有时失败)。建议:<code>{name}</code>。", | |
| "template.status.empty_id": "⚠ 输入 model id(例如 mistralai/Mistral-7B-Instruct-v0.3)。", | |
| "template.status.fetching": "⏳ 正在获取 {modelId} 的 tokenizer_config.json...", | |
| "template.status.success": "✅ 已检测 {modelId}(判定:{verdict})", | |
| "template.status.empty_paste": "⚠ 请先粘贴 tokenizer_config.json。", | |
| "template.status.invalid_json":"❌ JSON 无效:{error}", | |
| "template.status.success_paste":"✅ 已检测粘贴的 config(判定:{verdict})", | |
| "template.pasted_label": "(已粘贴 tokenizer_config)", | |
| // v0.7.2 — anti-bullshit pack #3: Arena-Elo CI reconstructor | |
| "modes.arena": "🎯 Arena CI", | |
| "mode_desc.arena": "从原始 pairwise 投票数据中恢复置信区间(Bradley-Terry MLE + bootstrap)。检测公开 Arena 排行榜隐藏的统计上并列对。", | |
| "arena.title": "🎯 Arena-Elo CI 重建器", | |
| "arena.tip": "Chatbot Arena 在公开排行榜中删除了置信区间。5 Elo 的差距在统计上可能毫无意义。粘贴原始投票数据(model_a, model_b, winner) — 工具计算 Bradley-Terry MLE + bootstrap CI 并列出统计上的并列(CI 重叠)。", | |
| "arena.desc": "<strong>GPT-4 真的比 Claude 强吗 — 还是它们打平?</strong> 粘贴 pairwise 投票 CSV(或点击 <em>加载样本</em>)。Bradley-Terry MLE + 200 次 bootstrap → 排序 Elo + 95% CI + 统计并列检测。全部在浏览器中。", | |
| "arena.sample_btn": "📊 加载样本数据", | |
| "arena.run_btn": "🎯 计算 CIs", | |
| "arena.clear_btn": "🗑️ 清空", | |
| "arena.csv_summary": "投票 CSV(header:<code>model_a,model_b,winner</code>;winner ∈ a/b/tie)", | |
| "arena.section.ranked": "排序 Elo 与 95% CI", | |
| "arena.section.ties": "统计并列(CI 重叠)", | |
| "arena.section.summary": "摘要", | |
| "arena.col.rank": "#", | |
| "arena.col.model": "模型", | |
| "arena.col.elo": "Elo", | |
| "arena.col.ci": "95% CI", | |
| "arena.col.ci_width": "± 半宽", | |
| "arena.col.matches": "对局", | |
| "arena.col.wins": "胜 / 负 / 平", | |
| "arena.col.tie_pair": "配对", | |
| "arena.col.tie_diff": "Elo 差距", | |
| "arena.col.tie_overlap": "CI 重叠", | |
| "arena.no_ties": "无统计并列 — 所有配对在 95% CI 下可区分。", | |
| "arena.summary.votes": "总投票数", | |
| "arena.summary.models": "模型数", | |
| "arena.summary.ties": "统计并列", | |
| "arena.summary.bootstrap": "Bootstrap 迭代", | |
| "arena.summary.ci_level": "CI 水平", | |
| "arena.status.empty": "⚠ 粘贴投票 CSV 或点击加载样本。", | |
| "arena.status.too_few": "⚠ 仅 {n} 个有效投票 — 需要至少 10 个才能可靠拟合 Bradley-Terry。", | |
| "arena.status.computing": "⏳ 在 {n} 个投票上计算 Bradley-Terry MLE + bootstrap...", | |
| "arena.status.done": "✅ {n} 投票 · {models} 模型 · {ties} 统计并列 · {ms} ms", | |
| "arena.status.sample_loaded": "✅ 样本已加载(合成 6 模型 Arena 数据)。点击计算 CIs。", | |
| // v0.7.3 — anti-bullshit pack #4: Contamination Prior | |
| "modes.contam": "🧪 污染", | |
| "mode_desc.contam": "对 benchmark 分数是否被污染做贝叶斯式的先验估计。输入模型训练 cutoff → 评估 20+ 主流 benchmark(MMLU、GSM8K、HumanEval、MMLU-Pro…)。", | |
| "contam.title": "🧪 污染先验", | |
| "contam.tip": "基于 (模型训练 cutoff 日期) × (benchmark 发布日期) × (已知语料库纳入 + 泄漏历史),对 benchmark 分数是否被污染做贝叶斯式的先验估计。Open LLM Leaderboard v1 在 2024 年因 MMLU/HellaSwag 分数被污染而停用。", | |
| "contam.desc": "<strong>你应该相信你模型的 MMLU 分数吗?</strong> 输入模型训练 cutoff 日期 — 工具评估 20+ 主流 benchmark(MMLU、HellaSwag、GSM8K、HumanEval、IFEval、MMLU-Pro、GPQA…)并告诉你哪些分数可能被污染。", | |
| "contam.cutoff_label": "训练 cutoff:", | |
| "contam.run_btn": "🧪 评估所有 benchmark", | |
| "contam.section.ranked": "Benchmark 污染先验", | |
| "contam.section.high": "🔴 高风险 benchmark(视分数为不可信)", | |
| "contam.section.medium": "🟡 中等风险(用替代品验证)", | |
| "contam.section.low": "🟢 低风险(可能干净)", | |
| "contam.col.benchmark": "Benchmark", | |
| "contam.col.released": "发布", | |
| "contam.col.gap": "差距(月)", | |
| "contam.col.prior": "P(污染)", | |
| "contam.col.level": "等级", | |
| "contam.col.corpora": "在语料库", | |
| "contam.col.category": "类别", | |
| "contam.label.high": "高风险", | |
| "contam.label.medium": "中", | |
| "contam.label.low": "低", | |
| "contam.no_entries": "(此类别中无)", | |
| "contam.advice.high": "视这些分数为不可信。用更新 / 私有测试的替代品替换(MMLU-Pro、GPQA、MUSR、MATH-500)。", | |
| "contam.advice.medium": "谨慎对待。在 held-out 子集或社区复现上寻找复制。", | |
| "contam.advice.low": "分数可能未被污染,但没有泄漏不等于证明 — 仍要用替代测试交叉验证。", | |
| "contam.summary.headline": "Cutoff <code>{cutoff}</code> · {n} 个 benchmark 已评估", | |
| "contam.status.empty": "⚠ 输入模型训练 cutoff 日期(例如 2023-12)。", | |
| "contam.status.bad_date": "⚠ 日期格式错误。使用 YYYY-MM 或 YYYY-MM-DD。", | |
| "contam.status.done": "✅ Cutoff {cutoff} · {n} benchmarks 已评估 · {high} 个高风险", | |
| // v0.7 — Help 模态部分 | |
| "help.v07.title": "🆕 v0.7 — Anti-bullshit 套件(4 个新模式)", | |
| "help.v07.intro": "<em>v0.7(2026-05-06):四个新模式,解决 HuggingFace 社区报告的具体痛点。每个都在浏览器中运行,无推理 — 纯元数据 + 数学。</em>", | |
| "help.v07.unmask.title": "🪟 上下文揭示器", | |
| "help.v07.unmask.body": "检测 <code>max_position_embeddings</code> 何时具有误导性。Mistral-7B-v0.1 声称 32k 但通过 SWA 实际只在 ~4-8k 内做注意力。粘贴 HF 模型 id → 1 秒判定(诚实 / 夸大 / 严重夸大 / YARN 扩展)。捕获 SWA、RoPE-scaling(YaRN/linear/dynamic NTK)、小 d_head + GQA。<em>用例</em>:在为 32k 上下文付 GPU 钱之前,验证模型是否真的注意那么远。", | |
| "help.v07.template.title": "📜 Chat-template 检测器", | |
| "help.v07.template.body": "检测模型使用的 chat-template 系列(Llama-3 / ChatML / Mistral / Gemma / Phi-3 / Alpaca / DeepSeek / 自定义 / 无)并给出 lm-evaluation-harness、vLLM、transformers 的精确 CLI flag。解决 lm-eval-harness 的 issue #1841:忘记 <code>--apply_chat_template</code> 会让 multi-turn accuracy 静默对半。<em>用例</em>:报告 benchmark 分数前,确认你正确应用了 template。", | |
| "help.v07.arena.title": "🎯 Arena-Elo CI 重建器", | |
| "help.v07.arena.body": "Chatbot Arena 在公开排行榜中删除了置信区间 — 5 Elo 的差距在统计上可能毫无意义。粘贴原始 pairwise 投票数据(model_a, model_b, winner)→ Bradley-Terry MLE + 200 次 bootstrap → 排序 Elo + 95% CI + \"统计并列\" 面板,列出 CI 重叠的配对。尝试加载样本按钮。<em>用例</em>:宣称 \"模型 A 胜过模型 B\" 之前,验证它们的 CI 不重叠。", | |
| "help.v07.contam.title": "🧪 污染先验", | |
| "help.v07.contam.body": "对 benchmark 分数是否被污染做贝叶斯式的先验估计。输入模型训练 cutoff 日期 → 工具按 P(污染) 评估 20+ 主流 benchmark(MMLU、HellaSwag、GSM8K、HumanEval、IFEval、MMLU-Pro、GPQA、AIME、MATH-500、BBH、MUSR…),基于时间差距、语料库纳入和已知泄漏历史。Open LLM Leaderboard v1 在 2024 年因 MMLU/HellaSwag 分数被污染而停用。<em>用例</em>:比较两个模型时决定相信哪些分数。", | |
| "help.v07.quant.title": "⚖️ 量化机制分类器", | |
| "help.v07.quant.body": "预测任意(模型 × 量化方案:NF4、AWQ、GPTQ、GGUF Q4_K_M / Q5_K_M / Q8_0、int8、FP8…)的 γ-shift 与 ΔPPL。架构感知:小 d_head + 激进 GQA → 更敏感;校准方案(AWQ)比未校准方案(NF4)更好地吸收偏移。检测到 cliff 时推荐更安全的替代方案。<em>用例</em>:量化之前,预测你的特定架构 × 方案组合是否能保持 PPL 可接受,否则给出具体的切换建议。", | |
| "help.v07.drift.title": "🔀 跨框架 Drift 界", | |
| "help.v07.drift.body": "同一模型,不同 setup 下分数不同。工具预测仅由数值噪声(dtype、framework、batch)允许的最大 drift。若观测差距超过它 → 真实 bug,通常是 chat-template mismatch(lm-eval-harness issue #1841)或 KV-cache 布局。试试 "加载样本" 按钮看典型的 chat-template bug。<em>用例</em>:在报告回归或声称可复现性之前,验证两个评估之间的差距是否大于数值噪声能解释的范围。", | |
| "inv.v07.drift": "<strong>🔀 Drift</strong> — bug 还是噪声?预测两个评估间的最大可允许差距", | |
| "help.v07.niah.title": "🔍 NIAH → Reasoning Gap", | |
| "help.v07.niah.body": "RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH(needle 检索)但在相同上下文上多跳 reasoning 失败。工具仅根据架构(γ_Padé + d_horizon + 架构压力:小 d_head、GQA、SWA)预测两种通过率,报告 gap,并找到模型 reasoning 保持 ≥65% 的\"安全 reasoning 上下文\"。扫描模式显示在 1k/4k/16k/64k/T_train 的曲线。<em>用例</em>:在声称的上下文部署之前,搞清楚模型是真的能在那里 reasoning 还是只能检索。", | |
| "inv.v07.niah": "<strong>🔍 NIAH→Reason</strong> — 你的\"128k 上下文\"真的能在那里 reasoning,还是只能检索?", | |
| // v0.7 — Inventory 模态第 5 卡 | |
| "inv.v07.title": "🆕 v0.7 anti-bullshit 套件", | |
| "inv.v07.unmask": "<strong>🪟 Unmask</strong> — config.json 声称 32k?看它是否真的注意那么远", | |
| "inv.v07.template": "<strong>📜 Chat-template</strong> — 精确 CLI flag,让 lm-eval 不会静默对半你的 accuracy", | |
| "inv.v07.arena": "<strong>🎯 Arena CI</strong> — 恢复 Chatbot Arena 隐藏的置信区间", | |
| "inv.v07.contam": "<strong>🧪 污染</strong> — 按污染概率对 20+ benchmark 评级", | |
| "inv.v07.quant": "<strong>⚖️ Quant</strong> — 预测任意(模型 × 量化方案)组合的 γ-shift + ΔPPL", | |
| // v0.7.3 — anti-bullshit pack #5: Quant-regime classifier | |
| "modes.quant": "⚖️ Quant", | |
| "mode_desc.quant": "预测任意(模型 × 量化方案)的 γ-shift 与 ΔPPL。架构感知:小 d_head + GQA → 更敏感。检测到 cliff 时推荐更安全的替代方案。", | |
| "quant.title": "⚖️ 量化机制分类器", | |
| "quant.tip": "预测给定(模型 × 量化方案)的 γ-shift(及由此产生的 ΔPPL)。\"AWQ 保留 ~95%\" 这类通用说法太模糊 — TAF 利用 d_head、GQA 比、SWA 标志和模型大小给出特定于架构的判定。解决:HF 社区普遍报告不可预测的量化 cliff(NF4 在 Phi-3 上 -2 PPL,但在 Llama-3-8B 上没问题)。", | |
| "quant.desc": "<strong>量化会破坏你的模型吗?</strong>粘贴 HF 模型 id,选择量化方案 — 获取预测的 γ-shift、预期 ΔPPL 区间,以及在 cliff 情况下的推荐替代方案。仅浏览器,无 GPU,无需校准集。", | |
| "quant.id_label": "HF 模型 id:", | |
| "quant.fetch_btn": "📥 获取 config", | |
| "quant.scheme_label": "量化方案:", | |
| "quant.run_btn": "⚖️ 预测", | |
| "quant.all_btn": "📊 比较所有方案", | |
| "quant.regime.safe": "✅ 安全", | |
| "quant.regime.mild": "✅ 轻度压缩", | |
| "quant.regime.significant": "⚠ 显著退化", | |
| "quant.regime.cliff": "❌ 重大 CLIFF", | |
| "quant.label.gamma_shift": "γ 偏移", | |
| "quant.label.delta_ppl": "ΔPPL(估)", | |
| "quant.label.arch_mult": "架构乘数", | |
| "quant.section.breakdown": "细节分解", | |
| "quant.section.reco": "建议", | |
| "quant.section.compare": "所有方案(按安全性排序)", | |
| "quant.field.scheme": "方案", | |
| "quant.field.calibrated": "已校准", | |
| "quant.field.uncalibrated": "未校准", | |
| "quant.field.base_penalty": "基础惩罚", | |
| "quant.field.arch_mult_full": "架构乘数", | |
| "quant.field.gamma_shift": "预测 γ 偏移", | |
| "quant.field.ppl_band": "ΔPPL 区间(估)", | |
| "quant.field.params": "参数量", | |
| "quant.col.scheme": "方案", | |
| "quant.col.bits": "比特", | |
| "quant.col.gamma_shift": "γ 偏移", | |
| "quant.col.ppl_band": "ΔPPL 区间", | |
| "quant.col.regime": "机制", | |
| "quant.reco.switch_to_awq": "<strong>切换到 {scheme}</strong> — 校准的 4-bit 处理小 d_head + GQA 比 NF4 好得多。预期 ΔPPL 下降 ~2-3 倍。", | |
| "quant.reco.switch_to_q5_km": "<strong>切换到 {scheme}</strong> — Q5 以低成本保留更多 head 维度(仅大约 25% 文件更大)。", | |
| "quant.reco.switch_to_q4_km": "<strong>切换到 {scheme}</strong> — Q3/Q2 对此架构过于激进。", | |
| "quant.reco.consider_awq": "<strong>考虑 {scheme}</strong> — 在此架构上校准能显著降低 γ-shift。", | |
| "quant.reco.use_higher_bits": "<strong>使用更高比特的替代</strong> — 此架构无法干净吸收 4-bit。尝试 5 或 8-bit。", | |
| "quant.reco.verify_with_eval": "<strong>用真实 eval 验证</strong> — 预测偏移在边缘。部署前在目标上下文运行 NIAH。", | |
| "quant.reco.no_action": "无需操作 — 此架构下量化是安全的。", | |
| "quant.summary.headline_all": "<code>{modelId}</code> 的所有方案", | |
| "quant.status.empty_id": "⚠ 输入 model id(例如 meta-llama/Llama-3.2-1B)。", | |
| "quant.status.fetching": "⏳ 正在获取 {modelId} 的 config.json...", | |
| "quant.status.fetched": "✅ 已获取 {modelId} 的 config。选择方案并点击预测(或比较所有)。", | |
| "quant.status.no_scheme": "⚠ 从下拉中选择一个量化方案。", | |
| "quant.status.done": "✅ 预测机制:{regime}", | |
| "quant.status.done_all": "✅ 已比较 {n} 个方案 — 按安全性排序。", | |
| // v0.7.4 — HF Hub 自动完成:隐私 + rate-limit | |
| "hf_auto.privacy": "🔒 查询发送到 huggingface.co/api · 本地缓存 5 分钟", | |
| "hf_auto.rate_limited": "⚠ HuggingFace 速率限制 — 稍后再试,或手动键入完整 model id", | |
| "hf_auto.gated_msg": "是 gated 模型。在此接受许可证:", | |
| // v0.7.5 — anti-bullshit pack #6: 跨框架 drift 界 | |
| "modes.drift": "🔀 Drift", | |
| "mode_desc.drift": "在给定(framework、dtype、batch、chat-template)下预测两个 benchmark 分数之间的最大允许 drift。区分真实 bug 与数值噪声。", | |
| "drift.title": "🔀 跨框架 Drift 界", | |
| "drift.tip": "同一模型,不同 setup 下分数不同。差距是噪声还是真实 bug?输入两个分数及其(framework、dtype、batch、chat-template)— 工具预测仅由数值噪声允许的最大 drift。若观测差距超过它 → 真实 bug,通常是 chat-template mismatch(lm-eval issue #1841)或 KV-cache 布局。", | |
| "drift.desc": "<strong>你的模型在 lm-eval-hf 给 67.2,在 vLLM-served 给 65.1。Bug 还是噪声?</strong> 输入两个分数及(framework、dtype、batch、是否应用 chat-template)。工具预测噪声区间并标记真实 bug。arxiv 2506.09501 将此记录为评估再现性的主要问题。", | |
| "drift.setup_a": "Setup A", | |
| "drift.setup_b": "Setup B", | |
| "drift.score": "分数", | |
| "drift.framework": "框架", | |
| "drift.dtype": "Dtype", | |
| "drift.batch": "Batch", | |
| "drift.template": "Chat-template", | |
| "drift.template.applied": "已应用", | |
| "drift.template.not_applied": "未应用", | |
| "drift.template.unknown": "未知", | |
| "drift.run_btn": "🔀 计算 drift 界", | |
| "drift.sample_btn": "📊 加载样本(chat-template bug)", | |
| "drift.label.observed": "观测差距", | |
| "drift.label.band": "数值区间", | |
| "drift.label.ratio": "差距 / 区间", | |
| "drift.section.setups": "Setups", | |
| "drift.section.breakdown": "Drift 贡献者(数值区间)", | |
| "drift.section.verdict": "判定与建议", | |
| "drift.contrib.dtype": "Dtype 不匹配", | |
| "drift.contrib.framework": "框架", | |
| "drift.contrib.batch": "Batch 差异", | |
| "drift.contrib.template": "Chat-template 不匹配", | |
| "drift.dominant_cause": "主导原因", | |
| "drift.cause.dtype": "dtype 精度差异", | |
| "drift.cause.framework": "框架 / 内核差异", | |
| "drift.cause.batch": "按 batch 的归一化路径", | |
| "drift.cause.template_mismatch": "一侧应用了 chat-template 而另一侧没有(lm-eval-harness #1841 模式 — 多轮通常 -50%)", | |
| "drift.verdict.noise": "✅ 数值噪声", | |
| "drift.verdict.suspicious": "⚠ 可疑 — 验证", | |
| "drift.verdict.bug": "❌ 真实 BUG — 调查", | |
| "drift.verdict.bug_template": "❌ CHAT-TEMPLATE BUG", | |
| "drift.reco.noise": "差距落在预期的数值噪声区间内。无需操作;差异与单独的 framework/dtype/batch 变化一致。", | |
| "drift.reco.suspicious": "差距是预测区间的 1–2×。边缘——可能是真实 bug。尝试对齐主导贡献者(例如匹配框架或 dtype)并重新测试。", | |
| "drift.reco.bug": "差距 > 预测区间的 2×。这是真实 bug。检查主导贡献者 — 很可能是 tokenizer / chat-template / KV-cache 布局差异。用 <code>--apply_chat_template</code> 运行 lm-eval-harness 并确认。", | |
| "drift.reco.bug_template": "检测到 chat-template 不匹配。这是评估差距大的最常见原因(lm-eval-harness issue #1841)。用 <code>--apply_chat_template</code> 重跑 "未应用" 一侧(或设置 vLLM <code>--chat-template <name></code>)并重测。", | |
| "drift.status.empty_scores": "⚠ 输入两个分数。", | |
| "drift.status.done": "✅ 判定:{verdict}", | |
| "drift.status.sample_loaded": "✅ 样本已加载(典型 chat-template bug)。点击计算 drift 界。", | |
| // v0.7.6 — anti-bullshit pack #7: NIAH → reasoning gap 预测器 | |
| "modes.niah": "🔍 NIAH→Reason", | |
| "mode_desc.niah": "在任意上下文下预测 NIAH(检索)与多跳 reasoning 通过率。解决:长上下文模型常常通过 NIAH 但在同一上下文上 reasoning 失败(RULER 论文)。", | |
| "modes.saturation": "📈 饱和度", | |
| "mode_desc.saturation": "告诉你某个 benchmark 是否仍能区分 frontier 模型,或者已经饱和(例如 MMLU 88-94% 顶部,AIME 2025 已经 96-100%)。返回 top-3 + 判定 + 推荐替代品。", | |
| "modes.hub": "🧭 方案", | |
| "mode_desc.hub": "每个 LLM-eval 问题的地图 → tafagent 模式(若覆盖)+ 精选外部工具。找到方案而非重新发明。30+ 问题,7 类别。", | |
| "niah.title": "🔍 NIAH → Reasoning Gap", | |
| "niah.tip": "NIAH(Needle in a Haystack)测试检索:\"在长文本中找到这个事实\"。多跳 reasoning 测试推理:\"把开头的事实 X+Y 与结尾的事实 Z 结合\"。RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH 但在相同上下文上 reasoning 失败。本工具仅根据架构预测两种通过率。", | |
| "niah.desc": "<strong>你的模型声称 128k 上下文。它在 64k 是真的能 reasoning,还是只能检索?</strong>粘贴 HF 模型 id 和目标 eval 上下文 — 工具预测 NIAH 与多跳 reasoning 通过率、gap,以及 reasoning 保持 ≥65% 的 \"安全上下文\"。", | |
| "niah.id_label": "HF 模型 id:", | |
| "niah.fetch_btn": "📥 获取 config", | |
| "niah.teval_label": "目标上下文 (T_eval):", | |
| "niah.run_btn": "🔍 预测", | |
| "niah.sweep_btn": "📊 扫描上下文", | |
| "niah.label.niah": "NIAH 通过率", | |
| "niah.label.reasoning": "Reasoning 通过率", | |
| "niah.label.gap": "Gap", | |
| "niah.label.safe_ctx": "Reasoning 安全上下文", | |
| "niah.section.breakdown": "架构细节", | |
| "niah.section.reco": "建议", | |
| "niah.section.sweep": "按上下文长度扫描通过率", | |
| "niah.field.dhorizon": "d_horizon(有效)", | |
| "niah.field.ratio": "T_eval / d_horizon", | |
| "niah.field.arch_pressure": "架构压力(小 d_head + GQA + SWA)", | |
| "niah.field.theta": "RoPE θ", | |
| "niah.field.t_train": "T_train(声称)", | |
| "niah.col.context": "T_eval", | |
| "niah.col.niah": "NIAH", | |
| "niah.col.reasoning": "Reasoning", | |
| "niah.col.gap": "Gap", | |
| "niah.col.verdict": "判定", | |
| "niah.verdict.robust": "✅ 稳健", | |
| "niah.verdict.marginal": "⚠ 边缘", | |
| "niah.verdict.degraded": "⚠ 退化", | |
| "niah.verdict.retrieval_only": "❌ 仅检索", | |
| "niah.verdict.broken": "❌ 失效", | |
| "niah.reco.robust": "在此上下文下检索与 reasoning 都稳定。可安全部署用于查询和推理任务。", | |
| "niah.reco.marginal": "边缘。检索可用但 reasoning 不稳。用于事实查询,不要用于多步推理。", | |
| "niah.reco.degraded": "Reasoning 显著下降。模型能找到事实但难以组合它们。在此长度下避免多跳任务。", | |
| "niah.reco.retrieval_only": "RULER 的典型发现:模型通过 NIAH 但 reasoning 失败。适用于 RAG 设置(LLM 仅定位事实),不适用于链式推理。把上下文降到下方的 \"安全\" 值。", | |
| "niah.reco.broken": "在此上下文下模型连基本检索都失败。视为 out-of-distribution — 在更短上下文重测。", | |
| "niah.safe_context": "≤ {ctx} tokens(reasoning ≥ 65%)", | |
| "niah.safe_context_none": "在你的目标以下没找到安全上下文 — 模型即使在小上下文也 reasoning 失败。", | |
| "niah.summary.sweep": "<code>{modelId}</code> — 按上下文的通过率", | |
| "niah.status.empty_id": "⚠ 输入 model id(例如 meta-llama/Llama-3.1-8B-Instruct)。", | |
| "niah.status.bad_teval": "⚠ 输入目标上下文(≥ 512 tokens)。", | |
| "niah.status.fetching": "⏳ 正在获取 {modelId} 的 config.json...", | |
| "niah.status.fetched": "✅ 已获取 {modelId} 的 config。设置 T_eval 并点击预测(或扫描上下文)。", | |
| "niah.status.done": "✅ {verdict} — NIAH {niah}% · reasoning {reasoning}%", | |
| "niah.status.sweep_done": "✅ 已扫描 {n} 个上下文长度。", | |
| "saturation.title": "📈 Benchmark 饱和度检测器", | |
| "saturation.tip": "MMLU 已饱和(所有 frontier 模型 88-94%)。报告\"92% on MMLU\"现在毫无意义。本工具告诉你哪些 benchmark 仍能区分 frontier 模型,哪些已饱和,以及替代方案。数据:DemandSphere AI Frontier Tracker(CC BY-NC 4.0),2026-05 刷新。", | |
| "saturation.desc": "<strong>你的 benchmark 还有用吗?</strong>选一个 benchmark 查看 top-3 frontier 分数、spread 与判定(saturated / near-saturated / discriminative),并给出推荐替代品。", | |
| "saturation.select_label": "Benchmark:", | |
| "saturation.select.all": "— 显示所有 benchmark —", | |
| "saturation.run_btn": "📈 分类", | |
| "saturation.all_btn": "📊 显示全部", | |
| "saturation.col.spread": "Top-3 spread", | |
| "saturation.col.mean": "Top-3 平均", | |
| "saturation.col.n": "模型数", | |
| "saturation.col.bench": "Benchmark", | |
| "saturation.col.verdict": "判定", | |
| "saturation.col.reco": "首选替代", | |
| "saturation.col.model": "模型", | |
| "saturation.col.score": "分数", | |
| "saturation.section.top3": "Top-3 frontier 分数", | |
| "saturation.section.recommendations": "推荐替代品", | |
| "saturation.section.note": "备注", | |
| "saturation.section.all": "所有跟踪的 benchmark", | |
| "saturation.verdict.saturated": "🚨 已饱和", | |
| "saturation.verdict.near_saturated": "⚠ 接近饱和", | |
| "saturation.verdict.discriminative": "✅ 仍可区分", | |
| "saturation.verdict.sparse_data": "ℹ 数据稀疏", | |
| "saturation.borderline": "边缘 — 在阈值切点的 ±1pp 内。判定视为\"需仔细核对\"。", | |
| "saturation.unknown": "未知 benchmark。", | |
| "saturation.attribution": "数据:DemandSphere AI Frontier Model Tracker(CC BY-NC 4.0)· HF Open LLM Leaderboard v3(开源权重历史)· 最近一次 fetch 2026-05-05。", | |
| "saturation.status.live": "✅ 实时数据已加载 — {count} 个模型。", | |
| "saturation.status.baked": "ℹ 使用 baked 快照(实时 fetch 不可用)。", | |
| "saturation.status.kb_fail": "⚠ 无法加载饱和度 KB。", | |
| "saturation.status.done": "✅ {name} — {verdict}", | |
| "saturation.status.all_done": "✅ 已分类 {n} 个 benchmark。", | |
| "help.v08.saturation.title": "📈 Benchmark 饱和度检测器", | |
| "help.v08.saturation.body": "MMLU 已饱和(top 88-94%),AIME 2025 上线几个月就饱和,HumanEval 接近饱和。选任何 benchmark,工具返回 top-3 frontier 分数、spread、平均,以及判定 — saturated / near-saturated / discriminative — 加上推荐替代品(例如 MMLU → MMLU-Pro / GPQA / HLE)。可达时从 DemandSphere AI Frontier Tracker(CC BY-NC 4.0)实时 fetch;不可达时使用 2026-05-05 的 baked 快照。<em>用例</em>:在引用\"92% on MMLU\"或设计 eval 之前,检查 benchmark 是否仍能区分任何东西。", | |
| "inv.v08.saturation": "<strong>📈 Saturation</strong> — 你的 benchmark 还有用吗,还是所有 frontier 都在顶部并列?", | |
| "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — 每个文档化的问题都映射到一个 tafagent 模式或精选外部工具。别重复发明 — 去找。", | |
| "help.v081.hub.title": "🧭 Solutions Hub", | |
| "help.v081.hub.body": "tafagent 作为集成者而非孤岛。30+ 问题跨 7 类别(评估可靠性 · 诊断 · 设置 · 训练 · 检索 · 多模态 · 可观测性),每个映射到(a)解决它的 tafagent 模式(若存在),以及(b)社区已信任的最佳外部工具(RAGAS、MTEB、HELM、MCP Schema Validator、llm-stats、llguidance、GlitchMiner 等)。搜索框匹配 pain、场景和工具名称。<em>用例</em>:'我有问题 X — tafagent 解决它吗,如果不,谁解决?'", | |
| "hub.title": "🧭 Solutions Hub", | |
| "hub.tip": "我们已知的每个 LLM-eval 问题的地图:哪个 tafagent 模式能解决它(若有),以及社区已信任的最佳外部工具。目标:全覆盖。如果规范工具已在别处,我们链接而非重建。", | |
| "hub.desc": "<strong>别重新发明 — 去找。</strong>30+ 问题映射到 tafagent 模式 + 精选外部工具。按类别浏览、按关键字搜索,或查看新模式最有帮助的空缺。", | |
| "hub.clear_btn": "✕ 清空", | |
| "hub.no_mode": "外部", | |
| "hub.planned": "计划:", | |
| "hub.best_for": "适合", | |
| "hub.not_for": "不适合", | |
| "hub.tools": "外部工具", | |
| "hub.status.loaded": "✅ 已加载 {total} 个问题,跨 {categories} 类别 — {covered} 个由 tafagent 模式覆盖,精选 {externalLinks} 个外部链接。编译于 {compiled}。", | |
| "hub.status.fail": "⚠ 无法加载 Solutions Hub。", | |
| "hub.search.empty": "无 '{query}' 的匹配。尝试更宽泛的词(如 'eval'、'rag'、'tokenizer')。", | |
| "hub.search.results": "为 '{query}' 找到 {n} 个匹配。", | |
| // v0.7.7 — 任务卡片(UX 重构:按用户意图分组的 14 个模式) | |
| "tiles.title": "🎯 你想做什么?", | |
| "tiles.subtitle": "选择一项任务。每一项会打开下方对应的工具。或往下滚动查看完整的 14 个模式列表。", | |
| "tile.diagnose.title": "🔬 诊断一个模型", | |
| "tile.diagnose.desc": "这个具体模型符合我的用例吗?", | |
| "tile.trust.title": "✓ 相信 benchmark 分数", | |
| "tile.trust.desc": "我该相信这个数字吗?是 bug 还是噪声?", | |
| "tile.eval.title": "⚙️ 正确设置 eval", | |
| "tile.eval.desc": "获取 lm-eval / vLLM / transformers 的精确 CLI flag。", | |
| "tile.compare.title": "🆚 比较模型", | |
| "tile.compare.desc": "并排,或浏览经验模型面板。", | |
| "tile.manual.title": "📋 手动 / 自由", | |
| "tile.manual.desc": "手动挑一个具体 recipe,或用自然语言提问。", | |
| "tile.diagnose.tip": "当你有具体的 model id 并想要完整诊断时从这里开始:<strong>Profile</strong> 一次运行所有 5 个 recipe。<strong>Unmask</strong> 检查 max_position_embeddings 是否诚实。<strong>NIAH→Reason</strong> 预测 retrieval-vs-reasoning 的 gap。<strong>Quant</strong> 预测量化是否会破坏它。<strong>Inspect</strong> 允许粘贴原始 config.json,适用于私有 / 在研模型。", | |
| "tile.trust.tip": "当你看到一个分数想知道它是否可靠。<strong>Contamination</strong> 按模型在训练时看到 benchmark 的可能性给 20+ 个 benchmark 评级。<strong>Drift</strong> 告诉你两个 eval 之间的 gap 是数值噪声还是真实 bug(chat-template 不匹配、KV-cache 布局等)。<strong>Arena CI</strong> 重建 Chatbot Arena 隐藏的置信区间——很多 top-Elo 的 "胜利" 在统计上是并列。", | |
| "tile.eval.tip": "在运行 lm-eval-harness 或 vLLM serve 之前,获取正确的 CLI flag。<strong>Chat-template Sniffer</strong> 检测 template 系列(Llama-3 / ChatML / Mistral / Phi-3 / DeepSeek / Alpaca / custom / none)并输出精确的 <code>--apply_chat_template</code> / <code>--chat-template</code> 调用。解决 lm-eval-harness 的 issue #1841(accuracy 静默对半)。<strong>Diagnose CLI</strong> 生成 Python 命令在你的本地 GPU 上测量 γ_obs。", | |
| "tile.compare.tip": "<strong>Compare</strong>:选择 2-3 个候选模型 + 一个 recipe,在并排表格中看判定(例如 Llama-3-8B vs Mistral-7B 在 32k 上下文)。<strong>Phase diagram</strong>:23 个经验模型在 (log θ, γ) 平面上的散点图,叠加 Padé 曲线。悬停点查看详情,点击将该模型加载到 Recipe 表单。", | |
| "tile.manual.tip": "<strong>Recipe</strong>:挑选具体的 X-N recipe(X-1 自训 vs API、X-2 长上下文、X-3 预算、X-5 硬件、X-19 KV 压缩、X-21 imprint、X-22 compute-context 不变量、X-23 IH 相位)并手动填表,完全控制。<strong>Ask</strong>:输入自由问题;浏览器内的 0.5B LLM(Qwen2.5)选择合适的 recipe 并运行。最适合 "如果……会怎样" 的探索。", | |
| "share.import_desc": "有他人 TAF 分析的 JSON 文件? 在这里加载以本地查看判定 + 链。与您自己运行的视图相同。", | |
| "share.import_btn": "📂 加载共享的 JSON", | |
| "synthesis.system": "您是 transformer LLM 的精确诊断助手。给定预先计算的 TAF 公式结果,用 4-6 句中文写出清晰的摘要。为每个提到的数字引用章节号 (§X.Y)。始终给出具体建议。不要编造数字。", | |
| // INSPECTOR 模式 | |
| "inspector.title": "🔍 架构检查器", | |
| "inspector.desc": "粘贴 <code>config.json</code> 的原始内容。工具提取架构参数并运行完整的 5 配方 Profile。", | |
| "inspector.tip": "<strong>直接粘贴任意 config.json</strong>。工具解析它并运行完整 Profile。适用于:私有模型、开发中的 configs、尚未在 HuggingFace 的模型,或比较自定义架构的行为。", | |
| "inspector.quickstart": "💡 用例:您有未在 HF Hub 上的私有模型,或正在设计的 config。粘贴下面的原始 JSON,获取完整 TAF 画像。", | |
| "inspector.placeholder": "{\n \"model_type\": \"llama\",\n \"rope_theta\": 500000,\n \"max_position_embeddings\": 8192,\n \"num_attention_heads\": 32,\n \"num_key_value_heads\": 8,\n \"hidden_size\": 4096,\n \"num_hidden_layers\": 32\n}", | |
| "inspector.T_eval": "T_eval (您的目标上下文):", | |
| "inspector.btn": "🚀 检查并画像", | |
| // WHAT-IF 滑块 | |
| "whatif.title": "🎚 What-if: 拖动 T_eval 实时查看 γ 变化", | |
| "whatif.desc": "纯 JS 重新计算 (不调用 Pyodide)。滑动时显示几何 γ_Padé 和 d_horizon。点击按钮重新运行完整链。", | |
| "whatif.T_eval": "<strong>T_eval</strong>", | |
| "whatif.gamma_pade": "<strong>γ_Padé</strong>", | |
| "whatif.d_horizon": "<strong>d_horizon</strong>", | |
| "whatif.l_niah": "<strong>L_NIAH 上限</strong>", | |
| "whatif.predicted": "<strong>预测几何判定</strong>", | |
| "whatif.rerun": "↻ 在此 T_eval 重新计算完整链", | |
| // COMMUNITY 反馈 | |
| "community.title": "🌐 社区最近提交", | |
| "community.desc": "公共 registry 的实时反馈。点击任意提交查看完整分析。", | |
| "community.browse_all": "浏览全部 →", | |
| "community.loading": "加载中...", | |
| "community.no_repo": "Registry 仓库尚未创建。一旦它存在并有提交,它们将在此处实时显示。", | |
| "community.no_submissions": "暂无提交。成为第一个 — 生成一个 Profile 并点击 📤 提交到 registry。", | |
| // FALSIFICATION 仪表板 | |
| "falsification.title": "🔬 论文预测 — 可证伪状态", | |
| "falsification.desc": "TAF 框架基于可证伪的预测 (F1-F23)。每一个都经过经验测试。这是论文中每个预测的实时状态。", | |
| "falsification.summary": "{confirmed} 已确认 · {partial} 部分 · {refuted} 已反驳 · {untested} 未测试 (共 {total} 个预测)", | |
| "falsification.col.id": "ID", | |
| "falsification.col.claim": "Claim", | |
| "falsification.col.status": "状态", | |
| "falsification.col.evidence": "证据", | |
| "tafcard.title": "📇 TAF 卡 — 完整模型画像", | |
| "tafcard.recipes_title": "📋 配方 — 各维度判定", | |
| "tafcard.recipes_count_label": "维度", | |
| "tafcard.numbers_title": "🔢 关键数字 (paper §26)", | |
| "tafcard.fals_title": "🔬 可证伪状态 (F1-F23)", | |
| "tafcard.fals_none": "无适用的可证伪。", | |
| "tafcard.diag_title": "🔬 诊断 — 数字 · γ 检验 · what-if", | |
| "tafcard.verify_title": "✓ 验证 — Lean + Sage + 可证伪", | |
| "tafcard.share_title": "📂 来源与分享", | |
| "tafcard.whatif_title": "🎚️ What-if 浏览器", | |
| "verdict.go": "通过", | |
| "verdict.no": "否", | |
| "verdict.degraded": "降级", | |
| "compare.title_out": "🆚 比较表", | |
| "status.loading_pyodide": "⏳ 加载 Python 运行时 (~10MB,首次加载)...", | |
| "status.loading_taf": "⏳ 加载 TAF 公式 + 配方...", | |
| "status.ready": "✅ 就绪。选择一个模型并点击画像开始。", | |
| "status.computing": "🧮 计算 TAF 链...", | |
| "status.done": "✅ 完成。", | |
| "profile.hf_placeholder": "例如: meta-llama/Meta-Llama-3-8B 或 Qwen/Qwen2.5-7B", | |
| "compare.hf_placeholder": "HF 模型 id (例如: meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot1_placeholder": "HF 模型 id (例如: meta-llama/Meta-Llama-3-8B)", | |
| "compare.slot2_placeholder": "HF 模型 id #2", | |
| "compare.slot3_placeholder": "HF 模型 id #3 (可选)", | |
| "compare.preset_default": "— 或预设 —", | |
| // 表单参数 | |
| "param.theta": "θ (rope_theta)", | |
| "param.theta.tip": "<strong>RoPE 基础频率</strong> 来自 <code>config.rope_theta</code>。越高 = 长程能力越强。", | |
| "param.T_train": "T_train", | |
| "param.T_train.tip": "<strong>训练最大上下文</strong>。来自 <code>max_position_embeddings</code>。超出此范围属于外推。", | |
| "param.T_eval": "T_eval (您的目标)", | |
| "param.T_eval.tip": "<strong>您的目标推理上下文</strong>。关键问题: 模型在 <em>这个</em> 长度下表现是否良好?", | |
| "param.n_attn": "n_attention_heads", | |
| "param.n_attn.tip": "<strong>每层 attention heads 数</strong>。来自 <code>num_attention_heads</code>。", | |
| "param.n_kv": "n_kv_heads", | |
| "param.n_kv.tip": "<strong>KV heads</strong>。若 < n_attention_heads → GQA (Grouped Query Attention)。降低 KV 内存但将 γ 推向 Hagedorn。", | |
| "param.d_head": "head_dim", | |
| "param.d_head.tip": "<strong>每 head 维度</strong>。典型 64、96、128。来自 <code>head_dim</code> 或 <code>hidden_size / num_attention_heads</code>。", | |
| "param.n_layers": "n_layers", | |
| "param.n_layers.tip": "<strong>Transformer 块数</strong>。来自 <code>num_hidden_layers</code>。", | |
| "param.n_params": "n_params (例如 8e9)", | |
| "param.n_params.tip": "<strong>总参数量</strong>。约 400M 阈值出现 induction heads。影响 KV 内存和预算配方。", | |
| "param.has_swa": "有 SWA 吗?", | |
| "param.has_swa.tip": "<strong>Sliding Window Attention</strong>。Mistral、gemma-2、phi-3 为 <code>true</code>。v0.5.3 校准审计禁用了历史 δ_SWA 校正 (n=1 拟合)。", | |
| "common.yes": "是", | |
| "common.no": "否", | |
| // 模式提示 | |
| "modes.tip": "<strong>十四种使用方式</strong>。<br><strong>📇 画像</strong>: 粘贴模型 id → 5 个配方的 TAF 卡。<br><strong>🆚 比较</strong>: 2-3 个模型在一个配方上并排比较。<br><strong>🔍 检查 config</strong>: 粘贴原始 config.json → 完整画像。<br><strong>💬 提问</strong>: 自由形式问题,浏览器 LLM 选择配方。<br><strong>📋 配方</strong>: 手动选择,完全控制表单。<br><strong>🩺 CLI 诊断</strong>: 生成 Python 命令在本地测量 γ。<br><strong>📊 相图</strong>: 23 个面板模型在 (log θ, γ) 平面上。<br><strong>🪟 揭示</strong>: 检测误导的 max_position_embeddings(SWA / YaRN / RoPE 缩放)。<br><strong>📜 Chat-template</strong>: 检测系列 + 给出 lm-eval / vLLM / transformers 的精确 CLI flag。<br><strong>🎯 Arena CI</strong>: 从原始 pairwise 投票数据重建置信区间;检测 Arena 隐藏的统计并列。<br><strong>🧪 污染</strong>: 根据训练 cutoff 与发布日期,对 20+ benchmark 进行污染概率评估。<br><strong>⚖️ Quant</strong>: 预测任意(模型 × 量化方案)的 γ-shift 与 ΔPPL;cliff 时推荐更安全替代方案。<br><strong>🔀 Drift</strong>: 同一模型,两 setup 下分数不同 — bug 还是噪声?预测数值噪声区间并标记真实 bug。<br><strong>🔍 NIAH→Reason</strong>: 从架构预测 NIAH 与多跳 reasoning 通过率;找到模型的安全 reasoning 上下文。", | |
| "profile.tip": "<strong>一键完整诊断</strong>。粘贴任意 HF 模型 id (或选择预设)。工具运行所有 5 个配方 (长上下文、KV 压缩、自定义 vs API、预算、硬件),生成单个 <strong>TAF 卡</strong>,显示每个维度的判定 + 关键数字 + 架构分类。<br><br><strong>用例</strong>: \"我正在为生产评估 Qwen2.5-32B — 它的完整可行性概况是什么?\" → 粘贴 id → 画像 → 完成。", | |
| "compare.tip": "<strong>同一配方,多个模型</strong>。选择 2-3 个候选模型和一个配方。在单个比较表中查看判定。<br><br><strong>用例</strong>: \"我需要在 16K 进行长上下文检索 — 哪个最好: Llama-3-8B、Mistral-7B 或 Qwen-7B?\" → 选择 3 个 + X-2 + 16K → 看赢家。", | |
| // 帮助模态框 | |
| "help.title": "📘 TAF Agent — 用户手册", | |
| "help.what.title": "它做什么?", | |
| "help.what.body": "在<em>花费 GPU/$ 之前</em>,预测任意 transformer LLM 的<strong>实际可行性</strong>。回答诸如 \"这个模型能在 L=32K 工作吗?\" 或 \"我应该自定义训练还是使用 API?\" 等问题,使用确定性 Python 公式 (TAF — Thermodynamic Attention Framework)。", | |
| "help.modes.title": "如何使用 — 7 种模式", | |
| "help.modes.profile": "<strong>📇 画像</strong>: 粘贴模型 id → 同时运行所有配方 = TAF 卡。<strong>最佳起点</strong>。", | |
| "help.modes.compare": "<strong>🆚 比较</strong>: 2-3 个模型在同一配方上并排。最适合在候选者之间选择。", | |
| "help.modes.inspector": "<strong>🔍 检查 config</strong>: 粘贴原始 <code>config.json</code> → 工具解析并运行完整画像。适用于私有模型、开发中的配置、或尚未在 HF Hub 上的模型。", | |
| "help.modes.ask": "<strong>💬 自由提问</strong>: 自然语言问题,浏览器 LLM 选择配方。最适合随意探索。", | |
| "help.modes.recipe": "<strong>📋 配方 + 表单</strong>: 手动选择,完全控制参数。最适合需要精确控制时。", | |
| "help.modes.diagnose": "<strong>🩺 CLI 诊断</strong>: 生成 Python 命令在你的本地机器上测量 γ (transformers + numpy)。快速 ≈5 分钟 CPU;完整 ≈20–60 分钟 GPU。结果 JSON 可通过 Inspect 重新上传。", | |
| "help.modes.phase": "<strong>📊 相图</strong>: 23 个面板模型在 (log θ, γ) 平面上的散点图。Hagedorn 线 γ=1 分隔 A 相和 B 相。点击点将该模型加载到配方表单。", | |
| "help.recipes.title": "可用的 8 个配方", | |
| "help.recipe.x1.title": "<strong>X-1 自定义训练 vs API</strong> — 比较训练自己模型的成本与付费使用 API 的成本。", | |
| "help.recipe.x1.example": "尝试: <em>\"我应该训练 8B 自定义模型还是使用 GPT-4o 处理每月 50M tokens?\"</em><br>答案: 是 (自定义) / 否 (API),含损益平衡月数。", | |
| "help.recipe.x2.title": "<strong>X-2 长上下文可行性</strong> — 预测模型是否能可靠地服务目标上下文长度。", | |
| "help.recipe.x2.example": "尝试: <em>\"Meta-Llama-3-8B 能处理 32000 tokens 检索吗?\"</em><br>链: γ_Padé → 分解 → d_horizon → NIAH 上限 → 幻觉 → KV 内存。<br>判定: 是 / 降级 / 否,如需则提供缓解措施。", | |
| "help.recipe.x3.title": "<strong>X-3 预算预飞行</strong> — 给定 $ 预算,可行训练什么模型?", | |
| "help.recipe.x3.example": "尝试: <em>\"我有 $5000,可以训练什么模型?\"</em><br>答案: GO / TINY-MODEL / MEMORY-LIMITED 含具体的 N (参数) 和 D (tokens)。", | |
| "help.recipe.x5.title": "<strong>X-5 硬件选择</strong> — 应该使用哪个 GPU 以达到目标吞吐量?", | |
| "help.recipe.x5.example": "尝试: <em>\"以每天 1000 万 tokens 提供 Llama-3-8B 的最便宜硬件\"</em><br>答案: 最佳 GPU + $/Mtok + 容量 vs 目标。", | |
| "help.recipe.x19.title": "<strong>X-19 KV 压缩决策</strong> — 应该使用 soft decay、hard cutoff 还是文献方法?", | |
| "help.recipe.x21.title": "<strong>X-21 Imprint 纯度诊断</strong> — 通过 ν=−1/(2π) 预测 RANDOM token 上的 γ;模型的 RoPE 预测有多干净?", | |
| "help.recipe.x22.title": "<strong>X-22 Compute-Context 不变量</strong> — γ × log(N²·D) 是否落在 51.2 ± 16.8 区间内?检测 scaling/training 异常。", | |
| "help.recipe.x23.title": "<strong>X-23 IH-Phase 检测器</strong> — 前- 还是后-induction-head?通过 sign(γ_text − γ_random) 进行廉价探测。", | |
| "help.recipe.x19.example": "尝试: <em>\"如何为 Qwen2.5-7B 在 32K 压缩 KV 缓存?\"</em><br>答案: USE SOFT DECAY / USE D_f CUTOFF / USE LITERATURE METHODS / USE HARD T_train.", | |
| "help.recipe.x21.example": "尝试: <em>\"Llama-3-8B 上的 RoPE 预测有多干净?\"</em><br>答案: 预测的 γ_random + 诊断 (CLEAN / OVER-IMPRINTED / UNDER-IMPRINTED)。", | |
| "help.recipe.x22.example": "尝试: <em>\"Mistral-7B 是否符合 compute-context 不变量?\"</em><br>答案: K = γ·log(N²·D)、z-score、IN-BAND 或 OUTLIER。", | |
| "help.recipe.x23.example": "尝试: <em>\"Qwen2.5-7B 是后-induction-head 吗?\"</em><br>答案: CONFIRMED PRE-IH / CONFIRMED POST-IH / ANOMALY。", | |
| "help.section.v04": "<strong>v0.4 新增</strong> (第 29 次研究会话, 2026-04-28): 来自 cross-model panel 分析 (n=22 LLMs) 的三个诊断 recipes。", | |
| "help.divider.v04_s29": "— v0.4 (第 29 次会话发现) —", | |
| "footer.tech_stack": "计算:Pyodide · 综合:WebLLM (Qwen2.5-0.5B 本地) · 托管:GitHub Pages · 成本:$0", | |
| "help.v04.imprint": "<strong>学习印记斜率 ν = −1/(2π)</strong>: RoPE 旋转周期 2π 在权重上引发位置偏置, 与 log(N_params) 成正比。即使 random token 也显示此 scaling。ν 是 DERIVED — 非拟合 (经验误差 0.3%)。", | |
| "help.v04.invariant": "<strong>Chinchilla-attention 不变量 K</strong>: γ × log(N²·D) ≈ 51.2 ± 16.8 (CV=0.329)。将 compute scaling 和 attention 指数连接为单一无量纲数。", | |
| "help.v04.ih_probe": "<strong>Δγ 作为 IH 探测</strong>: sign(γ_text − γ_random) > 0 ⟺ post-induction-head。比运行 in-context-learning 基准更便宜。", | |
| "help.v04.constants": "<strong>γ 簇落在著名常数上</strong> (有趣, n=4): CodeLlama-13b γ=0.382 ≈ 1−1/φ (黄金共轭, err 0.0003); pythia-1.4b γ=0.705 ≈ 1/√2; Llama-2-7b γ=0.287 ≈ 1−1/√2; Mistral-Nemo γ=0.428 ≈ log_10(e)。Caveat: 可能是巧合。", | |
| "help.param.theta": "<strong>θ (rope_theta)</strong>: RoPE 基础频率。越高 = 长程能力越强。典型: 10000 (早期),500000 (Llama-3),1000000 (Qwen2.5)。", | |
| "help.param.T_train": "<strong>T_train</strong>: 模型训练时的最大上下文。来自 <code>max_position_embeddings</code>。", | |
| "help.param.T_eval": "<strong>T_eval</strong>: <em>您的</em> 目标推理上下文长度。关键旋钮。", | |
| "help.param.gqa": "<strong>n_kv_heads < n_attention_heads</strong>: 模型使用 GQA (Grouped Query Attention)。减少 KV 内存但将 γ 推向 Hagedorn。", | |
| "help.param.swa": "<strong>has_SWA</strong>: 模型使用 Sliding Window Attention (Mistral、gemma-2)。", | |
| "help.param.nparams": "<strong>n_params</strong>: 总参数数量。诱导头出现的阈值约 400M。", | |
| "help.add_models.title": "添加新模型 (3 种方式)", | |
| "help.add_models.preset": "<strong>预设列表</strong>: 11 个流行模型已策划。从下拉菜单选择。", | |
| "help.add_models.hf": "<strong>HF Hub 获取</strong>: 粘贴任意 id (例如 <code>Qwen/Qwen2.5-32B-Instruct</code>),点击 📥 获取。浏览器直接从 HuggingFace 下载 <code>config.json</code>,填充表单。适用于任何公共模型。", | |
| "help.add_models.manual": "<strong>手动</strong>: 用模型卡的值直接填充表单字段。", | |
| "help.audit.title": "可审计链", | |
| "help.audit.body": "每个结果都显示完整的<strong>计算链</strong> — 每个公式步骤及其输入、输出和解释。点击任意步骤展开。引用的章节号 (§26.1、§19.1 等) 指向论文中的推导。", | |
| "help.synthesis.title": "自然语言回答", | |
| "help.synthesis.body": "在确定性链运行后,浏览器中的 LLM (Qwen2.5-0.5B,首次加载后约 350MB 缓存) 综合自然语言摘要。上面的数字<em>始终正确</em> (确定性 Python);综合由 LLM 生成 — 如有疑问,请对照链验证。", | |
| "help.params.title": "常见参数解释", | |
| "help.verdicts.title": "判定中要看什么", | |
| "help.verdict.yes": "<strong style=\"color:#3fb950;\">是 / GO</strong> — 自信地继续;数字支持选择。", | |
| "help.verdict.deg": "<strong style=\"color:#d29922;\">降级 / TINY-MODEL</strong> — 有警告地工作;阅读操作。", | |
| "help.verdict.no": "<strong style=\"color:#f85149;\">否 / MEMORY-LIMITED</strong> — 不要按原样进行;提供缓解措施。", | |
| "help.privacy.title": "隐私", | |
| "help.privacy.body": "一切都在您的浏览器中运行。无遥测,无分析,无数据发送到任何地方。即使是 LLM 模型也通过 WebGPU/WebAssembly 在本地运行。您的 model_ids 和问题永不离开此页面。", | |
| "help.source.title": "源代码和论文", | |
| "help.source.body": "源代码: <a href=\"https://github.com/karlesmarin/tafagent\" target=\"_blank\">github.com/karlesmarin/tafagent</a><br>论文: <em>Marin 2026 — Predicting How Transformers Attend</em> (<a href=\"https://zenodo.org/records/19826343\" target=\"_blank\">Zenodo</a>; arXiv 即将)<br>数据集: <a href=\"https://huggingface.co/datasets/karlexmarin/taf-attention-decay\" target=\"_blank\">taf-attention-decay</a> — 32个模型上的58次γ测量 (CC-BY-4.0)", | |
| "footer.text": "© 2026 Carles Marin · Apache-2.0 · 独立研究 · 闭合论文回路的工具。", | |
| }, | |
| }; | |
| let currentLang = "en"; | |
| export function getLang() { | |
| return currentLang; | |
| } | |
| export function setLang(code) { | |
| if (!TRANSLATIONS[code]) return; | |
| currentLang = code; | |
| try { localStorage.setItem("tafagent_lang", code); } catch (e) {} | |
| applyTranslations(); | |
| // Highlight active flag | |
| document.querySelectorAll("[data-lang]").forEach(el => { | |
| el.classList.toggle("lang-active", el.dataset.lang === code); | |
| }); | |
| } | |
| export function t(key) { | |
| return TRANSLATIONS[currentLang][key] ?? TRANSLATIONS.en[key] ?? key; | |
| } | |
| export function applyTranslations() { | |
| document.querySelectorAll("[data-i18n]").forEach(el => { | |
| const key = el.dataset.i18n; | |
| const value = t(key); | |
| // Allow HTML in translations (we control them) | |
| el.innerHTML = value; | |
| }); | |
| document.querySelectorAll("[data-i18n-placeholder]").forEach(el => { | |
| el.placeholder = t(el.dataset.i18nPlaceholder); | |
| }); | |
| } | |
| // Expose so dynamically-inserted DOM (renderProfile, renderCompare) can re-apply | |
| if (typeof window !== "undefined") { | |
| window.__taf_applyTranslations = applyTranslations; | |
| // Also expose the lookup itself so non-import-based modules (e.g. hf_autocomplete | |
| // that runs outside main.js context) can localize without a circular import. | |
| window.__taf_t = t; | |
| } | |
| export function initI18n() { | |
| // Browser default lang detection or stored preference | |
| let stored = null; | |
| try { stored = localStorage.getItem("tafagent_lang"); } catch (e) {} | |
| if (stored && TRANSLATIONS[stored]) { | |
| currentLang = stored; | |
| } else { | |
| const browserLang = (navigator.language || "en").slice(0, 2); | |
| if (TRANSLATIONS[browserLang]) currentLang = browserLang; | |
| } | |
| applyTranslations(); | |
| // Mark active flag | |
| document.querySelectorAll("[data-lang]").forEach(el => { | |
| el.classList.toggle("lang-active", el.dataset.lang === currentLang); | |
| }); | |
| } | |