Spaces:
Running
v0.8.1 Solutions Hub — integrator portal (30 pains × 65 external tools)
Browse files🧭 Solutions Hub mode: every documented LLM-eval pain mapped to
(a) the tafagent mode that addresses it (16 of 30 covered) and
(b) the best-of-breed external tools the community already trusts
(65 curated links across RAGAS, MTEB, HELM, MCP Schema Validator,
llm-stats, llguidance, GlitchMiner, RULER, JSONLint, FastMCP,
LangSmith, TruLens, DeepEval, etc.).
Strategy shift: tafagent as integrator, not silo. If a canonical
solution exists publicly we link, not rebuild. Round-3 + round-4
research (2026-05-07) validated this — 6 of 10 candidate pains
had production-grade tools already (skip build). Hub closes the
loop: users land here, find the right tool, regardless of who
shipped it.
Coverage: 7 categories — eval reliability · diagnostic · setup ·
training · retrieval · multimodal · observability. Each pain entry
has: tafagent_mode (or null/planned), external_tools[]
(name+url+type), best_for, not_for. Tool types: tool / leaderboard /
paper / article / docs / issue / spec / benchmark.
UI: live search across pain+scenario+tool name, accordion per
category, badges for coverage status. i18n × 4 langs (EN/ES/FR/ZH).
Help modal entry, inventory card entry, task-tile button.
Also surfaces 2 planned tafagent gaps: 🔧 PEFT Anti-Pattern Checker
(v0.8.2 candidate, peft #2115 silent fail) and JSON CoT-aware Linter
(answer-before-reasoning bug). Both browser-feasible, no current tool.
URL validation 2026-05-07: top critical URLs fetched + confirmed alive
(HF PEFT troubleshooting docs, MCP Schema Validator, RAGAS v0.4.3
13.8k★, MTEB leaderboard).
Files: data/solutions_hub.json + js/solutions_hub.js (new);
index.html + js/main.js + js/i18n.js (modified).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- data/solutions_hub.json +407 -0
- index.html +24 -0
- js/i18n.js +72 -0
- js/main.js +108 -1
- js/solutions_hub.js +69 -0
|
@@ -0,0 +1,407 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"version": "0.8.1",
|
| 3 |
+
"compiled": "2026-05-07",
|
| 4 |
+
"philosophy": "tafagent as integrator, not silo. For each documented LLM-eval pain we surface: (a) the tafagent mode that addresses it, if any; (b) the best-of-breed external tools the community already trusts; (c) when to use which. Goal: complete coverage, not feature lock-in. If the canonical tool exists elsewhere we link, not rebuild.",
|
| 5 |
+
"verification_note": "All external URLs were fetched and confirmed alive on the compiled date. Treat older entries with skepticism — link rot is real. Report dead links via the GitHub issue tracker.",
|
| 6 |
+
"categories": {
|
| 7 |
+
"eval_reliability": {
|
| 8 |
+
"label": "Trust a benchmark score",
|
| 9 |
+
"icon": "✓",
|
| 10 |
+
"description": "Should I believe this number?"
|
| 11 |
+
},
|
| 12 |
+
"diagnostic": {
|
| 13 |
+
"label": "Diagnose a model",
|
| 14 |
+
"icon": "🔬",
|
| 15 |
+
"description": "Will this model work for my use case?"
|
| 16 |
+
},
|
| 17 |
+
"setup": {
|
| 18 |
+
"label": "Set up an eval correctly",
|
| 19 |
+
"icon": "⚙️",
|
| 20 |
+
"description": "Avoid silent failures before running."
|
| 21 |
+
},
|
| 22 |
+
"training": {
|
| 23 |
+
"label": "Train / fine-tune safely",
|
| 24 |
+
"icon": "🛠️",
|
| 25 |
+
"description": "Don't waste GPU time on broken setups."
|
| 26 |
+
},
|
| 27 |
+
"retrieval": {
|
| 28 |
+
"label": "RAG & retrieval quality",
|
| 29 |
+
"icon": "📚",
|
| 30 |
+
"description": "Is my retrieval actually retrieving?"
|
| 31 |
+
},
|
| 32 |
+
"multimodal": {
|
| 33 |
+
"label": "Multimodal models",
|
| 34 |
+
"icon": "🖼️",
|
| 35 |
+
"description": "Vision-language and beyond."
|
| 36 |
+
},
|
| 37 |
+
"observability": {
|
| 38 |
+
"label": "Observe & debug agents",
|
| 39 |
+
"icon": "🔭",
|
| 40 |
+
"description": "What is my agent actually doing?"
|
| 41 |
+
}
|
| 42 |
+
},
|
| 43 |
+
"entries": [
|
| 44 |
+
{
|
| 45 |
+
"id": "saturation",
|
| 46 |
+
"category": "eval_reliability",
|
| 47 |
+
"pain": "Benchmark saturation — top models all tied at 90%+, score no longer informative.",
|
| 48 |
+
"tafagent_mode": "📈 Saturation",
|
| 49 |
+
"external_tools": [
|
| 50 |
+
{"name": "DemandSphere AI Frontier Tracker", "url": "https://www.demandsphere.com/research/demandsphere-radar/ai-frontier-model-tracker/", "type": "leaderboard"},
|
| 51 |
+
{"name": "BenchLM.ai", "url": "https://benchlm.ai/", "type": "leaderboard"},
|
| 52 |
+
{"name": "LLM Stats", "url": "https://llm-stats.com/", "type": "leaderboard"}
|
| 53 |
+
],
|
| 54 |
+
"best_for": "Quick check whether MMLU / AIME / HumanEval still discriminate frontier models in 2026.",
|
| 55 |
+
"not_for": "Predicting which model will win on a non-standard benchmark."
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"id": "contamination",
|
| 59 |
+
"category": "eval_reliability",
|
| 60 |
+
"pain": "Benchmark contamination — model trained on the test set.",
|
| 61 |
+
"tafagent_mode": "🧪 Contamination",
|
| 62 |
+
"external_tools": [
|
| 63 |
+
{"name": "LiveBench (contamination-resistant)", "url": "https://livebench.ai/", "type": "leaderboard"},
|
| 64 |
+
{"name": "GSM8K-Platinum / contamination studies", "url": "https://thegrigorian.medium.com/when-benchmarks-lie-why-contamination-breaks-llm-evaluation-1fa335706f32", "type": "article"}
|
| 65 |
+
],
|
| 66 |
+
"best_for": "Estimating contamination probability across 20+ public benchmarks per architecture.",
|
| 67 |
+
"not_for": "Definitive proof — needs trace inspection. Treat as prior, not certainty."
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"id": "vendor_self_reported",
|
| 71 |
+
"category": "eval_reliability",
|
| 72 |
+
"pain": "Vendor-reported scores untrustworthy (Llama 4 mixed-quality reports).",
|
| 73 |
+
"tafagent_mode": null,
|
| 74 |
+
"external_tools": [
|
| 75 |
+
{"name": "llm-stats verified vs self-reported tags", "url": "https://llm-stats.com/benchmarks/swe-bench-verified", "type": "leaderboard"},
|
| 76 |
+
{"name": "BenchLM.ai confidence indicator", "url": "https://benchlm.ai/", "type": "leaderboard"},
|
| 77 |
+
{"name": "Vellum independent leaderboard", "url": "https://www.vellum.ai/llm-leaderboard", "type": "leaderboard"}
|
| 78 |
+
],
|
| 79 |
+
"best_for": "Cross-checking vendor blog claims against community-verified runs before quoting.",
|
| 80 |
+
"not_for": "Models that have never been independently verified — assume vendor optimism."
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"id": "arena_ci",
|
| 84 |
+
"category": "eval_reliability",
|
| 85 |
+
"pain": "Chatbot Arena hides confidence intervals — many top-Elo wins are statistically tied.",
|
| 86 |
+
"tafagent_mode": "🎯 Arena CI",
|
| 87 |
+
"external_tools": [
|
| 88 |
+
{"name": "LMArena leaderboard (raw)", "url": "https://lmarena.ai/", "type": "leaderboard"},
|
| 89 |
+
{"name": "Bradley-Terry methodology paper", "url": "https://arxiv.org/abs/2403.04132", "type": "paper"}
|
| 90 |
+
],
|
| 91 |
+
"best_for": "Reconstructing 95% CIs from raw vote CSVs to flag statistical ties.",
|
| 92 |
+
"not_for": "Inferring true skill — Arena measures preference, not capability."
|
| 93 |
+
},
|
| 94 |
+
{
|
| 95 |
+
"id": "cross_drift",
|
| 96 |
+
"category": "eval_reliability",
|
| 97 |
+
"pain": "Same model, different scores on different setups — bug or noise?",
|
| 98 |
+
"tafagent_mode": "🔀 Drift",
|
| 99 |
+
"external_tools": [
|
| 100 |
+
{"name": "vLLM vs HF transformers consistency study", "url": "https://github.com/vllm-project/vllm/issues/12343", "type": "issue"}
|
| 101 |
+
],
|
| 102 |
+
"best_for": "Predicting maximum admissible numerical gap between two evaluation frameworks.",
|
| 103 |
+
"not_for": "Identifying the exact root cause — narrows down candidates only."
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"id": "sandbagging",
|
| 107 |
+
"category": "eval_reliability",
|
| 108 |
+
"pain": "Models can strategically underperform on capability evaluations.",
|
| 109 |
+
"tafagent_mode": null,
|
| 110 |
+
"external_tools": [
|
| 111 |
+
{"name": "AI Sandbagging paper", "url": "https://arxiv.org/abs/2406.07358", "type": "paper"},
|
| 112 |
+
{"name": "Covert sandbagging vs CoT monitoring", "url": "https://www.alphaxiv.org/overview/2508.00943", "type": "paper"}
|
| 113 |
+
],
|
| 114 |
+
"best_for": "Awareness — knowing CoT monitoring can have up to 36% false-negative rate.",
|
| 115 |
+
"not_for": "Live detection — requires running the model and adversarial probes."
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"id": "max_pos_embeddings_unmask",
|
| 119 |
+
"category": "diagnostic",
|
| 120 |
+
"pain": "Config claims 32k/128k context but model attends way less (SWA, YaRN).",
|
| 121 |
+
"tafagent_mode": "🪟 Unmask",
|
| 122 |
+
"external_tools": [
|
| 123 |
+
{"name": "vLLM long-context handling thread", "url": "https://github.com/vllm-project/vllm/issues/16757", "type": "issue"}
|
| 124 |
+
],
|
| 125 |
+
"best_for": "1-second verdict (HONEST / INFLATED / SEVERELY INFLATED / YARN-EXTENDED) before paying GPU.",
|
| 126 |
+
"not_for": "Validating that the model reasons (vs. just retrieves) at the effective context — use NIAH→Reason."
|
| 127 |
+
},
|
| 128 |
+
{
|
| 129 |
+
"id": "niah_reasoning",
|
| 130 |
+
"category": "diagnostic",
|
| 131 |
+
"pain": "Long-context models pass NIAH but fail multi-hop reasoning.",
|
| 132 |
+
"tafagent_mode": "🔍 NIAH→Reason",
|
| 133 |
+
"external_tools": [
|
| 134 |
+
{"name": "NVIDIA RULER benchmark", "url": "https://github.com/NVIDIA/RULER", "type": "tool"},
|
| 135 |
+
{"name": "RULER paper / leaderboard", "url": "https://llm-stats.com/benchmarks/ruler", "type": "leaderboard"}
|
| 136 |
+
],
|
| 137 |
+
"best_for": "Predicting NIAH and reasoning pass rates from architecture alone — no inference needed.",
|
| 138 |
+
"not_for": "Final go/no-go decision — re-test on your domain after architectural screening passes."
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"id": "tokenizer_glitch",
|
| 142 |
+
"category": "diagnostic",
|
| 143 |
+
"pain": "Glitch tokens / merge residues break inference silently.",
|
| 144 |
+
"tafagent_mode": null,
|
| 145 |
+
"external_tools": [
|
| 146 |
+
{"name": "GlitchMiner (AAAI 2026)", "url": "https://arxiv.org/html/2601.14658v1", "type": "paper"},
|
| 147 |
+
{"name": "Tiktokenizer (browser visualization)", "url": "https://tiktokenizer.vercel.app/", "type": "tool"}
|
| 148 |
+
],
|
| 149 |
+
"best_for": "Spotting weird tokens. ~4.3% of vocab in Llama-2 / Mistral / DeepSeek-V3 are glitches.",
|
| 150 |
+
"not_for": "Fixing them — requires finetuning or vocab patching."
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"id": "phase_diagram",
|
| 154 |
+
"category": "diagnostic",
|
| 155 |
+
"pain": "Where does my model sit in the architecture phase space (γ × θ)?",
|
| 156 |
+
"tafagent_mode": "📊 Phase diagram",
|
| 157 |
+
"external_tools": [],
|
| 158 |
+
"best_for": "Visualizing 23 reference models and locating yours by Hagedorn line / Padé curve.",
|
| 159 |
+
"not_for": "Quantitative recipe scoring — use Profile mode instead."
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"id": "profile",
|
| 163 |
+
"category": "diagnostic",
|
| 164 |
+
"pain": "Will this model fit my use case across all 5 recipes?",
|
| 165 |
+
"tafagent_mode": "📇 Profile",
|
| 166 |
+
"external_tools": [],
|
| 167 |
+
"best_for": "Scoring all 5 recipes (custom train vs API · long context · budget · hardware · KV cache · etc.) in one pass.",
|
| 168 |
+
"not_for": "Production deployment readiness — Profile is screening, not certification."
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"id": "chat_template",
|
| 172 |
+
"category": "setup",
|
| 173 |
+
"pain": "Forgetting `--apply_chat_template` silently halves multi-turn accuracy.",
|
| 174 |
+
"tafagent_mode": "📜 Chat-template",
|
| 175 |
+
"external_tools": [
|
| 176 |
+
{"name": "lm-eval-harness #1841 (canonical issue)", "url": "https://github.com/EleutherAI/lm-evaluation-harness/issues/1841", "type": "issue"},
|
| 177 |
+
{"name": "HF chat-template docs", "url": "https://huggingface.co/docs/transformers/main/en/chat_templating", "type": "docs"}
|
| 178 |
+
],
|
| 179 |
+
"best_for": "Detecting which family (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / DeepSeek / Alpaca) and getting the exact CLI flag.",
|
| 180 |
+
"not_for": "Custom templates outside the 7 detected families — verify manually."
|
| 181 |
+
},
|
| 182 |
+
{
|
| 183 |
+
"id": "structured_outputs",
|
| 184 |
+
"category": "setup",
|
| 185 |
+
"pain": "JSON schema engines fail silently; CoT models commit to answer before reasoning.",
|
| 186 |
+
"tafagent_mode": null,
|
| 187 |
+
"external_tools": [
|
| 188 |
+
{"name": "llguidance (constrained decoding)", "url": "https://github.com/guidance-ai/llguidance", "type": "tool"},
|
| 189 |
+
{"name": "Outlines", "url": "https://github.com/dottxt-ai/outlines", "type": "tool"},
|
| 190 |
+
{"name": "JSONLint validator (browser)", "url": "https://jsonlint.com/json-schema", "type": "tool"},
|
| 191 |
+
{"name": "JSONSchemaBench (10K real schemas)", "url": "https://github.com/guidance-ai/jsonschemabench", "type": "benchmark"},
|
| 192 |
+
{"name": "Schema field-ordering anti-patterns explained", "url": "https://collinwilkins.com/articles/structured-output", "type": "article"}
|
| 193 |
+
],
|
| 194 |
+
"best_for": "Constrained decoding for production. Use llguidance / Outlines / SGLang grammars for 100% schema-valid output.",
|
| 195 |
+
"not_for": "Quick prototypes — function calling is sufficient (95-99% reliable)."
|
| 196 |
+
},
|
| 197 |
+
{
|
| 198 |
+
"id": "mcp_conformance",
|
| 199 |
+
"category": "setup",
|
| 200 |
+
"pain": "MCP server schema doesn't conform to spec — clients silently break.",
|
| 201 |
+
"tafagent_mode": null,
|
| 202 |
+
"external_tools": [
|
| 203 |
+
{"name": "MCP Schema Validator (free, browser-based)", "url": "https://www.mcpserverspot.com/tools/validator", "type": "tool"},
|
| 204 |
+
{"name": "Official MCP spec", "url": "https://github.com/modelcontextprotocol/modelcontextprotocol", "type": "spec"},
|
| 205 |
+
{"name": "FastMCP 3.0 (Jan 2026)", "url": "https://github.com/jlowin/fastmcp", "type": "tool"}
|
| 206 |
+
],
|
| 207 |
+
"best_for": "One-shot validation of tool/resource/prompt schemas before publishing an MCP server.",
|
| 208 |
+
"not_for": "Runtime testing — use the official inspector for live calls."
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"id": "diagnose_cli",
|
| 212 |
+
"category": "setup",
|
| 213 |
+
"pain": "Need to measure γ_obs on real weights, not just predict from config.",
|
| 214 |
+
"tafagent_mode": "🩺 Diagnose CLI",
|
| 215 |
+
"external_tools": [
|
| 216 |
+
{"name": "TAF paper (Triangulum/karlesmarin)", "url": "https://github.com/karlesmarin/NeurIPS", "type": "paper"}
|
| 217 |
+
],
|
| 218 |
+
"best_for": "Generating the exact `python cli/diagnose_model.py` command for your model.",
|
| 219 |
+
"not_for": "Browser-only diagnosis — this mode is a builder, not an executor."
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"id": "peft_loading",
|
| 223 |
+
"category": "training",
|
| 224 |
+
"pain": "`get_peft_model()` before `PeftModel.from_pretrained()` silently loads base model — LoRA weights ignored.",
|
| 225 |
+
"tafagent_mode": null,
|
| 226 |
+
"external_tools": [
|
| 227 |
+
{"name": "HF PEFT troubleshooting (canonical)", "url": "https://huggingface.co/docs/peft/main/en/developer_guides/troubleshooting", "type": "docs"},
|
| 228 |
+
{"name": "peft #2115 — original bug report", "url": "https://github.com/huggingface/peft/issues/2115", "type": "issue"},
|
| 229 |
+
{"name": "PEFT get_layer_status() / get_model_status()", "url": "https://huggingface.co/docs/peft/main/en/package_reference/peft_model", "type": "docs"}
|
| 230 |
+
],
|
| 231 |
+
"best_for": "If you suspect your LoRA isn't being applied, call `model.get_layer_status()` and check `active_adapters` is non-empty.",
|
| 232 |
+
"not_for": null,
|
| 233 |
+
"tafagent_planned_mode": "🔧 PEFT Anti-Pattern Checker (v0.8.2)"
|
| 234 |
+
},
|
| 235 |
+
{
|
| 236 |
+
"id": "intruder_dimensions",
|
| 237 |
+
"category": "training",
|
| 238 |
+
"pain": "LoRA introduces 'intruder dimensions' that contribute to forgetting.",
|
| 239 |
+
"tafagent_mode": null,
|
| 240 |
+
"external_tools": [
|
| 241 |
+
{"name": "PEFT reduce_intruder_dimension utility", "url": "https://huggingface.co/docs/peft/main/en/developer_guides/troubleshooting", "type": "docs"}
|
| 242 |
+
],
|
| 243 |
+
"best_for": "Post-training cleanup if forgetting metrics regress after LoRA finetune.",
|
| 244 |
+
"not_for": "Heavy domain shift — intruder dim removal won't fix structural forgetting."
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"id": "quant_regime",
|
| 248 |
+
"category": "training",
|
| 249 |
+
"pain": "Will quantization break my model? Which scheme for which arch?",
|
| 250 |
+
"tafagent_mode": "⚖️ Quant",
|
| 251 |
+
"external_tools": [
|
| 252 |
+
{"name": "Maarten Grootendorst quantization newsletter", "url": "https://newsletter.maartengrootendorst.com/p/which-quantization-method-is-right", "type": "article"},
|
| 253 |
+
{"name": "Jarvis Labs vLLM quantization benchmarks", "url": "https://jarvislabs.ai/blog/vllm-quantization-complete-guide-benchmarks", "type": "article"},
|
| 254 |
+
{"name": "oobabooga quant comparison (GPTQ/AWQ/EXL2/GGUF)", "url": "https://oobabooga.github.io/blog/posts/gptq-awq-exl2-llamacpp/", "type": "article"},
|
| 255 |
+
{"name": "Which Quantization (arxiv)", "url": "https://arxiv.org/pdf/2601.14277", "type": "paper"}
|
| 256 |
+
],
|
| 257 |
+
"best_for": "Predict γ shift + ΔPPL for any (model × scheme) combo. AWQ ~95% / GGUF ~92% / GPTQ ~90% retention.",
|
| 258 |
+
"not_for": "Production quality cert — run a 10-prompt holdout eval after quantization."
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"id": "forgetting",
|
| 262 |
+
"category": "training",
|
| 263 |
+
"pain": "Will my LoRA fine-tune destroy MMLU performance?",
|
| 264 |
+
"tafagent_mode": null,
|
| 265 |
+
"external_tools": [
|
| 266 |
+
{"name": "Scaling Laws for Forgetting (Kleiman et al.)", "url": "https://arxiv.org/html/2401.05605v1", "type": "paper"},
|
| 267 |
+
{"name": "LoRA Learns Less and Forgets Less (Biderman et al., TMLR)", "url": "https://arxiv.org/abs/2405.09673", "type": "paper"},
|
| 268 |
+
{"name": "How Much is Too Much? (LoRA Rank Trade-offs)", "url": "https://arxiv.org/html/2512.15634v1", "type": "paper"}
|
| 269 |
+
],
|
| 270 |
+
"best_for": "Reading before any new fine-tune. Same (arch, rank) yields Δ from -10pp to +35pp on MMLU.",
|
| 271 |
+
"not_for": "A predictor — variance is too high for a closed-form heuristic. Measure your own holdout."
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"id": "rag_eval",
|
| 275 |
+
"category": "retrieval",
|
| 276 |
+
"pain": "Is my RAG retrieval actually retrieving?",
|
| 277 |
+
"tafagent_mode": null,
|
| 278 |
+
"external_tools": [
|
| 279 |
+
{"name": "RAGAS — automated RAG eval (13.8k★)", "url": "https://github.com/explodinggradients/ragas", "type": "tool"},
|
| 280 |
+
{"name": "TruLens — feedback functions + tracing", "url": "https://www.trulens.org/", "type": "tool"},
|
| 281 |
+
{"name": "DeepEval — 50+ metrics, CI/CD ready", "url": "https://github.com/confident-ai/deepeval", "type": "tool"},
|
| 282 |
+
{"name": "RAG eval frameworks comparison", "url": "https://atlan.com/know/llm-evaluation-frameworks-compared/", "type": "article"}
|
| 283 |
+
],
|
| 284 |
+
"best_for": "Production RAG monitoring. RAGAS for metric exploration, DeepEval for CI/CD gates, TruLens for dashboards.",
|
| 285 |
+
"not_for": "Browser-only — all three need Python + your retrieval pipeline."
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"id": "embeddings",
|
| 289 |
+
"category": "retrieval",
|
| 290 |
+
"pain": "Which embedding model for my corpus?",
|
| 291 |
+
"tafagent_mode": null,
|
| 292 |
+
"external_tools": [
|
| 293 |
+
{"name": "MTEB Leaderboard (HF official)", "url": "https://huggingface.co/spaces/mteb/leaderboard", "type": "leaderboard"},
|
| 294 |
+
{"name": "MMTEB — 250+ langs", "url": "https://github.com/embeddings-benchmark/mteb", "type": "tool"},
|
| 295 |
+
{"name": "Best embedding models for RAG (2026)", "url": "https://blog.premai.io/best-embedding-models-for-rag-2026-ranked-by-mteb-score-cost-and-self-hosting/", "type": "article"}
|
| 296 |
+
],
|
| 297 |
+
"best_for": "Cross-comparison of 100+ embedding models on 56 English tasks / 250+ multilingual.",
|
| 298 |
+
"not_for": "Predicting performance on your specific corpus — 'leaderboard ≠ your data'."
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"id": "vlm_eval",
|
| 302 |
+
"category": "multimodal",
|
| 303 |
+
"pain": "Which VLM benchmark, and is my VLM actually seeing?",
|
| 304 |
+
"tafagent_mode": "📈 Saturation (covers MMMU/MMMU-Pro/VisScience)",
|
| 305 |
+
"external_tools": [
|
| 306 |
+
{"name": "MMMU benchmark", "url": "https://mmmu-benchmark.github.io/", "type": "leaderboard"},
|
| 307 |
+
{"name": "VisScience (K-12 science)", "url": "https://arxiv.org/abs/2409.13730", "type": "paper"},
|
| 308 |
+
{"name": "VLM survey 2025", "url": "https://arxiv.org/abs/2501.02189", "type": "paper"}
|
| 309 |
+
],
|
| 310 |
+
"best_for": "MMMU near-saturated (top-3 ~85.6%); VisScience still discriminative (~46% mean) — pick the harder one.",
|
| 311 |
+
"not_for": "Visual hallucination detection — needs running the VLM with your images."
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"id": "agent_observability",
|
| 315 |
+
"category": "observability",
|
| 316 |
+
"pain": "Why did my agent fail / loop? Can't tell from logs.",
|
| 317 |
+
"tafagent_mode": null,
|
| 318 |
+
"external_tools": [
|
| 319 |
+
{"name": "LangSmith (LangChain ecosystem)", "url": "https://www.langchain.com/langsmith/observability", "type": "tool"},
|
| 320 |
+
{"name": "LangGraph Studio v2 (May 2025)", "url": "https://www.langchain.com/", "type": "tool"},
|
| 321 |
+
{"name": "TruLens (RAG + agent traces)", "url": "https://www.trulens.org/", "type": "tool"},
|
| 322 |
+
{"name": "OpenLLMetry — OTLP-based tracing", "url": "https://github.com/traceloop/openllmetry", "type": "tool"}
|
| 323 |
+
],
|
| 324 |
+
"best_for": "Visual trace viewer per LLM call / tool invocation / retrieval step. Token + cost tracking.",
|
| 325 |
+
"not_for": "Browser-only — all need integration into your stack."
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"id": "instruction_following",
|
| 329 |
+
"category": "observability",
|
| 330 |
+
"pain": "Best agentic models follow <30% of instructions perfectly on real-world tasks.",
|
| 331 |
+
"tafagent_mode": null,
|
| 332 |
+
"external_tools": [
|
| 333 |
+
{"name": "AGENTIF benchmark", "url": "https://keg.cs.tsinghua.edu.cn/persons/xubin/papers/AgentIF.pdf", "type": "paper"},
|
| 334 |
+
{"name": "Tool output processing benchmark", "url": "https://arxiv.org/html/2510.15955v1", "type": "paper"}
|
| 335 |
+
],
|
| 336 |
+
"best_for": "Calibrating expectations — performance falls with instruction length and tool constraints.",
|
| 337 |
+
"not_for": "Live testing — needs running the agent on your task suite."
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"id": "saturation_meta_resources",
|
| 341 |
+
"category": "eval_reliability",
|
| 342 |
+
"pain": "I want to read the full state of LLM evaluation 2026.",
|
| 343 |
+
"tafagent_mode": null,
|
| 344 |
+
"external_tools": [
|
| 345 |
+
{"name": "Survey: A Survey on LLM Benchmarks (2508.15361)", "url": "https://arxiv.org/abs/2508.15361", "type": "paper"},
|
| 346 |
+
{"name": "Survey: LLMs-as-Judges (2412.05579)", "url": "https://arxiv.org/abs/2412.05579", "type": "paper"},
|
| 347 |
+
{"name": "Holistic Evaluation of Language Models (HELM)", "url": "https://crfm.stanford.edu/helm/latest/", "type": "tool"},
|
| 348 |
+
{"name": "Open LLM Leaderboard v3", "url": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard", "type": "leaderboard"}
|
| 349 |
+
],
|
| 350 |
+
"best_for": "Comprehensive context on contamination, judge bias, saturation, methodology open problems.",
|
| 351 |
+
"not_for": "Quick decisions — these are surveys, not tools."
|
| 352 |
+
},
|
| 353 |
+
{
|
| 354 |
+
"id": "config_inspector",
|
| 355 |
+
"category": "diagnostic",
|
| 356 |
+
"pain": "What's actually in this model's config.json?",
|
| 357 |
+
"tafagent_mode": "🔍 Inspect config",
|
| 358 |
+
"external_tools": [
|
| 359 |
+
{"name": "LLM Config Comparer", "url": "https://huggingface.co/spaces/gojiteji/LLM-Comparer", "type": "tool"},
|
| 360 |
+
{"name": "HF Hub model card / config viewer", "url": "https://huggingface.co/", "type": "tool"}
|
| 361 |
+
],
|
| 362 |
+
"best_for": "Paste config JSON → full TAF analysis without re-fetching.",
|
| 363 |
+
"not_for": "Comparing across N models — use 🆚 Compare or open-llm-leaderboard/comparator."
|
| 364 |
+
},
|
| 365 |
+
{
|
| 366 |
+
"id": "compare_models",
|
| 367 |
+
"category": "diagnostic",
|
| 368 |
+
"pain": "Side-by-side comparison of multiple models on multiple recipes.",
|
| 369 |
+
"tafagent_mode": "🆚 Compare models",
|
| 370 |
+
"external_tools": [
|
| 371 |
+
{"name": "Open LLM Leaderboard Comparator (HF official)", "url": "https://huggingface.co/spaces/open-llm-leaderboard/comparator", "type": "tool"}
|
| 372 |
+
],
|
| 373 |
+
"best_for": "Quick recipe-by-recipe comparison up to 5 models.",
|
| 374 |
+
"not_for": "Production benchmark scores — use the HF comparator for benchmark results."
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"id": "ask_plain_english",
|
| 378 |
+
"category": "diagnostic",
|
| 379 |
+
"pain": "I just want to ask a question in plain English.",
|
| 380 |
+
"tafagent_mode": "💬 Ask plain English",
|
| 381 |
+
"external_tools": [],
|
| 382 |
+
"best_for": "'Will Mistral-7B handle 16K NIAH retrieval?' → answer with the right recipe + chain.",
|
| 383 |
+
"not_for": "Open-ended chat — this is a routing front-end, not a chatbot."
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"id": "recipe_picker",
|
| 387 |
+
"category": "diagnostic",
|
| 388 |
+
"pain": "I know my use case but not which recipe to apply.",
|
| 389 |
+
"tafagent_mode": "📋 Pick recipe",
|
| 390 |
+
"external_tools": [],
|
| 391 |
+
"best_for": "Browsing the 8 recipes (custom train vs API · long context · budget · hardware · etc.) when you don't know which fits.",
|
| 392 |
+
"not_for": "Running all of them at once — use Profile mode."
|
| 393 |
+
},
|
| 394 |
+
{
|
| 395 |
+
"id": "verified_math",
|
| 396 |
+
"category": "diagnostic",
|
| 397 |
+
"pain": "Can I trust the math behind the diagnostic?",
|
| 398 |
+
"tafagent_mode": null,
|
| 399 |
+
"external_tools": [
|
| 400 |
+
{"name": "Lean theorems (Triangulum/karlesmarin/lean-taf)", "url": "https://github.com/karlesmarin/lean-taf", "type": "spec"},
|
| 401 |
+
{"name": "TAF paper (NeurIPS)", "url": "https://github.com/karlesmarin/NeurIPS", "type": "paper"}
|
| 402 |
+
],
|
| 403 |
+
"best_for": "37 theorems machine-proven in Lean 4 + Mathlib. Click any badge in the UI to open the source line.",
|
| 404 |
+
"not_for": "Empirical claims — Lean covers algebraic identities, not measurement protocols."
|
| 405 |
+
}
|
| 406 |
+
]
|
| 407 |
+
}
|
|
@@ -216,6 +216,9 @@
|
|
| 216 |
<p><strong data-i18n="help.v08.saturation.title">📈 Benchmark Saturation Detector</strong></p>
|
| 217 |
<p data-i18n="help.v08.saturation.body">MMLU is saturated (top 88-94%), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.</p>
|
| 218 |
|
|
|
|
|
|
|
|
|
|
| 219 |
<h3 data-i18n="help.audit.title">The audit chain</h3>
|
| 220 |
<p data-i18n="help.audit.body">Every result shows the full <strong>Computation Chain</strong> — each formula step with its inputs,
|
| 221 |
output, and interpretation. Click any step to expand. Cite section numbers (§26.1, §19.1, etc.) refer
|
|
@@ -325,6 +328,7 @@
|
|
| 325 |
<li data-i18n="inv.v07.drift"><strong>🔀 Drift</strong> — bug or noise? Predict max admissible gap between two evals</li>
|
| 326 |
<li data-i18n="inv.v07.niah"><strong>🔍 NIAH→Reason</strong> — does your "128k context" actually reason there, or just retrieve?</li>
|
| 327 |
<li data-i18n="inv.v08.saturation"><strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?</li>
|
|
|
|
| 328 |
</ul>
|
| 329 |
</details>
|
| 330 |
</div>
|
|
@@ -383,6 +387,7 @@
|
|
| 383 |
<button data-mode-link="drift" data-i18n="modes.drift">🔀 Drift</button>
|
| 384 |
<button data-mode-link="arena" data-i18n="modes.arena">🎯 Arena CI</button>
|
| 385 |
<button data-mode-link="saturation" data-i18n="modes.saturation">📈 Saturation</button>
|
|
|
|
| 386 |
</div>
|
| 387 |
</div>
|
| 388 |
<div class="task-tile">
|
|
@@ -450,6 +455,7 @@
|
|
| 450 |
<button class="mode-btn" data-mode="drift" role="tab" aria-selected="false" data-i18n="modes.drift">🔀 Drift</button>
|
| 451 |
<button class="mode-btn" data-mode="niah" role="tab" aria-selected="false" data-i18n="modes.niah">🔍 NIAH→Reason</button>
|
| 452 |
<button class="mode-btn" data-mode="saturation" role="tab" aria-selected="false" data-i18n="modes.saturation">📈 Saturation</button>
|
|
|
|
| 453 |
</div>
|
| 454 |
<p id="mode-desc" class="recipe-desc" data-i18n="modes.desc">
|
| 455 |
<strong>Quickest start</strong>: paste any HuggingFace model id (e.g. <code>meta-llama/Meta-Llama-3-8B</code>),
|
|
@@ -997,6 +1003,24 @@
|
|
| 997 |
</p>
|
| 998 |
</section>
|
| 999 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1000 |
<!-- Recipe selector (mode=recipe) -->
|
| 1001 |
<section id="recipe-section" style="display:none;">
|
| 1002 |
<h2 data-i18n="recipe.title">📋 Recipe</h2>
|
|
|
|
| 216 |
<p><strong data-i18n="help.v08.saturation.title">📈 Benchmark Saturation Detector</strong></p>
|
| 217 |
<p data-i18n="help.v08.saturation.body">MMLU is saturated (top 88-94%), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.</p>
|
| 218 |
|
| 219 |
+
<p><strong data-i18n="help.v081.hub.title">🧭 Solutions Hub</strong></p>
|
| 220 |
+
<p data-i18n="help.v081.hub.body">tafagent as integrator, not silo. 30+ pains across 7 categories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), each mapped to (a) the tafagent mode that addresses it, if any, and (b) the best-of-breed external tools the community already trusts (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Search box matches across pain, scenario, and tool name. <em>Use case</em>: 'I have problem X — does tafagent solve it, and if not, who does?'</p>
|
| 221 |
+
|
| 222 |
<h3 data-i18n="help.audit.title">The audit chain</h3>
|
| 223 |
<p data-i18n="help.audit.body">Every result shows the full <strong>Computation Chain</strong> — each formula step with its inputs,
|
| 224 |
output, and interpretation. Click any step to expand. Cite section numbers (§26.1, §19.1, etc.) refer
|
|
|
|
| 328 |
<li data-i18n="inv.v07.drift"><strong>🔀 Drift</strong> — bug or noise? Predict max admissible gap between two evals</li>
|
| 329 |
<li data-i18n="inv.v07.niah"><strong>🔍 NIAH→Reason</strong> — does your "128k context" actually reason there, or just retrieve?</li>
|
| 330 |
<li data-i18n="inv.v08.saturation"><strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?</li>
|
| 331 |
+
<li data-i18n="inv.v081.hub"><strong>🧭 Solutions Hub</strong> — every documented pain mapped to a tafagent mode or curated external tool. Don't reinvent — find.</li>
|
| 332 |
</ul>
|
| 333 |
</details>
|
| 334 |
</div>
|
|
|
|
| 387 |
<button data-mode-link="drift" data-i18n="modes.drift">🔀 Drift</button>
|
| 388 |
<button data-mode-link="arena" data-i18n="modes.arena">🎯 Arena CI</button>
|
| 389 |
<button data-mode-link="saturation" data-i18n="modes.saturation">📈 Saturation</button>
|
| 390 |
+
<button data-mode-link="hub" data-i18n="modes.hub">🧭 Solutions</button>
|
| 391 |
</div>
|
| 392 |
</div>
|
| 393 |
<div class="task-tile">
|
|
|
|
| 455 |
<button class="mode-btn" data-mode="drift" role="tab" aria-selected="false" data-i18n="modes.drift">🔀 Drift</button>
|
| 456 |
<button class="mode-btn" data-mode="niah" role="tab" aria-selected="false" data-i18n="modes.niah">🔍 NIAH→Reason</button>
|
| 457 |
<button class="mode-btn" data-mode="saturation" role="tab" aria-selected="false" data-i18n="modes.saturation">📈 Saturation</button>
|
| 458 |
+
<button class="mode-btn" data-mode="hub" role="tab" aria-selected="false" data-i18n="modes.hub">🧭 Solutions</button>
|
| 459 |
</div>
|
| 460 |
<p id="mode-desc" class="recipe-desc" data-i18n="modes.desc">
|
| 461 |
<strong>Quickest start</strong>: paste any HuggingFace model id (e.g. <code>meta-llama/Meta-Llama-3-8B</code>),
|
|
|
|
| 1003 |
</p>
|
| 1004 |
</section>
|
| 1005 |
|
| 1006 |
+
<!-- Solutions Hub — integrator portal (v0.8.1) -->
|
| 1007 |
+
<section id="hub-section" style="display:none;">
|
| 1008 |
+
<h2><span data-i18n="hub.title">🧭 Solutions Hub</span>
|
| 1009 |
+
<span class="info"><span class="tooltip" data-i18n="hub.tip">
|
| 1010 |
+
Map of every documented LLM-eval pain we know about: which tafagent mode addresses it (if any), and the best-of-breed external tools the community already trusts. Goal: full coverage. If a canonical tool exists elsewhere, we link rather than rebuild.
|
| 1011 |
+
</span></span>
|
| 1012 |
+
</h2>
|
| 1013 |
+
<p class="recipe-desc" data-i18n="hub.desc">
|
| 1014 |
+
<strong>Don't reinvent — find.</strong> 30+ pains mapped to tafagent modes + curated external tools. Browse by category, search by keyword, or see the gaps where new modes would help most.
|
| 1015 |
+
</p>
|
| 1016 |
+
<div class="form-row">
|
| 1017 |
+
<input type="text" id="hub-search" placeholder="search: e.g. 'forgetting' or 'vendor' or 'RAG'…" style="flex:1;" />
|
| 1018 |
+
<button type="button" id="hub-clear-btn" class="secondary" data-i18n="hub.clear_btn">✕ Clear</button>
|
| 1019 |
+
</div>
|
| 1020 |
+
<p id="hub-status" class="recipe-desc" style="font-size:0.92em;"></p>
|
| 1021 |
+
<div id="hub-output" style="margin-top: 1em;"></div>
|
| 1022 |
+
</section>
|
| 1023 |
+
|
| 1024 |
<!-- Recipe selector (mode=recipe) -->
|
| 1025 |
<section id="recipe-section" style="display:none;">
|
| 1026 |
<h2 data-i18n="recipe.title">📋 Recipe</h2>
|
|
@@ -423,6 +423,8 @@ export const TRANSLATIONS = {
|
|
| 423 |
"mode_desc.niah": "Predicts NIAH (retrieval) and multi-hop reasoning pass rates at any context. Solves: long-context models often pass NIAH but fail reasoning at the same context (RULER paper).",
|
| 424 |
"modes.saturation": "📈 Saturation",
|
| 425 |
"mode_desc.saturation": "Tells you whether a benchmark still discriminates frontier models or has saturated (e.g. MMLU 88-94% top, AIME 2025 already 96-100%). Returns top-3 + verdict + recommended replacements.",
|
|
|
|
|
|
|
| 426 |
"niah.title": "🔍 NIAH → Reasoning Gap",
|
| 427 |
"niah.tip": "NIAH (Needle in a Haystack) tests retrieval: 'find this fact in long text'. Multi-hop reasoning tests inference: 'combine facts X+Y at the start with fact Z at the end'. RULER paper (NVIDIA 2024) shows long-context models often pass NIAH but fail reasoning at the same context. This tool predicts both pass rates from architecture alone.",
|
| 428 |
"niah.desc": "<strong>Your model claims 128k context. Will it actually reason at 64k, or just retrieve?</strong> Paste an HF model id and a target eval context — tool predicts NIAH and multi-hop reasoning pass rates, the gap, and a 'safe context' where reasoning stays ≥65%.",
|
|
@@ -501,6 +503,22 @@ export const TRANSLATIONS = {
|
|
| 501 |
"help.v08.saturation.title": "📈 Benchmark Saturation Detector",
|
| 502 |
"help.v08.saturation.body": "MMLU is saturated (88-94% top), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.",
|
| 503 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
|
| 505 |
// v0.7.7 — Task tiles (UX restructure: 14 modes grouped by user intent)
|
| 506 |
"tiles.title": "🎯 What do you want to do?",
|
|
@@ -1367,6 +1385,8 @@ export const TRANSLATIONS = {
|
|
| 1367 |
"mode_desc.niah": "Predice tasas de pass de NIAH (retrieval) y reasoning multi-hop a cualquier contexto. Resuelve: modelos long-context pasan NIAH pero fallan reasoning al mismo contexto (paper RULER).",
|
| 1368 |
"modes.saturation": "📈 Saturación",
|
| 1369 |
"mode_desc.saturation": "Te dice si un benchmark sigue discriminando frontier models o ya está saturado (ej. MMLU 88-94% top, AIME 2025 ya 96-100%). Devuelve top-3 + veredicto + reemplazos recomendados.",
|
|
|
|
|
|
|
| 1370 |
"niah.title": "🔍 Gap NIAH → Reasoning",
|
| 1371 |
"niah.tip": "NIAH (Needle in a Haystack) testea retrieval: 'encuentra este hecho en texto largo'. Reasoning multi-hop testea inferencia: 'combina hechos X+Y del principio con hecho Z del final'. El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH pero fallan reasoning al mismo contexto. Esta herramienta predice ambas tasas desde la arquitectura sola.",
|
| 1372 |
"niah.desc": "<strong>Tu modelo dice 128k de contexto. ¿Razonará realmente a 64k, o solo encontrará?</strong> Pega un model id HF y un contexto objetivo — la herramienta predice tasas de pass NIAH y reasoning multi-hop, el gap, y un 'contexto seguro' donde reasoning se mantiene ≥65%.",
|
|
@@ -1445,6 +1465,22 @@ export const TRANSLATIONS = {
|
|
| 1445 |
"help.v08.saturation.title": "📈 Detector de saturación de benchmarks",
|
| 1446 |
"help.v08.saturation.body": "MMLU está saturado (top 88-94%), AIME 2025 saturó a los pocos meses de salir, HumanEval near-saturated. Elige cualquier benchmark y la herramienta retorna top-3 frontier scores, spread, media, y un veredicto — saturated / near-saturated / discriminative — más un reemplazo recomendado (ej. MMLU → MMLU-Pro / GPQA / HLE). Fetch en vivo desde DemandSphere AI Frontier Tracker (CC BY-NC 4.0) cuando llega; snapshot baked 2026-05-05 cuando no. <em>Caso de uso</em>: antes de citar '92% en MMLU' o diseñar una eval, verifica si el benchmark aún discrimina algo.",
|
| 1447 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — ¿sigue siendo útil tu benchmark, o están todos los frontiers empatados arriba?",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1448 |
|
| 1449 |
// v0.7.7 — Tiles de tareas (UX restructure: 14 modos agrupados por intención)
|
| 1450 |
"tiles.title": "🎯 ¿Qué quieres hacer?",
|
|
@@ -2175,6 +2211,8 @@ export const TRANSLATIONS = {
|
|
| 2175 |
"mode_desc.niah": "Prédit les taux de réussite NIAH (retrieval) et reasoning multi-hop à n'importe quel contexte. Résout : les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte (paper RULER).",
|
| 2176 |
"modes.saturation": "📈 Saturation",
|
| 2177 |
"mode_desc.saturation": "Indique si un benchmark discrimine encore les frontier models ou s'il est saturé (ex. MMLU 88-94% top, AIME 2025 déjà 96-100%). Retourne top-3 + verdict + remplacements recommandés.",
|
|
|
|
|
|
|
| 2178 |
"niah.title": "🔍 Gap NIAH → Reasoning",
|
| 2179 |
"niah.tip": "NIAH (Needle in a Haystack) teste le retrieval : 'trouve ce fait dans un long texte'. Le reasoning multi-hop teste l'inférence : 'combine les faits X+Y au début avec le fait Z à la fin'. Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte. Cet outil prédit les deux taux à partir de la seule architecture.",
|
| 2180 |
"niah.desc": "<strong>Votre modèle revendique 128k de contexte. Va-t-il vraiment raisonner à 64k, ou seulement retrouver ?</strong> Collez un model id HF et un contexte cible — l'outil prédit les taux de réussite NIAH et reasoning multi-hop, le gap, et un 'contexte sûr' où le reasoning reste ≥65%.",
|
|
@@ -2253,6 +2291,22 @@ export const TRANSLATIONS = {
|
|
| 2253 |
"help.v08.saturation.title": "📈 Détecteur de saturation des benchmarks",
|
| 2254 |
"help.v08.saturation.body": "MMLU est saturé (top 88-94%), AIME 2025 saturé en quelques mois après sa sortie, HumanEval presque saturé. Choisissez un benchmark et l'outil retourne top-3 frontier scores, spread, moyenne, et un verdict — saturated / near-saturated / discriminative — plus un remplacement recommandé (ex. MMLU → MMLU-Pro / GPQA / HLE). Fetch en direct depuis DemandSphere AI Frontier Tracker (CC BY-NC 4.0) si accessible ; snapshot baked 2026-05-05 sinon. <em>Cas d'usage</em> : avant de citer '92% sur MMLU' ou de concevoir une eval, vérifiez si le benchmark discrimine encore quelque chose.",
|
| 2255 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — votre benchmark est-il encore utile, ou tous les frontiers sont-ils à égalité au sommet ?",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2256 |
|
| 2257 |
// v0.7.7 — Tuiles de tâches (refonte UX : 14 modes regroupés par intention)
|
| 2258 |
"tiles.title": "🎯 Que voulez-vous faire ?",
|
|
@@ -2983,6 +3037,8 @@ export const TRANSLATIONS = {
|
|
| 2983 |
"mode_desc.niah": "在任意上下文下预测 NIAH(检索)与多跳 reasoning 通过率。解决:长上下文模型常常通过 NIAH 但在同一上下文上 reasoning 失败(RULER 论文)。",
|
| 2984 |
"modes.saturation": "📈 饱和度",
|
| 2985 |
"mode_desc.saturation": "告诉你某个 benchmark 是否仍能区分 frontier 模型,或者已经饱和(例如 MMLU 88-94% 顶部,AIME 2025 已经 96-100%)。返回 top-3 + 判定 + 推荐替代品。",
|
|
|
|
|
|
|
| 2986 |
"niah.title": "🔍 NIAH → Reasoning Gap",
|
| 2987 |
"niah.tip": "NIAH(Needle in a Haystack)测试检索:\"在长文本中找到这个事实\"。多跳 reasoning 测试推理:\"把开头的事实 X+Y 与结尾的事实 Z 结合\"。RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH 但在相同上下文上 reasoning 失败。本工具仅根据架构预测两种通过率。",
|
| 2988 |
"niah.desc": "<strong>你的模型声称 128k 上下文。它在 64k 是真的能 reasoning,还是只能检索?</strong>粘贴 HF 模型 id 和目标 eval 上下文 — 工具预测 NIAH 与多跳 reasoning 通过率、gap,以及 reasoning 保持 ≥65% 的 \"安全上下文\"。",
|
|
@@ -3061,6 +3117,22 @@ export const TRANSLATIONS = {
|
|
| 3061 |
"help.v08.saturation.title": "📈 Benchmark 饱和度检测器",
|
| 3062 |
"help.v08.saturation.body": "MMLU 已饱和(top 88-94%),AIME 2025 上线几个月就饱和,HumanEval 接近饱和。选任何 benchmark,工具返回 top-3 frontier 分数、spread、平均,以及判定 — saturated / near-saturated / discriminative — 加上推荐替代品(例如 MMLU → MMLU-Pro / GPQA / HLE)。可达时从 DemandSphere AI Frontier Tracker(CC BY-NC 4.0)实时 fetch;不可达时使用 2026-05-05 的 baked 快照。<em>用例</em>:在引用\"92% on MMLU\"或设计 eval 之前,检查 benchmark 是否仍能区分任何东西。",
|
| 3063 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — 你的 benchmark 还有用吗,还是所有 frontier 都在顶部并列?",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3064 |
|
| 3065 |
// v0.7.7 — 任务卡片(UX 重构:按用户意图分组的 14 个模式)
|
| 3066 |
"tiles.title": "🎯 你想做什么?",
|
|
|
|
| 423 |
"mode_desc.niah": "Predicts NIAH (retrieval) and multi-hop reasoning pass rates at any context. Solves: long-context models often pass NIAH but fail reasoning at the same context (RULER paper).",
|
| 424 |
"modes.saturation": "📈 Saturation",
|
| 425 |
"mode_desc.saturation": "Tells you whether a benchmark still discriminates frontier models or has saturated (e.g. MMLU 88-94% top, AIME 2025 already 96-100%). Returns top-3 + verdict + recommended replacements.",
|
| 426 |
+
"modes.hub": "🧭 Solutions",
|
| 427 |
+
"mode_desc.hub": "Map of every documented LLM-eval pain → tafagent mode (if covered) + curated external tools. Find the right solution without rebuilding it. 30+ pains, 7 categories.",
|
| 428 |
"niah.title": "🔍 NIAH → Reasoning Gap",
|
| 429 |
"niah.tip": "NIAH (Needle in a Haystack) tests retrieval: 'find this fact in long text'. Multi-hop reasoning tests inference: 'combine facts X+Y at the start with fact Z at the end'. RULER paper (NVIDIA 2024) shows long-context models often pass NIAH but fail reasoning at the same context. This tool predicts both pass rates from architecture alone.",
|
| 430 |
"niah.desc": "<strong>Your model claims 128k context. Will it actually reason at 64k, or just retrieve?</strong> Paste an HF model id and a target eval context — tool predicts NIAH and multi-hop reasoning pass rates, the gap, and a 'safe context' where reasoning stays ≥65%.",
|
|
|
|
| 503 |
"help.v08.saturation.title": "📈 Benchmark Saturation Detector",
|
| 504 |
"help.v08.saturation.body": "MMLU is saturated (88-94% top), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.",
|
| 505 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?",
|
| 506 |
+
"inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — every documented pain mapped to a tafagent mode or curated external tool. Don't reinvent — find.",
|
| 507 |
+
"help.v081.hub.title": "🧭 Solutions Hub",
|
| 508 |
+
"help.v081.hub.body": "tafagent as integrator, not silo. 30+ pains across 7 categories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), each mapped to (a) the tafagent mode that addresses it, if any, and (b) the best-of-breed external tools the community already trusts (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Search box matches across pain, scenario, and tool name. <em>Use case</em>: 'I have problem X — does tafagent solve it, and if not, who does?'",
|
| 509 |
+
"hub.title": "🧭 Solutions Hub",
|
| 510 |
+
"hub.tip": "Map of every documented LLM-eval pain we know about: which tafagent mode addresses it (if any), and the best-of-breed external tools the community already trusts. Goal: full coverage. If a canonical tool exists elsewhere, we link rather than rebuild.",
|
| 511 |
+
"hub.desc": "<strong>Don't reinvent — find.</strong> 30+ pains mapped to tafagent modes + curated external tools. Browse by category, search by keyword, or see the gaps where new modes would help most.",
|
| 512 |
+
"hub.clear_btn": "✕ Clear",
|
| 513 |
+
"hub.no_mode": "external",
|
| 514 |
+
"hub.planned": "planned:",
|
| 515 |
+
"hub.best_for": "Best for",
|
| 516 |
+
"hub.not_for": "Not for",
|
| 517 |
+
"hub.tools": "External tools",
|
| 518 |
+
"hub.status.loaded": "✅ Loaded {total} pains across {categories} categories — {covered} covered by tafagent modes, {externalLinks} external links curated. Compiled {compiled}.",
|
| 519 |
+
"hub.status.fail": "⚠ Could not load Solutions Hub.",
|
| 520 |
+
"hub.search.empty": "No matches for '{query}'. Try broader terms (e.g. 'eval', 'rag', 'tokenizer').",
|
| 521 |
+
"hub.search.results": "Found {n} match(es) for '{query}'.",
|
| 522 |
|
| 523 |
// v0.7.7 — Task tiles (UX restructure: 14 modes grouped by user intent)
|
| 524 |
"tiles.title": "🎯 What do you want to do?",
|
|
|
|
| 1385 |
"mode_desc.niah": "Predice tasas de pass de NIAH (retrieval) y reasoning multi-hop a cualquier contexto. Resuelve: modelos long-context pasan NIAH pero fallan reasoning al mismo contexto (paper RULER).",
|
| 1386 |
"modes.saturation": "📈 Saturación",
|
| 1387 |
"mode_desc.saturation": "Te dice si un benchmark sigue discriminando frontier models o ya está saturado (ej. MMLU 88-94% top, AIME 2025 ya 96-100%). Devuelve top-3 + veredicto + reemplazos recomendados.",
|
| 1388 |
+
"modes.hub": "🧭 Soluciones",
|
| 1389 |
+
"mode_desc.hub": "Mapa de cada problema documentado de LLM-eval → mode tafagent (si cubierto) + herramientas externas curadas. Encuentra la solución sin reinventarla. 30+ pains, 7 categorías.",
|
| 1390 |
"niah.title": "🔍 Gap NIAH → Reasoning",
|
| 1391 |
"niah.tip": "NIAH (Needle in a Haystack) testea retrieval: 'encuentra este hecho en texto largo'. Reasoning multi-hop testea inferencia: 'combina hechos X+Y del principio con hecho Z del final'. El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH pero fallan reasoning al mismo contexto. Esta herramienta predice ambas tasas desde la arquitectura sola.",
|
| 1392 |
"niah.desc": "<strong>Tu modelo dice 128k de contexto. ¿Razonará realmente a 64k, o solo encontrará?</strong> Pega un model id HF y un contexto objetivo — la herramienta predice tasas de pass NIAH y reasoning multi-hop, el gap, y un 'contexto seguro' donde reasoning se mantiene ≥65%.",
|
|
|
|
| 1465 |
"help.v08.saturation.title": "📈 Detector de saturación de benchmarks",
|
| 1466 |
"help.v08.saturation.body": "MMLU está saturado (top 88-94%), AIME 2025 saturó a los pocos meses de salir, HumanEval near-saturated. Elige cualquier benchmark y la herramienta retorna top-3 frontier scores, spread, media, y un veredicto — saturated / near-saturated / discriminative — más un reemplazo recomendado (ej. MMLU → MMLU-Pro / GPQA / HLE). Fetch en vivo desde DemandSphere AI Frontier Tracker (CC BY-NC 4.0) cuando llega; snapshot baked 2026-05-05 cuando no. <em>Caso de uso</em>: antes de citar '92% en MMLU' o diseñar una eval, verifica si el benchmark aún discrimina algo.",
|
| 1467 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — ¿sigue siendo útil tu benchmark, o están todos los frontiers empatados arriba?",
|
| 1468 |
+
"inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — cada pain documentado mapeado a un mode tafagent o herramienta externa curada. No reinventes — encuentra.",
|
| 1469 |
+
"help.v081.hub.title": "🧭 Solutions Hub",
|
| 1470 |
+
"help.v081.hub.body": "tafagent como integrador, no silo. 30+ pains en 7 categorías (eval reliability · diagnósticos · setup · training · retrieval · multimodal · observability), cada uno mapeado a (a) el mode tafagent que lo resuelve, si existe, y (b) las herramientas externas best-of-breed que la comunidad ya usa (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Caja de búsqueda matchea pain, scenario, y nombre de herramienta. <em>Caso de uso</em>: 'tengo problema X — ¿lo resuelve tafagent, y si no, quién?'",
|
| 1471 |
+
"hub.title": "🧭 Solutions Hub",
|
| 1472 |
+
"hub.tip": "Mapa de cada pain de LLM-eval documentado: qué mode tafagent lo resuelve (si alguno), y las herramientas externas best-of-breed que la comunidad ya usa. Objetivo: cobertura total. Si la herramienta canónica existe en otra parte, enlazamos en vez de rebuildear.",
|
| 1473 |
+
"hub.desc": "<strong>No reinventes — encuentra.</strong> 30+ pains mapeados a modes tafagent + herramientas externas curadas. Navega por categoría, busca por keyword, o ve los huecos donde nuevos modes ayudarían más.",
|
| 1474 |
+
"hub.clear_btn": "✕ Limpiar",
|
| 1475 |
+
"hub.no_mode": "externo",
|
| 1476 |
+
"hub.planned": "planeado:",
|
| 1477 |
+
"hub.best_for": "Mejor para",
|
| 1478 |
+
"hub.not_for": "No para",
|
| 1479 |
+
"hub.tools": "Herramientas externas",
|
| 1480 |
+
"hub.status.loaded": "✅ Cargados {total} pains en {categories} categorías — {covered} cubiertos por modes tafagent, {externalLinks} enlaces externos curados. Compilado {compiled}.",
|
| 1481 |
+
"hub.status.fail": "⚠ No se pudo cargar Solutions Hub.",
|
| 1482 |
+
"hub.search.empty": "Sin coincidencias para '{query}'. Prueba términos más amplios (ej. 'eval', 'rag', 'tokenizer').",
|
| 1483 |
+
"hub.search.results": "Encontradas {n} coincidencia(s) para '{query}'.",
|
| 1484 |
|
| 1485 |
// v0.7.7 — Tiles de tareas (UX restructure: 14 modos agrupados por intención)
|
| 1486 |
"tiles.title": "🎯 ¿Qué quieres hacer?",
|
|
|
|
| 2211 |
"mode_desc.niah": "Prédit les taux de réussite NIAH (retrieval) et reasoning multi-hop à n'importe quel contexte. Résout : les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte (paper RULER).",
|
| 2212 |
"modes.saturation": "📈 Saturation",
|
| 2213 |
"mode_desc.saturation": "Indique si un benchmark discrimine encore les frontier models ou s'il est saturé (ex. MMLU 88-94% top, AIME 2025 déjà 96-100%). Retourne top-3 + verdict + remplacements recommandés.",
|
| 2214 |
+
"modes.hub": "🧭 Solutions",
|
| 2215 |
+
"mode_desc.hub": "Carte de chaque problème documenté de LLM-eval → mode tafagent (si couvert) + outils externes curés. Trouvez la solution sans la réinventer. 30+ pains, 7 catégories.",
|
| 2216 |
"niah.title": "🔍 Gap NIAH → Reasoning",
|
| 2217 |
"niah.tip": "NIAH (Needle in a Haystack) teste le retrieval : 'trouve ce fait dans un long texte'. Le reasoning multi-hop teste l'inférence : 'combine les faits X+Y au début avec le fait Z à la fin'. Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte. Cet outil prédit les deux taux à partir de la seule architecture.",
|
| 2218 |
"niah.desc": "<strong>Votre modèle revendique 128k de contexte. Va-t-il vraiment raisonner à 64k, ou seulement retrouver ?</strong> Collez un model id HF et un contexte cible — l'outil prédit les taux de réussite NIAH et reasoning multi-hop, le gap, et un 'contexte sûr' où le reasoning reste ≥65%.",
|
|
|
|
| 2291 |
"help.v08.saturation.title": "📈 Détecteur de saturation des benchmarks",
|
| 2292 |
"help.v08.saturation.body": "MMLU est saturé (top 88-94%), AIME 2025 saturé en quelques mois après sa sortie, HumanEval presque saturé. Choisissez un benchmark et l'outil retourne top-3 frontier scores, spread, moyenne, et un verdict — saturated / near-saturated / discriminative — plus un remplacement recommandé (ex. MMLU → MMLU-Pro / GPQA / HLE). Fetch en direct depuis DemandSphere AI Frontier Tracker (CC BY-NC 4.0) si accessible ; snapshot baked 2026-05-05 sinon. <em>Cas d'usage</em> : avant de citer '92% sur MMLU' ou de concevoir une eval, vérifiez si le benchmark discrimine encore quelque chose.",
|
| 2293 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — votre benchmark est-il encore utile, ou tous les frontiers sont-ils à égalité au sommet ?",
|
| 2294 |
+
"inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — chaque pain documenté mappé à un mode tafagent ou outil externe curé. Ne réinventez pas — trouvez.",
|
| 2295 |
+
"help.v081.hub.title": "🧭 Solutions Hub",
|
| 2296 |
+
"help.v081.hub.body": "tafagent comme intégrateur, pas silo. 30+ pains à travers 7 catégories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), chacun mappé à (a) le mode tafagent qui le résout, s'il existe, et (b) les outils externes best-of-breed que la communauté utilise déjà (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). La barre de recherche matche pain, scénario, et nom d'outil. <em>Cas d'usage</em> : 'j'ai le problème X — tafagent le résout-il, et sinon, qui ?'",
|
| 2297 |
+
"hub.title": "🧭 Solutions Hub",
|
| 2298 |
+
"hub.tip": "Carte de chaque pain de LLM-eval documenté : quel mode tafagent l'adresse (si applicable), et les outils externes best-of-breed que la communauté utilise déjà. Objectif : couverture totale. Si l'outil canonique existe ailleurs, nous lions plutôt que de reconstruire.",
|
| 2299 |
+
"hub.desc": "<strong>Ne réinventez pas — trouvez.</strong> 30+ pains mappés à des modes tafagent + outils externes curés. Naviguez par catégorie, recherchez par mot-clé, ou voyez les lacunes où de nouveaux modes aideraient le plus.",
|
| 2300 |
+
"hub.clear_btn": "✕ Effacer",
|
| 2301 |
+
"hub.no_mode": "externe",
|
| 2302 |
+
"hub.planned": "prévu :",
|
| 2303 |
+
"hub.best_for": "Idéal pour",
|
| 2304 |
+
"hub.not_for": "Pas pour",
|
| 2305 |
+
"hub.tools": "Outils externes",
|
| 2306 |
+
"hub.status.loaded": "✅ Chargés {total} pains dans {categories} catégories — {covered} couverts par des modes tafagent, {externalLinks} liens externes curés. Compilé {compiled}.",
|
| 2307 |
+
"hub.status.fail": "⚠ Impossible de charger Solutions Hub.",
|
| 2308 |
+
"hub.search.empty": "Aucune correspondance pour '{query}'. Essayez des termes plus larges (ex. 'eval', 'rag', 'tokenizer').",
|
| 2309 |
+
"hub.search.results": "{n} correspondance(s) trouvée(s) pour '{query}'.",
|
| 2310 |
|
| 2311 |
// v0.7.7 — Tuiles de tâches (refonte UX : 14 modes regroupés par intention)
|
| 2312 |
"tiles.title": "🎯 Que voulez-vous faire ?",
|
|
|
|
| 3037 |
"mode_desc.niah": "在任意上下文下预测 NIAH(检索)与多跳 reasoning 通过率。解决:长上下文模型常常通过 NIAH 但在同一上下文上 reasoning 失败(RULER 论文)。",
|
| 3038 |
"modes.saturation": "📈 饱和度",
|
| 3039 |
"mode_desc.saturation": "告诉你某个 benchmark 是否仍能区分 frontier 模型,或者已经饱和(例如 MMLU 88-94% 顶部,AIME 2025 已经 96-100%)。返回 top-3 + 判定 + 推荐替代品。",
|
| 3040 |
+
"modes.hub": "🧭 方案",
|
| 3041 |
+
"mode_desc.hub": "每个 LLM-eval 问题的地图 → tafagent 模式(若覆盖)+ 精选外部工具。找到方案而非重新发明。30+ 问题,7 类别。",
|
| 3042 |
"niah.title": "🔍 NIAH → Reasoning Gap",
|
| 3043 |
"niah.tip": "NIAH(Needle in a Haystack)测试检索:\"在长文本中找到这个事实\"。多跳 reasoning 测试推理:\"把开头的事实 X+Y 与结尾的事实 Z 结合\"。RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH 但在相同上下文上 reasoning 失败。本工具仅根据架构预测两种通过率。",
|
| 3044 |
"niah.desc": "<strong>你的模型声称 128k 上下文。它在 64k 是真的能 reasoning,还是只能检索?</strong>粘贴 HF 模型 id 和目标 eval 上下文 — 工具预测 NIAH 与多跳 reasoning 通过率、gap,以及 reasoning 保持 ≥65% 的 \"安全上下文\"。",
|
|
|
|
| 3117 |
"help.v08.saturation.title": "📈 Benchmark 饱和度检测器",
|
| 3118 |
"help.v08.saturation.body": "MMLU 已饱和(top 88-94%),AIME 2025 上线几个月就饱和,HumanEval 接近饱和。选任何 benchmark,工具返回 top-3 frontier 分数、spread、平均,以及判定 — saturated / near-saturated / discriminative — 加上推荐替代品(例如 MMLU → MMLU-Pro / GPQA / HLE)。可达时从 DemandSphere AI Frontier Tracker(CC BY-NC 4.0)实时 fetch;不可达时使用 2026-05-05 的 baked 快照。<em>用例</em>:在引用\"92% on MMLU\"或设计 eval 之前,检查 benchmark 是否仍能区分任何东西。",
|
| 3119 |
"inv.v08.saturation": "<strong>📈 Saturation</strong> — 你的 benchmark 还有用吗,还是所有 frontier 都在顶部并列?",
|
| 3120 |
+
"inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — 每个文档化的问题都映射到一个 tafagent 模式或精选外部工具。别重复发明 — 去找。",
|
| 3121 |
+
"help.v081.hub.title": "🧭 Solutions Hub",
|
| 3122 |
+
"help.v081.hub.body": "tafagent 作为集成者而非孤岛。30+ 问题跨 7 类别(评估可靠性 · 诊断 · 设置 · 训练 · 检索 · 多模态 · 可观测性),每个映射到(a)解决它的 tafagent 模式(若存在),以及(b)社区已信任的最佳外部工具(RAGAS、MTEB、HELM、MCP Schema Validator、llm-stats、llguidance、GlitchMiner 等)。搜索框匹配 pain、场景和工具名称。<em>用例</em>:'我有问题 X — tafagent 解决它吗,如果不,谁解决?'",
|
| 3123 |
+
"hub.title": "🧭 Solutions Hub",
|
| 3124 |
+
"hub.tip": "我们已知的每个 LLM-eval 问题的地图:哪个 tafagent 模式能解决它(若有),以及社区已信任的最佳外部工具。目标:全覆盖。如果规范工具已在别处,我们链接而非重建。",
|
| 3125 |
+
"hub.desc": "<strong>别重新发明 — 去找。</strong>30+ 问题映射到 tafagent 模式 + 精选外部工具。按类别浏览、按关键字搜索,或查看新模式最有帮助的空缺。",
|
| 3126 |
+
"hub.clear_btn": "✕ 清空",
|
| 3127 |
+
"hub.no_mode": "外部",
|
| 3128 |
+
"hub.planned": "计划:",
|
| 3129 |
+
"hub.best_for": "适合",
|
| 3130 |
+
"hub.not_for": "不适合",
|
| 3131 |
+
"hub.tools": "外部工具",
|
| 3132 |
+
"hub.status.loaded": "✅ 已加载 {total} 个问题,跨 {categories} 类别 — {covered} 个由 tafagent 模式覆盖,精选 {externalLinks} 个外部链接。编译于 {compiled}。",
|
| 3133 |
+
"hub.status.fail": "⚠ 无法加载 Solutions Hub。",
|
| 3134 |
+
"hub.search.empty": "无 '{query}' 的匹配。尝试更宽泛的词(如 'eval'、'rag'、'tokenizer')。",
|
| 3135 |
+
"hub.search.results": "为 '{query}' 找到 {n} 个匹配。",
|
| 3136 |
|
| 3137 |
// v0.7.7 — 任务卡片(UX 重构:按用户意图分组的 14 个模式)
|
| 3138 |
"tiles.title": "🎯 你想做什么?",
|
|
@@ -23,6 +23,10 @@ import {
|
|
| 23 |
loadSaturationKB, classifyAll, classifyBenchmark,
|
| 24 |
listBenchmarks, attribution as saturationAttribution, tryFetchLive,
|
| 25 |
} from "./saturation_detector.js";
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
// Attach HF Hub search-as-you-type to all 5 model id inputs (Profile, Recipe,
|
| 28 |
// Unmask, Template, Quant). Hits public huggingface.co/api/models. Idempotent.
|
|
@@ -212,6 +216,7 @@ document.addEventListener("click", (e) => {
|
|
| 212 |
template: "template-section", arena: "arena-section", contam: "contam-section",
|
| 213 |
quant: "quant-section", drift: "drift-section", niah: "niah-section",
|
| 214 |
saturation: "saturation-section",
|
|
|
|
| 215 |
}[targetMode];
|
| 216 |
if (sectionId) {
|
| 217 |
const sec = document.getElementById(sectionId);
|
|
@@ -236,7 +241,7 @@ document.querySelectorAll(".mode-btn").forEach(btn => {
|
|
| 236 |
"diagnose-section", "phase-section", "unmask-section",
|
| 237 |
"template-section", "arena-section", "contam-section",
|
| 238 |
"quant-section", "drift-section", "niah-section",
|
| 239 |
-
"saturation-section"].forEach(id => {
|
| 240 |
const el = $(id);
|
| 241 |
if (el) el.style.display = "none";
|
| 242 |
});
|
|
@@ -248,12 +253,14 @@ document.querySelectorAll(".mode-btn").forEach(btn => {
|
|
| 248 |
template: "template-section", arena: "arena-section", contam: "contam-section",
|
| 249 |
quant: "quant-section", drift: "drift-section", niah: "niah-section",
|
| 250 |
saturation: "saturation-section",
|
|
|
|
| 251 |
};
|
| 252 |
const sectionId = sectionMap[mode];
|
| 253 |
if (sectionId) $(sectionId).style.display = "";
|
| 254 |
$("mode-desc").textContent = t(`mode_desc.${mode}`) || "";
|
| 255 |
if (mode === "phase") initPhaseDiagram();
|
| 256 |
if (mode === "saturation") initSaturation();
|
|
|
|
| 257 |
});
|
| 258 |
});
|
| 259 |
|
|
@@ -3277,6 +3284,106 @@ function runSaturationAll() {
|
|
| 3277 |
$("saturation-run-btn")?.addEventListener("click", runSaturationOne);
|
| 3278 |
$("saturation-all-btn")?.addEventListener("click", runSaturationAll);
|
| 3279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3280 |
// ════════════════════════════════════════════════════════════════════
|
| 3281 |
// Bootstrap
|
| 3282 |
// ════════════════════════════════════════════════════════════════════
|
|
|
|
| 23 |
loadSaturationKB, classifyAll, classifyBenchmark,
|
| 24 |
listBenchmarks, attribution as saturationAttribution, tryFetchLive,
|
| 25 |
} from "./saturation_detector.js";
|
| 26 |
+
import {
|
| 27 |
+
loadHub, listCategories, listEntries, searchEntries,
|
| 28 |
+
hubStats, getCategoryMeta,
|
| 29 |
+
} from "./solutions_hub.js";
|
| 30 |
|
| 31 |
// Attach HF Hub search-as-you-type to all 5 model id inputs (Profile, Recipe,
|
| 32 |
// Unmask, Template, Quant). Hits public huggingface.co/api/models. Idempotent.
|
|
|
|
| 216 |
template: "template-section", arena: "arena-section", contam: "contam-section",
|
| 217 |
quant: "quant-section", drift: "drift-section", niah: "niah-section",
|
| 218 |
saturation: "saturation-section",
|
| 219 |
+
hub: "hub-section",
|
| 220 |
}[targetMode];
|
| 221 |
if (sectionId) {
|
| 222 |
const sec = document.getElementById(sectionId);
|
|
|
|
| 241 |
"diagnose-section", "phase-section", "unmask-section",
|
| 242 |
"template-section", "arena-section", "contam-section",
|
| 243 |
"quant-section", "drift-section", "niah-section",
|
| 244 |
+
"saturation-section", "hub-section"].forEach(id => {
|
| 245 |
const el = $(id);
|
| 246 |
if (el) el.style.display = "none";
|
| 247 |
});
|
|
|
|
| 253 |
template: "template-section", arena: "arena-section", contam: "contam-section",
|
| 254 |
quant: "quant-section", drift: "drift-section", niah: "niah-section",
|
| 255 |
saturation: "saturation-section",
|
| 256 |
+
hub: "hub-section",
|
| 257 |
};
|
| 258 |
const sectionId = sectionMap[mode];
|
| 259 |
if (sectionId) $(sectionId).style.display = "";
|
| 260 |
$("mode-desc").textContent = t(`mode_desc.${mode}`) || "";
|
| 261 |
if (mode === "phase") initPhaseDiagram();
|
| 262 |
if (mode === "saturation") initSaturation();
|
| 263 |
+
if (mode === "hub") initHub();
|
| 264 |
});
|
| 265 |
});
|
| 266 |
|
|
|
|
| 3284 |
$("saturation-run-btn")?.addEventListener("click", runSaturationOne);
|
| 3285 |
$("saturation-all-btn")?.addEventListener("click", runSaturationAll);
|
| 3286 |
|
| 3287 |
+
// ════════════════════════════════════════════════════════════════════
|
| 3288 |
+
// 🧭 Solutions Hub (v0.8.1) — integrator portal
|
| 3289 |
+
// ════════════════════════════════════════════════════════════════════
|
| 3290 |
+
const HUB_TYPE_BADGE = {
|
| 3291 |
+
tool: "🔧",
|
| 3292 |
+
leaderboard: "📊",
|
| 3293 |
+
paper: "📄",
|
| 3294 |
+
article: "📝",
|
| 3295 |
+
docs: "📘",
|
| 3296 |
+
issue: "🐛",
|
| 3297 |
+
spec: "📐",
|
| 3298 |
+
benchmark: "🧪",
|
| 3299 |
+
};
|
| 3300 |
+
|
| 3301 |
+
let __hubInited = false;
|
| 3302 |
+
|
| 3303 |
+
async function initHub() {
|
| 3304 |
+
if (__hubInited) return;
|
| 3305 |
+
__hubInited = true;
|
| 3306 |
+
try {
|
| 3307 |
+
await loadHub();
|
| 3308 |
+
} catch (e) {
|
| 3309 |
+
$("hub-status").textContent = (t("hub.status.fail") || "⚠ Could not load Solutions Hub.") + " " + (e.message || e);
|
| 3310 |
+
return;
|
| 3311 |
+
}
|
| 3312 |
+
const stats = hubStats();
|
| 3313 |
+
$("hub-status").textContent = tFmt("hub.status.loaded", stats);
|
| 3314 |
+
renderHubAll();
|
| 3315 |
+
}
|
| 3316 |
+
|
| 3317 |
+
function renderEntry(e) {
|
| 3318 |
+
const modeBadge = e.tafagent_mode
|
| 3319 |
+
? `<span class="badge" style="background:#3fb950;">${e.tafagent_mode}</span>`
|
| 3320 |
+
: (e.tafagent_planned_mode
|
| 3321 |
+
? `<span class="badge" style="background:#d29922;">${t("hub.planned") || "planned:"} ${e.tafagent_planned_mode}</span>`
|
| 3322 |
+
: `<span class="badge" style="background:#6e7781;">${t("hub.no_mode") || "external"}</span>`);
|
| 3323 |
+
const tools = (e.external_tools || [])
|
| 3324 |
+
.map(tl => {
|
| 3325 |
+
const icon = HUB_TYPE_BADGE[tl.type] || "🔗";
|
| 3326 |
+
return `<li>${icon} <a href="${tl.url}" target="_blank" rel="noopener noreferrer">${tl.name}</a> <span class="subtle" style="font-size:0.82em;">(${tl.type})</span></li>`;
|
| 3327 |
+
})
|
| 3328 |
+
.join("");
|
| 3329 |
+
const bestFor = e.best_for ? `<p><strong>${t("hub.best_for") || "Best for"}:</strong> ${e.best_for}</p>` : "";
|
| 3330 |
+
const notFor = e.not_for ? `<p><strong>${t("hub.not_for") || "Not for"}:</strong> ${e.not_for}</p>` : "";
|
| 3331 |
+
return `
|
| 3332 |
+
<details class="unmask-panel" style="margin: 0.5em 0;">
|
| 3333 |
+
<summary class="unmask-panel-title">${e.pain} ${modeBadge}</summary>
|
| 3334 |
+
${bestFor}
|
| 3335 |
+
${notFor}
|
| 3336 |
+
${tools ? `<p><strong>${t("hub.tools") || "External tools"}:</strong></p><ul>${tools}</ul>` : ""}
|
| 3337 |
+
</details>
|
| 3338 |
+
`;
|
| 3339 |
+
}
|
| 3340 |
+
|
| 3341 |
+
function renderHubAll() {
|
| 3342 |
+
const cats = listCategories();
|
| 3343 |
+
const html = cats.map(c => {
|
| 3344 |
+
const entries = listEntries(c.key);
|
| 3345 |
+
if (entries.length === 0) return "";
|
| 3346 |
+
const inner = entries.map(renderEntry).join("");
|
| 3347 |
+
return `
|
| 3348 |
+
<details class="unmask-panel" open style="margin-top: 1em;">
|
| 3349 |
+
<summary class="unmask-panel-title" style="font-size:1.05em;">
|
| 3350 |
+
${c.icon} ${c.label} <span class="subtle" style="font-size:0.85em;">(${c.count})</span>
|
| 3351 |
+
</summary>
|
| 3352 |
+
<p class="recipe-desc" style="font-style:italic;">${c.description}</p>
|
| 3353 |
+
${inner}
|
| 3354 |
+
</details>
|
| 3355 |
+
`;
|
| 3356 |
+
}).join("");
|
| 3357 |
+
$("hub-output").innerHTML = `<div class="arena-result">${html}</div>`;
|
| 3358 |
+
}
|
| 3359 |
+
|
| 3360 |
+
function renderHubSearch(query) {
|
| 3361 |
+
const matches = searchEntries(query);
|
| 3362 |
+
if (matches.length === 0) {
|
| 3363 |
+
$("hub-output").innerHTML = `<p class="recipe-desc">${tFmt("hub.search.empty", { query })}</p>`;
|
| 3364 |
+
return;
|
| 3365 |
+
}
|
| 3366 |
+
const html = matches.map(renderEntry).join("");
|
| 3367 |
+
$("hub-output").innerHTML = `<div class="arena-result">
|
| 3368 |
+
<p class="recipe-desc">${tFmt("hub.search.results", { n: matches.length, query })}</p>
|
| 3369 |
+
${html}
|
| 3370 |
+
</div>`;
|
| 3371 |
+
}
|
| 3372 |
+
|
| 3373 |
+
let __hubSearchTimer = null;
|
| 3374 |
+
$("hub-search")?.addEventListener("input", (e) => {
|
| 3375 |
+
clearTimeout(__hubSearchTimer);
|
| 3376 |
+
const q = e.target.value;
|
| 3377 |
+
__hubSearchTimer = setTimeout(() => {
|
| 3378 |
+
if (!q.trim()) renderHubAll();
|
| 3379 |
+
else renderHubSearch(q);
|
| 3380 |
+
}, 200);
|
| 3381 |
+
});
|
| 3382 |
+
$("hub-clear-btn")?.addEventListener("click", () => {
|
| 3383 |
+
$("hub-search").value = "";
|
| 3384 |
+
renderHubAll();
|
| 3385 |
+
});
|
| 3386 |
+
|
| 3387 |
// ════════════════════════════════════════════════════════════════════
|
| 3388 |
// Bootstrap
|
| 3389 |
// ════════════════════════════════════════════════════════════════════
|
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Solutions Hub (v0.8.1)
|
| 2 |
+
// tafagent as integrator/curator. Pain → tafagent mode (if shipped) +
|
| 3 |
+
// external best-of-breed tools. Pure logic — no human strings; main.js
|
| 4 |
+
// renders with i18n.
|
| 5 |
+
|
| 6 |
+
let _hub = null;
|
| 7 |
+
|
| 8 |
+
export async function loadHub(url = "./data/solutions_hub.json") {
|
| 9 |
+
if (_hub) return _hub;
|
| 10 |
+
const res = await fetch(url);
|
| 11 |
+
if (!res.ok) throw new Error(`Hub fetch failed: ${res.status}`);
|
| 12 |
+
_hub = await res.json();
|
| 13 |
+
return _hub;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
export function getHub() { return _hub; }
|
| 17 |
+
|
| 18 |
+
export function listCategories() {
|
| 19 |
+
if (!_hub) return [];
|
| 20 |
+
return Object.entries(_hub.categories).map(([key, meta]) => ({
|
| 21 |
+
key, ...meta,
|
| 22 |
+
count: _hub.entries.filter(e => e.category === key).length,
|
| 23 |
+
}));
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
export function listEntries(categoryKey = null) {
|
| 27 |
+
if (!_hub) return [];
|
| 28 |
+
return categoryKey
|
| 29 |
+
? _hub.entries.filter(e => e.category === categoryKey)
|
| 30 |
+
: _hub.entries;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
// Search across pain + best_for + tool names. Case-insensitive substring.
|
| 34 |
+
export function searchEntries(query) {
|
| 35 |
+
if (!_hub || !query) return [];
|
| 36 |
+
const q = query.toLowerCase().trim();
|
| 37 |
+
if (!q) return [];
|
| 38 |
+
return _hub.entries.filter(e => {
|
| 39 |
+
const haystack = [
|
| 40 |
+
e.pain || "",
|
| 41 |
+
e.best_for || "",
|
| 42 |
+
e.not_for || "",
|
| 43 |
+
e.tafagent_mode || "",
|
| 44 |
+
...(e.external_tools || []).map(t => t.name || ""),
|
| 45 |
+
].join(" ").toLowerCase();
|
| 46 |
+
return haystack.includes(q);
|
| 47 |
+
});
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
export function getCategoryMeta(key) {
|
| 51 |
+
return _hub?.categories?.[key] || null;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
// Stats for the inventory header.
|
| 55 |
+
export function hubStats() {
|
| 56 |
+
if (!_hub) return null;
|
| 57 |
+
const entries = _hub.entries;
|
| 58 |
+
const covered = entries.filter(e => e.tafagent_mode).length;
|
| 59 |
+
const planned = entries.filter(e => e.tafagent_planned_mode).length;
|
| 60 |
+
const totalExternal = entries.reduce((acc, e) => acc + (e.external_tools?.length || 0), 0);
|
| 61 |
+
return {
|
| 62 |
+
total: entries.length,
|
| 63 |
+
covered,
|
| 64 |
+
planned,
|
| 65 |
+
externalLinks: totalExternal,
|
| 66 |
+
categories: Object.keys(_hub.categories).length,
|
| 67 |
+
compiled: _hub.compiled,
|
| 68 |
+
};
|
| 69 |
+
}
|