karlexmarin Claude Opus 4.7 (1M context) commited on
Commit
fbf3edc
·
1 Parent(s): 7c80934

v0.8.1 Solutions Hub — integrator portal (30 pains × 65 external tools)

Browse files

🧭 Solutions Hub mode: every documented LLM-eval pain mapped to
(a) the tafagent mode that addresses it (16 of 30 covered) and
(b) the best-of-breed external tools the community already trusts
(65 curated links across RAGAS, MTEB, HELM, MCP Schema Validator,
llm-stats, llguidance, GlitchMiner, RULER, JSONLint, FastMCP,
LangSmith, TruLens, DeepEval, etc.).

Strategy shift: tafagent as integrator, not silo. If a canonical
solution exists publicly we link, not rebuild. Round-3 + round-4
research (2026-05-07) validated this — 6 of 10 candidate pains
had production-grade tools already (skip build). Hub closes the
loop: users land here, find the right tool, regardless of who
shipped it.

Coverage: 7 categories — eval reliability · diagnostic · setup ·
training · retrieval · multimodal · observability. Each pain entry
has: tafagent_mode (or null/planned), external_tools[]
(name+url+type), best_for, not_for. Tool types: tool / leaderboard /
paper / article / docs / issue / spec / benchmark.

UI: live search across pain+scenario+tool name, accordion per
category, badges for coverage status. i18n × 4 langs (EN/ES/FR/ZH).
Help modal entry, inventory card entry, task-tile button.

Also surfaces 2 planned tafagent gaps: 🔧 PEFT Anti-Pattern Checker
(v0.8.2 candidate, peft #2115 silent fail) and JSON CoT-aware Linter
(answer-before-reasoning bug). Both browser-feasible, no current tool.

URL validation 2026-05-07: top critical URLs fetched + confirmed alive
(HF PEFT troubleshooting docs, MCP Schema Validator, RAGAS v0.4.3
13.8k★, MTEB leaderboard).

Files: data/solutions_hub.json + js/solutions_hub.js (new);
index.html + js/main.js + js/i18n.js (modified).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (5) hide show
  1. data/solutions_hub.json +407 -0
  2. index.html +24 -0
  3. js/i18n.js +72 -0
  4. js/main.js +108 -1
  5. js/solutions_hub.js +69 -0
data/solutions_hub.json ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "0.8.1",
3
+ "compiled": "2026-05-07",
4
+ "philosophy": "tafagent as integrator, not silo. For each documented LLM-eval pain we surface: (a) the tafagent mode that addresses it, if any; (b) the best-of-breed external tools the community already trusts; (c) when to use which. Goal: complete coverage, not feature lock-in. If the canonical tool exists elsewhere we link, not rebuild.",
5
+ "verification_note": "All external URLs were fetched and confirmed alive on the compiled date. Treat older entries with skepticism — link rot is real. Report dead links via the GitHub issue tracker.",
6
+ "categories": {
7
+ "eval_reliability": {
8
+ "label": "Trust a benchmark score",
9
+ "icon": "✓",
10
+ "description": "Should I believe this number?"
11
+ },
12
+ "diagnostic": {
13
+ "label": "Diagnose a model",
14
+ "icon": "🔬",
15
+ "description": "Will this model work for my use case?"
16
+ },
17
+ "setup": {
18
+ "label": "Set up an eval correctly",
19
+ "icon": "⚙️",
20
+ "description": "Avoid silent failures before running."
21
+ },
22
+ "training": {
23
+ "label": "Train / fine-tune safely",
24
+ "icon": "🛠️",
25
+ "description": "Don't waste GPU time on broken setups."
26
+ },
27
+ "retrieval": {
28
+ "label": "RAG & retrieval quality",
29
+ "icon": "📚",
30
+ "description": "Is my retrieval actually retrieving?"
31
+ },
32
+ "multimodal": {
33
+ "label": "Multimodal models",
34
+ "icon": "🖼️",
35
+ "description": "Vision-language and beyond."
36
+ },
37
+ "observability": {
38
+ "label": "Observe & debug agents",
39
+ "icon": "🔭",
40
+ "description": "What is my agent actually doing?"
41
+ }
42
+ },
43
+ "entries": [
44
+ {
45
+ "id": "saturation",
46
+ "category": "eval_reliability",
47
+ "pain": "Benchmark saturation — top models all tied at 90%+, score no longer informative.",
48
+ "tafagent_mode": "📈 Saturation",
49
+ "external_tools": [
50
+ {"name": "DemandSphere AI Frontier Tracker", "url": "https://www.demandsphere.com/research/demandsphere-radar/ai-frontier-model-tracker/", "type": "leaderboard"},
51
+ {"name": "BenchLM.ai", "url": "https://benchlm.ai/", "type": "leaderboard"},
52
+ {"name": "LLM Stats", "url": "https://llm-stats.com/", "type": "leaderboard"}
53
+ ],
54
+ "best_for": "Quick check whether MMLU / AIME / HumanEval still discriminate frontier models in 2026.",
55
+ "not_for": "Predicting which model will win on a non-standard benchmark."
56
+ },
57
+ {
58
+ "id": "contamination",
59
+ "category": "eval_reliability",
60
+ "pain": "Benchmark contamination — model trained on the test set.",
61
+ "tafagent_mode": "🧪 Contamination",
62
+ "external_tools": [
63
+ {"name": "LiveBench (contamination-resistant)", "url": "https://livebench.ai/", "type": "leaderboard"},
64
+ {"name": "GSM8K-Platinum / contamination studies", "url": "https://thegrigorian.medium.com/when-benchmarks-lie-why-contamination-breaks-llm-evaluation-1fa335706f32", "type": "article"}
65
+ ],
66
+ "best_for": "Estimating contamination probability across 20+ public benchmarks per architecture.",
67
+ "not_for": "Definitive proof — needs trace inspection. Treat as prior, not certainty."
68
+ },
69
+ {
70
+ "id": "vendor_self_reported",
71
+ "category": "eval_reliability",
72
+ "pain": "Vendor-reported scores untrustworthy (Llama 4 mixed-quality reports).",
73
+ "tafagent_mode": null,
74
+ "external_tools": [
75
+ {"name": "llm-stats verified vs self-reported tags", "url": "https://llm-stats.com/benchmarks/swe-bench-verified", "type": "leaderboard"},
76
+ {"name": "BenchLM.ai confidence indicator", "url": "https://benchlm.ai/", "type": "leaderboard"},
77
+ {"name": "Vellum independent leaderboard", "url": "https://www.vellum.ai/llm-leaderboard", "type": "leaderboard"}
78
+ ],
79
+ "best_for": "Cross-checking vendor blog claims against community-verified runs before quoting.",
80
+ "not_for": "Models that have never been independently verified — assume vendor optimism."
81
+ },
82
+ {
83
+ "id": "arena_ci",
84
+ "category": "eval_reliability",
85
+ "pain": "Chatbot Arena hides confidence intervals — many top-Elo wins are statistically tied.",
86
+ "tafagent_mode": "🎯 Arena CI",
87
+ "external_tools": [
88
+ {"name": "LMArena leaderboard (raw)", "url": "https://lmarena.ai/", "type": "leaderboard"},
89
+ {"name": "Bradley-Terry methodology paper", "url": "https://arxiv.org/abs/2403.04132", "type": "paper"}
90
+ ],
91
+ "best_for": "Reconstructing 95% CIs from raw vote CSVs to flag statistical ties.",
92
+ "not_for": "Inferring true skill — Arena measures preference, not capability."
93
+ },
94
+ {
95
+ "id": "cross_drift",
96
+ "category": "eval_reliability",
97
+ "pain": "Same model, different scores on different setups — bug or noise?",
98
+ "tafagent_mode": "🔀 Drift",
99
+ "external_tools": [
100
+ {"name": "vLLM vs HF transformers consistency study", "url": "https://github.com/vllm-project/vllm/issues/12343", "type": "issue"}
101
+ ],
102
+ "best_for": "Predicting maximum admissible numerical gap between two evaluation frameworks.",
103
+ "not_for": "Identifying the exact root cause — narrows down candidates only."
104
+ },
105
+ {
106
+ "id": "sandbagging",
107
+ "category": "eval_reliability",
108
+ "pain": "Models can strategically underperform on capability evaluations.",
109
+ "tafagent_mode": null,
110
+ "external_tools": [
111
+ {"name": "AI Sandbagging paper", "url": "https://arxiv.org/abs/2406.07358", "type": "paper"},
112
+ {"name": "Covert sandbagging vs CoT monitoring", "url": "https://www.alphaxiv.org/overview/2508.00943", "type": "paper"}
113
+ ],
114
+ "best_for": "Awareness — knowing CoT monitoring can have up to 36% false-negative rate.",
115
+ "not_for": "Live detection — requires running the model and adversarial probes."
116
+ },
117
+ {
118
+ "id": "max_pos_embeddings_unmask",
119
+ "category": "diagnostic",
120
+ "pain": "Config claims 32k/128k context but model attends way less (SWA, YaRN).",
121
+ "tafagent_mode": "🪟 Unmask",
122
+ "external_tools": [
123
+ {"name": "vLLM long-context handling thread", "url": "https://github.com/vllm-project/vllm/issues/16757", "type": "issue"}
124
+ ],
125
+ "best_for": "1-second verdict (HONEST / INFLATED / SEVERELY INFLATED / YARN-EXTENDED) before paying GPU.",
126
+ "not_for": "Validating that the model reasons (vs. just retrieves) at the effective context — use NIAH→Reason."
127
+ },
128
+ {
129
+ "id": "niah_reasoning",
130
+ "category": "diagnostic",
131
+ "pain": "Long-context models pass NIAH but fail multi-hop reasoning.",
132
+ "tafagent_mode": "🔍 NIAH→Reason",
133
+ "external_tools": [
134
+ {"name": "NVIDIA RULER benchmark", "url": "https://github.com/NVIDIA/RULER", "type": "tool"},
135
+ {"name": "RULER paper / leaderboard", "url": "https://llm-stats.com/benchmarks/ruler", "type": "leaderboard"}
136
+ ],
137
+ "best_for": "Predicting NIAH and reasoning pass rates from architecture alone — no inference needed.",
138
+ "not_for": "Final go/no-go decision — re-test on your domain after architectural screening passes."
139
+ },
140
+ {
141
+ "id": "tokenizer_glitch",
142
+ "category": "diagnostic",
143
+ "pain": "Glitch tokens / merge residues break inference silently.",
144
+ "tafagent_mode": null,
145
+ "external_tools": [
146
+ {"name": "GlitchMiner (AAAI 2026)", "url": "https://arxiv.org/html/2601.14658v1", "type": "paper"},
147
+ {"name": "Tiktokenizer (browser visualization)", "url": "https://tiktokenizer.vercel.app/", "type": "tool"}
148
+ ],
149
+ "best_for": "Spotting weird tokens. ~4.3% of vocab in Llama-2 / Mistral / DeepSeek-V3 are glitches.",
150
+ "not_for": "Fixing them — requires finetuning or vocab patching."
151
+ },
152
+ {
153
+ "id": "phase_diagram",
154
+ "category": "diagnostic",
155
+ "pain": "Where does my model sit in the architecture phase space (γ × θ)?",
156
+ "tafagent_mode": "📊 Phase diagram",
157
+ "external_tools": [],
158
+ "best_for": "Visualizing 23 reference models and locating yours by Hagedorn line / Padé curve.",
159
+ "not_for": "Quantitative recipe scoring — use Profile mode instead."
160
+ },
161
+ {
162
+ "id": "profile",
163
+ "category": "diagnostic",
164
+ "pain": "Will this model fit my use case across all 5 recipes?",
165
+ "tafagent_mode": "📇 Profile",
166
+ "external_tools": [],
167
+ "best_for": "Scoring all 5 recipes (custom train vs API · long context · budget · hardware · KV cache · etc.) in one pass.",
168
+ "not_for": "Production deployment readiness — Profile is screening, not certification."
169
+ },
170
+ {
171
+ "id": "chat_template",
172
+ "category": "setup",
173
+ "pain": "Forgetting `--apply_chat_template` silently halves multi-turn accuracy.",
174
+ "tafagent_mode": "📜 Chat-template",
175
+ "external_tools": [
176
+ {"name": "lm-eval-harness #1841 (canonical issue)", "url": "https://github.com/EleutherAI/lm-evaluation-harness/issues/1841", "type": "issue"},
177
+ {"name": "HF chat-template docs", "url": "https://huggingface.co/docs/transformers/main/en/chat_templating", "type": "docs"}
178
+ ],
179
+ "best_for": "Detecting which family (Llama-3 / ChatML / Mistral / Gemma / Phi-3 / DeepSeek / Alpaca) and getting the exact CLI flag.",
180
+ "not_for": "Custom templates outside the 7 detected families — verify manually."
181
+ },
182
+ {
183
+ "id": "structured_outputs",
184
+ "category": "setup",
185
+ "pain": "JSON schema engines fail silently; CoT models commit to answer before reasoning.",
186
+ "tafagent_mode": null,
187
+ "external_tools": [
188
+ {"name": "llguidance (constrained decoding)", "url": "https://github.com/guidance-ai/llguidance", "type": "tool"},
189
+ {"name": "Outlines", "url": "https://github.com/dottxt-ai/outlines", "type": "tool"},
190
+ {"name": "JSONLint validator (browser)", "url": "https://jsonlint.com/json-schema", "type": "tool"},
191
+ {"name": "JSONSchemaBench (10K real schemas)", "url": "https://github.com/guidance-ai/jsonschemabench", "type": "benchmark"},
192
+ {"name": "Schema field-ordering anti-patterns explained", "url": "https://collinwilkins.com/articles/structured-output", "type": "article"}
193
+ ],
194
+ "best_for": "Constrained decoding for production. Use llguidance / Outlines / SGLang grammars for 100% schema-valid output.",
195
+ "not_for": "Quick prototypes — function calling is sufficient (95-99% reliable)."
196
+ },
197
+ {
198
+ "id": "mcp_conformance",
199
+ "category": "setup",
200
+ "pain": "MCP server schema doesn't conform to spec — clients silently break.",
201
+ "tafagent_mode": null,
202
+ "external_tools": [
203
+ {"name": "MCP Schema Validator (free, browser-based)", "url": "https://www.mcpserverspot.com/tools/validator", "type": "tool"},
204
+ {"name": "Official MCP spec", "url": "https://github.com/modelcontextprotocol/modelcontextprotocol", "type": "spec"},
205
+ {"name": "FastMCP 3.0 (Jan 2026)", "url": "https://github.com/jlowin/fastmcp", "type": "tool"}
206
+ ],
207
+ "best_for": "One-shot validation of tool/resource/prompt schemas before publishing an MCP server.",
208
+ "not_for": "Runtime testing — use the official inspector for live calls."
209
+ },
210
+ {
211
+ "id": "diagnose_cli",
212
+ "category": "setup",
213
+ "pain": "Need to measure γ_obs on real weights, not just predict from config.",
214
+ "tafagent_mode": "🩺 Diagnose CLI",
215
+ "external_tools": [
216
+ {"name": "TAF paper (Triangulum/karlesmarin)", "url": "https://github.com/karlesmarin/NeurIPS", "type": "paper"}
217
+ ],
218
+ "best_for": "Generating the exact `python cli/diagnose_model.py` command for your model.",
219
+ "not_for": "Browser-only diagnosis — this mode is a builder, not an executor."
220
+ },
221
+ {
222
+ "id": "peft_loading",
223
+ "category": "training",
224
+ "pain": "`get_peft_model()` before `PeftModel.from_pretrained()` silently loads base model — LoRA weights ignored.",
225
+ "tafagent_mode": null,
226
+ "external_tools": [
227
+ {"name": "HF PEFT troubleshooting (canonical)", "url": "https://huggingface.co/docs/peft/main/en/developer_guides/troubleshooting", "type": "docs"},
228
+ {"name": "peft #2115 — original bug report", "url": "https://github.com/huggingface/peft/issues/2115", "type": "issue"},
229
+ {"name": "PEFT get_layer_status() / get_model_status()", "url": "https://huggingface.co/docs/peft/main/en/package_reference/peft_model", "type": "docs"}
230
+ ],
231
+ "best_for": "If you suspect your LoRA isn't being applied, call `model.get_layer_status()` and check `active_adapters` is non-empty.",
232
+ "not_for": null,
233
+ "tafagent_planned_mode": "🔧 PEFT Anti-Pattern Checker (v0.8.2)"
234
+ },
235
+ {
236
+ "id": "intruder_dimensions",
237
+ "category": "training",
238
+ "pain": "LoRA introduces 'intruder dimensions' that contribute to forgetting.",
239
+ "tafagent_mode": null,
240
+ "external_tools": [
241
+ {"name": "PEFT reduce_intruder_dimension utility", "url": "https://huggingface.co/docs/peft/main/en/developer_guides/troubleshooting", "type": "docs"}
242
+ ],
243
+ "best_for": "Post-training cleanup if forgetting metrics regress after LoRA finetune.",
244
+ "not_for": "Heavy domain shift — intruder dim removal won't fix structural forgetting."
245
+ },
246
+ {
247
+ "id": "quant_regime",
248
+ "category": "training",
249
+ "pain": "Will quantization break my model? Which scheme for which arch?",
250
+ "tafagent_mode": "⚖️ Quant",
251
+ "external_tools": [
252
+ {"name": "Maarten Grootendorst quantization newsletter", "url": "https://newsletter.maartengrootendorst.com/p/which-quantization-method-is-right", "type": "article"},
253
+ {"name": "Jarvis Labs vLLM quantization benchmarks", "url": "https://jarvislabs.ai/blog/vllm-quantization-complete-guide-benchmarks", "type": "article"},
254
+ {"name": "oobabooga quant comparison (GPTQ/AWQ/EXL2/GGUF)", "url": "https://oobabooga.github.io/blog/posts/gptq-awq-exl2-llamacpp/", "type": "article"},
255
+ {"name": "Which Quantization (arxiv)", "url": "https://arxiv.org/pdf/2601.14277", "type": "paper"}
256
+ ],
257
+ "best_for": "Predict γ shift + ΔPPL for any (model × scheme) combo. AWQ ~95% / GGUF ~92% / GPTQ ~90% retention.",
258
+ "not_for": "Production quality cert — run a 10-prompt holdout eval after quantization."
259
+ },
260
+ {
261
+ "id": "forgetting",
262
+ "category": "training",
263
+ "pain": "Will my LoRA fine-tune destroy MMLU performance?",
264
+ "tafagent_mode": null,
265
+ "external_tools": [
266
+ {"name": "Scaling Laws for Forgetting (Kleiman et al.)", "url": "https://arxiv.org/html/2401.05605v1", "type": "paper"},
267
+ {"name": "LoRA Learns Less and Forgets Less (Biderman et al., TMLR)", "url": "https://arxiv.org/abs/2405.09673", "type": "paper"},
268
+ {"name": "How Much is Too Much? (LoRA Rank Trade-offs)", "url": "https://arxiv.org/html/2512.15634v1", "type": "paper"}
269
+ ],
270
+ "best_for": "Reading before any new fine-tune. Same (arch, rank) yields Δ from -10pp to +35pp on MMLU.",
271
+ "not_for": "A predictor — variance is too high for a closed-form heuristic. Measure your own holdout."
272
+ },
273
+ {
274
+ "id": "rag_eval",
275
+ "category": "retrieval",
276
+ "pain": "Is my RAG retrieval actually retrieving?",
277
+ "tafagent_mode": null,
278
+ "external_tools": [
279
+ {"name": "RAGAS — automated RAG eval (13.8k★)", "url": "https://github.com/explodinggradients/ragas", "type": "tool"},
280
+ {"name": "TruLens — feedback functions + tracing", "url": "https://www.trulens.org/", "type": "tool"},
281
+ {"name": "DeepEval — 50+ metrics, CI/CD ready", "url": "https://github.com/confident-ai/deepeval", "type": "tool"},
282
+ {"name": "RAG eval frameworks comparison", "url": "https://atlan.com/know/llm-evaluation-frameworks-compared/", "type": "article"}
283
+ ],
284
+ "best_for": "Production RAG monitoring. RAGAS for metric exploration, DeepEval for CI/CD gates, TruLens for dashboards.",
285
+ "not_for": "Browser-only — all three need Python + your retrieval pipeline."
286
+ },
287
+ {
288
+ "id": "embeddings",
289
+ "category": "retrieval",
290
+ "pain": "Which embedding model for my corpus?",
291
+ "tafagent_mode": null,
292
+ "external_tools": [
293
+ {"name": "MTEB Leaderboard (HF official)", "url": "https://huggingface.co/spaces/mteb/leaderboard", "type": "leaderboard"},
294
+ {"name": "MMTEB — 250+ langs", "url": "https://github.com/embeddings-benchmark/mteb", "type": "tool"},
295
+ {"name": "Best embedding models for RAG (2026)", "url": "https://blog.premai.io/best-embedding-models-for-rag-2026-ranked-by-mteb-score-cost-and-self-hosting/", "type": "article"}
296
+ ],
297
+ "best_for": "Cross-comparison of 100+ embedding models on 56 English tasks / 250+ multilingual.",
298
+ "not_for": "Predicting performance on your specific corpus — 'leaderboard ≠ your data'."
299
+ },
300
+ {
301
+ "id": "vlm_eval",
302
+ "category": "multimodal",
303
+ "pain": "Which VLM benchmark, and is my VLM actually seeing?",
304
+ "tafagent_mode": "📈 Saturation (covers MMMU/MMMU-Pro/VisScience)",
305
+ "external_tools": [
306
+ {"name": "MMMU benchmark", "url": "https://mmmu-benchmark.github.io/", "type": "leaderboard"},
307
+ {"name": "VisScience (K-12 science)", "url": "https://arxiv.org/abs/2409.13730", "type": "paper"},
308
+ {"name": "VLM survey 2025", "url": "https://arxiv.org/abs/2501.02189", "type": "paper"}
309
+ ],
310
+ "best_for": "MMMU near-saturated (top-3 ~85.6%); VisScience still discriminative (~46% mean) — pick the harder one.",
311
+ "not_for": "Visual hallucination detection — needs running the VLM with your images."
312
+ },
313
+ {
314
+ "id": "agent_observability",
315
+ "category": "observability",
316
+ "pain": "Why did my agent fail / loop? Can't tell from logs.",
317
+ "tafagent_mode": null,
318
+ "external_tools": [
319
+ {"name": "LangSmith (LangChain ecosystem)", "url": "https://www.langchain.com/langsmith/observability", "type": "tool"},
320
+ {"name": "LangGraph Studio v2 (May 2025)", "url": "https://www.langchain.com/", "type": "tool"},
321
+ {"name": "TruLens (RAG + agent traces)", "url": "https://www.trulens.org/", "type": "tool"},
322
+ {"name": "OpenLLMetry — OTLP-based tracing", "url": "https://github.com/traceloop/openllmetry", "type": "tool"}
323
+ ],
324
+ "best_for": "Visual trace viewer per LLM call / tool invocation / retrieval step. Token + cost tracking.",
325
+ "not_for": "Browser-only — all need integration into your stack."
326
+ },
327
+ {
328
+ "id": "instruction_following",
329
+ "category": "observability",
330
+ "pain": "Best agentic models follow <30% of instructions perfectly on real-world tasks.",
331
+ "tafagent_mode": null,
332
+ "external_tools": [
333
+ {"name": "AGENTIF benchmark", "url": "https://keg.cs.tsinghua.edu.cn/persons/xubin/papers/AgentIF.pdf", "type": "paper"},
334
+ {"name": "Tool output processing benchmark", "url": "https://arxiv.org/html/2510.15955v1", "type": "paper"}
335
+ ],
336
+ "best_for": "Calibrating expectations — performance falls with instruction length and tool constraints.",
337
+ "not_for": "Live testing — needs running the agent on your task suite."
338
+ },
339
+ {
340
+ "id": "saturation_meta_resources",
341
+ "category": "eval_reliability",
342
+ "pain": "I want to read the full state of LLM evaluation 2026.",
343
+ "tafagent_mode": null,
344
+ "external_tools": [
345
+ {"name": "Survey: A Survey on LLM Benchmarks (2508.15361)", "url": "https://arxiv.org/abs/2508.15361", "type": "paper"},
346
+ {"name": "Survey: LLMs-as-Judges (2412.05579)", "url": "https://arxiv.org/abs/2412.05579", "type": "paper"},
347
+ {"name": "Holistic Evaluation of Language Models (HELM)", "url": "https://crfm.stanford.edu/helm/latest/", "type": "tool"},
348
+ {"name": "Open LLM Leaderboard v3", "url": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard", "type": "leaderboard"}
349
+ ],
350
+ "best_for": "Comprehensive context on contamination, judge bias, saturation, methodology open problems.",
351
+ "not_for": "Quick decisions — these are surveys, not tools."
352
+ },
353
+ {
354
+ "id": "config_inspector",
355
+ "category": "diagnostic",
356
+ "pain": "What's actually in this model's config.json?",
357
+ "tafagent_mode": "🔍 Inspect config",
358
+ "external_tools": [
359
+ {"name": "LLM Config Comparer", "url": "https://huggingface.co/spaces/gojiteji/LLM-Comparer", "type": "tool"},
360
+ {"name": "HF Hub model card / config viewer", "url": "https://huggingface.co/", "type": "tool"}
361
+ ],
362
+ "best_for": "Paste config JSON → full TAF analysis without re-fetching.",
363
+ "not_for": "Comparing across N models — use 🆚 Compare or open-llm-leaderboard/comparator."
364
+ },
365
+ {
366
+ "id": "compare_models",
367
+ "category": "diagnostic",
368
+ "pain": "Side-by-side comparison of multiple models on multiple recipes.",
369
+ "tafagent_mode": "🆚 Compare models",
370
+ "external_tools": [
371
+ {"name": "Open LLM Leaderboard Comparator (HF official)", "url": "https://huggingface.co/spaces/open-llm-leaderboard/comparator", "type": "tool"}
372
+ ],
373
+ "best_for": "Quick recipe-by-recipe comparison up to 5 models.",
374
+ "not_for": "Production benchmark scores — use the HF comparator for benchmark results."
375
+ },
376
+ {
377
+ "id": "ask_plain_english",
378
+ "category": "diagnostic",
379
+ "pain": "I just want to ask a question in plain English.",
380
+ "tafagent_mode": "💬 Ask plain English",
381
+ "external_tools": [],
382
+ "best_for": "'Will Mistral-7B handle 16K NIAH retrieval?' → answer with the right recipe + chain.",
383
+ "not_for": "Open-ended chat — this is a routing front-end, not a chatbot."
384
+ },
385
+ {
386
+ "id": "recipe_picker",
387
+ "category": "diagnostic",
388
+ "pain": "I know my use case but not which recipe to apply.",
389
+ "tafagent_mode": "📋 Pick recipe",
390
+ "external_tools": [],
391
+ "best_for": "Browsing the 8 recipes (custom train vs API · long context · budget · hardware · etc.) when you don't know which fits.",
392
+ "not_for": "Running all of them at once — use Profile mode."
393
+ },
394
+ {
395
+ "id": "verified_math",
396
+ "category": "diagnostic",
397
+ "pain": "Can I trust the math behind the diagnostic?",
398
+ "tafagent_mode": null,
399
+ "external_tools": [
400
+ {"name": "Lean theorems (Triangulum/karlesmarin/lean-taf)", "url": "https://github.com/karlesmarin/lean-taf", "type": "spec"},
401
+ {"name": "TAF paper (NeurIPS)", "url": "https://github.com/karlesmarin/NeurIPS", "type": "paper"}
402
+ ],
403
+ "best_for": "37 theorems machine-proven in Lean 4 + Mathlib. Click any badge in the UI to open the source line.",
404
+ "not_for": "Empirical claims — Lean covers algebraic identities, not measurement protocols."
405
+ }
406
+ ]
407
+ }
index.html CHANGED
@@ -216,6 +216,9 @@
216
  <p><strong data-i18n="help.v08.saturation.title">📈 Benchmark Saturation Detector</strong></p>
217
  <p data-i18n="help.v08.saturation.body">MMLU is saturated (top 88-94%), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.</p>
218
 
 
 
 
219
  <h3 data-i18n="help.audit.title">The audit chain</h3>
220
  <p data-i18n="help.audit.body">Every result shows the full <strong>Computation Chain</strong> — each formula step with its inputs,
221
  output, and interpretation. Click any step to expand. Cite section numbers (§26.1, §19.1, etc.) refer
@@ -325,6 +328,7 @@
325
  <li data-i18n="inv.v07.drift"><strong>🔀 Drift</strong> — bug or noise? Predict max admissible gap between two evals</li>
326
  <li data-i18n="inv.v07.niah"><strong>🔍 NIAH→Reason</strong> — does your "128k context" actually reason there, or just retrieve?</li>
327
  <li data-i18n="inv.v08.saturation"><strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?</li>
 
328
  </ul>
329
  </details>
330
  </div>
@@ -383,6 +387,7 @@
383
  <button data-mode-link="drift" data-i18n="modes.drift">🔀 Drift</button>
384
  <button data-mode-link="arena" data-i18n="modes.arena">🎯 Arena CI</button>
385
  <button data-mode-link="saturation" data-i18n="modes.saturation">📈 Saturation</button>
 
386
  </div>
387
  </div>
388
  <div class="task-tile">
@@ -450,6 +455,7 @@
450
  <button class="mode-btn" data-mode="drift" role="tab" aria-selected="false" data-i18n="modes.drift">🔀 Drift</button>
451
  <button class="mode-btn" data-mode="niah" role="tab" aria-selected="false" data-i18n="modes.niah">🔍 NIAH→Reason</button>
452
  <button class="mode-btn" data-mode="saturation" role="tab" aria-selected="false" data-i18n="modes.saturation">📈 Saturation</button>
 
453
  </div>
454
  <p id="mode-desc" class="recipe-desc" data-i18n="modes.desc">
455
  <strong>Quickest start</strong>: paste any HuggingFace model id (e.g. <code>meta-llama/Meta-Llama-3-8B</code>),
@@ -997,6 +1003,24 @@
997
  </p>
998
  </section>
999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1000
  <!-- Recipe selector (mode=recipe) -->
1001
  <section id="recipe-section" style="display:none;">
1002
  <h2 data-i18n="recipe.title">📋 Recipe</h2>
 
216
  <p><strong data-i18n="help.v08.saturation.title">📈 Benchmark Saturation Detector</strong></p>
217
  <p data-i18n="help.v08.saturation.body">MMLU is saturated (top 88-94%), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.</p>
218
 
219
+ <p><strong data-i18n="help.v081.hub.title">🧭 Solutions Hub</strong></p>
220
+ <p data-i18n="help.v081.hub.body">tafagent as integrator, not silo. 30+ pains across 7 categories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), each mapped to (a) the tafagent mode that addresses it, if any, and (b) the best-of-breed external tools the community already trusts (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Search box matches across pain, scenario, and tool name. <em>Use case</em>: 'I have problem X — does tafagent solve it, and if not, who does?'</p>
221
+
222
  <h3 data-i18n="help.audit.title">The audit chain</h3>
223
  <p data-i18n="help.audit.body">Every result shows the full <strong>Computation Chain</strong> — each formula step with its inputs,
224
  output, and interpretation. Click any step to expand. Cite section numbers (§26.1, §19.1, etc.) refer
 
328
  <li data-i18n="inv.v07.drift"><strong>🔀 Drift</strong> — bug or noise? Predict max admissible gap between two evals</li>
329
  <li data-i18n="inv.v07.niah"><strong>🔍 NIAH→Reason</strong> — does your "128k context" actually reason there, or just retrieve?</li>
330
  <li data-i18n="inv.v08.saturation"><strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?</li>
331
+ <li data-i18n="inv.v081.hub"><strong>🧭 Solutions Hub</strong> — every documented pain mapped to a tafagent mode or curated external tool. Don't reinvent — find.</li>
332
  </ul>
333
  </details>
334
  </div>
 
387
  <button data-mode-link="drift" data-i18n="modes.drift">🔀 Drift</button>
388
  <button data-mode-link="arena" data-i18n="modes.arena">🎯 Arena CI</button>
389
  <button data-mode-link="saturation" data-i18n="modes.saturation">📈 Saturation</button>
390
+ <button data-mode-link="hub" data-i18n="modes.hub">🧭 Solutions</button>
391
  </div>
392
  </div>
393
  <div class="task-tile">
 
455
  <button class="mode-btn" data-mode="drift" role="tab" aria-selected="false" data-i18n="modes.drift">🔀 Drift</button>
456
  <button class="mode-btn" data-mode="niah" role="tab" aria-selected="false" data-i18n="modes.niah">🔍 NIAH→Reason</button>
457
  <button class="mode-btn" data-mode="saturation" role="tab" aria-selected="false" data-i18n="modes.saturation">📈 Saturation</button>
458
+ <button class="mode-btn" data-mode="hub" role="tab" aria-selected="false" data-i18n="modes.hub">🧭 Solutions</button>
459
  </div>
460
  <p id="mode-desc" class="recipe-desc" data-i18n="modes.desc">
461
  <strong>Quickest start</strong>: paste any HuggingFace model id (e.g. <code>meta-llama/Meta-Llama-3-8B</code>),
 
1003
  </p>
1004
  </section>
1005
 
1006
+ <!-- Solutions Hub — integrator portal (v0.8.1) -->
1007
+ <section id="hub-section" style="display:none;">
1008
+ <h2><span data-i18n="hub.title">🧭 Solutions Hub</span>
1009
+ <span class="info"><span class="tooltip" data-i18n="hub.tip">
1010
+ Map of every documented LLM-eval pain we know about: which tafagent mode addresses it (if any), and the best-of-breed external tools the community already trusts. Goal: full coverage. If a canonical tool exists elsewhere, we link rather than rebuild.
1011
+ </span></span>
1012
+ </h2>
1013
+ <p class="recipe-desc" data-i18n="hub.desc">
1014
+ <strong>Don't reinvent — find.</strong> 30+ pains mapped to tafagent modes + curated external tools. Browse by category, search by keyword, or see the gaps where new modes would help most.
1015
+ </p>
1016
+ <div class="form-row">
1017
+ <input type="text" id="hub-search" placeholder="search: e.g. 'forgetting' or 'vendor' or 'RAG'…" style="flex:1;" />
1018
+ <button type="button" id="hub-clear-btn" class="secondary" data-i18n="hub.clear_btn">✕ Clear</button>
1019
+ </div>
1020
+ <p id="hub-status" class="recipe-desc" style="font-size:0.92em;"></p>
1021
+ <div id="hub-output" style="margin-top: 1em;"></div>
1022
+ </section>
1023
+
1024
  <!-- Recipe selector (mode=recipe) -->
1025
  <section id="recipe-section" style="display:none;">
1026
  <h2 data-i18n="recipe.title">📋 Recipe</h2>
js/i18n.js CHANGED
@@ -423,6 +423,8 @@ export const TRANSLATIONS = {
423
  "mode_desc.niah": "Predicts NIAH (retrieval) and multi-hop reasoning pass rates at any context. Solves: long-context models often pass NIAH but fail reasoning at the same context (RULER paper).",
424
  "modes.saturation": "📈 Saturation",
425
  "mode_desc.saturation": "Tells you whether a benchmark still discriminates frontier models or has saturated (e.g. MMLU 88-94% top, AIME 2025 already 96-100%). Returns top-3 + verdict + recommended replacements.",
 
 
426
  "niah.title": "🔍 NIAH → Reasoning Gap",
427
  "niah.tip": "NIAH (Needle in a Haystack) tests retrieval: 'find this fact in long text'. Multi-hop reasoning tests inference: 'combine facts X+Y at the start with fact Z at the end'. RULER paper (NVIDIA 2024) shows long-context models often pass NIAH but fail reasoning at the same context. This tool predicts both pass rates from architecture alone.",
428
  "niah.desc": "<strong>Your model claims 128k context. Will it actually reason at 64k, or just retrieve?</strong> Paste an HF model id and a target eval context — tool predicts NIAH and multi-hop reasoning pass rates, the gap, and a 'safe context' where reasoning stays ≥65%.",
@@ -501,6 +503,22 @@ export const TRANSLATIONS = {
501
  "help.v08.saturation.title": "📈 Benchmark Saturation Detector",
502
  "help.v08.saturation.body": "MMLU is saturated (88-94% top), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.",
503
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
  // v0.7.7 — Task tiles (UX restructure: 14 modes grouped by user intent)
506
  "tiles.title": "🎯 What do you want to do?",
@@ -1367,6 +1385,8 @@ export const TRANSLATIONS = {
1367
  "mode_desc.niah": "Predice tasas de pass de NIAH (retrieval) y reasoning multi-hop a cualquier contexto. Resuelve: modelos long-context pasan NIAH pero fallan reasoning al mismo contexto (paper RULER).",
1368
  "modes.saturation": "📈 Saturación",
1369
  "mode_desc.saturation": "Te dice si un benchmark sigue discriminando frontier models o ya está saturado (ej. MMLU 88-94% top, AIME 2025 ya 96-100%). Devuelve top-3 + veredicto + reemplazos recomendados.",
 
 
1370
  "niah.title": "🔍 Gap NIAH → Reasoning",
1371
  "niah.tip": "NIAH (Needle in a Haystack) testea retrieval: 'encuentra este hecho en texto largo'. Reasoning multi-hop testea inferencia: 'combina hechos X+Y del principio con hecho Z del final'. El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH pero fallan reasoning al mismo contexto. Esta herramienta predice ambas tasas desde la arquitectura sola.",
1372
  "niah.desc": "<strong>Tu modelo dice 128k de contexto. ¿Razonará realmente a 64k, o solo encontrará?</strong> Pega un model id HF y un contexto objetivo — la herramienta predice tasas de pass NIAH y reasoning multi-hop, el gap, y un 'contexto seguro' donde reasoning se mantiene ≥65%.",
@@ -1445,6 +1465,22 @@ export const TRANSLATIONS = {
1445
  "help.v08.saturation.title": "📈 Detector de saturación de benchmarks",
1446
  "help.v08.saturation.body": "MMLU está saturado (top 88-94%), AIME 2025 saturó a los pocos meses de salir, HumanEval near-saturated. Elige cualquier benchmark y la herramienta retorna top-3 frontier scores, spread, media, y un veredicto — saturated / near-saturated / discriminative — más un reemplazo recomendado (ej. MMLU → MMLU-Pro / GPQA / HLE). Fetch en vivo desde DemandSphere AI Frontier Tracker (CC BY-NC 4.0) cuando llega; snapshot baked 2026-05-05 cuando no. <em>Caso de uso</em>: antes de citar '92% en MMLU' o diseñar una eval, verifica si el benchmark aún discrimina algo.",
1447
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — ¿sigue siendo útil tu benchmark, o están todos los frontiers empatados arriba?",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1448
 
1449
  // v0.7.7 — Tiles de tareas (UX restructure: 14 modos agrupados por intención)
1450
  "tiles.title": "🎯 ¿Qué quieres hacer?",
@@ -2175,6 +2211,8 @@ export const TRANSLATIONS = {
2175
  "mode_desc.niah": "Prédit les taux de réussite NIAH (retrieval) et reasoning multi-hop à n'importe quel contexte. Résout : les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte (paper RULER).",
2176
  "modes.saturation": "📈 Saturation",
2177
  "mode_desc.saturation": "Indique si un benchmark discrimine encore les frontier models ou s'il est saturé (ex. MMLU 88-94% top, AIME 2025 déjà 96-100%). Retourne top-3 + verdict + remplacements recommandés.",
 
 
2178
  "niah.title": "🔍 Gap NIAH → Reasoning",
2179
  "niah.tip": "NIAH (Needle in a Haystack) teste le retrieval : 'trouve ce fait dans un long texte'. Le reasoning multi-hop teste l'inférence : 'combine les faits X+Y au début avec le fait Z à la fin'. Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte. Cet outil prédit les deux taux à partir de la seule architecture.",
2180
  "niah.desc": "<strong>Votre modèle revendique 128k de contexte. Va-t-il vraiment raisonner à 64k, ou seulement retrouver ?</strong> Collez un model id HF et un contexte cible — l'outil prédit les taux de réussite NIAH et reasoning multi-hop, le gap, et un 'contexte sûr' où le reasoning reste ≥65%.",
@@ -2253,6 +2291,22 @@ export const TRANSLATIONS = {
2253
  "help.v08.saturation.title": "📈 Détecteur de saturation des benchmarks",
2254
  "help.v08.saturation.body": "MMLU est saturé (top 88-94%), AIME 2025 saturé en quelques mois après sa sortie, HumanEval presque saturé. Choisissez un benchmark et l'outil retourne top-3 frontier scores, spread, moyenne, et un verdict — saturated / near-saturated / discriminative — plus un remplacement recommandé (ex. MMLU → MMLU-Pro / GPQA / HLE). Fetch en direct depuis DemandSphere AI Frontier Tracker (CC BY-NC 4.0) si accessible ; snapshot baked 2026-05-05 sinon. <em>Cas d'usage</em> : avant de citer '92% sur MMLU' ou de concevoir une eval, vérifiez si le benchmark discrimine encore quelque chose.",
2255
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — votre benchmark est-il encore utile, ou tous les frontiers sont-ils à égalité au sommet ?",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2256
 
2257
  // v0.7.7 — Tuiles de tâches (refonte UX : 14 modes regroupés par intention)
2258
  "tiles.title": "🎯 Que voulez-vous faire ?",
@@ -2983,6 +3037,8 @@ export const TRANSLATIONS = {
2983
  "mode_desc.niah": "在任意上下文下预测 NIAH(检索)与多跳 reasoning 通过率。解决:长上下文模型常常通过 NIAH 但在同一上下文上 reasoning 失败(RULER 论文)。",
2984
  "modes.saturation": "📈 饱和度",
2985
  "mode_desc.saturation": "告诉你某个 benchmark 是否仍能区分 frontier 模型,或者已经饱和(例如 MMLU 88-94% 顶部,AIME 2025 已经 96-100%)。返回 top-3 + 判定 + 推荐替代品。",
 
 
2986
  "niah.title": "🔍 NIAH → Reasoning Gap",
2987
  "niah.tip": "NIAH(Needle in a Haystack)测试检索:\"在长文本中找到这个事实\"。多跳 reasoning 测试推理:\"把开头的事实 X+Y 与结尾的事实 Z 结合\"。RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH 但在相同上下文上 reasoning 失败。本工具仅根据架构预测两种通过率。",
2988
  "niah.desc": "<strong>你的模型声称 128k 上下文。它在 64k 是真的能 reasoning,还是只能检索?</strong>粘贴 HF 模型 id 和目标 eval 上下文 — 工具预测 NIAH 与多跳 reasoning 通过率、gap,以及 reasoning 保持 ≥65% 的 \"安全上下文\"。",
@@ -3061,6 +3117,22 @@ export const TRANSLATIONS = {
3061
  "help.v08.saturation.title": "📈 Benchmark 饱和度检测器",
3062
  "help.v08.saturation.body": "MMLU 已饱和(top 88-94%),AIME 2025 上线几个月就饱和,HumanEval 接近饱和。选任何 benchmark,工具返回 top-3 frontier 分数、spread、平均,以及判定 — saturated / near-saturated / discriminative — 加上推荐替代品(例如 MMLU → MMLU-Pro / GPQA / HLE)。可达时从 DemandSphere AI Frontier Tracker(CC BY-NC 4.0)实时 fetch;不可达时使用 2026-05-05 的 baked 快照。<em>用例</em>:在引用\"92% on MMLU\"或设计 eval 之前,检查 benchmark 是否仍能区分任何东西。",
3063
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — 你的 benchmark 还有用吗,还是所有 frontier 都在顶部并列?",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3064
 
3065
  // v0.7.7 — 任务卡片(UX 重构:按用户意图分组的 14 个模式)
3066
  "tiles.title": "🎯 你想做什么?",
 
423
  "mode_desc.niah": "Predicts NIAH (retrieval) and multi-hop reasoning pass rates at any context. Solves: long-context models often pass NIAH but fail reasoning at the same context (RULER paper).",
424
  "modes.saturation": "📈 Saturation",
425
  "mode_desc.saturation": "Tells you whether a benchmark still discriminates frontier models or has saturated (e.g. MMLU 88-94% top, AIME 2025 already 96-100%). Returns top-3 + verdict + recommended replacements.",
426
+ "modes.hub": "🧭 Solutions",
427
+ "mode_desc.hub": "Map of every documented LLM-eval pain → tafagent mode (if covered) + curated external tools. Find the right solution without rebuilding it. 30+ pains, 7 categories.",
428
  "niah.title": "🔍 NIAH → Reasoning Gap",
429
  "niah.tip": "NIAH (Needle in a Haystack) tests retrieval: 'find this fact in long text'. Multi-hop reasoning tests inference: 'combine facts X+Y at the start with fact Z at the end'. RULER paper (NVIDIA 2024) shows long-context models often pass NIAH but fail reasoning at the same context. This tool predicts both pass rates from architecture alone.",
430
  "niah.desc": "<strong>Your model claims 128k context. Will it actually reason at 64k, or just retrieve?</strong> Paste an HF model id and a target eval context — tool predicts NIAH and multi-hop reasoning pass rates, the gap, and a 'safe context' where reasoning stays ≥65%.",
 
503
  "help.v08.saturation.title": "📈 Benchmark Saturation Detector",
504
  "help.v08.saturation.body": "MMLU is saturated (88-94% top), AIME 2025 saturated within months of release, HumanEval near-saturated. Pick any benchmark and the tool returns top-3 frontier scores, spread, mean, and a verdict — saturated / near-saturated / discriminative — plus a recommended replacement (e.g. MMLU → MMLU-Pro / GPQA / HLE). Live fetch from DemandSphere AI Frontier Tracker (CC BY-NC 4.0) when reachable; baked 2026-05-05 snapshot when not. <em>Use case</em>: before you cite '92% on MMLU' or design an eval, check whether the benchmark still discriminates anything.",
505
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — is your benchmark still useful, or are all frontier models tied at the top?",
506
+ "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — every documented pain mapped to a tafagent mode or curated external tool. Don't reinvent — find.",
507
+ "help.v081.hub.title": "🧭 Solutions Hub",
508
+ "help.v081.hub.body": "tafagent as integrator, not silo. 30+ pains across 7 categories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), each mapped to (a) the tafagent mode that addresses it, if any, and (b) the best-of-breed external tools the community already trusts (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Search box matches across pain, scenario, and tool name. <em>Use case</em>: 'I have problem X — does tafagent solve it, and if not, who does?'",
509
+ "hub.title": "🧭 Solutions Hub",
510
+ "hub.tip": "Map of every documented LLM-eval pain we know about: which tafagent mode addresses it (if any), and the best-of-breed external tools the community already trusts. Goal: full coverage. If a canonical tool exists elsewhere, we link rather than rebuild.",
511
+ "hub.desc": "<strong>Don't reinvent — find.</strong> 30+ pains mapped to tafagent modes + curated external tools. Browse by category, search by keyword, or see the gaps where new modes would help most.",
512
+ "hub.clear_btn": "✕ Clear",
513
+ "hub.no_mode": "external",
514
+ "hub.planned": "planned:",
515
+ "hub.best_for": "Best for",
516
+ "hub.not_for": "Not for",
517
+ "hub.tools": "External tools",
518
+ "hub.status.loaded": "✅ Loaded {total} pains across {categories} categories — {covered} covered by tafagent modes, {externalLinks} external links curated. Compiled {compiled}.",
519
+ "hub.status.fail": "⚠ Could not load Solutions Hub.",
520
+ "hub.search.empty": "No matches for '{query}'. Try broader terms (e.g. 'eval', 'rag', 'tokenizer').",
521
+ "hub.search.results": "Found {n} match(es) for '{query}'.",
522
 
523
  // v0.7.7 — Task tiles (UX restructure: 14 modes grouped by user intent)
524
  "tiles.title": "🎯 What do you want to do?",
 
1385
  "mode_desc.niah": "Predice tasas de pass de NIAH (retrieval) y reasoning multi-hop a cualquier contexto. Resuelve: modelos long-context pasan NIAH pero fallan reasoning al mismo contexto (paper RULER).",
1386
  "modes.saturation": "📈 Saturación",
1387
  "mode_desc.saturation": "Te dice si un benchmark sigue discriminando frontier models o ya está saturado (ej. MMLU 88-94% top, AIME 2025 ya 96-100%). Devuelve top-3 + veredicto + reemplazos recomendados.",
1388
+ "modes.hub": "🧭 Soluciones",
1389
+ "mode_desc.hub": "Mapa de cada problema documentado de LLM-eval → mode tafagent (si cubierto) + herramientas externas curadas. Encuentra la solución sin reinventarla. 30+ pains, 7 categorías.",
1390
  "niah.title": "🔍 Gap NIAH → Reasoning",
1391
  "niah.tip": "NIAH (Needle in a Haystack) testea retrieval: 'encuentra este hecho en texto largo'. Reasoning multi-hop testea inferencia: 'combina hechos X+Y del principio con hecho Z del final'. El paper RULER (NVIDIA 2024) muestra que modelos long-context a menudo pasan NIAH pero fallan reasoning al mismo contexto. Esta herramienta predice ambas tasas desde la arquitectura sola.",
1392
  "niah.desc": "<strong>Tu modelo dice 128k de contexto. ¿Razonará realmente a 64k, o solo encontrará?</strong> Pega un model id HF y un contexto objetivo — la herramienta predice tasas de pass NIAH y reasoning multi-hop, el gap, y un 'contexto seguro' donde reasoning se mantiene ≥65%.",
 
1465
  "help.v08.saturation.title": "📈 Detector de saturación de benchmarks",
1466
  "help.v08.saturation.body": "MMLU está saturado (top 88-94%), AIME 2025 saturó a los pocos meses de salir, HumanEval near-saturated. Elige cualquier benchmark y la herramienta retorna top-3 frontier scores, spread, media, y un veredicto — saturated / near-saturated / discriminative — más un reemplazo recomendado (ej. MMLU → MMLU-Pro / GPQA / HLE). Fetch en vivo desde DemandSphere AI Frontier Tracker (CC BY-NC 4.0) cuando llega; snapshot baked 2026-05-05 cuando no. <em>Caso de uso</em>: antes de citar '92% en MMLU' o diseñar una eval, verifica si el benchmark aún discrimina algo.",
1467
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — ¿sigue siendo útil tu benchmark, o están todos los frontiers empatados arriba?",
1468
+ "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — cada pain documentado mapeado a un mode tafagent o herramienta externa curada. No reinventes — encuentra.",
1469
+ "help.v081.hub.title": "🧭 Solutions Hub",
1470
+ "help.v081.hub.body": "tafagent como integrador, no silo. 30+ pains en 7 categorías (eval reliability · diagnósticos · setup · training · retrieval · multimodal · observability), cada uno mapeado a (a) el mode tafagent que lo resuelve, si existe, y (b) las herramientas externas best-of-breed que la comunidad ya usa (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). Caja de búsqueda matchea pain, scenario, y nombre de herramienta. <em>Caso de uso</em>: 'tengo problema X — ¿lo resuelve tafagent, y si no, quién?'",
1471
+ "hub.title": "🧭 Solutions Hub",
1472
+ "hub.tip": "Mapa de cada pain de LLM-eval documentado: qué mode tafagent lo resuelve (si alguno), y las herramientas externas best-of-breed que la comunidad ya usa. Objetivo: cobertura total. Si la herramienta canónica existe en otra parte, enlazamos en vez de rebuildear.",
1473
+ "hub.desc": "<strong>No reinventes — encuentra.</strong> 30+ pains mapeados a modes tafagent + herramientas externas curadas. Navega por categoría, busca por keyword, o ve los huecos donde nuevos modes ayudarían más.",
1474
+ "hub.clear_btn": "✕ Limpiar",
1475
+ "hub.no_mode": "externo",
1476
+ "hub.planned": "planeado:",
1477
+ "hub.best_for": "Mejor para",
1478
+ "hub.not_for": "No para",
1479
+ "hub.tools": "Herramientas externas",
1480
+ "hub.status.loaded": "✅ Cargados {total} pains en {categories} categorías — {covered} cubiertos por modes tafagent, {externalLinks} enlaces externos curados. Compilado {compiled}.",
1481
+ "hub.status.fail": "⚠ No se pudo cargar Solutions Hub.",
1482
+ "hub.search.empty": "Sin coincidencias para '{query}'. Prueba términos más amplios (ej. 'eval', 'rag', 'tokenizer').",
1483
+ "hub.search.results": "Encontradas {n} coincidencia(s) para '{query}'.",
1484
 
1485
  // v0.7.7 — Tiles de tareas (UX restructure: 14 modos agrupados por intención)
1486
  "tiles.title": "🎯 ¿Qué quieres hacer?",
 
2211
  "mode_desc.niah": "Prédit les taux de réussite NIAH (retrieval) et reasoning multi-hop à n'importe quel contexte. Résout : les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte (paper RULER).",
2212
  "modes.saturation": "📈 Saturation",
2213
  "mode_desc.saturation": "Indique si un benchmark discrimine encore les frontier models ou s'il est saturé (ex. MMLU 88-94% top, AIME 2025 déjà 96-100%). Retourne top-3 + verdict + remplacements recommandés.",
2214
+ "modes.hub": "🧭 Solutions",
2215
+ "mode_desc.hub": "Carte de chaque problème documenté de LLM-eval → mode tafagent (si couvert) + outils externes curés. Trouvez la solution sans la réinventer. 30+ pains, 7 catégories.",
2216
  "niah.title": "🔍 Gap NIAH → Reasoning",
2217
  "niah.tip": "NIAH (Needle in a Haystack) teste le retrieval : 'trouve ce fait dans un long texte'. Le reasoning multi-hop teste l'inférence : 'combine les faits X+Y au début avec le fait Z à la fin'. Le paper RULER (NVIDIA 2024) montre que les modèles long-context passent souvent NIAH mais échouent au reasoning au même contexte. Cet outil prédit les deux taux à partir de la seule architecture.",
2218
  "niah.desc": "<strong>Votre modèle revendique 128k de contexte. Va-t-il vraiment raisonner à 64k, ou seulement retrouver ?</strong> Collez un model id HF et un contexte cible — l'outil prédit les taux de réussite NIAH et reasoning multi-hop, le gap, et un 'contexte sûr' où le reasoning reste ≥65%.",
 
2291
  "help.v08.saturation.title": "📈 Détecteur de saturation des benchmarks",
2292
  "help.v08.saturation.body": "MMLU est saturé (top 88-94%), AIME 2025 saturé en quelques mois après sa sortie, HumanEval presque saturé. Choisissez un benchmark et l'outil retourne top-3 frontier scores, spread, moyenne, et un verdict — saturated / near-saturated / discriminative — plus un remplacement recommandé (ex. MMLU → MMLU-Pro / GPQA / HLE). Fetch en direct depuis DemandSphere AI Frontier Tracker (CC BY-NC 4.0) si accessible ; snapshot baked 2026-05-05 sinon. <em>Cas d'usage</em> : avant de citer '92% sur MMLU' ou de concevoir une eval, vérifiez si le benchmark discrimine encore quelque chose.",
2293
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — votre benchmark est-il encore utile, ou tous les frontiers sont-ils à égalité au sommet ?",
2294
+ "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — chaque pain documenté mappé à un mode tafagent ou outil externe curé. Ne réinventez pas — trouvez.",
2295
+ "help.v081.hub.title": "🧭 Solutions Hub",
2296
+ "help.v081.hub.body": "tafagent comme intégrateur, pas silo. 30+ pains à travers 7 catégories (eval reliability · diagnostics · setup · training · retrieval · multimodal · observability), chacun mappé à (a) le mode tafagent qui le résout, s'il existe, et (b) les outils externes best-of-breed que la communauté utilise déjà (RAGAS, MTEB, HELM, MCP Schema Validator, llm-stats, llguidance, GlitchMiner, etc.). La barre de recherche matche pain, scénario, et nom d'outil. <em>Cas d'usage</em> : 'j'ai le problème X — tafagent le résout-il, et sinon, qui ?'",
2297
+ "hub.title": "🧭 Solutions Hub",
2298
+ "hub.tip": "Carte de chaque pain de LLM-eval documenté : quel mode tafagent l'adresse (si applicable), et les outils externes best-of-breed que la communauté utilise déjà. Objectif : couverture totale. Si l'outil canonique existe ailleurs, nous lions plutôt que de reconstruire.",
2299
+ "hub.desc": "<strong>Ne réinventez pas — trouvez.</strong> 30+ pains mappés à des modes tafagent + outils externes curés. Naviguez par catégorie, recherchez par mot-clé, ou voyez les lacunes où de nouveaux modes aideraient le plus.",
2300
+ "hub.clear_btn": "✕ Effacer",
2301
+ "hub.no_mode": "externe",
2302
+ "hub.planned": "prévu :",
2303
+ "hub.best_for": "Idéal pour",
2304
+ "hub.not_for": "Pas pour",
2305
+ "hub.tools": "Outils externes",
2306
+ "hub.status.loaded": "✅ Chargés {total} pains dans {categories} catégories — {covered} couverts par des modes tafagent, {externalLinks} liens externes curés. Compilé {compiled}.",
2307
+ "hub.status.fail": "⚠ Impossible de charger Solutions Hub.",
2308
+ "hub.search.empty": "Aucune correspondance pour '{query}'. Essayez des termes plus larges (ex. 'eval', 'rag', 'tokenizer').",
2309
+ "hub.search.results": "{n} correspondance(s) trouvée(s) pour '{query}'.",
2310
 
2311
  // v0.7.7 — Tuiles de tâches (refonte UX : 14 modes regroupés par intention)
2312
  "tiles.title": "🎯 Que voulez-vous faire ?",
 
3037
  "mode_desc.niah": "在任意上下文下预测 NIAH(检索)与多跳 reasoning 通过率。解决:长上下文模型常常通过 NIAH 但在同一上下文上 reasoning 失败(RULER 论文)。",
3038
  "modes.saturation": "📈 饱和度",
3039
  "mode_desc.saturation": "告诉你某个 benchmark 是否仍能区分 frontier 模型,或者已经饱和(例如 MMLU 88-94% 顶部,AIME 2025 已经 96-100%)。返回 top-3 + 判定 + 推荐替代品。",
3040
+ "modes.hub": "🧭 方案",
3041
+ "mode_desc.hub": "每个 LLM-eval 问题的地图 → tafagent 模式(若覆盖)+ 精选外部工具。找到方案而非重新发明。30+ 问题,7 类别。",
3042
  "niah.title": "🔍 NIAH → Reasoning Gap",
3043
  "niah.tip": "NIAH(Needle in a Haystack)测试检索:\"在长文本中找到这个事实\"。多跳 reasoning 测试推理:\"把开头的事实 X+Y 与结尾的事实 Z 结合\"。RULER 论文(NVIDIA 2024)显示长上下文模型经常通过 NIAH 但在相同上下文上 reasoning 失败。本工具仅根据架构预测两种通过率。",
3044
  "niah.desc": "<strong>你的模型声称 128k 上下文。它在 64k 是真的能 reasoning,还是只能检索?</strong>粘贴 HF 模型 id 和目标 eval 上下文 — 工具预测 NIAH 与多跳 reasoning 通过率、gap,以及 reasoning 保持 ≥65% 的 \"安全上下文\"。",
 
3117
  "help.v08.saturation.title": "📈 Benchmark 饱和度检测器",
3118
  "help.v08.saturation.body": "MMLU 已饱和(top 88-94%),AIME 2025 上线几个月就饱和,HumanEval 接近饱和。选任何 benchmark,工具返回 top-3 frontier 分数、spread、平均,以及判定 — saturated / near-saturated / discriminative — 加上推荐替代品(例如 MMLU → MMLU-Pro / GPQA / HLE)。可达时从 DemandSphere AI Frontier Tracker(CC BY-NC 4.0)实时 fetch;不可达时使用 2026-05-05 的 baked 快照。<em>用例</em>:在引用\"92% on MMLU\"或设计 eval 之前,检查 benchmark 是否仍能区分任何东西。",
3119
  "inv.v08.saturation": "<strong>📈 Saturation</strong> — 你的 benchmark 还有用吗,还是所有 frontier 都在顶部并列?",
3120
+ "inv.v081.hub": "<strong>🧭 Solutions Hub</strong> — 每个文档化的问题都映射到一个 tafagent 模式或精选外部工具。别重复发明 — 去找。",
3121
+ "help.v081.hub.title": "🧭 Solutions Hub",
3122
+ "help.v081.hub.body": "tafagent 作为集成者而非孤岛。30+ 问题跨 7 类别(评估可靠性 · 诊断 · 设置 · 训练 · 检索 · 多模态 · 可观测性),每个映射到(a)解决它的 tafagent 模式(若存在),以及(b)社区已信任的最佳外部工具(RAGAS、MTEB、HELM、MCP Schema Validator、llm-stats、llguidance、GlitchMiner 等)。搜索框匹配 pain、场景和工具名称。<em>用例</em>:'我有问题 X — tafagent 解决它吗,如果不,谁解决?'",
3123
+ "hub.title": "🧭 Solutions Hub",
3124
+ "hub.tip": "我们已知的每个 LLM-eval 问题的地图:哪个 tafagent 模式能解决它(若有),以及社区已信任的最佳外部工具。目标:全覆盖。如果规范工具已在别处,我们链接而非重建。",
3125
+ "hub.desc": "<strong>别重新发明 — 去找。</strong>30+ 问题映射到 tafagent 模式 + 精选外部工具。按类别浏览、按关键字搜索,或查看新模式最有帮助的空缺。",
3126
+ "hub.clear_btn": "✕ 清空",
3127
+ "hub.no_mode": "外部",
3128
+ "hub.planned": "计划:",
3129
+ "hub.best_for": "适合",
3130
+ "hub.not_for": "不适合",
3131
+ "hub.tools": "外部工具",
3132
+ "hub.status.loaded": "✅ 已加载 {total} 个问题,跨 {categories} 类别 — {covered} 个由 tafagent 模式覆盖,精选 {externalLinks} 个外部链接。编译于 {compiled}。",
3133
+ "hub.status.fail": "⚠ 无法加载 Solutions Hub。",
3134
+ "hub.search.empty": "无 '{query}' 的匹配。尝试更宽泛的词(如 'eval'、'rag'、'tokenizer')。",
3135
+ "hub.search.results": "为 '{query}' 找到 {n} 个匹配。",
3136
 
3137
  // v0.7.7 — 任务卡片(UX 重构:按用户意图分组的 14 个模式)
3138
  "tiles.title": "🎯 你想做什么?",
js/main.js CHANGED
@@ -23,6 +23,10 @@ import {
23
  loadSaturationKB, classifyAll, classifyBenchmark,
24
  listBenchmarks, attribution as saturationAttribution, tryFetchLive,
25
  } from "./saturation_detector.js";
 
 
 
 
26
 
27
  // Attach HF Hub search-as-you-type to all 5 model id inputs (Profile, Recipe,
28
  // Unmask, Template, Quant). Hits public huggingface.co/api/models. Idempotent.
@@ -212,6 +216,7 @@ document.addEventListener("click", (e) => {
212
  template: "template-section", arena: "arena-section", contam: "contam-section",
213
  quant: "quant-section", drift: "drift-section", niah: "niah-section",
214
  saturation: "saturation-section",
 
215
  }[targetMode];
216
  if (sectionId) {
217
  const sec = document.getElementById(sectionId);
@@ -236,7 +241,7 @@ document.querySelectorAll(".mode-btn").forEach(btn => {
236
  "diagnose-section", "phase-section", "unmask-section",
237
  "template-section", "arena-section", "contam-section",
238
  "quant-section", "drift-section", "niah-section",
239
- "saturation-section"].forEach(id => {
240
  const el = $(id);
241
  if (el) el.style.display = "none";
242
  });
@@ -248,12 +253,14 @@ document.querySelectorAll(".mode-btn").forEach(btn => {
248
  template: "template-section", arena: "arena-section", contam: "contam-section",
249
  quant: "quant-section", drift: "drift-section", niah: "niah-section",
250
  saturation: "saturation-section",
 
251
  };
252
  const sectionId = sectionMap[mode];
253
  if (sectionId) $(sectionId).style.display = "";
254
  $("mode-desc").textContent = t(`mode_desc.${mode}`) || "";
255
  if (mode === "phase") initPhaseDiagram();
256
  if (mode === "saturation") initSaturation();
 
257
  });
258
  });
259
 
@@ -3277,6 +3284,106 @@ function runSaturationAll() {
3277
  $("saturation-run-btn")?.addEventListener("click", runSaturationOne);
3278
  $("saturation-all-btn")?.addEventListener("click", runSaturationAll);
3279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3280
  // ════════════════════════════════════════════════════════════════════
3281
  // Bootstrap
3282
  // ════════════════════════════════════════════════════════════════════
 
23
  loadSaturationKB, classifyAll, classifyBenchmark,
24
  listBenchmarks, attribution as saturationAttribution, tryFetchLive,
25
  } from "./saturation_detector.js";
26
+ import {
27
+ loadHub, listCategories, listEntries, searchEntries,
28
+ hubStats, getCategoryMeta,
29
+ } from "./solutions_hub.js";
30
 
31
  // Attach HF Hub search-as-you-type to all 5 model id inputs (Profile, Recipe,
32
  // Unmask, Template, Quant). Hits public huggingface.co/api/models. Idempotent.
 
216
  template: "template-section", arena: "arena-section", contam: "contam-section",
217
  quant: "quant-section", drift: "drift-section", niah: "niah-section",
218
  saturation: "saturation-section",
219
+ hub: "hub-section",
220
  }[targetMode];
221
  if (sectionId) {
222
  const sec = document.getElementById(sectionId);
 
241
  "diagnose-section", "phase-section", "unmask-section",
242
  "template-section", "arena-section", "contam-section",
243
  "quant-section", "drift-section", "niah-section",
244
+ "saturation-section", "hub-section"].forEach(id => {
245
  const el = $(id);
246
  if (el) el.style.display = "none";
247
  });
 
253
  template: "template-section", arena: "arena-section", contam: "contam-section",
254
  quant: "quant-section", drift: "drift-section", niah: "niah-section",
255
  saturation: "saturation-section",
256
+ hub: "hub-section",
257
  };
258
  const sectionId = sectionMap[mode];
259
  if (sectionId) $(sectionId).style.display = "";
260
  $("mode-desc").textContent = t(`mode_desc.${mode}`) || "";
261
  if (mode === "phase") initPhaseDiagram();
262
  if (mode === "saturation") initSaturation();
263
+ if (mode === "hub") initHub();
264
  });
265
  });
266
 
 
3284
  $("saturation-run-btn")?.addEventListener("click", runSaturationOne);
3285
  $("saturation-all-btn")?.addEventListener("click", runSaturationAll);
3286
 
3287
+ // ════════════════════════════════════════════════════════════════════
3288
+ // 🧭 Solutions Hub (v0.8.1) — integrator portal
3289
+ // ════════════════════════════════════════════════════════════════════
3290
+ const HUB_TYPE_BADGE = {
3291
+ tool: "🔧",
3292
+ leaderboard: "📊",
3293
+ paper: "📄",
3294
+ article: "📝",
3295
+ docs: "📘",
3296
+ issue: "🐛",
3297
+ spec: "📐",
3298
+ benchmark: "🧪",
3299
+ };
3300
+
3301
+ let __hubInited = false;
3302
+
3303
+ async function initHub() {
3304
+ if (__hubInited) return;
3305
+ __hubInited = true;
3306
+ try {
3307
+ await loadHub();
3308
+ } catch (e) {
3309
+ $("hub-status").textContent = (t("hub.status.fail") || "⚠ Could not load Solutions Hub.") + " " + (e.message || e);
3310
+ return;
3311
+ }
3312
+ const stats = hubStats();
3313
+ $("hub-status").textContent = tFmt("hub.status.loaded", stats);
3314
+ renderHubAll();
3315
+ }
3316
+
3317
+ function renderEntry(e) {
3318
+ const modeBadge = e.tafagent_mode
3319
+ ? `<span class="badge" style="background:#3fb950;">${e.tafagent_mode}</span>`
3320
+ : (e.tafagent_planned_mode
3321
+ ? `<span class="badge" style="background:#d29922;">${t("hub.planned") || "planned:"} ${e.tafagent_planned_mode}</span>`
3322
+ : `<span class="badge" style="background:#6e7781;">${t("hub.no_mode") || "external"}</span>`);
3323
+ const tools = (e.external_tools || [])
3324
+ .map(tl => {
3325
+ const icon = HUB_TYPE_BADGE[tl.type] || "🔗";
3326
+ return `<li>${icon} <a href="${tl.url}" target="_blank" rel="noopener noreferrer">${tl.name}</a> <span class="subtle" style="font-size:0.82em;">(${tl.type})</span></li>`;
3327
+ })
3328
+ .join("");
3329
+ const bestFor = e.best_for ? `<p><strong>${t("hub.best_for") || "Best for"}:</strong> ${e.best_for}</p>` : "";
3330
+ const notFor = e.not_for ? `<p><strong>${t("hub.not_for") || "Not for"}:</strong> ${e.not_for}</p>` : "";
3331
+ return `
3332
+ <details class="unmask-panel" style="margin: 0.5em 0;">
3333
+ <summary class="unmask-panel-title">${e.pain} ${modeBadge}</summary>
3334
+ ${bestFor}
3335
+ ${notFor}
3336
+ ${tools ? `<p><strong>${t("hub.tools") || "External tools"}:</strong></p><ul>${tools}</ul>` : ""}
3337
+ </details>
3338
+ `;
3339
+ }
3340
+
3341
+ function renderHubAll() {
3342
+ const cats = listCategories();
3343
+ const html = cats.map(c => {
3344
+ const entries = listEntries(c.key);
3345
+ if (entries.length === 0) return "";
3346
+ const inner = entries.map(renderEntry).join("");
3347
+ return `
3348
+ <details class="unmask-panel" open style="margin-top: 1em;">
3349
+ <summary class="unmask-panel-title" style="font-size:1.05em;">
3350
+ ${c.icon} ${c.label} <span class="subtle" style="font-size:0.85em;">(${c.count})</span>
3351
+ </summary>
3352
+ <p class="recipe-desc" style="font-style:italic;">${c.description}</p>
3353
+ ${inner}
3354
+ </details>
3355
+ `;
3356
+ }).join("");
3357
+ $("hub-output").innerHTML = `<div class="arena-result">${html}</div>`;
3358
+ }
3359
+
3360
+ function renderHubSearch(query) {
3361
+ const matches = searchEntries(query);
3362
+ if (matches.length === 0) {
3363
+ $("hub-output").innerHTML = `<p class="recipe-desc">${tFmt("hub.search.empty", { query })}</p>`;
3364
+ return;
3365
+ }
3366
+ const html = matches.map(renderEntry).join("");
3367
+ $("hub-output").innerHTML = `<div class="arena-result">
3368
+ <p class="recipe-desc">${tFmt("hub.search.results", { n: matches.length, query })}</p>
3369
+ ${html}
3370
+ </div>`;
3371
+ }
3372
+
3373
+ let __hubSearchTimer = null;
3374
+ $("hub-search")?.addEventListener("input", (e) => {
3375
+ clearTimeout(__hubSearchTimer);
3376
+ const q = e.target.value;
3377
+ __hubSearchTimer = setTimeout(() => {
3378
+ if (!q.trim()) renderHubAll();
3379
+ else renderHubSearch(q);
3380
+ }, 200);
3381
+ });
3382
+ $("hub-clear-btn")?.addEventListener("click", () => {
3383
+ $("hub-search").value = "";
3384
+ renderHubAll();
3385
+ });
3386
+
3387
  // ════════════════════════════════════════════════════════════════════
3388
  // Bootstrap
3389
  // ════════════════════════════════════════════════════════════════════
js/solutions_hub.js ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Solutions Hub (v0.8.1)
2
+ // tafagent as integrator/curator. Pain → tafagent mode (if shipped) +
3
+ // external best-of-breed tools. Pure logic — no human strings; main.js
4
+ // renders with i18n.
5
+
6
+ let _hub = null;
7
+
8
+ export async function loadHub(url = "./data/solutions_hub.json") {
9
+ if (_hub) return _hub;
10
+ const res = await fetch(url);
11
+ if (!res.ok) throw new Error(`Hub fetch failed: ${res.status}`);
12
+ _hub = await res.json();
13
+ return _hub;
14
+ }
15
+
16
+ export function getHub() { return _hub; }
17
+
18
+ export function listCategories() {
19
+ if (!_hub) return [];
20
+ return Object.entries(_hub.categories).map(([key, meta]) => ({
21
+ key, ...meta,
22
+ count: _hub.entries.filter(e => e.category === key).length,
23
+ }));
24
+ }
25
+
26
+ export function listEntries(categoryKey = null) {
27
+ if (!_hub) return [];
28
+ return categoryKey
29
+ ? _hub.entries.filter(e => e.category === categoryKey)
30
+ : _hub.entries;
31
+ }
32
+
33
+ // Search across pain + best_for + tool names. Case-insensitive substring.
34
+ export function searchEntries(query) {
35
+ if (!_hub || !query) return [];
36
+ const q = query.toLowerCase().trim();
37
+ if (!q) return [];
38
+ return _hub.entries.filter(e => {
39
+ const haystack = [
40
+ e.pain || "",
41
+ e.best_for || "",
42
+ e.not_for || "",
43
+ e.tafagent_mode || "",
44
+ ...(e.external_tools || []).map(t => t.name || ""),
45
+ ].join(" ").toLowerCase();
46
+ return haystack.includes(q);
47
+ });
48
+ }
49
+
50
+ export function getCategoryMeta(key) {
51
+ return _hub?.categories?.[key] || null;
52
+ }
53
+
54
+ // Stats for the inventory header.
55
+ export function hubStats() {
56
+ if (!_hub) return null;
57
+ const entries = _hub.entries;
58
+ const covered = entries.filter(e => e.tafagent_mode).length;
59
+ const planned = entries.filter(e => e.tafagent_planned_mode).length;
60
+ const totalExternal = entries.reduce((acc, e) => acc + (e.external_tools?.length || 0), 0);
61
+ return {
62
+ total: entries.length,
63
+ covered,
64
+ planned,
65
+ externalLinks: totalExternal,
66
+ categories: Object.keys(_hub.categories).length,
67
+ compiled: _hub.compiled,
68
+ };
69
+ }