Food Desert commited on
Commit
33fc1b0
·
1 Parent(s): a48a025

Refine tag toggle UI ordering/colors and add category assignment analysis artifacts

Browse files
app.py CHANGED
@@ -1,144 +1,409 @@
1
- import gradio as gr
2
- import os
3
- import logging
4
- import time
5
- import json
6
- from datetime import datetime
7
- from PIL import Image
8
- from pathlib import Path
9
- from typing import Any, Dict, List, Set
10
- from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
11
-
12
- from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
13
- from psq_rag.llm.rewrite import llm_rewrite_prompt
14
- from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
15
- from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags, llm_infer_probe_tags
16
- from psq_rag.retrieval.state import expand_tags_via_implications
17
- from psq_rag.ui.group_ranked_display import rank_groups_from_tfidf, _load_enabled_groups
18
-
19
-
20
- def _split_prompt_commas(s: str) -> List[str]:
21
- return [p.strip() for p in (s or "").split(",") if p.strip()]
22
-
23
- def _norm_for_dedupe(tag: str) -> str:
24
- # your canonical form for lookup/dedupe
25
- return _norm_tag_for_lookup(tag.lower())
26
-
27
- def compose_final_prompt(rewritten_prompt: str, selected_tags: List[str]) -> str:
28
- parts = _split_prompt_commas(rewritten_prompt)
29
- parts.extend(selected_tags)
30
-
31
- seen = set()
32
- out = []
33
- for p in parts:
34
- key = _norm_for_dedupe(p)
35
- if key in seen:
36
- continue
37
- seen.add(key)
38
- out.append(p)
39
-
40
- return ", ".join(out)
41
-
42
-
 
 
43
  def _display_tag_text(tag: str) -> str:
44
  return tag.replace("_", " ")
45
 
46
 
47
- def _escape_prompt_tag(tag: str) -> str:
48
- return (
49
- tag.replace("_", " ")
50
- .replace("(", "\\(")
51
- .replace(")", "\\)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  )
53
 
 
 
 
 
 
54
 
55
- def _ordered_selected_for_prompt(selected: Set[str], row_defs: List[Dict[str, Any]]) -> List[str]:
56
- out: List[str] = []
57
- seen: Set[str] = set()
58
- for row in row_defs:
59
- for tag in row.get("tags", []):
60
- if tag in selected and tag not in seen:
61
- out.append(tag)
62
- seen.add(tag)
63
- # Fallback for any selected tags not present in current rows.
64
- for tag in sorted(selected):
65
- if tag not in seen:
66
- out.append(tag)
67
- seen.add(tag)
68
- return out
69
-
70
 
71
- def _compose_toggle_prompt_text(selected_tags: List[str], row_defs: List[Dict[str, Any]]) -> str:
72
- selected = {t for t in (selected_tags or []) if t}
73
- ordered = _ordered_selected_for_prompt(selected, row_defs or [])
74
- return ", ".join(_escape_prompt_tag(t) for t in ordered)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def _build_toggle_rows(
78
  *,
79
  seed_terms: List[str],
80
- llm_selected_tags: List[str],
 
 
81
  top_groups: int,
82
  top_tags_per_group: int,
83
  group_rank_top_k: int,
84
  ) -> List[Dict[str, Any]]:
85
- ranked_rows = rank_groups_from_tfidf(
86
- seed_terms=seed_terms,
87
- top_groups=max(1, int(top_groups)),
88
- top_tags_per_group=max(1, int(top_tags_per_group)),
89
- group_rank_top_k=max(1, int(group_rank_top_k)),
90
- )
91
  groups_map = _load_enabled_groups()
92
- llm_selected = list(dict.fromkeys(_norm_tag_for_lookup(t) for t in llm_selected_tags if t))
93
-
94
- row_defs: List[Dict[str, Any]] = []
95
- displayed_group_names = [r.group_name for r in ranked_rows]
96
- displayed_group_tag_sets: Dict[str, Set[str]] = {
97
- name: set(groups_map.get(name, [])) for name in displayed_group_names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  }
99
- tags_in_any_displayed_group: Set[str] = set()
100
- for tag_set in displayed_group_tag_sets.values():
101
- tags_in_any_displayed_group.update(tag_set)
102
-
103
- llm_other = [t for t in llm_selected if t not in tags_in_any_displayed_group]
104
  row_defs.append(
105
  {
106
- "name": "llm_selected_other",
107
- "label": "LLM Selected (Other)",
108
- "tags": llm_other,
 
109
  }
110
  )
111
 
112
  for row in ranked_rows:
113
  group_name = row.group_name
114
  group_tag_set = displayed_group_tag_sets.get(group_name, set())
115
- selected_in_group = [t for t in llm_selected if t in group_tag_set]
116
- ranked_tags = [t for t, _ in row.tags]
 
 
 
 
 
 
 
 
 
 
117
  merged = selected_in_group + [t for t in ranked_tags if t not in selected_in_group]
118
  keep_n = max(max(1, int(top_tags_per_group)), len(selected_in_group))
119
  merged = merged[:keep_n]
 
 
 
 
 
 
 
120
  row_defs.append(
121
  {
122
  "name": group_name,
123
  "label": f"{group_name} (E={row.expected_count:.2f})",
124
  "tags": merged,
 
125
  }
126
  )
127
-
128
- return row_defs
129
-
130
-
131
- def _build_row_component_updates(
132
- row_defs: List[Dict[str, Any]],
133
- selected_tags: List[str],
134
- max_rows: int,
135
- ):
136
- selected = {t for t in (selected_tags or []) if t}
137
- row_values_state: List[List[str]] = []
138
- header_updates = []
139
- checkbox_updates = []
140
-
141
- for idx in range(max_rows):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  if idx < len(row_defs):
143
  row = row_defs[idx]
144
  tags = list(dict.fromkeys(row.get("tags", [])))
@@ -146,716 +411,993 @@ def _build_row_component_updates(
146
  row_values_state.append(values)
147
  visible = bool(tags)
148
  header_updates.append(gr.update(value=f"**{row.get('label', '')}**", visible=visible))
149
- choices = [(_display_tag_text(t), t) for t in tags]
 
 
 
 
 
 
150
  checkbox_updates.append(
151
  gr.update(
152
  choices=choices,
153
  value=values,
154
  visible=visible,
155
- )
156
- )
157
- else:
158
- header_updates.append(gr.update(value="", visible=False))
159
- checkbox_updates.append(gr.update(choices=[], value=[], visible=False))
160
-
161
- prompt_text = _compose_toggle_prompt_text(list(selected), row_defs)
162
- return prompt_text, row_values_state, header_updates, checkbox_updates
163
-
164
-
165
  def _on_toggle_row(
166
  row_idx: int,
167
  changed_values: List[str],
168
  selected_tags_state: List[str],
169
- row_defs_state: List[Dict[str, Any]],
170
- row_values_state: List[List[str]],
171
- max_rows: int,
172
  ):
173
  row_defs = row_defs_state or []
174
  selected = set(selected_tags_state or [])
175
- prev_values = list(row_values_state or [])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
- while len(prev_values) < len(row_defs):
178
- prev_values.append([])
 
 
 
 
179
 
180
- prev_set = set(prev_values[row_idx]) if row_idx < len(prev_values) else set()
181
- new_set = set(changed_values or [])
182
- selected.update(new_set - prev_set)
183
- selected.difference_update(prev_set - new_set)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
 
185
- prompt_text, new_row_values_state, _header_updates, checkbox_updates = _build_row_component_updates(
186
- row_defs=row_defs,
187
- selected_tags=list(selected),
188
- max_rows=max_rows,
189
- )
190
 
191
- return [sorted(selected), new_row_values_state, prompt_text, *checkbox_updates]
 
 
 
 
 
 
 
 
 
 
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- def _build_ui_payload(
195
- *,
196
- console_text: str,
197
- legacy_prompt_text: str,
198
- row_defs: List[Dict[str, Any]],
199
- selected_tags: List[str],
200
- ):
201
- prompt_text, row_values_state, header_updates, checkbox_updates = _build_row_component_updates(
202
- row_defs=row_defs,
203
- selected_tags=selected_tags,
204
- max_rows=display_max_rows_default,
205
- )
206
- return [
207
- console_text,
208
- legacy_prompt_text,
209
- prompt_text,
210
- sorted(set(selected_tags or [])),
211
- row_defs,
212
- row_values_state,
213
- *header_updates,
214
- *checkbox_updates,
215
- ]
216
-
217
-
218
- def _build_selection_query(
219
- prompt_in: str,
220
- rewritten: str,
221
- structural_tags: List[str],
222
- probe_tags: List[str],
223
- ) -> str:
224
- lines = [f"IMAGE DESCRIPTION: {prompt_in.strip()}"]
225
- if rewritten and rewritten.strip():
226
- lines.append(f"REWRITE PHRASES: {rewritten.strip()}")
227
- hint_tags = []
228
- if structural_tags:
229
- hint_tags.extend(structural_tags)
230
- if probe_tags:
231
- hint_tags.extend(probe_tags)
232
- if hint_tags:
233
- # Keep hints as context only; selection still must choose by candidate indices.
234
- lines.append(
235
- "INFERRED TAG HINTS (context only): " + ", ".join(sorted(set(hint_tags)))
236
- )
237
- return "\n".join(lines)
238
-
239
-
240
- # Set up logging
241
- # Minimal prod logging: warnings+ to stderr, no file by default
242
- import os, logging
243
-
244
- LOG_LEVEL = os.environ.get("PSQ_LOG_LEVEL", "WARNING").upper()
245
- logging.basicConfig(
246
- level=getattr(logging, LOG_LEVEL, logging.WARNING),
247
- format="%(asctime)s %(levelname)s:%(message)s",
248
- handlers=[logging.StreamHandler()] # no file -> avoids huge logs on Spaces
249
- )
250
-
251
- # Quiet down common noisy libs (optional)
252
- for _name in ("gensim", "gradio", "hnswlib", "httpx", "uvicorn"):
253
- logging.getLogger(_name).setLevel(logging.ERROR)
254
-
255
- # Turn off Gradio analytics phone-home to avoid those background thread errors (optional)
256
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "0"
257
-
258
-
259
- MASCOT_DIR = Path(__file__).parent / "mascotimages"
260
- MASCOT_FILE = MASCOT_DIR / "transparentsquirrel.png"
261
-
262
-
263
- def _load_mascot_image():
264
- """Load mascot image if available; return None when missing/unreadable."""
265
- if not MASCOT_FILE.exists():
266
- logging.warning("Mascot image missing: %s", MASCOT_FILE)
267
- return None
268
- try:
269
- return Image.open(MASCOT_FILE).convert("RGBA")
270
- except Exception as e:
271
- logging.warning("Failed to load mascot image (%s): %s", MASCOT_FILE, e)
272
- return None
273
-
274
- try:
275
- from gradio_client import utils as _gc_utils
276
-
277
- _orig_get_type = _gc_utils.get_type
278
- _orig_j2p = _gc_utils._json_schema_to_python_type
279
- _orig_pub = _gc_utils.json_schema_to_python_type
280
-
281
- def _get_type_safe(schema):
282
- # Sometimes schema is a bare True/False (JSON Schema boolean form)
283
- if not isinstance(schema, dict):
284
- return "any"
285
- return _orig_get_type(schema)
286
-
287
- def _j2p_safe(schema, defs=None):
288
- # Accept non-dict schemas (True/False/None) and treat as "any"
289
- if not isinstance(schema, dict):
290
- return "any"
291
- return _orig_j2p(schema, defs or schema.get("$defs"))
292
-
293
- def _pub_safe(schema):
294
- # Public wrapper used by Gradio; keep it resilient too
295
- if not isinstance(schema, dict):
296
- return "any"
297
- return _j2p_safe(schema, schema.get("$defs"))
298
-
299
- _gc_utils.get_type = _get_type_safe
300
- _gc_utils._json_schema_to_python_type = _j2p_safe
301
- _gc_utils.json_schema_to_python_type = _pub_safe
302
-
303
- except Exception as e:
304
- print("gradio_client hotfix not applied:", e)
305
- # -------------------------------------------------------------------------------
306
-
307
-
308
- allow_nsfw_tags = False
309
- def _is_production_runtime() -> bool:
310
- """Best-effort detection for deployed runtime (HF Spaces or explicit env)."""
311
- if os.environ.get("PSQ_PRODUCTION", "").strip().lower() in {"1", "true", "yes"}:
312
- return True
313
- if os.environ.get("SPACE_ID"):
314
- return True
315
- if os.environ.get("HF_SPACE_ID"):
316
- return True
317
- if os.environ.get("SYSTEM") == "spaces":
318
- return True
319
- return False
320
-
321
-
322
- verbose_retrieval_default = "0" if _is_production_runtime() else "1"
323
- verbose_retrieval = os.environ.get("PSQ_VERBOSE_RETRIEVAL", verbose_retrieval_default).strip().lower() in {"1", "true", "yes"}
324
- verbose_retrieval_all = False
325
- verbose_retrieval_limit = 20
326
- enable_probe_tags = os.environ.get("PSQ_ENABLE_PROBE", "1").strip() not in {"0", "false", "False"}
327
- display_top_groups_default = int(os.environ.get("PSQ_DISPLAY_TOP_GROUPS", "10"))
328
- display_top_tags_per_group_default = int(os.environ.get("PSQ_DISPLAY_TOP_TAGS_PER_GROUP", "5"))
329
- display_rank_top_k_default = int(os.environ.get("PSQ_DISPLAY_GROUP_RANK_TOP_K", "5"))
330
- display_max_rows_default = int(os.environ.get("PSQ_DISPLAY_MAX_ROWS", "14"))
331
- retrieval_global_k = int(os.environ.get("PSQ_RETRIEVAL_GLOBAL_K", "300"))
332
- retrieval_per_phrase_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_K", "10"))
333
- retrieval_per_phrase_final_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_FINAL_K", "1"))
334
- selection_mode = os.environ.get("PSQ_SELECTION_MODE", "chunked_map_union").strip()
335
- selection_chunk_size = int(os.environ.get("PSQ_SELECTION_CHUNK_SIZE", "60"))
336
- selection_per_phrase_k = int(os.environ.get("PSQ_SELECTION_PER_PHRASE_K", "2"))
337
- selection_candidate_cap = int(os.environ.get("PSQ_SELECTION_CANDIDATE_CAP", "0"))
338
- stage1_rewrite_timeout_s = float(os.environ.get("PSQ_TIMEOUT_REWRITE_S", "45"))
339
- stage1_struct_timeout_s = float(os.environ.get("PSQ_TIMEOUT_STRUCT_S", "45"))
340
- stage1_probe_timeout_s = float(os.environ.get("PSQ_TIMEOUT_PROBE_S", "45"))
341
- stage3_select_timeout_s = float(os.environ.get("PSQ_TIMEOUT_SELECT_S", "45"))
342
- timing_log_path = Path(os.environ.get("PSQ_TIMING_LOG_PATH", "data/runtime_metrics/ui_pipeline_timings.jsonl"))
343
 
344
- css = """
345
- .scrollable-content{
346
- max-height: 420px;
347
- overflow-y: scroll; /* always show scrollbar */
348
- overflow-x: hidden;
349
- padding-right: 8px;
350
- padding-bottom: 14px; /* <— add this */
351
- scrollbar-gutter: stable; /* prevent layout shift as it fills */
352
-
353
- /* Firefox */
354
- scrollbar-width: auto;
355
- scrollbar-color: rgba(180,180,180,.9) rgba(0,0,0,.15);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  }
357
 
358
- /* WebKit/Chromium (Chrome/Edge/Safari) */
359
- .scrollable-content::-webkit-scrollbar{ width: 10px; }
360
- .scrollable-content::-webkit-scrollbar-thumb{ background: rgba(180,180,180,.9); border-radius: 8px; }
361
- .scrollable-content::-webkit-scrollbar-track{ background: rgba(0,0,0,.15); }
 
 
 
362
 
363
- /* (Optional) make both scroll panes taller so they fill more of the column */
364
- .pane-left .scrollable-content,
365
- .pane-right .scrollable-content {
366
- max-height: 610px; /* was 420px; tweak to taste */
367
  }
368
 
369
- .lego-tags .gr-checkboxgroup {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  display: flex;
371
  flex-wrap: wrap;
372
  gap: 8px;
 
373
  }
374
 
375
- .lego-tags .gr-checkboxgroup label {
376
- margin: 0;
377
- padding: 0;
 
 
 
 
 
 
 
 
378
  }
379
 
380
- .lego-tags .gr-checkboxgroup input[type="checkbox"] {
381
- display: none;
 
 
 
382
  }
383
 
384
- .lego-tags .gr-checkboxgroup span {
385
- display: inline-block;
386
- padding: 7px 12px;
387
- border: 1px solid #8a8a8a;
388
- border-radius: 10px;
389
- background: #f4f4f4;
390
- color: #222;
391
- font-size: 0.95rem;
392
- line-height: 1.2;
393
- cursor: pointer;
394
- user-select: none;
395
- box-shadow: 0 1px 0 rgba(0,0,0,0.12), inset 0 1px 0 rgba(255,255,255,0.7);
396
- }
397
 
398
- .lego-tags .gr-checkboxgroup input[type="checkbox"]:checked + span {
399
- background: #ffd86a;
400
- border-color: #c49a00;
401
- box-shadow: 0 2px 0 #a98000, inset 0 1px 0 rgba(255,255,255,0.65);
402
- transform: translateY(1px);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  }
404
  """
405
 
406
 
407
  def rag_pipeline_ui(
408
- user_prompt: str,
409
- display_top_groups: float,
410
- display_top_tags_per_group: float,
411
- display_rank_top_k: float,
412
- ):
413
- logs = []
414
- def log(s): logs.append(s)
415
-
416
- try:
417
- stage_timings = {}
418
-
419
- def _record_timing(stage: str, dt_s: float):
420
- stage_timings[stage] = float(dt_s)
421
-
422
- def _emit_timing_summary(total_s: float):
423
- summary_order = [
424
- "preprocess",
425
- "rewrite",
426
- "structural",
427
- "probe",
428
- "retrieval",
429
- "selection",
430
- "implication_expansion",
431
- "prompt_composition",
432
- "group_display",
433
- ]
434
- lines = []
435
- for k in summary_order:
436
- if k in stage_timings:
437
- lines.append(f"{k}={stage_timings[k]:.2f}s")
438
- slowest = max(stage_timings.items(), key=lambda kv: kv[1])[0] if stage_timings else "n/a"
439
- log("Timing Summary: " + ", ".join(lines))
440
- log(f"Timing Slowest Stage: {slowest}")
441
- log(f"Timing Total: {total_s:.2f}s")
442
-
443
- def _append_timing_jsonl(total_s: float):
444
- try:
445
- timing_log_path.parent.mkdir(parents=True, exist_ok=True)
446
- rec = {
447
- "timestamp_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
448
- "stages_s": stage_timings,
449
- "total_s": float(total_s),
450
- "config": {
451
- "timeout_rewrite_s": stage1_rewrite_timeout_s,
452
- "timeout_struct_s": stage1_struct_timeout_s,
453
- "timeout_probe_s": stage1_probe_timeout_s,
454
- "timeout_select_s": stage3_select_timeout_s,
455
- },
456
- }
457
- with timing_log_path.open("a", encoding="utf-8") as f:
458
- f.write(json.dumps(rec, ensure_ascii=True) + "\n")
459
- log(f"Timing Log: wrote {timing_log_path}")
460
- except Exception as e:
461
- log(f"Timing Log: failed ({type(e).__name__}: {e})")
462
-
463
- def _future_with_timeout(fut, timeout_s: float, stage_name: str, fallback):
464
- t0 = time.perf_counter()
465
- try:
466
- out = fut.result(timeout=max(1.0, float(timeout_s)))
467
- dt = time.perf_counter() - t0
468
- log(f"{stage_name}: {dt:.2f}s")
469
- stage_key = {
470
- "Rewrite": "rewrite",
471
- "Structural inference": "structural",
472
- "Probe inference": "probe",
473
- "Index selection": "selection",
474
- }.get(stage_name)
475
- if stage_key:
476
- _record_timing(stage_key, dt)
477
- return out
478
- except FutureTimeoutError:
479
- fut.cancel()
480
- log(f"{stage_name}: timed out after {timeout_s:.0f}s; using fallback")
481
- return fallback
482
- except Exception as e:
483
- log(f"{stage_name}: failed ({type(e).__name__}: {e}); using fallback")
484
- return fallback
485
-
486
- t_total0 = time.perf_counter()
487
- log("Start: received prompt")
488
- prompt_in = (user_prompt or "").strip()
489
- if not prompt_in:
490
- return _build_ui_payload(
491
- console_text="Error: empty prompt",
492
- legacy_prompt_text="",
493
- row_defs=[],
494
- selected_tags=[],
495
- )
496
-
497
- log("Input:")
498
- log(prompt_in)
499
- log("")
500
- log(
501
- "Runtime config: "
502
- f"retrieval_global_k={retrieval_global_k} "
503
- f"retrieval_per_phrase_k={retrieval_per_phrase_k} "
504
- f"retrieval_per_phrase_final_k={retrieval_per_phrase_final_k} "
505
- f"selection_mode={selection_mode} "
506
- f"selection_chunk_size={selection_chunk_size} "
507
- f"selection_per_phrase_k={selection_per_phrase_k}"
508
- )
509
- log("")
510
-
511
- t0 = time.perf_counter()
512
- user_tags = extract_user_provided_tags_upto_3_words(prompt_in)
513
- dt = time.perf_counter()-t0
514
- _record_timing("preprocess", dt)
515
- log(f"Preprocess (user tag extraction): {dt:.2f}s")
516
- log("Heuristically extracted user tags:")
517
- if user_tags:
518
- log(", ".join(user_tags))
519
- else:
520
- log("(none)")
521
- log("")
522
-
523
- log("Step 1: LLM rewrite + structural inference + probe (concurrent)")
524
- max_workers = 3 if enable_probe_tags else 2
525
- with ThreadPoolExecutor(max_workers=max_workers) as ex:
526
- fut_rewrite = ex.submit(llm_rewrite_prompt, prompt_in, log)
527
- fut_struct = ex.submit(llm_infer_structural_tags, prompt_in, log=log)
528
- fut_probe = ex.submit(llm_infer_probe_tags, prompt_in, log=log) if enable_probe_tags else None
529
-
530
- rewritten = _future_with_timeout(
531
- fut_rewrite, stage1_rewrite_timeout_s, "Rewrite", prompt_in
532
- )
533
- structural_tags = _future_with_timeout(
534
- fut_struct, stage1_struct_timeout_s, "Structural inference", []
535
- )
536
- probe_tags = (
537
- _future_with_timeout(fut_probe, stage1_probe_timeout_s, "Probe inference", [])
538
- if fut_probe else []
539
- )
540
-
541
- log("Rewrite:")
542
- log(rewritten if rewritten else "(empty)")
543
- log("")
544
-
545
- rewrite_for_retrieval = rewritten
546
- if user_tags:
547
- # keep them separate in logs, but allow them to help retrieval
548
- rewrite_for_retrieval = (rewrite_for_retrieval + ", " + ", ".join(user_tags)).strip(", ").strip()
549
-
550
-
551
- log("Step 2: Prompt Squirrel retrieval (hidden)")
552
- try:
553
- t0 = time.perf_counter()
554
- retrieval_context_tags = list(dict.fromkeys((structural_tags or []) + (probe_tags or [])))
555
- rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
556
- retrieval_result = psq_candidates_from_rewrite_phrases(
557
- rewrite_phrases=rewrite_phrases,
558
- allow_nsfw_tags=allow_nsfw_tags,
559
- context_tags=retrieval_context_tags,
560
- global_k=max(1, retrieval_global_k),
561
- per_phrase_k=max(1, retrieval_per_phrase_k),
562
- per_phrase_final_k=max(1, retrieval_per_phrase_final_k),
563
- verbose=verbose_retrieval,
564
- )
565
- if isinstance(retrieval_result, tuple):
566
- candidates, phrase_reports = retrieval_result
567
- else:
568
- candidates, phrase_reports = retrieval_result, []
569
- if selection_candidate_cap > 0 and len(candidates) > selection_candidate_cap:
570
- candidates = candidates[:selection_candidate_cap]
571
- log(f"Selection candidate cap applied: {selection_candidate_cap}")
572
- dt = time.perf_counter()-t0
573
- _record_timing("retrieval", dt)
574
- log(f"Retrieval: {dt:.2f}s")
575
- log(f"Retrieved {len(candidates)} candidate tags")
576
- if verbose_retrieval:
577
- log(f"Total unique candidates: {len(candidates)}")
578
- limit = None if verbose_retrieval_all else max(1, int(verbose_retrieval_limit))
579
- for report in phrase_reports:
580
- phrase = report.get("normalized") or report.get("phrase") or ""
581
- lookup = report.get("lookup") or ""
582
- tfidf_vocab = report.get("tfidf_vocab")
583
- log(f"Phrase: {phrase} (lookup={lookup}) tfidf_vocab={tfidf_vocab}")
584
- rows = report.get("candidates", [])
585
- shown = rows if limit is None else rows[:limit]
586
- for row in shown:
587
- tag = row.get("tag")
588
- alias_token = row.get("alias_token")
589
- score_fasttext = row.get("score_fasttext")
590
- score_context = row.get("score_context")
591
- score_combined = row.get("score_combined")
592
- count = row.get("count")
593
- alias_part = ""
594
- if alias_token and alias_token != tag:
595
- alias_part = f" [alias_token={alias_token}]"
596
- fasttext_str = (
597
- f"{score_fasttext:.3f}" if isinstance(score_fasttext, (int, float)) else score_fasttext
598
- )
599
- if score_context is None:
600
- context_str = "None"
601
- else:
602
- context_str = (
603
- f"{score_context:.3f}" if isinstance(score_context, (int, float)) else score_context
604
- )
605
- combined_str = (
606
- f"{score_combined:.3f}" if isinstance(score_combined, (int, float)) else score_combined
607
- )
608
- log(
609
- f" {tag}{alias_part} | fasttext={fasttext_str} context={context_str} "
610
- f"combined={combined_str} count={count}"
611
- )
612
- if limit is not None and len(rows) > limit:
613
- log(f" ... ({len(rows) - limit} more)")
614
- except Exception as e:
615
- log(f"Retrieval fallback: {type(e).__name__}: {e}")
616
- candidates = []
617
-
618
- log("Step 3: LLM index selection (uses rewrite + structural/probe context)")
619
- selection_query = _build_selection_query(
620
- prompt_in=prompt_in,
621
- rewritten=rewritten,
622
- structural_tags=structural_tags,
623
- probe_tags=probe_tags,
624
- )
625
- with ThreadPoolExecutor(max_workers=1) as ex:
626
- fut_sel = ex.submit(
627
- llm_select_indices,
628
- query_text=selection_query,
629
- candidates=candidates,
630
- max_pick=0,
631
- log=log,
632
- mode=selection_mode,
633
- chunk_size=max(1, selection_chunk_size),
634
- per_phrase_k=max(1, selection_per_phrase_k),
635
- )
636
- picked_indices = _future_with_timeout(
637
- fut_sel, stage3_select_timeout_s, "Index selection", []
638
- )
639
-
640
- selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
641
-
642
- if structural_tags:
643
- # Add structural tags that aren't already selected
644
- existing = {t for t in selected_tags}
645
- new_structural = [t for t in structural_tags if t not in existing]
646
- selected_tags.extend(new_structural)
647
- log(f" Added {len(new_structural)} structural tags: {', '.join(new_structural)}")
648
- else:
649
- log(" No structural tags inferred")
650
-
651
- if probe_tags:
652
- existing = {t for t in selected_tags}
653
- new_probe = [t for t in probe_tags if t not in existing]
654
- selected_tags.extend(new_probe)
655
- log(f" Added {len(new_probe)} probe tags: {', '.join(new_probe)}")
656
- elif enable_probe_tags:
657
- log(" No probe tags inferred")
658
-
659
- llm_selected_tags = list(dict.fromkeys(selected_tags))
660
-
661
- log("Step 3c: Expand via tag implications")
662
- t0 = time.perf_counter()
663
- tag_set = set(selected_tags)
664
- expanded, implied_only = expand_tags_via_implications(tag_set)
665
- dt = time.perf_counter()-t0
666
- _record_timing("implication_expansion", dt)
667
- log(f"Implication expansion: {dt:.2f}s")
668
- if implied_only:
669
- selected_tags.extend(sorted(implied_only))
670
- log(f" Added {len(implied_only)} implied tags: {', '.join(sorted(implied_only))}")
671
- else:
672
- log(" No additional implied tags")
673
-
674
- log("Step 4: Compose final prompt")
675
- t0 = time.perf_counter()
676
- final_prompt = compose_final_prompt(rewritten, selected_tags)
677
- dt = time.perf_counter()-t0
678
- _record_timing("prompt_composition", dt)
679
- log(f"Prompt composition: {dt:.2f}s")
680
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
  log("Step 5: Build ranked group/category display")
682
  t0 = time.perf_counter()
683
  seed_terms = []
684
  seed_terms.extend(user_tags)
685
  seed_terms.extend([p.strip() for p in (rewritten or "").split(",") if p.strip()])
686
- seed_terms.extend(structural_tags or [])
687
- seed_terms.extend(probe_tags or [])
688
- seed_terms.extend(selected_tags)
689
- seed_terms = list(dict.fromkeys(seed_terms))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
 
691
  toggle_rows = _build_toggle_rows(
692
  seed_terms=seed_terms,
693
- llm_selected_tags=llm_selected_tags,
 
 
694
  top_groups=max(1, int(display_top_groups)),
695
  top_tags_per_group=max(1, int(display_top_tags_per_group)),
696
  group_rank_top_k=max(1, int(display_rank_top_k)),
697
  )
698
- dt = time.perf_counter()-t0
699
- _record_timing("group_display", dt)
700
- log(f"Ranked group display: {dt:.2f}s ({len(toggle_rows)} rows)")
701
-
702
- total_dt = time.perf_counter()-t_total0
703
- _emit_timing_summary(total_dt)
704
- _append_timing_jsonl(total_dt)
705
- log("Done: final prompt ready")
706
- return _build_ui_payload(
707
- console_text="\n".join(logs),
708
- legacy_prompt_text=final_prompt,
709
- row_defs=toggle_rows,
710
- selected_tags=llm_selected_tags,
711
- )
712
-
713
- except Exception as e:
714
- log(f"Error: {type(e).__name__}: {e}")
715
- return _build_ui_payload(
716
- console_text="\n".join(logs),
717
- legacy_prompt_text="",
718
- row_defs=[],
719
- selected_tags=[],
720
- )
721
-
722
-
723
-
724
- with gr.Blocks(css=css) as app:
725
- with gr.Row():
726
- with gr.Column(scale=3, elem_classes=["prompt-col"]):
727
- image_tags = gr.Textbox(
728
- label="Enter Prompt",
729
- placeholder="e.g. fox, outside, detailed background, .",
730
- lines=1
731
- )
732
- with gr.Column(scale=1):
733
- _mascot_pil = _load_mascot_image()
734
- if _mascot_pil is not None:
735
- mascot_img = gr.Image(
736
- value=_mascot_pil,
737
- show_label=False,
738
- interactive=False,
739
- height=220,
740
- elem_id="mascot"
741
- )
742
- else:
743
- mascot_img = gr.Markdown("`(mascot image unavailable)`")
744
- submit_button = gr.Button("Run", variant="primary")
745
-
746
- gr.Markdown(
747
- """
748
- ### Prompt Squirrel RAG (pipeline version)
749
-
750
- Type a rough prompt. This tool rewrites it and aligns it to an e621-style tag vocabulary using Prompt Squirrel internally,
751
- then returns a cleaned, model-friendly prompt.
752
- """.strip()
753
- )
754
-
755
- console = gr.Textbox(
756
- label="Console",
757
- lines=10,
758
- interactive=False,
759
- placeholder="Progress logs will appear here."
760
- )
761
-
762
- suggested_prompt = gr.Textbox(
763
- label="Suggested Prompt (From Toggled Tags)",
764
- lines=3,
765
- interactive=False,
766
- show_copy_button=True,
767
- placeholder="Comma-separated tags selected in the rows below."
768
- )
769
-
770
- with gr.Accordion("Legacy Pipeline Prompt (for reference)", open=False):
771
- legacy_final_prompt = gr.Textbox(
772
- label="Legacy Final Prompt",
773
- lines=3,
774
- interactive=False,
775
- show_copy_button=True,
776
- )
777
-
 
 
 
 
 
 
 
 
778
  selected_tags_state = gr.State([])
779
  row_defs_state = gr.State([])
780
  row_values_state = gr.State([])
781
 
782
  gr.Markdown("### Toggle Tag Rows")
783
- row_headers: List[gr.Markdown] = []
784
- row_checkboxes: List[gr.CheckboxGroup] = []
785
- for _ in range(display_max_rows_default):
786
- row_headers.append(gr.Markdown(value="", visible=False))
787
- row_checkboxes.append(
788
- gr.CheckboxGroup(
789
- choices=[],
790
- value=[],
791
- visible=False,
792
- interactive=True,
793
- container=False,
794
- elem_classes=["lego-tags"],
795
- )
796
- )
797
-
798
- gr.Markdown(
799
- "Toggling a tag in any row toggles it everywhere else that tag appears."
800
- )
801
-
802
- with gr.Accordion("Display Settings", open=False):
803
- with gr.Row():
804
- display_top_groups = gr.Number(
805
- value=display_top_groups_default,
806
- precision=0,
807
- label="Rows (Top Groups/Categories)",
808
- minimum=1,
809
- )
810
- display_top_tags_per_group = gr.Number(
811
- value=display_top_tags_per_group_default,
812
- precision=0,
813
- label="Top Tags Shown Per Row",
814
- minimum=1,
815
- )
816
- display_rank_top_k = gr.Number(
817
- value=display_rank_top_k_default,
818
- precision=0,
819
- label="Top Tags Used for Row Ranking",
820
- minimum=1,
821
- )
822
-
823
- run_outputs = [
824
- console,
825
- legacy_final_prompt,
826
- suggested_prompt,
827
- selected_tags_state,
828
- row_defs_state,
829
- row_values_state,
830
- *row_headers,
831
- *row_checkboxes,
832
- ]
833
-
834
- submit_button.click(
835
- rag_pipeline_ui,
836
- inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
837
- outputs=run_outputs
838
  )
839
-
840
- image_tags.submit(
841
- rag_pipeline_ui,
842
- inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
843
- outputs=run_outputs
844
  )
845
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
846
  for idx, row_cb in enumerate(row_checkboxes):
847
  row_cb.change(
848
  fn=lambda changed_values, selected_state, row_defs, row_values, i=idx: _on_toggle_row(
849
  i,
850
  changed_values,
851
- selected_state,
852
- row_defs,
853
- row_values,
854
- display_max_rows_default,
855
  ),
856
  inputs=[row_cb, selected_tags_state, row_defs_state, row_values_state],
857
  outputs=[selected_tags_state, row_values_state, suggested_prompt, *row_checkboxes],
 
 
858
  )
859
-
860
- if __name__ == "__main__":
861
- app.queue().launch(allowed_paths=[str(MASCOT_DIR)])
 
1
+ import gradio as gr
2
+ import os
3
+ import logging
4
+ import time
5
+ import json
6
+ import csv
7
+ from datetime import datetime
8
+ from functools import lru_cache
9
+ from PIL import Image
10
+ from pathlib import Path
11
+ from typing import Any, Dict, List, Set, Tuple
12
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
13
+
14
+ from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
15
+ from psq_rag.llm.rewrite import llm_rewrite_prompt
16
+ from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
17
+ from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags, llm_infer_probe_tags
18
+ from psq_rag.retrieval.state import expand_tags_via_implications, get_tag_type_name, get_tag_implications
19
+ from psq_rag.ui.group_ranked_display import rank_groups_from_tfidf, _load_enabled_groups
20
+
21
+
22
+ def _split_prompt_commas(s: str) -> List[str]:
23
+ return [p.strip() for p in (s or "").split(",") if p.strip()]
24
+
25
+ def _norm_for_dedupe(tag: str) -> str:
26
+ # your canonical form for lookup/dedupe
27
+ return _norm_tag_for_lookup(tag.lower())
28
+
29
+ def compose_final_prompt(rewritten_prompt: str, selected_tags: List[str]) -> str:
30
+ parts = _split_prompt_commas(rewritten_prompt)
31
+ parts.extend(selected_tags)
32
+
33
+ seen = set()
34
+ out = []
35
+ for p in parts:
36
+ key = _norm_for_dedupe(p)
37
+ if key in seen:
38
+ continue
39
+ seen.add(key)
40
+ out.append(p)
41
+
42
+ return ", ".join(out)
43
+
44
+
45
  def _display_tag_text(tag: str) -> str:
46
  return tag.replace("_", " ")
47
 
48
 
49
+ def _normalize_selection_origin(origin: str) -> str:
50
+ o = (origin or "").strip().lower()
51
+ if o in {"rewrite", "selection", "probe", "structural", "user", "candidate"}:
52
+ return o
53
+ return "selection"
54
+
55
+
56
+ def _choice_label_with_source_meta(tag: str, *, origin: str, preselected: bool) -> str:
57
+ # Marker is stripped client-side and converted into data attributes for CSS-driven colors.
58
+ origin_norm = _normalize_selection_origin(origin)
59
+ pre = "1" if preselected else "0"
60
+ return f"{_display_tag_text(tag)} [[psq:{origin_norm}:{pre}]]"
61
+
62
+
63
+ def _selection_source_rank(origin: str) -> int:
64
+ o = _normalize_selection_origin(origin)
65
+ if o == "structural":
66
+ return 0
67
+ if o == "probe":
68
+ return 1
69
+ # Keep rewrite/user in the same priority band as general selection for row ordering.
70
+ return 2
71
+
72
+
73
+ def _build_implied_parent_map(
74
+ direct_tags_ordered: List[str],
75
+ implied_tags: List[str],
76
+ ) -> Dict[str, str]:
77
+ implied_set = {_norm_tag_for_lookup(t) for t in (implied_tags or []) if t}
78
+ if not implied_set or not direct_tags_ordered:
79
+ return {}
80
+ impl = get_tag_implications()
81
+ parent_by_implied: Dict[str, str] = {}
82
+ for direct in direct_tags_ordered:
83
+ d = _norm_tag_for_lookup(direct)
84
+ if not d:
85
+ continue
86
+ queue = [d]
87
+ seen = {d}
88
+ while queue:
89
+ t = queue.pop()
90
+ for parent in impl.get(t, ()):
91
+ p = _norm_tag_for_lookup(parent)
92
+ if not p or p in seen:
93
+ continue
94
+ seen.add(p)
95
+ if p in implied_set and p not in parent_by_implied:
96
+ parent_by_implied[p] = d
97
+ queue.append(p)
98
+ return parent_by_implied
99
+
100
+
101
+ def _order_selected_tags_for_row(
102
+ *,
103
+ row_selected_tags: List[str],
104
+ selected_index: Dict[str, int],
105
+ tag_selection_origins: Dict[str, str],
106
+ implied_parent_map: Dict[str, str],
107
+ ) -> List[str]:
108
+ row_selected_norm = [_norm_tag_for_lookup(t) for t in (row_selected_tags or []) if t]
109
+ implied_in_row = {t for t in row_selected_norm if t in implied_parent_map}
110
+ base_tags = [t for t in row_selected_norm if t not in implied_in_row]
111
+
112
+ base_tags.sort(
113
+ key=lambda t: (
114
+ _selection_source_rank(tag_selection_origins.get(t, "selection")),
115
+ selected_index.get(t, 10**9),
116
+ t,
117
+ )
118
  )
119
 
120
+ children_by_parent: Dict[str, List[str]] = {}
121
+ for implied in implied_in_row:
122
+ parent = implied_parent_map.get(implied)
123
+ if parent:
124
+ children_by_parent.setdefault(parent, []).append(implied)
125
 
126
+ for parent, children in children_by_parent.items():
127
+ children.sort(key=lambda t: (selected_index.get(t, 10**9), t))
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ ordered: List[str] = []
130
+ emitted: Set[str] = set()
131
+ for tag in base_tags:
132
+ if tag in emitted:
133
+ continue
134
+ ordered.append(tag)
135
+ emitted.add(tag)
136
+ for child in children_by_parent.get(tag, []):
137
+ if child not in emitted:
138
+ ordered.append(child)
139
+ emitted.add(child)
140
+
141
+ remaining_implied = [t for t in row_selected_norm if t not in emitted]
142
+ remaining_implied.sort(
143
+ key=lambda t: (
144
+ _selection_source_rank(tag_selection_origins.get(implied_parent_map.get(t, ""), "selection")),
145
+ selected_index.get(implied_parent_map.get(t, ""), 10**9),
146
+ selected_index.get(t, 10**9),
147
+ t,
148
+ )
149
+ )
150
+ for t in remaining_implied:
151
+ if t not in emitted:
152
+ ordered.append(t)
153
+ emitted.add(t)
154
+ return ordered
155
 
156
 
157
+ def _escape_prompt_tag(tag: str) -> str:
158
+ return (
159
+ tag.replace("_", " ")
160
+ .replace("(", "\\(")
161
+ .replace(")", "\\)")
162
+ )
163
+
164
+
165
+ def _ordered_selected_for_prompt(selected: Set[str], row_defs: List[Dict[str, Any]]) -> List[str]:
166
+ out: List[str] = []
167
+ seen: Set[str] = set()
168
+ for row in row_defs:
169
+ for tag in row.get("tags", []):
170
+ if tag in selected and tag not in seen:
171
+ out.append(tag)
172
+ seen.add(tag)
173
+ # Fallback for any selected tags not present in current rows.
174
+ for tag in sorted(selected):
175
+ if tag not in seen:
176
+ out.append(tag)
177
+ seen.add(tag)
178
+ return out
179
+
180
+
181
+ def _compose_toggle_prompt_text(selected_tags: List[str], row_defs: List[Dict[str, Any]]) -> str:
182
+ selected = {t for t in (selected_tags or []) if t}
183
+ ordered = _ordered_selected_for_prompt(selected, row_defs or [])
184
+ return ", ".join(_escape_prompt_tag(t) for t in ordered)
185
+
186
+
187
+ def _is_artist_tag(tag: str) -> bool:
188
+ t = _norm_tag_for_lookup(str(tag))
189
+ if not t:
190
+ return False
191
+ # Keep a resilient fallback for malformed/missing tag typing metadata.
192
+ return get_tag_type_name(t) == "artist" or t.startswith("by_")
193
+
194
+
195
+ @lru_cache(maxsize=1)
196
+ def _load_excluded_recommendation_tags() -> Set[str]:
197
+ csv_path = Path("data/analysis/category_registry.csv")
198
+ out: Set[str] = set()
199
+ if not csv_path.exists():
200
+ return out
201
+ try:
202
+ with csv_path.open("r", encoding="utf-8", newline="") as f:
203
+ reader = csv.DictReader(f)
204
+ for row in reader:
205
+ tag = _norm_tag_for_lookup(str(row.get("tag") or ""))
206
+ if not tag:
207
+ continue
208
+ status = str(row.get("category_status") or "").strip().lower()
209
+ if status == "excluded":
210
+ out.add(tag)
211
+ except Exception:
212
+ return set()
213
+ return out
214
+
215
+
216
+ def _is_excluded_recommendation_tag(tag: str) -> bool:
217
+ t = _norm_tag_for_lookup(str(tag))
218
+ if not t:
219
+ return False
220
+ return t in _load_excluded_recommendation_tags()
221
+
222
+
223
+ def _filter_excluded_recommendation_tags(tags: List[str]) -> Tuple[List[str], List[str]]:
224
+ excluded = _load_excluded_recommendation_tags()
225
+ if not excluded:
226
+ return list(dict.fromkeys(_norm_tag_for_lookup(t) for t in (tags or []) if t)), []
227
+
228
+ keep: List[str] = []
229
+ removed: List[str] = []
230
+ seen: Set[str] = set()
231
+ for raw in (tags or []):
232
+ t = _norm_tag_for_lookup(str(raw))
233
+ if not t:
234
+ continue
235
+ if t in excluded:
236
+ removed.append(t)
237
+ continue
238
+ if t in seen:
239
+ continue
240
+ seen.add(t)
241
+ keep.append(t)
242
+ return keep, sorted(set(removed))
243
+
244
+
245
  def _build_toggle_rows(
246
  *,
247
  seed_terms: List[str],
248
+ selected_tags: List[str],
249
+ tag_selection_origins: Dict[str, str],
250
+ implied_parent_map: Dict[str, str],
251
  top_groups: int,
252
  top_tags_per_group: int,
253
  group_rank_top_k: int,
254
  ) -> List[Dict[str, Any]]:
255
+ ranked_rows = rank_groups_from_tfidf(
256
+ seed_terms=seed_terms,
257
+ top_groups=max(1, int(top_groups)),
258
+ top_tags_per_group=max(1, int(top_tags_per_group)),
259
+ group_rank_top_k=max(1, int(group_rank_top_k)),
260
+ )
261
  groups_map = _load_enabled_groups()
262
+ selected_active = list(
263
+ dict.fromkeys(
264
+ _norm_tag_for_lookup(t)
265
+ for t in selected_tags
266
+ if t and not _is_artist_tag(t) and not _is_excluded_recommendation_tag(t)
267
+ )
268
+ )
269
+ selected_index: Dict[str, int] = {t: i for i, t in enumerate(selected_active)}
270
+
271
+ row_defs: List[Dict[str, Any]] = []
272
+ displayed_group_names = [r.group_name for r in ranked_rows]
273
+ displayed_group_tag_sets: Dict[str, Set[str]] = {
274
+ name: {t for t in groups_map.get(name, []) if not _is_artist_tag(t)}
275
+ for name in displayed_group_names
276
+ }
277
+ tags_in_any_displayed_group: Set[str] = set()
278
+ for tag_set in displayed_group_tag_sets.values():
279
+ tags_in_any_displayed_group.update(tag_set)
280
+
281
+ selected_other_raw = [t for t in selected_active if t not in tags_in_any_displayed_group]
282
+ selected_other = _order_selected_tags_for_row(
283
+ row_selected_tags=selected_other_raw,
284
+ selected_index=selected_index,
285
+ tag_selection_origins=tag_selection_origins,
286
+ implied_parent_map=implied_parent_map,
287
+ )
288
+ selected_other_meta = {
289
+ t: {
290
+ "origin": _normalize_selection_origin(tag_selection_origins.get(t, "selection")),
291
+ "preselected": True,
292
+ }
293
+ for t in selected_other
294
  }
 
 
 
 
 
295
  row_defs.append(
296
  {
297
+ "name": "selected_other",
298
+ "label": "Selected (Other)",
299
+ "tags": selected_other,
300
+ "tag_meta": selected_other_meta,
301
  }
302
  )
303
 
304
  for row in ranked_rows:
305
  group_name = row.group_name
306
  group_tag_set = displayed_group_tag_sets.get(group_name, set())
307
+ selected_in_group_raw = [t for t in selected_active if t in group_tag_set]
308
+ selected_in_group = _order_selected_tags_for_row(
309
+ row_selected_tags=selected_in_group_raw,
310
+ selected_index=selected_index,
311
+ tag_selection_origins=tag_selection_origins,
312
+ implied_parent_map=implied_parent_map,
313
+ )
314
+ ranked_tags = [
315
+ t
316
+ for t, _ in row.tags
317
+ if not _is_artist_tag(t) and not _is_excluded_recommendation_tag(t)
318
+ ]
319
  merged = selected_in_group + [t for t in ranked_tags if t not in selected_in_group]
320
  keep_n = max(max(1, int(top_tags_per_group)), len(selected_in_group))
321
  merged = merged[:keep_n]
322
+ tag_meta = {
323
+ t: {
324
+ "origin": _normalize_selection_origin(tag_selection_origins.get(t, "selection")),
325
+ "preselected": t in selected_active,
326
+ }
327
+ for t in merged
328
+ }
329
  row_defs.append(
330
  {
331
  "name": group_name,
332
  "label": f"{group_name} (E={row.expected_count:.2f})",
333
  "tags": merged,
334
+ "tag_meta": tag_meta,
335
  }
336
  )
337
+
338
+ return row_defs
339
+
340
+
341
+ def _build_display_audit_line(
342
+ row_defs: List[Dict[str, Any]],
343
+ *,
344
+ active_selected_tags: List[str],
345
+ direct_selected_tags: List[str],
346
+ implied_selected_tags: List[str],
347
+ ) -> str:
348
+ active_set = {
349
+ _norm_tag_for_lookup(t)
350
+ for t in (active_selected_tags or [])
351
+ if t and not _is_artist_tag(t)
352
+ }
353
+ direct_set = {
354
+ _norm_tag_for_lookup(t)
355
+ for t in (direct_selected_tags or [])
356
+ if t and not _is_artist_tag(t)
357
+ }
358
+ implied_set = {
359
+ _norm_tag_for_lookup(t)
360
+ for t in (implied_selected_tags or [])
361
+ if t and not _is_artist_tag(t)
362
+ }
363
+ info_by_tag: Dict[str, Dict[str, Any]] = {}
364
+
365
+ for row in row_defs or []:
366
+ row_name = row.get("name", "")
367
+ row_label = row.get("label", row_name)
368
+ for tag in row.get("tags", []):
369
+ rec = info_by_tag.setdefault(tag, {"rows": [], "sources": set()})
370
+ rec["rows"].append(row_label)
371
+ if row_name == "selected_other":
372
+ rec["sources"].add("selected_other_row")
373
+ else:
374
+ rec["sources"].add("ranked_group_row")
375
+ if tag in active_set:
376
+ rec["sources"].add("selected_active")
377
+ if tag in direct_set:
378
+ rec["sources"].add("selected_direct")
379
+ if tag in implied_set:
380
+ rec["sources"].add("selected_implied")
381
+
382
+ payload = {
383
+ "n_tags": len(info_by_tag),
384
+ "tags": [
385
+ {
386
+ "tag": tag,
387
+ "rows": rec["rows"],
388
+ "sources": sorted(rec["sources"]),
389
+ }
390
+ for tag, rec in sorted(info_by_tag.items())
391
+ ],
392
+ }
393
+ return "Display Tag Audit: " + json.dumps(payload, ensure_ascii=True)
394
+
395
+
396
+ def _build_row_component_updates(
397
+ row_defs: List[Dict[str, Any]],
398
+ selected_tags: List[str],
399
+ max_rows: int,
400
+ ):
401
+ selected = {t for t in (selected_tags or []) if t}
402
+ row_values_state: List[List[str]] = []
403
+ header_updates = []
404
+ checkbox_updates = []
405
+
406
+ for idx in range(max_rows):
407
  if idx < len(row_defs):
408
  row = row_defs[idx]
409
  tags = list(dict.fromkeys(row.get("tags", [])))
 
411
  row_values_state.append(values)
412
  visible = bool(tags)
413
  header_updates.append(gr.update(value=f"**{row.get('label', '')}**", visible=visible))
414
+ tag_meta = row.get("tag_meta", {}) if isinstance(row.get("tag_meta", {}), dict) else {}
415
+ choices = []
416
+ for t in tags:
417
+ meta = tag_meta.get(t, {}) if isinstance(tag_meta.get(t, {}), dict) else {}
418
+ origin = _normalize_selection_origin(str(meta.get("origin", "selection")))
419
+ preselected = bool(meta.get("preselected", False))
420
+ choices.append((_choice_label_with_source_meta(t, origin=origin, preselected=preselected), t))
421
  checkbox_updates.append(
422
  gr.update(
423
  choices=choices,
424
  value=values,
425
  visible=visible,
426
+ )
427
+ )
428
+ else:
429
+ header_updates.append(gr.update(value="", visible=False))
430
+ checkbox_updates.append(gr.update(choices=[], value=[], visible=False))
431
+
432
+ prompt_text = _compose_toggle_prompt_text(list(selected), row_defs)
433
+ return prompt_text, row_values_state, header_updates, checkbox_updates
434
+
435
+
436
  def _on_toggle_row(
437
  row_idx: int,
438
  changed_values: List[str],
439
  selected_tags_state: List[str],
440
+ row_defs_state: List[Dict[str, Any]],
441
+ row_values_state: List[List[str]],
442
+ max_rows: int,
443
  ):
444
  row_defs = row_defs_state or []
445
  selected = set(selected_tags_state or [])
446
+ row = row_defs[row_idx] if 0 <= row_idx < len(row_defs) else {}
447
+ row_tags = list(dict.fromkeys(row.get("tags", [])))
448
+ row_tag_set = set(row_tags)
449
+ row_tag_by_norm = {_norm_tag_for_lookup(t): t for t in row_tags}
450
+
451
+ # Be tolerant to UI payload forms: canonical tag values, display labels, or normalized variants.
452
+ new_set: Set[str] = set()
453
+ for raw in (changed_values or []):
454
+ if raw in row_tag_set:
455
+ new_set.add(raw)
456
+ continue
457
+ raw_norm = _norm_tag_for_lookup(str(raw))
458
+ mapped = row_tag_by_norm.get(raw_norm)
459
+ if mapped:
460
+ new_set.add(mapped)
461
+
462
+ prev_row_selected = {t for t in selected if t in row_tag_set}
463
+ selected.difference_update(row_tag_set)
464
+ selected.update(new_set)
465
+ toggled_tags = prev_row_selected ^ new_set
466
+
467
+ # Recompute row selections, but only push UI updates to rows touched by the toggled tags.
468
+ new_row_values_state: List[List[str]] = []
469
+ affected_rows: Set[int] = {row_idx}
470
+ for idx, row in enumerate(row_defs):
471
+ tags = list(dict.fromkeys(row.get("tags", [])))
472
+ values = [t for t in tags if t in selected]
473
+ new_row_values_state.append(values)
474
+ if toggled_tags and any(t in toggled_tags for t in tags):
475
+ affected_rows.add(idx)
476
 
477
+ checkbox_updates = []
478
+ for idx in range(max_rows):
479
+ if idx < len(row_defs) and idx in affected_rows:
480
+ checkbox_updates.append(gr.update(value=new_row_values_state[idx]))
481
+ else:
482
+ checkbox_updates.append(gr.update())
483
 
484
+ prompt_text = _compose_toggle_prompt_text(sorted(selected), row_defs)
485
+ return [sorted(selected), new_row_values_state, prompt_text, *checkbox_updates]
486
+
487
+
488
+ def _build_ui_payload(
489
+ *,
490
+ console_text: str,
491
+ legacy_prompt_text: str,
492
+ row_defs: List[Dict[str, Any]],
493
+ selected_tags: List[str],
494
+ ):
495
+ prompt_text, row_values_state, header_updates, checkbox_updates = _build_row_component_updates(
496
+ row_defs=row_defs,
497
+ selected_tags=selected_tags,
498
+ max_rows=display_max_rows_default,
499
+ )
500
+ return [
501
+ console_text,
502
+ legacy_prompt_text,
503
+ prompt_text,
504
+ sorted(set(selected_tags or [])),
505
+ row_defs,
506
+ row_values_state,
507
+ *header_updates,
508
+ *checkbox_updates,
509
+ ]
510
+
511
+
512
+ def _build_selection_query(
513
+ prompt_in: str,
514
+ rewritten: str,
515
+ structural_tags: List[str],
516
+ probe_tags: List[str],
517
+ ) -> str:
518
+ lines = [f"IMAGE DESCRIPTION: {prompt_in.strip()}"]
519
+ if rewritten and rewritten.strip():
520
+ lines.append(f"REWRITE PHRASES: {rewritten.strip()}")
521
+ hint_tags = []
522
+ if structural_tags:
523
+ hint_tags.extend(structural_tags)
524
+ if probe_tags:
525
+ hint_tags.extend(probe_tags)
526
+ if hint_tags:
527
+ # Keep hints as context only; selection still must choose by candidate indices.
528
+ lines.append(
529
+ "INFERRED TAG HINTS (context only): " + ", ".join(sorted(set(hint_tags)))
530
+ )
531
+ return "\n".join(lines)
532
+
533
+
534
+ # Set up logging
535
+ # Minimal prod logging: warnings+ to stderr, no file by default
536
+ import os, logging
537
+
538
+ LOG_LEVEL = os.environ.get("PSQ_LOG_LEVEL", "WARNING").upper()
539
+ logging.basicConfig(
540
+ level=getattr(logging, LOG_LEVEL, logging.WARNING),
541
+ format="%(asctime)s %(levelname)s:%(message)s",
542
+ handlers=[logging.StreamHandler()] # no file -> avoids huge logs on Spaces
543
+ )
544
+
545
+ # Quiet down common noisy libs (optional)
546
+ for _name in ("gensim", "gradio", "hnswlib", "httpx", "uvicorn"):
547
+ logging.getLogger(_name).setLevel(logging.ERROR)
548
+
549
+ # Turn off Gradio analytics phone-home to avoid those background thread errors (optional)
550
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "0"
551
+
552
+
553
+ MASCOT_DIR = Path(__file__).parent / "mascotimages"
554
+ MASCOT_FILE = MASCOT_DIR / "transparentsquirrel.png"
555
+
556
+
557
+ def _load_mascot_image():
558
+ """Load mascot image if available; return None when missing/unreadable."""
559
+ if not MASCOT_FILE.exists():
560
+ logging.warning("Mascot image missing: %s", MASCOT_FILE)
561
+ return None
562
+ try:
563
+ return Image.open(MASCOT_FILE).convert("RGBA")
564
+ except Exception as e:
565
+ logging.warning("Failed to load mascot image (%s): %s", MASCOT_FILE, e)
566
+ return None
567
+
568
+ try:
569
+ from gradio_client import utils as _gc_utils
570
+
571
+ _orig_get_type = _gc_utils.get_type
572
+ _orig_j2p = _gc_utils._json_schema_to_python_type
573
+ _orig_pub = _gc_utils.json_schema_to_python_type
574
+
575
+ def _get_type_safe(schema):
576
+ # Sometimes schema is a bare True/False (JSON Schema boolean form)
577
+ if not isinstance(schema, dict):
578
+ return "any"
579
+ return _orig_get_type(schema)
580
+
581
+ def _j2p_safe(schema, defs=None):
582
+ # Accept non-dict schemas (True/False/None) and treat as "any"
583
+ if not isinstance(schema, dict):
584
+ return "any"
585
+ return _orig_j2p(schema, defs or schema.get("$defs"))
586
+
587
+ def _pub_safe(schema):
588
+ # Public wrapper used by Gradio; keep it resilient too
589
+ if not isinstance(schema, dict):
590
+ return "any"
591
+ return _j2p_safe(schema, schema.get("$defs"))
592
+
593
+ _gc_utils.get_type = _get_type_safe
594
+ _gc_utils._json_schema_to_python_type = _j2p_safe
595
+ _gc_utils.json_schema_to_python_type = _pub_safe
596
+
597
+ except Exception as e:
598
+ print("gradio_client hotfix not applied:", e)
599
+ # -------------------------------------------------------------------------------
600
+
601
+
602
+ allow_nsfw_tags = False
603
+ def _is_production_runtime() -> bool:
604
+ """Best-effort detection for deployed runtime (HF Spaces or explicit env)."""
605
+ if os.environ.get("PSQ_PRODUCTION", "").strip().lower() in {"1", "true", "yes"}:
606
+ return True
607
+ if os.environ.get("SPACE_ID"):
608
+ return True
609
+ if os.environ.get("HF_SPACE_ID"):
610
+ return True
611
+ if os.environ.get("SYSTEM") == "spaces":
612
+ return True
613
+ return False
614
+
615
+
616
+ verbose_retrieval_default = "0" if _is_production_runtime() else "1"
617
+ verbose_retrieval = os.environ.get("PSQ_VERBOSE_RETRIEVAL", verbose_retrieval_default).strip().lower() in {"1", "true", "yes"}
618
+ verbose_retrieval_all = False
619
+ verbose_retrieval_limit = 20
620
+ enable_probe_tags = os.environ.get("PSQ_ENABLE_PROBE", "1").strip() not in {"0", "false", "False"}
621
+ display_top_groups_default = int(os.environ.get("PSQ_DISPLAY_TOP_GROUPS", "10"))
622
+ display_top_tags_per_group_default = int(os.environ.get("PSQ_DISPLAY_TOP_TAGS_PER_GROUP", "5"))
623
+ display_rank_top_k_default = int(os.environ.get("PSQ_DISPLAY_GROUP_RANK_TOP_K", "5"))
624
+ display_max_rows_default = int(os.environ.get("PSQ_DISPLAY_MAX_ROWS", "14"))
625
+ retrieval_global_k = int(os.environ.get("PSQ_RETRIEVAL_GLOBAL_K", "300"))
626
+ retrieval_per_phrase_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_K", "10"))
627
+ retrieval_per_phrase_final_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_FINAL_K", "1"))
628
+ selection_mode = os.environ.get("PSQ_SELECTION_MODE", "chunked_map_union").strip()
629
+ selection_chunk_size = int(os.environ.get("PSQ_SELECTION_CHUNK_SIZE", "60"))
630
+ selection_per_phrase_k = int(os.environ.get("PSQ_SELECTION_PER_PHRASE_K", "2"))
631
+ selection_candidate_cap = int(os.environ.get("PSQ_SELECTION_CANDIDATE_CAP", "0"))
632
+ stage1_rewrite_timeout_s = float(os.environ.get("PSQ_TIMEOUT_REWRITE_S", "45"))
633
+ stage1_struct_timeout_s = float(os.environ.get("PSQ_TIMEOUT_STRUCT_S", "45"))
634
+ stage1_probe_timeout_s = float(os.environ.get("PSQ_TIMEOUT_PROBE_S", "45"))
635
+ stage3_select_timeout_s = float(os.environ.get("PSQ_TIMEOUT_SELECT_S", "45"))
636
+ timing_log_path = Path(os.environ.get("PSQ_TIMING_LOG_PATH", "data/runtime_metrics/ui_pipeline_timings.jsonl"))
637
+
638
+ css = """
639
+ .scrollable-content{
640
+ max-height: 420px;
641
+ overflow-y: scroll; /* always show scrollbar */
642
+ overflow-x: hidden;
643
+ padding-right: 8px;
644
+ padding-bottom: 14px; /* <— add this */
645
+ scrollbar-gutter: stable; /* prevent layout shift as it fills */
646
+
647
+ /* Firefox */
648
+ scrollbar-width: auto;
649
+ scrollbar-color: rgba(180,180,180,.9) rgba(0,0,0,.15);
650
+ }
651
+
652
+ /* WebKit/Chromium (Chrome/Edge/Safari) */
653
+ .scrollable-content::-webkit-scrollbar{ width: 10px; }
654
+ .scrollable-content::-webkit-scrollbar-thumb{ background: rgba(180,180,180,.9); border-radius: 8px; }
655
+ .scrollable-content::-webkit-scrollbar-track{ background: rgba(0,0,0,.15); }
656
+
657
+ /* (Optional) make both scroll panes taller so they fill more of the column */
658
+ .pane-left .scrollable-content,
659
+ .pane-right .scrollable-content {
660
+ max-height: 610px; /* was 420px; tweak to taste */
661
+ }
662
+
663
+ .lego-tags .gr-checkboxgroup,
664
+ .lego-tags .wrap {
665
+ display: flex !important;
666
+ flex-wrap: wrap !important;
667
+ gap: 10px !important;
668
+ }
669
 
670
+ .lego-tags label {
671
+ margin: 0 !important;
672
+ padding: 0 !important;
673
+ position: relative !important;
674
+ }
675
 
676
+ /* Hide native checkbox visuals completely */
677
+ .lego-tags input[type="checkbox"] {
678
+ appearance: none !important;
679
+ -webkit-appearance: none !important;
680
+ -moz-appearance: none !important;
681
+ position: absolute !important;
682
+ width: 1px !important;
683
+ height: 1px !important;
684
+ opacity: 0 !important;
685
+ pointer-events: none !important;
686
+ display: none !important;
687
+ }
688
 
689
+ /* Brick button skin (works for both +span and ~span structures) */
690
+ .lego-tags input[type="checkbox"] + span,
691
+ .lego-tags input[type="checkbox"] ~ span {
692
+ --on-bg1: #ffd166;
693
+ --on-bg2: #f39c4a;
694
+ --on-border: #b86e21;
695
+ --on-text: #2e1706;
696
+ position: relative !important;
697
+ display: inline-flex !important;
698
+ align-items: center !important;
699
+ min-height: 40px !important;
700
+ padding: 10px 15px 9px !important;
701
+ border: 2px solid #7d8897 !important;
702
+ border-radius: 10px !important;
703
+ background: linear-gradient(180deg, #e8ecf2 0%, #c7ced8 100%) !important;
704
+ color: #2d3440 !important;
705
+ font-size: 0.97rem !important;
706
+ font-weight: 800 !important;
707
+ line-height: 1.15 !important;
708
+ cursor: pointer !important;
709
+ user-select: none !important;
710
+ letter-spacing: 0.01em !important;
711
+ box-shadow: 0 4px 0 rgba(0,0,0,0.22), inset 0 1px 0 rgba(255,255,255,0.72) !important;
712
+ transition: transform 0.08s ease, box-shadow 0.08s ease, filter 0.08s ease !important;
713
+ }
714
 
715
+ .lego-tags input[type="checkbox"] + span::before,
716
+ .lego-tags input[type="checkbox"] ~ span::before {
717
+ content: "" !important;
718
+ position: absolute !important;
719
+ top: 5px !important;
720
+ left: 8px !important;
721
+ width: 8px !important;
722
+ height: 8px !important;
723
+ border-radius: 50% !important;
724
+ background: rgba(255,255,255,0.58) !important;
725
+ box-shadow: 22px 0 0 rgba(255,255,255,0.58) !important;
726
+ pointer-events: none !important;
727
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728
 
729
+ /* Bright color cycle used only when selected */
730
+ .lego-tags label:nth-child(8n+1) span { --on-bg1: #ffd166; --on-bg2: #f39c4a; --on-border: #b86e21; --on-text: #2e1706; }
731
+ .lego-tags label:nth-child(8n+2) span { --on-bg1: #6ee7ff; --on-bg2: #1fb7ff; --on-border: #157cb3; --on-text: #07263c; }
732
+ .lego-tags label:nth-child(8n+3) span { --on-bg1: #9dff8f; --on-bg2: #45c96f; --on-border: #2a8b4b; --on-text: #0d2917; }
733
+ .lego-tags label:nth-child(8n+4) span { --on-bg1: #ff8fab; --on-bg2: #ff5c7a; --on-border: #b83956; --on-text: #3f0f1d; }
734
+ .lego-tags label:nth-child(8n+5) span { --on-bg1: #d0a8ff; --on-bg2: #a46cff; --on-border: #7147b3; --on-text: #25143f; }
735
+ .lego-tags label:nth-child(8n+6) span { --on-bg1: #ffe27a; --on-bg2: #f7bf39; --on-border: #ad7f1f; --on-text: #332407; }
736
+ .lego-tags label:nth-child(8n+7) span { --on-bg1: #8effd5; --on-bg2: #2ed6b5; --on-border: #1e947d; --on-text: #0d2a25; }
737
+ .lego-tags label:nth-child(8n+8) span { --on-bg1: #ffb47e; --on-bg2: #ff8753; --on-border: #b95b2d; --on-text: #391a0a; }
738
+
739
+ /* Source-driven selected colors (applies when tags are preselected by the pipeline). */
740
+ .lego-tags label[data-psq-preselected="1"][data-psq-origin="rewrite"] span {
741
+ --on-bg1: #77f0d7;
742
+ --on-bg2: #26b9a3;
743
+ --on-border: #187869;
744
+ --on-text: #062923;
745
+ }
746
+ .lego-tags label[data-psq-preselected="1"][data-psq-origin="selection"] span {
747
+ --on-bg1: #ffd98a;
748
+ --on-bg2: #f0a93c;
749
+ --on-border: #a66f1f;
750
+ --on-text: #382206;
751
+ }
752
+ .lego-tags label[data-psq-preselected="1"][data-psq-origin="probe"] span {
753
+ --on-bg1: #d8b4ff;
754
+ --on-bg2: #9a6cff;
755
+ --on-border: #6745b0;
756
+ --on-text: #24143b;
757
+ }
758
+ .lego-tags label[data-psq-preselected="1"][data-psq-origin="structural"] span {
759
+ --on-bg1: #a6f79a;
760
+ --on-bg2: #53c368;
761
+ --on-border: #2f8442;
762
+ --on-text: #102d17;
763
+ }
764
+ .lego-tags label[data-psq-preselected="1"][data-psq-origin="implied"] span {
765
+ --on-bg1: #d7dde8;
766
+ --on-bg2: #a8b3c4;
767
+ --on-border: #6f7e95;
768
+ --on-text: #1d2633;
769
  }
770
 
771
+ /* User-selected tags (not initially selected by the pipeline). */
772
+ .lego-tags label[data-psq-preselected="0"] span {
773
+ --on-bg1: #9ec5ff;
774
+ --on-bg2: #4f86ff;
775
+ --on-border: #2f5fbf;
776
+ --on-text: #0b1f42;
777
+ }
778
 
779
+ .lego-tags label:hover span {
780
+ filter: brightness(1.02) !important;
781
+ transform: translateY(1px) !important;
 
782
  }
783
 
784
+ /* ON state: brighter + visibly recessed */
785
+ .lego-tags input[type="checkbox"]:checked + span,
786
+ .lego-tags input[type="checkbox"]:checked ~ span,
787
+ .lego-tags label:has(input[type="checkbox"]:checked) span {
788
+ background: linear-gradient(180deg, var(--on-bg1) 0%, var(--on-bg2) 100%) !important;
789
+ color: var(--on-text) !important;
790
+ border-color: var(--on-border) !important;
791
+ filter: saturate(1.2) brightness(1.12) !important;
792
+ transform: translateY(-2px) !important;
793
+ box-shadow:
794
+ inset 0 3px 6px rgba(0,0,0,0.20),
795
+ inset 0 -1px 0 rgba(255,255,255,0.36),
796
+ 0 6px 0 rgba(0,0,0,0.32) !important;
797
+ }
798
+
799
+ .source-legend {
800
  display: flex;
801
  flex-wrap: wrap;
802
  gap: 8px;
803
+ margin: 4px 0 10px 0;
804
  }
805
 
806
+ .source-legend .chip {
807
+ display: inline-flex;
808
+ align-items: center;
809
+ gap: 8px;
810
+ border-radius: 999px;
811
+ border: 1px solid #8792a2;
812
+ padding: 5px 10px;
813
+ font-size: 0.85rem;
814
+ font-weight: 700;
815
+ color: #1f2430;
816
+ background: #f3f6fb;
817
  }
818
 
819
+ .source-legend .swatch {
820
+ width: 12px;
821
+ height: 12px;
822
+ border-radius: 50%;
823
+ border: 1px solid rgba(0,0,0,0.2);
824
  }
825
 
826
+ .source-legend .rewrite { background: #26b9a3; }
827
+ .source-legend .selection { background: #f0a93c; }
828
+ .source-legend .probe { background: #9a6cff; }
829
+ .source-legend .structural { background: #53c368; }
830
+ .source-legend .implied { background: #a8b3c4; }
831
+ .source-legend .user { background: #4f86ff; }
832
+ .source-legend .unselected { background: #c7ced8; }
833
+ """
 
 
 
 
 
834
 
835
+ client_js = """
836
+ () => {
837
+ const markerRe = /\\s*\\[\\[psq:([a-z_]+):(0|1)\\]\\]\\s*$/;
838
+ const applyTagMeta = () => {
839
+ const labels = document.querySelectorAll(".lego-tags label");
840
+ labels.forEach((label) => {
841
+ const span = label.querySelector("span");
842
+ if (!span) return;
843
+ const text = span.textContent || "";
844
+ const match = text.match(markerRe);
845
+ if (!match) return;
846
+ label.dataset.psqOrigin = match[1];
847
+ label.dataset.psqPreselected = match[2];
848
+ span.textContent = text.replace(markerRe, "");
849
+ });
850
+ };
851
+
852
+ applyTagMeta();
853
+ const observer = new MutationObserver(() => applyTagMeta());
854
+ observer.observe(document.body, { childList: true, subtree: true, characterData: true });
855
  }
856
  """
857
 
858
 
859
  def rag_pipeline_ui(
860
+ user_prompt: str,
861
+ display_top_groups: float,
862
+ display_top_tags_per_group: float,
863
+ display_rank_top_k: float,
864
+ ):
865
+ logs = []
866
+ def log(s): logs.append(s)
867
+
868
+ try:
869
+ stage_timings = {}
870
+
871
+ def _record_timing(stage: str, dt_s: float):
872
+ stage_timings[stage] = float(dt_s)
873
+
874
+ def _emit_timing_summary(total_s: float):
875
+ summary_order = [
876
+ "preprocess",
877
+ "rewrite",
878
+ "structural",
879
+ "probe",
880
+ "retrieval",
881
+ "selection",
882
+ "implication_expansion",
883
+ "prompt_composition",
884
+ "group_display",
885
+ ]
886
+ lines = []
887
+ for k in summary_order:
888
+ if k in stage_timings:
889
+ lines.append(f"{k}={stage_timings[k]:.2f}s")
890
+ slowest = max(stage_timings.items(), key=lambda kv: kv[1])[0] if stage_timings else "n/a"
891
+ log("Timing Summary: " + ", ".join(lines))
892
+ log(f"Timing Slowest Stage: {slowest}")
893
+ log(f"Timing Total: {total_s:.2f}s")
894
+
895
+ def _append_timing_jsonl(total_s: float):
896
+ try:
897
+ timing_log_path.parent.mkdir(parents=True, exist_ok=True)
898
+ rec = {
899
+ "timestamp_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
900
+ "stages_s": stage_timings,
901
+ "total_s": float(total_s),
902
+ "config": {
903
+ "timeout_rewrite_s": stage1_rewrite_timeout_s,
904
+ "timeout_struct_s": stage1_struct_timeout_s,
905
+ "timeout_probe_s": stage1_probe_timeout_s,
906
+ "timeout_select_s": stage3_select_timeout_s,
907
+ },
908
+ }
909
+ with timing_log_path.open("a", encoding="utf-8") as f:
910
+ f.write(json.dumps(rec, ensure_ascii=True) + "\n")
911
+ log(f"Timing Log: wrote {timing_log_path}")
912
+ except Exception as e:
913
+ log(f"Timing Log: failed ({type(e).__name__}: {e})")
914
+
915
+ def _future_with_timeout(fut, timeout_s: float, stage_name: str, fallback):
916
+ t0 = time.perf_counter()
917
+ try:
918
+ out = fut.result(timeout=max(1.0, float(timeout_s)))
919
+ dt = time.perf_counter() - t0
920
+ log(f"{stage_name}: {dt:.2f}s")
921
+ stage_key = {
922
+ "Rewrite": "rewrite",
923
+ "Structural inference": "structural",
924
+ "Probe inference": "probe",
925
+ "Index selection": "selection",
926
+ }.get(stage_name)
927
+ if stage_key:
928
+ _record_timing(stage_key, dt)
929
+ return out
930
+ except FutureTimeoutError:
931
+ fut.cancel()
932
+ log(f"{stage_name}: timed out after {timeout_s:.0f}s; using fallback")
933
+ return fallback
934
+ except Exception as e:
935
+ log(f"{stage_name}: failed ({type(e).__name__}: {e}); using fallback")
936
+ return fallback
937
+
938
+ t_total0 = time.perf_counter()
939
+ log("Start: received prompt")
940
+ prompt_in = (user_prompt or "").strip()
941
+ if not prompt_in:
942
+ return _build_ui_payload(
943
+ console_text="Error: empty prompt",
944
+ legacy_prompt_text="",
945
+ row_defs=[],
946
+ selected_tags=[],
947
+ )
948
+
949
+ log("Input:")
950
+ log(prompt_in)
951
+ log("")
952
+ log(
953
+ "Runtime config: "
954
+ f"retrieval_global_k={retrieval_global_k} "
955
+ f"retrieval_per_phrase_k={retrieval_per_phrase_k} "
956
+ f"retrieval_per_phrase_final_k={retrieval_per_phrase_final_k} "
957
+ f"selection_mode={selection_mode} "
958
+ f"selection_chunk_size={selection_chunk_size} "
959
+ f"selection_per_phrase_k={selection_per_phrase_k}"
960
+ )
961
+ log("")
962
+
963
+ t0 = time.perf_counter()
964
+ user_tags = extract_user_provided_tags_upto_3_words(prompt_in)
965
+ dt = time.perf_counter()-t0
966
+ _record_timing("preprocess", dt)
967
+ log(f"Preprocess (user tag extraction): {dt:.2f}s")
968
+ log("Heuristically extracted user tags:")
969
+ if user_tags:
970
+ log(", ".join(user_tags))
971
+ else:
972
+ log("(none)")
973
+ log("")
974
+
975
+ log("Step 1: LLM rewrite + structural inference + probe (concurrent)")
976
+ max_workers = 3 if enable_probe_tags else 2
977
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
978
+ fut_rewrite = ex.submit(llm_rewrite_prompt, prompt_in, log)
979
+ fut_struct = ex.submit(llm_infer_structural_tags, prompt_in, log=log)
980
+ fut_probe = ex.submit(llm_infer_probe_tags, prompt_in, log=log) if enable_probe_tags else None
981
+
982
+ rewritten = _future_with_timeout(
983
+ fut_rewrite, stage1_rewrite_timeout_s, "Rewrite", prompt_in
984
+ )
985
+ structural_tags = _future_with_timeout(
986
+ fut_struct, stage1_struct_timeout_s, "Structural inference", []
987
+ )
988
+ probe_tags = (
989
+ _future_with_timeout(fut_probe, stage1_probe_timeout_s, "Probe inference", [])
990
+ if fut_probe else []
991
+ )
992
+
993
+ log("Rewrite:")
994
+ log(rewritten if rewritten else "(empty)")
995
+ log("")
996
+
997
+ rewrite_for_retrieval = rewritten
998
+ if user_tags:
999
+ # keep them separate in logs, but allow them to help retrieval
1000
+ rewrite_for_retrieval = (rewrite_for_retrieval + ", " + ", ".join(user_tags)).strip(", ").strip()
1001
+
1002
+
1003
+ log("Step 2: Prompt Squirrel retrieval (hidden)")
1004
+ try:
1005
+ t0 = time.perf_counter()
1006
+ retrieval_context_tags = list(dict.fromkeys((structural_tags or []) + (probe_tags or [])))
1007
+ rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
1008
+ retrieval_result = psq_candidates_from_rewrite_phrases(
1009
+ rewrite_phrases=rewrite_phrases,
1010
+ allow_nsfw_tags=allow_nsfw_tags,
1011
+ context_tags=retrieval_context_tags,
1012
+ global_k=max(1, retrieval_global_k),
1013
+ per_phrase_k=max(1, retrieval_per_phrase_k),
1014
+ per_phrase_final_k=max(1, retrieval_per_phrase_final_k),
1015
+ verbose=verbose_retrieval,
1016
+ )
1017
+ if isinstance(retrieval_result, tuple):
1018
+ candidates, phrase_reports = retrieval_result
1019
+ else:
1020
+ candidates, phrase_reports = retrieval_result, []
1021
+ if selection_candidate_cap > 0 and len(candidates) > selection_candidate_cap:
1022
+ candidates = candidates[:selection_candidate_cap]
1023
+ log(f"Selection candidate cap applied: {selection_candidate_cap}")
1024
+ dt = time.perf_counter()-t0
1025
+ _record_timing("retrieval", dt)
1026
+ log(f"Retrieval: {dt:.2f}s")
1027
+ log(f"Retrieved {len(candidates)} candidate tags")
1028
+ if verbose_retrieval:
1029
+ log(f"Total unique candidates: {len(candidates)}")
1030
+ limit = None if verbose_retrieval_all else max(1, int(verbose_retrieval_limit))
1031
+ for report in phrase_reports:
1032
+ phrase = report.get("normalized") or report.get("phrase") or ""
1033
+ lookup = report.get("lookup") or ""
1034
+ tfidf_vocab = report.get("tfidf_vocab")
1035
+ log(f"Phrase: {phrase} (lookup={lookup}) tfidf_vocab={tfidf_vocab}")
1036
+ rows = report.get("candidates", [])
1037
+ shown = rows if limit is None else rows[:limit]
1038
+ for row in shown:
1039
+ tag = row.get("tag")
1040
+ alias_token = row.get("alias_token")
1041
+ score_fasttext = row.get("score_fasttext")
1042
+ score_context = row.get("score_context")
1043
+ score_combined = row.get("score_combined")
1044
+ count = row.get("count")
1045
+ alias_part = ""
1046
+ if alias_token and alias_token != tag:
1047
+ alias_part = f" [alias_token={alias_token}]"
1048
+ fasttext_str = (
1049
+ f"{score_fasttext:.3f}" if isinstance(score_fasttext, (int, float)) else score_fasttext
1050
+ )
1051
+ if score_context is None:
1052
+ context_str = "None"
1053
+ else:
1054
+ context_str = (
1055
+ f"{score_context:.3f}" if isinstance(score_context, (int, float)) else score_context
1056
+ )
1057
+ combined_str = (
1058
+ f"{score_combined:.3f}" if isinstance(score_combined, (int, float)) else score_combined
1059
+ )
1060
+ log(
1061
+ f" {tag}{alias_part} | fasttext={fasttext_str} context={context_str} "
1062
+ f"combined={combined_str} count={count}"
1063
+ )
1064
+ if limit is not None and len(rows) > limit:
1065
+ log(f" ... ({len(rows) - limit} more)")
1066
+ except Exception as e:
1067
+ log(f"Retrieval fallback: {type(e).__name__}: {e}")
1068
+ candidates = []
1069
+
1070
+ log("Step 3: LLM index selection (uses rewrite + structural/probe context)")
1071
+ selection_query = _build_selection_query(
1072
+ prompt_in=prompt_in,
1073
+ rewritten=rewritten,
1074
+ structural_tags=structural_tags,
1075
+ probe_tags=probe_tags,
1076
+ )
1077
+ with ThreadPoolExecutor(max_workers=1) as ex:
1078
+ fut_sel = ex.submit(
1079
+ llm_select_indices,
1080
+ query_text=selection_query,
1081
+ candidates=candidates,
1082
+ max_pick=0,
1083
+ log=log,
1084
+ mode=selection_mode,
1085
+ chunk_size=max(1, selection_chunk_size),
1086
+ per_phrase_k=max(1, selection_per_phrase_k),
1087
+ )
1088
+ picked_indices = _future_with_timeout(
1089
+ fut_sel, stage3_select_timeout_s, "Index selection", []
1090
+ )
1091
+
1092
+ selection_selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
1093
+ selected_tags = list(selection_selected_tags)
1094
+
1095
+ if structural_tags:
1096
+ # Add structural tags that aren't already selected
1097
+ existing = {t for t in selected_tags}
1098
+ new_structural = [t for t in structural_tags if t not in existing]
1099
+ selected_tags.extend(new_structural)
1100
+ log(f" Added {len(new_structural)} structural tags: {', '.join(new_structural)}")
1101
+ else:
1102
+ log(" No structural tags inferred")
1103
+
1104
+ if probe_tags:
1105
+ existing = {t for t in selected_tags}
1106
+ new_probe = [t for t in probe_tags if t not in existing]
1107
+ selected_tags.extend(new_probe)
1108
+ log(f" Added {len(new_probe)} probe tags: {', '.join(new_probe)}")
1109
+ elif enable_probe_tags:
1110
+ log(" No probe tags inferred")
1111
+
1112
+ selected_tags, removed_excluded_direct = _filter_excluded_recommendation_tags(selected_tags)
1113
+ if removed_excluded_direct:
1114
+ log(f" Removed {len(removed_excluded_direct)} excluded tags: {', '.join(removed_excluded_direct)}")
1115
+
1116
+ direct_selected_tags = list(dict.fromkeys(selected_tags))
1117
+
1118
+ log("Step 3c: Expand via tag implications")
1119
+ t0 = time.perf_counter()
1120
+ tag_set = set(selected_tags)
1121
+ expanded, implied_only = expand_tags_via_implications(tag_set)
1122
+ dt = time.perf_counter()-t0
1123
+ _record_timing("implication_expansion", dt)
1124
+ log(f"Implication expansion: {dt:.2f}s")
1125
+ implied_selected_tags = sorted(implied_only) if implied_only else []
1126
+ if implied_only:
1127
+ selected_tags.extend(sorted(implied_only))
1128
+ log(f" Added {len(implied_only)} implied tags: {', '.join(sorted(implied_only))}")
1129
+ else:
1130
+ log(" No additional implied tags")
1131
+
1132
+ selected_tags, removed_excluded_implied = _filter_excluded_recommendation_tags(selected_tags)
1133
+ implied_selected_tags = [
1134
+ t for t in implied_selected_tags if not _is_excluded_recommendation_tag(t)
1135
+ ]
1136
+ if removed_excluded_implied:
1137
+ log(
1138
+ f" Removed {len(removed_excluded_implied)} excluded tags after implications: "
1139
+ f"{', '.join(removed_excluded_implied)}"
1140
+ )
1141
+
1142
+ log("Step 4: Compose final prompt")
1143
+ t0 = time.perf_counter()
1144
+ final_prompt = compose_final_prompt(rewritten, selected_tags)
1145
+ dt = time.perf_counter()-t0
1146
+ _record_timing("prompt_composition", dt)
1147
+ log(f"Prompt composition: {dt:.2f}s")
1148
+
1149
  log("Step 5: Build ranked group/category display")
1150
  t0 = time.perf_counter()
1151
  seed_terms = []
1152
  seed_terms.extend(user_tags)
1153
  seed_terms.extend([p.strip() for p in (rewritten or "").split(",") if p.strip()])
1154
+ seed_terms.extend(structural_tags or [])
1155
+ seed_terms.extend(probe_tags or [])
1156
+ seed_terms.extend(selected_tags)
1157
+ seed_terms = list(dict.fromkeys(seed_terms))
1158
+
1159
+ active_selected_tags = list(dict.fromkeys(selected_tags))
1160
+ structural_set = {_norm_tag_for_lookup(t) for t in (structural_tags or []) if t}
1161
+ probe_set = {_norm_tag_for_lookup(t) for t in (probe_tags or []) if t}
1162
+ implied_set = {_norm_tag_for_lookup(t) for t in (implied_selected_tags or []) if t}
1163
+ rewrite_set = {
1164
+ _norm_tag_for_lookup(t)
1165
+ for t in (list(user_tags or []) + [p.strip() for p in (rewritten or "").split(",") if p.strip()])
1166
+ if t
1167
+ }
1168
+ selection_set = {_norm_tag_for_lookup(t) for t in (selection_selected_tags or []) if t}
1169
+ tag_selection_origins: Dict[str, str] = {}
1170
+ for tag in active_selected_tags:
1171
+ tag_norm = _norm_tag_for_lookup(tag)
1172
+ if tag_norm in structural_set:
1173
+ origin = "structural"
1174
+ elif tag_norm in probe_set:
1175
+ origin = "probe"
1176
+ elif tag_norm in rewrite_set:
1177
+ origin = "rewrite"
1178
+ elif tag_norm in selection_set:
1179
+ origin = "selection"
1180
+ elif tag_norm in implied_set:
1181
+ origin = "implied"
1182
+ else:
1183
+ # Unknown/fallback tags use selection color.
1184
+ origin = "selection"
1185
+ tag_selection_origins[tag] = origin
1186
+ if tag_norm and tag_norm != tag:
1187
+ tag_selection_origins[tag_norm] = origin
1188
+
1189
+ direct_tags_for_implied = list(
1190
+ dict.fromkeys(_norm_tag_for_lookup(t) for t in (direct_selected_tags or []) if t)
1191
+ )
1192
+ direct_tags_for_implied_idx = {t: i for i, t in enumerate(direct_tags_for_implied)}
1193
+ direct_tags_for_implied.sort(
1194
+ key=lambda t: (
1195
+ _selection_source_rank(tag_selection_origins.get(t, "selection")),
1196
+ direct_tags_for_implied_idx.get(t, 10**9),
1197
+ )
1198
+ )
1199
+ implied_parent_map = _build_implied_parent_map(
1200
+ direct_tags_ordered=direct_tags_for_implied,
1201
+ implied_tags=implied_selected_tags,
1202
+ )
1203
 
1204
  toggle_rows = _build_toggle_rows(
1205
  seed_terms=seed_terms,
1206
+ selected_tags=active_selected_tags,
1207
+ tag_selection_origins=tag_selection_origins,
1208
+ implied_parent_map=implied_parent_map,
1209
  top_groups=max(1, int(display_top_groups)),
1210
  top_tags_per_group=max(1, int(display_top_tags_per_group)),
1211
  group_rank_top_k=max(1, int(display_rank_top_k)),
1212
  )
1213
+ dt = time.perf_counter()-t0
1214
+ _record_timing("group_display", dt)
1215
+ log(f"Ranked group display: {dt:.2f}s ({len(toggle_rows)} rows)")
1216
+ log(
1217
+ _build_display_audit_line(
1218
+ toggle_rows,
1219
+ active_selected_tags=active_selected_tags,
1220
+ direct_selected_tags=direct_selected_tags,
1221
+ implied_selected_tags=implied_selected_tags,
1222
+ )
1223
+ )
1224
+
1225
+ total_dt = time.perf_counter()-t_total0
1226
+ _emit_timing_summary(total_dt)
1227
+ _append_timing_jsonl(total_dt)
1228
+ log("Done: final prompt ready")
1229
+ return _build_ui_payload(
1230
+ console_text="\n".join(logs),
1231
+ legacy_prompt_text=final_prompt,
1232
+ row_defs=toggle_rows,
1233
+ selected_tags=active_selected_tags,
1234
+ )
1235
+
1236
+ except Exception as e:
1237
+ log(f"Error: {type(e).__name__}: {e}")
1238
+ return _build_ui_payload(
1239
+ console_text="\n".join(logs),
1240
+ legacy_prompt_text="",
1241
+ row_defs=[],
1242
+ selected_tags=[],
1243
+ )
1244
+
1245
+
1246
+
1247
+ with gr.Blocks(css=css, js=client_js) as app:
1248
+ with gr.Row():
1249
+ with gr.Column(scale=3, elem_classes=["prompt-col"]):
1250
+ image_tags = gr.Textbox(
1251
+ label="Enter Prompt",
1252
+ placeholder="e.g. fox, outside, detailed background, .",
1253
+ lines=1
1254
+ )
1255
+ with gr.Column(scale=1):
1256
+ _mascot_pil = _load_mascot_image()
1257
+ if _mascot_pil is not None:
1258
+ mascot_img = gr.Image(
1259
+ value=_mascot_pil,
1260
+ show_label=False,
1261
+ interactive=False,
1262
+ height=220,
1263
+ elem_id="mascot"
1264
+ )
1265
+ else:
1266
+ mascot_img = gr.Markdown("`(mascot image unavailable)`")
1267
+ submit_button = gr.Button("Run", variant="primary")
1268
+
1269
+ gr.Markdown(
1270
+ """
1271
+ ### Prompt Squirrel RAG (pipeline version)
1272
+
1273
+ Type a rough prompt. This tool rewrites it and aligns it to an e621-style tag vocabulary using Prompt Squirrel internally,
1274
+ then returns a cleaned, model-friendly prompt.
1275
+ """.strip()
1276
+ )
1277
+
1278
+ console = gr.Textbox(
1279
+ label="Console",
1280
+ lines=10,
1281
+ interactive=False,
1282
+ placeholder="Progress logs will appear here."
1283
+ )
1284
+
1285
+ suggested_prompt = gr.Textbox(
1286
+ label="Suggested Prompt (From Toggled Tags)",
1287
+ lines=3,
1288
+ interactive=False,
1289
+ show_copy_button=True,
1290
+ placeholder="Comma-separated tags selected in the rows below."
1291
+ )
1292
+
1293
+ with gr.Accordion("Legacy Pipeline Prompt (for reference)", open=False):
1294
+ legacy_final_prompt = gr.Textbox(
1295
+ label="Legacy Final Prompt",
1296
+ lines=3,
1297
+ interactive=False,
1298
+ show_copy_button=True,
1299
+ )
1300
+
1301
  selected_tags_state = gr.State([])
1302
  row_defs_state = gr.State([])
1303
  row_values_state = gr.State([])
1304
 
1305
  gr.Markdown("### Toggle Tag Rows")
1306
+ gr.HTML(
1307
+ """
1308
+ <div class="source-legend">
1309
+ <span class="chip"><span class="swatch rewrite"></span>Rewrite phrase</span>
1310
+ <span class="chip"><span class="swatch selection"></span>General selection</span>
1311
+ <span class="chip"><span class="swatch probe"></span>Probe query</span>
1312
+ <span class="chip"><span class="swatch structural"></span>Structural query</span>
1313
+ <span class="chip"><span class="swatch implied"></span>Implied</span>
1314
+ <span class="chip"><span class="swatch user"></span>User-toggled</span>
1315
+ <span class="chip"><span class="swatch unselected"></span>Unselected</span>
1316
+ </div>
1317
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1318
  )
1319
+ gr.Markdown(
1320
+ "Rows are ranked by expected tag count (E). Within each row: structural -> probe -> selected, "
1321
+ "implied tags follow their triggering selected tag when possible, then unselected tags in confidence order."
 
 
1322
  )
1323
+ row_headers: List[gr.Markdown] = []
1324
+ row_checkboxes: List[gr.CheckboxGroup] = []
1325
+ for _ in range(display_max_rows_default):
1326
+ row_headers.append(gr.Markdown(value="", visible=False))
1327
+ row_checkboxes.append(
1328
+ gr.CheckboxGroup(
1329
+ choices=[],
1330
+ value=[],
1331
+ visible=False,
1332
+ interactive=True,
1333
+ container=False,
1334
+ elem_classes=["lego-tags"],
1335
+ )
1336
+ )
1337
+
1338
+ gr.Markdown(
1339
+ "Toggling a tag in any row toggles it everywhere else that tag appears."
1340
+ )
1341
+
1342
+ with gr.Accordion("Display Settings", open=False):
1343
+ with gr.Row():
1344
+ display_top_groups = gr.Number(
1345
+ value=display_top_groups_default,
1346
+ precision=0,
1347
+ label="Rows (Top Groups/Categories)",
1348
+ minimum=1,
1349
+ )
1350
+ display_top_tags_per_group = gr.Number(
1351
+ value=display_top_tags_per_group_default,
1352
+ precision=0,
1353
+ label="Top Tags Shown Per Row",
1354
+ minimum=1,
1355
+ )
1356
+ display_rank_top_k = gr.Number(
1357
+ value=display_rank_top_k_default,
1358
+ precision=0,
1359
+ label="Top Tags Used for Row Ranking",
1360
+ minimum=1,
1361
+ )
1362
+
1363
+ run_outputs = [
1364
+ console,
1365
+ legacy_final_prompt,
1366
+ suggested_prompt,
1367
+ selected_tags_state,
1368
+ row_defs_state,
1369
+ row_values_state,
1370
+ *row_headers,
1371
+ *row_checkboxes,
1372
+ ]
1373
+
1374
+ submit_button.click(
1375
+ rag_pipeline_ui,
1376
+ inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
1377
+ outputs=run_outputs
1378
+ )
1379
+
1380
+ image_tags.submit(
1381
+ rag_pipeline_ui,
1382
+ inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
1383
+ outputs=run_outputs
1384
+ )
1385
+
1386
  for idx, row_cb in enumerate(row_checkboxes):
1387
  row_cb.change(
1388
  fn=lambda changed_values, selected_state, row_defs, row_values, i=idx: _on_toggle_row(
1389
  i,
1390
  changed_values,
1391
+ selected_state,
1392
+ row_defs,
1393
+ row_values,
1394
+ display_max_rows_default,
1395
  ),
1396
  inputs=[row_cb, selected_tags_state, row_defs_state, row_values_state],
1397
  outputs=[selected_tags_state, row_values_state, suggested_prompt, *row_checkboxes],
1398
+ queue=False,
1399
+ show_progress="hidden",
1400
  )
1401
+
1402
+ if __name__ == "__main__":
1403
+ app.queue().launch(allowed_paths=[str(MASCOT_DIR)])
data/analysis/category_registry.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/analysis/hybrid_category_assignment_preview.json ADDED
@@ -0,0 +1,2753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "tfidf_weight": 0.6,
4
+ "wiki_weight": 0.4,
5
+ "tfidf_temp": 0.08,
6
+ "single_top1_min": 0.55,
7
+ "single_margin_min": 0.18,
8
+ "single_top2_max": 0.35,
9
+ "multi_top1_min": 0.42,
10
+ "multi_top2_min": 0.3,
11
+ "multi_pair_min": 0.78,
12
+ "sample_size": 20,
13
+ "seed": 42
14
+ },
15
+ "inputs": {
16
+ "registry_csv": "E:\\image\\backup\\Prompt_Squirrel_RAG\\data\\analysis\\category_registry.csv",
17
+ "wiki_pages_csv": "E:\\image\\backup\\Prompt_Squirrel_RAG\\wiki_pages-2023-08-08.csv",
18
+ "uncategorized_tags": 6261,
19
+ "active_categories_for_centroids": 19,
20
+ "centroid_seed_sizes": {
21
+ "anatomy_features": 407,
22
+ "background_composition": 84,
23
+ "body_decor": 9,
24
+ "body_type": 7,
25
+ "clothing_detail": 418,
26
+ "color_markings": 234,
27
+ "count": 5,
28
+ "expression_detail": 31,
29
+ "franchise_series": 86,
30
+ "gaze_detail": 22,
31
+ "gender": 3,
32
+ "objects_props": 264,
33
+ "organization": 8,
34
+ "perspective": 6,
35
+ "pose_action_detail": 100,
36
+ "resolution": 3,
37
+ "species": 13,
38
+ "style": 9,
39
+ "text": 4
40
+ }
41
+ },
42
+ "summary": {
43
+ "counts": {
44
+ "uncategorized_total": 6261,
45
+ "scored_rows": 6261,
46
+ "has_tfidf_vector": 5089,
47
+ "has_wiki_page": 4368,
48
+ "has_wiki_category_votes": 1957,
49
+ "signals": {
50
+ "tfidf_only": 3176,
51
+ "both": 1913,
52
+ "none": 1128,
53
+ "wiki_only": 44
54
+ },
55
+ "assignments": {
56
+ "hold": 5997,
57
+ "multi": 31,
58
+ "single": 233
59
+ },
60
+ "newly_categorized": 264,
61
+ "remaining_uncategorized": 5997,
62
+ "multi_category_additions": 62
63
+ },
64
+ "top_single_categories": [
65
+ [
66
+ "franchise_series",
67
+ 177
68
+ ],
69
+ [
70
+ "clothing_detail",
71
+ 13
72
+ ],
73
+ [
74
+ "anatomy_features",
75
+ 9
76
+ ],
77
+ [
78
+ "text",
79
+ 8
80
+ ],
81
+ [
82
+ "organization",
83
+ 7
84
+ ],
85
+ [
86
+ "body_type",
87
+ 6
88
+ ],
89
+ [
90
+ "style",
91
+ 2
92
+ ],
93
+ [
94
+ "species",
95
+ 2
96
+ ],
97
+ [
98
+ "body_decor",
99
+ 2
100
+ ],
101
+ [
102
+ "objects_props",
103
+ 2
104
+ ],
105
+ [
106
+ "background_composition",
107
+ 1
108
+ ],
109
+ [
110
+ "color_markings",
111
+ 1
112
+ ],
113
+ [
114
+ "expression_detail",
115
+ 1
116
+ ],
117
+ [
118
+ "pose_action_detail",
119
+ 1
120
+ ],
121
+ [
122
+ "count",
123
+ 1
124
+ ]
125
+ ],
126
+ "top_multi_category_pairs": [
127
+ {
128
+ "categories": [
129
+ "body_type",
130
+ "franchise_series"
131
+ ],
132
+ "count": 7
133
+ },
134
+ {
135
+ "categories": [
136
+ "objects_props",
137
+ "pose_action_detail"
138
+ ],
139
+ "count": 2
140
+ },
141
+ {
142
+ "categories": [
143
+ "franchise_series",
144
+ "gender"
145
+ ],
146
+ "count": 2
147
+ },
148
+ {
149
+ "categories": [
150
+ "body_type",
151
+ "species"
152
+ ],
153
+ "count": 2
154
+ },
155
+ {
156
+ "categories": [
157
+ "color_markings",
158
+ "franchise_series"
159
+ ],
160
+ "count": 2
161
+ },
162
+ {
163
+ "categories": [
164
+ "anatomy_features",
165
+ "color_markings"
166
+ ],
167
+ "count": 2
168
+ },
169
+ {
170
+ "categories": [
171
+ "expression_detail",
172
+ "pose_action_detail"
173
+ ],
174
+ "count": 1
175
+ },
176
+ {
177
+ "categories": [
178
+ "expression_detail",
179
+ "text"
180
+ ],
181
+ "count": 1
182
+ },
183
+ {
184
+ "categories": [
185
+ "color_markings",
186
+ "style"
187
+ ],
188
+ "count": 1
189
+ },
190
+ {
191
+ "categories": [
192
+ "anatomy_features",
193
+ "objects_props"
194
+ ],
195
+ "count": 1
196
+ },
197
+ {
198
+ "categories": [
199
+ "franchise_series",
200
+ "species"
201
+ ],
202
+ "count": 1
203
+ },
204
+ {
205
+ "categories": [
206
+ "perspective",
207
+ "pose_action_detail"
208
+ ],
209
+ "count": 1
210
+ },
211
+ {
212
+ "categories": [
213
+ "count",
214
+ "franchise_series"
215
+ ],
216
+ "count": 1
217
+ },
218
+ {
219
+ "categories": [
220
+ "clothing_detail",
221
+ "franchise_series"
222
+ ],
223
+ "count": 1
224
+ },
225
+ {
226
+ "categories": [
227
+ "body_decor",
228
+ "objects_props"
229
+ ],
230
+ "count": 1
231
+ },
232
+ {
233
+ "categories": [
234
+ "background_composition",
235
+ "franchise_series"
236
+ ],
237
+ "count": 1
238
+ },
239
+ {
240
+ "categories": [
241
+ "anatomy_features",
242
+ "species"
243
+ ],
244
+ "count": 1
245
+ },
246
+ {
247
+ "categories": [
248
+ "anatomy_features",
249
+ "franchise_series"
250
+ ],
251
+ "count": 1
252
+ },
253
+ {
254
+ "categories": [
255
+ "body_type",
256
+ "gender"
257
+ ],
258
+ "count": 1
259
+ },
260
+ {
261
+ "categories": [
262
+ "clothing_detail",
263
+ "color_markings"
264
+ ],
265
+ "count": 1
266
+ }
267
+ ],
268
+ "samples": {
269
+ "single": [
270
+ {
271
+ "tag": "eeveelution",
272
+ "count": 58150,
273
+ "signal": "both",
274
+ "assigned_categories": [
275
+ "franchise_series"
276
+ ],
277
+ "top_fused": [
278
+ [
279
+ "franchise_series",
280
+ 0.721
281
+ ],
282
+ [
283
+ "gender",
284
+ 0.0689
285
+ ],
286
+ [
287
+ "resolution",
288
+ 0.0234
289
+ ]
290
+ ],
291
+ "top_tfidf": [
292
+ [
293
+ "franchise_series",
294
+ 0.535
295
+ ],
296
+ [
297
+ "gender",
298
+ 0.1148
299
+ ],
300
+ [
301
+ "resolution",
302
+ 0.039
303
+ ]
304
+ ],
305
+ "top_wiki": [
306
+ [
307
+ "franchise_series",
308
+ 1.0
309
+ ],
310
+ [
311
+ "text",
312
+ 0.0
313
+ ],
314
+ [
315
+ "background_composition",
316
+ 0.0
317
+ ]
318
+ ],
319
+ "wiki_vote_count": 2,
320
+ "wiki_link_count": 13
321
+ },
322
+ {
323
+ "tag": "boss_monster",
324
+ "count": 19924,
325
+ "signal": "wiki_only",
326
+ "assigned_categories": [
327
+ "anatomy_features"
328
+ ],
329
+ "top_fused": [
330
+ [
331
+ "anatomy_features",
332
+ 1.0
333
+ ],
334
+ [
335
+ "franchise_series",
336
+ 0.0
337
+ ],
338
+ [
339
+ "background_composition",
340
+ 0.0
341
+ ]
342
+ ],
343
+ "top_tfidf": [],
344
+ "top_wiki": [
345
+ [
346
+ "anatomy_features",
347
+ 1.0
348
+ ],
349
+ [
350
+ "franchise_series",
351
+ 0.0
352
+ ],
353
+ [
354
+ "background_composition",
355
+ 0.0
356
+ ]
357
+ ],
358
+ "wiki_vote_count": 4,
359
+ "wiki_link_count": 19
360
+ },
361
+ {
362
+ "tag": "blaze_the_cat",
363
+ "count": 7169,
364
+ "signal": "both",
365
+ "assigned_categories": [
366
+ "franchise_series"
367
+ ],
368
+ "top_fused": [
369
+ [
370
+ "franchise_series",
371
+ 0.6294
372
+ ],
373
+ [
374
+ "resolution",
375
+ 0.0338
376
+ ],
377
+ [
378
+ "expression_detail",
379
+ 0.0334
380
+ ]
381
+ ],
382
+ "top_tfidf": [
383
+ [
384
+ "franchise_series",
385
+ 0.3824
386
+ ],
387
+ [
388
+ "resolution",
389
+ 0.0563
390
+ ],
391
+ [
392
+ "expression_detail",
393
+ 0.0556
394
+ ]
395
+ ],
396
+ "top_wiki": [
397
+ [
398
+ "franchise_series",
399
+ 1.0
400
+ ],
401
+ [
402
+ "text",
403
+ 0.0
404
+ ],
405
+ [
406
+ "background_composition",
407
+ 0.0
408
+ ]
409
+ ],
410
+ "wiki_vote_count": 2,
411
+ "wiki_link_count": 6
412
+ },
413
+ {
414
+ "tag": "espeon",
415
+ "count": 7029,
416
+ "signal": "tfidf_only",
417
+ "assigned_categories": [
418
+ "franchise_series"
419
+ ],
420
+ "top_fused": [
421
+ [
422
+ "franchise_series",
423
+ 0.6445
424
+ ],
425
+ [
426
+ "gender",
427
+ 0.0854
428
+ ],
429
+ [
430
+ "resolution",
431
+ 0.0296
432
+ ]
433
+ ],
434
+ "top_tfidf": [
435
+ [
436
+ "franchise_series",
437
+ 0.6445
438
+ ],
439
+ [
440
+ "gender",
441
+ 0.0854
442
+ ],
443
+ [
444
+ "resolution",
445
+ 0.0296
446
+ ]
447
+ ],
448
+ "top_wiki": [],
449
+ "wiki_vote_count": 0,
450
+ "wiki_link_count": 11
451
+ },
452
+ {
453
+ "tag": "zangoose",
454
+ "count": 6959,
455
+ "signal": "tfidf_only",
456
+ "assigned_categories": [
457
+ "franchise_series"
458
+ ],
459
+ "top_fused": [
460
+ [
461
+ "franchise_series",
462
+ 0.6316
463
+ ],
464
+ [
465
+ "gender",
466
+ 0.0872
467
+ ],
468
+ [
469
+ "resolution",
470
+ 0.0371
471
+ ]
472
+ ],
473
+ "top_tfidf": [
474
+ [
475
+ "franchise_series",
476
+ 0.6316
477
+ ],
478
+ [
479
+ "gender",
480
+ 0.0872
481
+ ],
482
+ [
483
+ "resolution",
484
+ 0.0371
485
+ ]
486
+ ],
487
+ "top_wiki": [],
488
+ "wiki_vote_count": 0,
489
+ "wiki_link_count": 4
490
+ },
491
+ {
492
+ "tag": "snivy",
493
+ "count": 3315,
494
+ "signal": "tfidf_only",
495
+ "assigned_categories": [
496
+ "franchise_series"
497
+ ],
498
+ "top_fused": [
499
+ [
500
+ "franchise_series",
501
+ 0.7953
502
+ ],
503
+ [
504
+ "gender",
505
+ 0.0605
506
+ ],
507
+ [
508
+ "resolution",
509
+ 0.0192
510
+ ]
511
+ ],
512
+ "top_tfidf": [
513
+ [
514
+ "franchise_series",
515
+ 0.7953
516
+ ],
517
+ [
518
+ "gender",
519
+ 0.0605
520
+ ],
521
+ [
522
+ "resolution",
523
+ 0.0192
524
+ ]
525
+ ],
526
+ "top_wiki": [],
527
+ "wiki_vote_count": 0,
528
+ "wiki_link_count": 6
529
+ },
530
+ {
531
+ "tag": "buizel",
532
+ "count": 3220,
533
+ "signal": "tfidf_only",
534
+ "assigned_categories": [
535
+ "franchise_series"
536
+ ],
537
+ "top_fused": [
538
+ [
539
+ "franchise_series",
540
+ 0.6631
541
+ ],
542
+ [
543
+ "gender",
544
+ 0.0802
545
+ ],
546
+ [
547
+ "resolution",
548
+ 0.0254
549
+ ]
550
+ ],
551
+ "top_tfidf": [
552
+ [
553
+ "franchise_series",
554
+ 0.6631
555
+ ],
556
+ [
557
+ "gender",
558
+ 0.0802
559
+ ],
560
+ [
561
+ "resolution",
562
+ 0.0254
563
+ ]
564
+ ],
565
+ "top_wiki": [],
566
+ "wiki_vote_count": 0,
567
+ "wiki_link_count": 2
568
+ },
569
+ {
570
+ "tag": "floatzel",
571
+ "count": 2957,
572
+ "signal": "tfidf_only",
573
+ "assigned_categories": [
574
+ "franchise_series"
575
+ ],
576
+ "top_fused": [
577
+ [
578
+ "franchise_series",
579
+ 0.674
580
+ ],
581
+ [
582
+ "gender",
583
+ 0.0738
584
+ ],
585
+ [
586
+ "resolution",
587
+ 0.0261
588
+ ]
589
+ ],
590
+ "top_tfidf": [
591
+ [
592
+ "franchise_series",
593
+ 0.674
594
+ ],
595
+ [
596
+ "gender",
597
+ 0.0738
598
+ ],
599
+ [
600
+ "resolution",
601
+ 0.0261
602
+ ]
603
+ ],
604
+ "top_wiki": [],
605
+ "wiki_vote_count": 0,
606
+ "wiki_link_count": 2
607
+ },
608
+ {
609
+ "tag": "charmeleon",
610
+ "count": 2899,
611
+ "signal": "tfidf_only",
612
+ "assigned_categories": [
613
+ "franchise_series"
614
+ ],
615
+ "top_fused": [
616
+ [
617
+ "franchise_series",
618
+ 0.6974
619
+ ],
620
+ [
621
+ "gender",
622
+ 0.0566
623
+ ],
624
+ [
625
+ "count",
626
+ 0.0233
627
+ ]
628
+ ],
629
+ "top_tfidf": [
630
+ [
631
+ "franchise_series",
632
+ 0.6974
633
+ ],
634
+ [
635
+ "gender",
636
+ 0.0566
637
+ ],
638
+ [
639
+ "count",
640
+ 0.0233
641
+ ]
642
+ ],
643
+ "top_wiki": [],
644
+ "wiki_vote_count": 0,
645
+ "wiki_link_count": 9
646
+ },
647
+ {
648
+ "tag": "dragonite",
649
+ "count": 2477,
650
+ "signal": "tfidf_only",
651
+ "assigned_categories": [
652
+ "franchise_series"
653
+ ],
654
+ "top_fused": [
655
+ [
656
+ "franchise_series",
657
+ 0.6717
658
+ ],
659
+ [
660
+ "gender",
661
+ 0.0623
662
+ ],
663
+ [
664
+ "resolution",
665
+ 0.0246
666
+ ]
667
+ ],
668
+ "top_tfidf": [
669
+ [
670
+ "franchise_series",
671
+ 0.6717
672
+ ],
673
+ [
674
+ "gender",
675
+ 0.0623
676
+ ],
677
+ [
678
+ "resolution",
679
+ 0.0246
680
+ ]
681
+ ],
682
+ "top_wiki": [],
683
+ "wiki_vote_count": 0,
684
+ "wiki_link_count": 3
685
+ },
686
+ {
687
+ "tag": "ampharos",
688
+ "count": 2449,
689
+ "signal": "tfidf_only",
690
+ "assigned_categories": [
691
+ "franchise_series"
692
+ ],
693
+ "top_fused": [
694
+ [
695
+ "franchise_series",
696
+ 0.7534
697
+ ],
698
+ [
699
+ "gender",
700
+ 0.0559
701
+ ],
702
+ [
703
+ "resolution",
704
+ 0.0207
705
+ ]
706
+ ],
707
+ "top_tfidf": [
708
+ [
709
+ "franchise_series",
710
+ 0.7534
711
+ ],
712
+ [
713
+ "gender",
714
+ 0.0559
715
+ ],
716
+ [
717
+ "resolution",
718
+ 0.0207
719
+ ]
720
+ ],
721
+ "top_wiki": [],
722
+ "wiki_vote_count": 0,
723
+ "wiki_link_count": 4
724
+ },
725
+ {
726
+ "tag": "pichu",
727
+ "count": 1980,
728
+ "signal": "tfidf_only",
729
+ "assigned_categories": [
730
+ "franchise_series"
731
+ ],
732
+ "top_fused": [
733
+ [
734
+ "franchise_series",
735
+ 0.7864
736
+ ],
737
+ [
738
+ "gender",
739
+ 0.0402
740
+ ],
741
+ [
742
+ "resolution",
743
+ 0.0197
744
+ ]
745
+ ],
746
+ "top_tfidf": [
747
+ [
748
+ "franchise_series",
749
+ 0.7864
750
+ ],
751
+ [
752
+ "gender",
753
+ 0.0402
754
+ ],
755
+ [
756
+ "resolution",
757
+ 0.0197
758
+ ]
759
+ ],
760
+ "top_wiki": [],
761
+ "wiki_vote_count": 0,
762
+ "wiki_link_count": 5
763
+ },
764
+ {
765
+ "tag": "quiver",
766
+ "count": 1372,
767
+ "signal": "wiki_only",
768
+ "assigned_categories": [
769
+ "objects_props"
770
+ ],
771
+ "top_fused": [
772
+ [
773
+ "objects_props",
774
+ 1.0
775
+ ],
776
+ [
777
+ "text",
778
+ 0.0
779
+ ],
780
+ [
781
+ "franchise_series",
782
+ 0.0
783
+ ]
784
+ ],
785
+ "top_tfidf": [],
786
+ "top_wiki": [
787
+ [
788
+ "objects_props",
789
+ 1.0
790
+ ],
791
+ [
792
+ "text",
793
+ 0.0
794
+ ],
795
+ [
796
+ "franchise_series",
797
+ 0.0
798
+ ]
799
+ ],
800
+ "wiki_vote_count": 1,
801
+ "wiki_link_count": 4
802
+ },
803
+ {
804
+ "tag": "snorlax",
805
+ "count": 1079,
806
+ "signal": "tfidf_only",
807
+ "assigned_categories": [
808
+ "franchise_series"
809
+ ],
810
+ "top_fused": [
811
+ [
812
+ "franchise_series",
813
+ 0.6034
814
+ ],
815
+ [
816
+ "gender",
817
+ 0.0515
818
+ ],
819
+ [
820
+ "resolution",
821
+ 0.0309
822
+ ]
823
+ ],
824
+ "top_tfidf": [
825
+ [
826
+ "franchise_series",
827
+ 0.6034
828
+ ],
829
+ [
830
+ "gender",
831
+ 0.0515
832
+ ],
833
+ [
834
+ "resolution",
835
+ 0.0309
836
+ ]
837
+ ],
838
+ "top_wiki": [],
839
+ "wiki_vote_count": 0,
840
+ "wiki_link_count": 2
841
+ },
842
+ {
843
+ "tag": "blastoise",
844
+ "count": 1006,
845
+ "signal": "tfidf_only",
846
+ "assigned_categories": [
847
+ "franchise_series"
848
+ ],
849
+ "top_fused": [
850
+ [
851
+ "franchise_series",
852
+ 0.6609
853
+ ],
854
+ [
855
+ "gender",
856
+ 0.0453
857
+ ],
858
+ [
859
+ "count",
860
+ 0.0243
861
+ ]
862
+ ],
863
+ "top_tfidf": [
864
+ [
865
+ "franchise_series",
866
+ 0.6609
867
+ ],
868
+ [
869
+ "gender",
870
+ 0.0453
871
+ ],
872
+ [
873
+ "count",
874
+ 0.0243
875
+ ]
876
+ ],
877
+ "top_wiki": [],
878
+ "wiki_vote_count": 0,
879
+ "wiki_link_count": 4
880
+ },
881
+ {
882
+ "tag": "roserade",
883
+ "count": 871,
884
+ "signal": "tfidf_only",
885
+ "assigned_categories": [
886
+ "franchise_series"
887
+ ],
888
+ "top_fused": [
889
+ [
890
+ "franchise_series",
891
+ 0.6325
892
+ ],
893
+ [
894
+ "gender",
895
+ 0.052
896
+ ],
897
+ [
898
+ "resolution",
899
+ 0.0336
900
+ ]
901
+ ],
902
+ "top_tfidf": [
903
+ [
904
+ "franchise_series",
905
+ 0.6325
906
+ ],
907
+ [
908
+ "gender",
909
+ 0.052
910
+ ],
911
+ [
912
+ "resolution",
913
+ 0.0336
914
+ ]
915
+ ],
916
+ "top_wiki": [],
917
+ "wiki_vote_count": 0,
918
+ "wiki_link_count": 3
919
+ },
920
+ {
921
+ "tag": "alolan_raichu",
922
+ "count": 730,
923
+ "signal": "tfidf_only",
924
+ "assigned_categories": [
925
+ "franchise_series"
926
+ ],
927
+ "top_fused": [
928
+ [
929
+ "franchise_series",
930
+ 0.6985
931
+ ],
932
+ [
933
+ "gender",
934
+ 0.0612
935
+ ],
936
+ [
937
+ "resolution",
938
+ 0.0263
939
+ ]
940
+ ],
941
+ "top_tfidf": [
942
+ [
943
+ "franchise_series",
944
+ 0.6985
945
+ ],
946
+ [
947
+ "gender",
948
+ 0.0612
949
+ ],
950
+ [
951
+ "resolution",
952
+ 0.0263
953
+ ]
954
+ ],
955
+ "top_wiki": [],
956
+ "wiki_vote_count": 0,
957
+ "wiki_link_count": 4
958
+ },
959
+ {
960
+ "tag": "nickit",
961
+ "count": 663,
962
+ "signal": "tfidf_only",
963
+ "assigned_categories": [
964
+ "franchise_series"
965
+ ],
966
+ "top_fused": [
967
+ [
968
+ "franchise_series",
969
+ 0.6143
970
+ ],
971
+ [
972
+ "gender",
973
+ 0.0972
974
+ ],
975
+ [
976
+ "resolution",
977
+ 0.0326
978
+ ]
979
+ ],
980
+ "top_tfidf": [
981
+ [
982
+ "franchise_series",
983
+ 0.6143
984
+ ],
985
+ [
986
+ "gender",
987
+ 0.0972
988
+ ],
989
+ [
990
+ "resolution",
991
+ 0.0326
992
+ ]
993
+ ],
994
+ "top_wiki": [],
995
+ "wiki_vote_count": 0,
996
+ "wiki_link_count": 0
997
+ },
998
+ {
999
+ "tag": "linoone",
1000
+ "count": 628,
1001
+ "signal": "tfidf_only",
1002
+ "assigned_categories": [
1003
+ "franchise_series"
1004
+ ],
1005
+ "top_fused": [
1006
+ [
1007
+ "franchise_series",
1008
+ 0.7373
1009
+ ],
1010
+ [
1011
+ "gender",
1012
+ 0.0406
1013
+ ],
1014
+ [
1015
+ "resolution",
1016
+ 0.0196
1017
+ ]
1018
+ ],
1019
+ "top_tfidf": [
1020
+ [
1021
+ "franchise_series",
1022
+ 0.7373
1023
+ ],
1024
+ [
1025
+ "gender",
1026
+ 0.0406
1027
+ ],
1028
+ [
1029
+ "resolution",
1030
+ 0.0196
1031
+ ]
1032
+ ],
1033
+ "top_wiki": [],
1034
+ "wiki_vote_count": 0,
1035
+ "wiki_link_count": 2
1036
+ },
1037
+ {
1038
+ "tag": "amped_toxtricity",
1039
+ "count": 568,
1040
+ "signal": "tfidf_only",
1041
+ "assigned_categories": [
1042
+ "franchise_series"
1043
+ ],
1044
+ "top_fused": [
1045
+ [
1046
+ "franchise_series",
1047
+ 0.6964
1048
+ ],
1049
+ [
1050
+ "gender",
1051
+ 0.0587
1052
+ ],
1053
+ [
1054
+ "resolution",
1055
+ 0.0257
1056
+ ]
1057
+ ],
1058
+ "top_tfidf": [
1059
+ [
1060
+ "franchise_series",
1061
+ 0.6964
1062
+ ],
1063
+ [
1064
+ "gender",
1065
+ 0.0587
1066
+ ],
1067
+ [
1068
+ "resolution",
1069
+ 0.0257
1070
+ ]
1071
+ ],
1072
+ "top_wiki": [],
1073
+ "wiki_vote_count": 0,
1074
+ "wiki_link_count": 0
1075
+ }
1076
+ ],
1077
+ "multi": [
1078
+ {
1079
+ "tag": "cub",
1080
+ "count": 147547,
1081
+ "signal": "wiki_only",
1082
+ "assigned_categories": [
1083
+ "species",
1084
+ "body_type"
1085
+ ],
1086
+ "top_fused": [
1087
+ [
1088
+ "species",
1089
+ 0.6
1090
+ ],
1091
+ [
1092
+ "body_type",
1093
+ 0.4
1094
+ ],
1095
+ [
1096
+ "text",
1097
+ 0.0
1098
+ ]
1099
+ ],
1100
+ "top_tfidf": [],
1101
+ "top_wiki": [
1102
+ [
1103
+ "species",
1104
+ 0.6
1105
+ ],
1106
+ [
1107
+ "body_type",
1108
+ 0.4
1109
+ ],
1110
+ [
1111
+ "text",
1112
+ 0.0
1113
+ ]
1114
+ ],
1115
+ "wiki_vote_count": 5,
1116
+ "wiki_link_count": 11
1117
+ },
1118
+ {
1119
+ "tag": "dock",
1120
+ "count": 16478,
1121
+ "signal": "wiki_only",
1122
+ "assigned_categories": [
1123
+ "anatomy_features",
1124
+ "objects_props"
1125
+ ],
1126
+ "top_fused": [
1127
+ [
1128
+ "anatomy_features",
1129
+ 0.5
1130
+ ],
1131
+ [
1132
+ "objects_props",
1133
+ 0.5
1134
+ ],
1135
+ [
1136
+ "franchise_series",
1137
+ 0.0
1138
+ ]
1139
+ ],
1140
+ "top_tfidf": [],
1141
+ "top_wiki": [
1142
+ [
1143
+ "anatomy_features",
1144
+ 0.5
1145
+ ],
1146
+ [
1147
+ "objects_props",
1148
+ 0.5
1149
+ ],
1150
+ [
1151
+ "franchise_series",
1152
+ 0.0
1153
+ ]
1154
+ ],
1155
+ "wiki_vote_count": 2,
1156
+ "wiki_link_count": 3
1157
+ },
1158
+ {
1159
+ "tag": "teenager",
1160
+ "count": 13700,
1161
+ "signal": "wiki_only",
1162
+ "assigned_categories": [
1163
+ "body_type",
1164
+ "gender"
1165
+ ],
1166
+ "top_fused": [
1167
+ [
1168
+ "body_type",
1169
+ 0.5
1170
+ ],
1171
+ [
1172
+ "gender",
1173
+ 0.5
1174
+ ],
1175
+ [
1176
+ "text",
1177
+ 0.0
1178
+ ]
1179
+ ],
1180
+ "top_tfidf": [],
1181
+ "top_wiki": [
1182
+ [
1183
+ "body_type",
1184
+ 0.5
1185
+ ],
1186
+ [
1187
+ "gender",
1188
+ 0.5
1189
+ ],
1190
+ [
1191
+ "text",
1192
+ 0.0
1193
+ ]
1194
+ ],
1195
+ "wiki_vote_count": 4,
1196
+ "wiki_link_count": 19
1197
+ },
1198
+ {
1199
+ "tag": "ringtail",
1200
+ "count": 6643,
1201
+ "signal": "wiki_only",
1202
+ "assigned_categories": [
1203
+ "anatomy_features",
1204
+ "color_markings"
1205
+ ],
1206
+ "top_fused": [
1207
+ [
1208
+ "anatomy_features",
1209
+ 0.6667
1210
+ ],
1211
+ [
1212
+ "color_markings",
1213
+ 0.3333
1214
+ ],
1215
+ [
1216
+ "franchise_series",
1217
+ 0.0
1218
+ ]
1219
+ ],
1220
+ "top_tfidf": [],
1221
+ "top_wiki": [
1222
+ [
1223
+ "anatomy_features",
1224
+ 0.6667
1225
+ ],
1226
+ [
1227
+ "color_markings",
1228
+ 0.3333
1229
+ ],
1230
+ [
1231
+ "franchise_series",
1232
+ 0.0
1233
+ ]
1234
+ ],
1235
+ "wiki_vote_count": 3,
1236
+ "wiki_link_count": 5
1237
+ },
1238
+ {
1239
+ "tag": "greninja",
1240
+ "count": 3805,
1241
+ "signal": "both",
1242
+ "assigned_categories": [
1243
+ "franchise_series",
1244
+ "body_type"
1245
+ ],
1246
+ "top_fused": [
1247
+ [
1248
+ "franchise_series",
1249
+ 0.4591
1250
+ ],
1251
+ [
1252
+ "body_type",
1253
+ 0.4072
1254
+ ],
1255
+ [
1256
+ "gender",
1257
+ 0.0343
1258
+ ]
1259
+ ],
1260
+ "top_tfidf": [
1261
+ [
1262
+ "franchise_series",
1263
+ 0.7651
1264
+ ],
1265
+ [
1266
+ "gender",
1267
+ 0.0571
1268
+ ],
1269
+ [
1270
+ "resolution",
1271
+ 0.0246
1272
+ ]
1273
+ ],
1274
+ "top_wiki": [
1275
+ [
1276
+ "body_type",
1277
+ 1.0
1278
+ ],
1279
+ [
1280
+ "text",
1281
+ 0.0
1282
+ ],
1283
+ [
1284
+ "franchise_series",
1285
+ 0.0
1286
+ ]
1287
+ ],
1288
+ "wiki_vote_count": 1,
1289
+ "wiki_link_count": 9
1290
+ },
1291
+ {
1292
+ "tag": "roxanne_wolf_(fnaf)",
1293
+ "count": 3637,
1294
+ "signal": "wiki_only",
1295
+ "assigned_categories": [
1296
+ "anatomy_features",
1297
+ "color_markings"
1298
+ ],
1299
+ "top_fused": [
1300
+ [
1301
+ "anatomy_features",
1302
+ 0.6
1303
+ ],
1304
+ [
1305
+ "color_markings",
1306
+ 0.4
1307
+ ],
1308
+ [
1309
+ "franchise_series",
1310
+ 0.0
1311
+ ]
1312
+ ],
1313
+ "top_tfidf": [],
1314
+ "top_wiki": [
1315
+ [
1316
+ "anatomy_features",
1317
+ 0.6
1318
+ ],
1319
+ [
1320
+ "color_markings",
1321
+ 0.4
1322
+ ],
1323
+ [
1324
+ "franchise_series",
1325
+ 0.0
1326
+ ]
1327
+ ],
1328
+ "wiki_vote_count": 5,
1329
+ "wiki_link_count": 21
1330
+ },
1331
+ {
1332
+ "tag": "pet",
1333
+ "count": 3461,
1334
+ "signal": "wiki_only",
1335
+ "assigned_categories": [
1336
+ "body_decor",
1337
+ "objects_props"
1338
+ ],
1339
+ "top_fused": [
1340
+ [
1341
+ "body_decor",
1342
+ 0.5
1343
+ ],
1344
+ [
1345
+ "objects_props",
1346
+ 0.5
1347
+ ],
1348
+ [
1349
+ "text",
1350
+ 0.0
1351
+ ]
1352
+ ],
1353
+ "top_tfidf": [],
1354
+ "top_wiki": [
1355
+ [
1356
+ "body_decor",
1357
+ 0.5
1358
+ ],
1359
+ [
1360
+ "objects_props",
1361
+ 0.5
1362
+ ],
1363
+ [
1364
+ "text",
1365
+ 0.0
1366
+ ]
1367
+ ],
1368
+ "wiki_vote_count": 2,
1369
+ "wiki_link_count": 18
1370
+ },
1371
+ {
1372
+ "tag": "zorua",
1373
+ "count": 3167,
1374
+ "signal": "both",
1375
+ "assigned_categories": [
1376
+ "franchise_series",
1377
+ "body_type"
1378
+ ],
1379
+ "top_fused": [
1380
+ [
1381
+ "franchise_series",
1382
+ 0.4565
1383
+ ],
1384
+ [
1385
+ "body_type",
1386
+ 0.4079
1387
+ ],
1388
+ [
1389
+ "gender",
1390
+ 0.0393
1391
+ ]
1392
+ ],
1393
+ "top_tfidf": [
1394
+ [
1395
+ "franchise_series",
1396
+ 0.7608
1397
+ ],
1398
+ [
1399
+ "gender",
1400
+ 0.0655
1401
+ ],
1402
+ [
1403
+ "resolution",
1404
+ 0.0223
1405
+ ]
1406
+ ],
1407
+ "top_wiki": [
1408
+ [
1409
+ "body_type",
1410
+ 1.0
1411
+ ],
1412
+ [
1413
+ "text",
1414
+ 0.0
1415
+ ],
1416
+ [
1417
+ "franchise_series",
1418
+ 0.0
1419
+ ]
1420
+ ],
1421
+ "wiki_vote_count": 1,
1422
+ "wiki_link_count": 3
1423
+ },
1424
+ {
1425
+ "tag": "kirlia",
1426
+ "count": 3140,
1427
+ "signal": "both",
1428
+ "assigned_categories": [
1429
+ "franchise_series",
1430
+ "body_type"
1431
+ ],
1432
+ "top_fused": [
1433
+ [
1434
+ "franchise_series",
1435
+ 0.4781
1436
+ ],
1437
+ [
1438
+ "body_type",
1439
+ 0.4081
1440
+ ],
1441
+ [
1442
+ "gender",
1443
+ 0.0273
1444
+ ]
1445
+ ],
1446
+ "top_tfidf": [
1447
+ [
1448
+ "franchise_series",
1449
+ 0.7969
1450
+ ],
1451
+ [
1452
+ "gender",
1453
+ 0.0455
1454
+ ],
1455
+ [
1456
+ "resolution",
1457
+ 0.0237
1458
+ ]
1459
+ ],
1460
+ "top_wiki": [
1461
+ [
1462
+ "body_type",
1463
+ 1.0
1464
+ ],
1465
+ [
1466
+ "text",
1467
+ 0.0
1468
+ ],
1469
+ [
1470
+ "franchise_series",
1471
+ 0.0
1472
+ ]
1473
+ ],
1474
+ "wiki_vote_count": 1,
1475
+ "wiki_link_count": 8
1476
+ },
1477
+ {
1478
+ "tag": "simba",
1479
+ "count": 2566,
1480
+ "signal": "wiki_only",
1481
+ "assigned_categories": [
1482
+ "franchise_series",
1483
+ "gender"
1484
+ ],
1485
+ "top_fused": [
1486
+ [
1487
+ "franchise_series",
1488
+ 0.5
1489
+ ],
1490
+ [
1491
+ "gender",
1492
+ 0.5
1493
+ ],
1494
+ [
1495
+ "text",
1496
+ 0.0
1497
+ ]
1498
+ ],
1499
+ "top_tfidf": [],
1500
+ "top_wiki": [
1501
+ [
1502
+ "franchise_series",
1503
+ 0.5
1504
+ ],
1505
+ [
1506
+ "gender",
1507
+ 0.5
1508
+ ],
1509
+ [
1510
+ "text",
1511
+ 0.0
1512
+ ]
1513
+ ],
1514
+ "wiki_vote_count": 2,
1515
+ "wiki_link_count": 14
1516
+ },
1517
+ {
1518
+ "tag": "colorful",
1519
+ "count": 2402,
1520
+ "signal": "wiki_only",
1521
+ "assigned_categories": [
1522
+ "color_markings",
1523
+ "style"
1524
+ ],
1525
+ "top_fused": [
1526
+ [
1527
+ "color_markings",
1528
+ 0.6667
1529
+ ],
1530
+ [
1531
+ "style",
1532
+ 0.3333
1533
+ ],
1534
+ [
1535
+ "text",
1536
+ 0.0
1537
+ ]
1538
+ ],
1539
+ "top_tfidf": [],
1540
+ "top_wiki": [
1541
+ [
1542
+ "color_markings",
1543
+ 0.6667
1544
+ ],
1545
+ [
1546
+ "style",
1547
+ 0.3333
1548
+ ],
1549
+ [
1550
+ "text",
1551
+ 0.0
1552
+ ]
1553
+ ],
1554
+ "wiki_vote_count": 3,
1555
+ "wiki_link_count": 5
1556
+ },
1557
+ {
1558
+ "tag": "mawile",
1559
+ "count": 2121,
1560
+ "signal": "both",
1561
+ "assigned_categories": [
1562
+ "franchise_series",
1563
+ "clothing_detail"
1564
+ ],
1565
+ "top_fused": [
1566
+ [
1567
+ "franchise_series",
1568
+ 0.4989
1569
+ ],
1570
+ [
1571
+ "clothing_detail",
1572
+ 0.4019
1573
+ ],
1574
+ [
1575
+ "gender",
1576
+ 0.0226
1577
+ ]
1578
+ ],
1579
+ "top_tfidf": [
1580
+ [
1581
+ "franchise_series",
1582
+ 0.8316
1583
+ ],
1584
+ [
1585
+ "gender",
1586
+ 0.0377
1587
+ ],
1588
+ [
1589
+ "resolution",
1590
+ 0.0176
1591
+ ]
1592
+ ],
1593
+ "top_wiki": [
1594
+ [
1595
+ "clothing_detail",
1596
+ 1.0
1597
+ ],
1598
+ [
1599
+ "text",
1600
+ 0.0
1601
+ ],
1602
+ [
1603
+ "franchise_series",
1604
+ 0.0
1605
+ ]
1606
+ ],
1607
+ "wiki_vote_count": 1,
1608
+ "wiki_link_count": 6
1609
+ },
1610
+ {
1611
+ "tag": "troll",
1612
+ "count": 1556,
1613
+ "signal": "wiki_only",
1614
+ "assigned_categories": [
1615
+ "species",
1616
+ "body_type"
1617
+ ],
1618
+ "top_fused": [
1619
+ [
1620
+ "species",
1621
+ 0.5
1622
+ ],
1623
+ [
1624
+ "body_type",
1625
+ 0.5
1626
+ ],
1627
+ [
1628
+ "text",
1629
+ 0.0
1630
+ ]
1631
+ ],
1632
+ "top_tfidf": [],
1633
+ "top_wiki": [
1634
+ [
1635
+ "species",
1636
+ 0.5
1637
+ ],
1638
+ [
1639
+ "body_type",
1640
+ 0.5
1641
+ ],
1642
+ [
1643
+ "text",
1644
+ 0.0
1645
+ ]
1646
+ ],
1647
+ "wiki_vote_count": 2,
1648
+ "wiki_link_count": 10
1649
+ },
1650
+ {
1651
+ "tag": "squirtle",
1652
+ "count": 1167,
1653
+ "signal": "both",
1654
+ "assigned_categories": [
1655
+ "franchise_series",
1656
+ "body_type"
1657
+ ],
1658
+ "top_fused": [
1659
+ [
1660
+ "franchise_series",
1661
+ 0.4597
1662
+ ],
1663
+ [
1664
+ "body_type",
1665
+ 0.4074
1666
+ ],
1667
+ [
1668
+ "gender",
1669
+ 0.0188
1670
+ ]
1671
+ ],
1672
+ "top_tfidf": [
1673
+ [
1674
+ "franchise_series",
1675
+ 0.7662
1676
+ ],
1677
+ [
1678
+ "gender",
1679
+ 0.0313
1680
+ ],
1681
+ [
1682
+ "resolution",
1683
+ 0.0187
1684
+ ]
1685
+ ],
1686
+ "top_wiki": [
1687
+ [
1688
+ "body_type",
1689
+ 1.0
1690
+ ],
1691
+ [
1692
+ "text",
1693
+ 0.0
1694
+ ],
1695
+ [
1696
+ "franchise_series",
1697
+ 0.0
1698
+ ]
1699
+ ],
1700
+ "wiki_vote_count": 1,
1701
+ "wiki_link_count": 10
1702
+ },
1703
+ {
1704
+ "tag": "oshawott",
1705
+ "count": 1157,
1706
+ "signal": "both",
1707
+ "assigned_categories": [
1708
+ "franchise_series",
1709
+ "body_type"
1710
+ ],
1711
+ "top_fused": [
1712
+ [
1713
+ "franchise_series",
1714
+ 0.5151
1715
+ ],
1716
+ [
1717
+ "body_type",
1718
+ 0.4049
1719
+ ],
1720
+ [
1721
+ "gender",
1722
+ 0.019
1723
+ ]
1724
+ ],
1725
+ "top_tfidf": [
1726
+ [
1727
+ "franchise_series",
1728
+ 0.8585
1729
+ ],
1730
+ [
1731
+ "gender",
1732
+ 0.0316
1733
+ ],
1734
+ [
1735
+ "resolution",
1736
+ 0.013
1737
+ ]
1738
+ ],
1739
+ "top_wiki": [
1740
+ [
1741
+ "body_type",
1742
+ 1.0
1743
+ ],
1744
+ [
1745
+ "text",
1746
+ 0.0
1747
+ ],
1748
+ [
1749
+ "franchise_series",
1750
+ 0.0
1751
+ ]
1752
+ ],
1753
+ "wiki_vote_count": 2,
1754
+ "wiki_link_count": 9
1755
+ },
1756
+ {
1757
+ "tag": "cosplay_pikachu_(character)",
1758
+ "count": 1138,
1759
+ "signal": "both",
1760
+ "assigned_categories": [
1761
+ "gender",
1762
+ "franchise_series"
1763
+ ],
1764
+ "top_fused": [
1765
+ [
1766
+ "gender",
1767
+ 0.4429
1768
+ ],
1769
+ [
1770
+ "franchise_series",
1771
+ 0.343
1772
+ ],
1773
+ [
1774
+ "resolution",
1775
+ 0.0276
1776
+ ]
1777
+ ],
1778
+ "top_tfidf": [
1779
+ [
1780
+ "franchise_series",
1781
+ 0.5717
1782
+ ],
1783
+ [
1784
+ "gender",
1785
+ 0.0715
1786
+ ],
1787
+ [
1788
+ "resolution",
1789
+ 0.046
1790
+ ]
1791
+ ],
1792
+ "top_wiki": [
1793
+ [
1794
+ "gender",
1795
+ 1.0
1796
+ ],
1797
+ [
1798
+ "text",
1799
+ 0.0
1800
+ ],
1801
+ [
1802
+ "franchise_series",
1803
+ 0.0
1804
+ ]
1805
+ ],
1806
+ "wiki_vote_count": 1,
1807
+ "wiki_link_count": 5
1808
+ },
1809
+ {
1810
+ "tag": "legendary_duo",
1811
+ "count": 1059,
1812
+ "signal": "both",
1813
+ "assigned_categories": [
1814
+ "franchise_series",
1815
+ "count"
1816
+ ],
1817
+ "top_fused": [
1818
+ [
1819
+ "franchise_series",
1820
+ 0.477
1821
+ ],
1822
+ [
1823
+ "count",
1824
+ 0.4082
1825
+ ],
1826
+ [
1827
+ "gender",
1828
+ 0.0209
1829
+ ]
1830
+ ],
1831
+ "top_tfidf": [
1832
+ [
1833
+ "franchise_series",
1834
+ 0.7951
1835
+ ],
1836
+ [
1837
+ "gender",
1838
+ 0.0348
1839
+ ],
1840
+ [
1841
+ "resolution",
1842
+ 0.0174
1843
+ ]
1844
+ ],
1845
+ "top_wiki": [
1846
+ [
1847
+ "count",
1848
+ 1.0
1849
+ ],
1850
+ [
1851
+ "text",
1852
+ 0.0
1853
+ ],
1854
+ [
1855
+ "franchise_series",
1856
+ 0.0
1857
+ ]
1858
+ ],
1859
+ "wiki_vote_count": 1,
1860
+ "wiki_link_count": 26
1861
+ },
1862
+ {
1863
+ "tag": "sobble",
1864
+ "count": 762,
1865
+ "signal": "both",
1866
+ "assigned_categories": [
1867
+ "franchise_series",
1868
+ "anatomy_features"
1869
+ ],
1870
+ "top_fused": [
1871
+ [
1872
+ "franchise_series",
1873
+ 0.4854
1874
+ ],
1875
+ [
1876
+ "anatomy_features",
1877
+ 0.3047
1878
+ ],
1879
+ [
1880
+ "color_markings",
1881
+ 0.1033
1882
+ ]
1883
+ ],
1884
+ "top_tfidf": [
1885
+ [
1886
+ "franchise_series",
1887
+ 0.8089
1888
+ ],
1889
+ [
1890
+ "gender",
1891
+ 0.0414
1892
+ ],
1893
+ [
1894
+ "resolution",
1895
+ 0.0166
1896
+ ]
1897
+ ],
1898
+ "top_wiki": [
1899
+ [
1900
+ "anatomy_features",
1901
+ 0.75
1902
+ ],
1903
+ [
1904
+ "color_markings",
1905
+ 0.25
1906
+ ],
1907
+ [
1908
+ "franchise_series",
1909
+ 0.0
1910
+ ]
1911
+ ],
1912
+ "wiki_vote_count": 4,
1913
+ "wiki_link_count": 20
1914
+ },
1915
+ {
1916
+ "tag": "chesnaught",
1917
+ "count": 718,
1918
+ "signal": "both",
1919
+ "assigned_categories": [
1920
+ "franchise_series",
1921
+ "body_type"
1922
+ ],
1923
+ "top_fused": [
1924
+ [
1925
+ "franchise_series",
1926
+ 0.4713
1927
+ ],
1928
+ [
1929
+ "body_type",
1930
+ 0.406
1931
+ ],
1932
+ [
1933
+ "gender",
1934
+ 0.0226
1935
+ ]
1936
+ ],
1937
+ "top_tfidf": [
1938
+ [
1939
+ "franchise_series",
1940
+ 0.7856
1941
+ ],
1942
+ [
1943
+ "gender",
1944
+ 0.0377
1945
+ ],
1946
+ [
1947
+ "resolution",
1948
+ 0.0181
1949
+ ]
1950
+ ],
1951
+ "top_wiki": [
1952
+ [
1953
+ "body_type",
1954
+ 1.0
1955
+ ],
1956
+ [
1957
+ "text",
1958
+ 0.0
1959
+ ],
1960
+ [
1961
+ "franchise_series",
1962
+ 0.0
1963
+ ]
1964
+ ],
1965
+ "wiki_vote_count": 1,
1966
+ "wiki_link_count": 7
1967
+ },
1968
+ {
1969
+ "tag": "</3",
1970
+ "count": 712,
1971
+ "signal": "wiki_only",
1972
+ "assigned_categories": [
1973
+ "expression_detail",
1974
+ "pose_action_detail"
1975
+ ],
1976
+ "top_fused": [
1977
+ [
1978
+ "expression_detail",
1979
+ 0.6667
1980
+ ],
1981
+ [
1982
+ "pose_action_detail",
1983
+ 0.3333
1984
+ ],
1985
+ [
1986
+ "text",
1987
+ 0.0
1988
+ ]
1989
+ ],
1990
+ "top_tfidf": [],
1991
+ "top_wiki": [
1992
+ [
1993
+ "expression_detail",
1994
+ 0.6667
1995
+ ],
1996
+ [
1997
+ "pose_action_detail",
1998
+ 0.3333
1999
+ ],
2000
+ [
2001
+ "text",
2002
+ 0.0
2003
+ ]
2004
+ ],
2005
+ "wiki_vote_count": 3,
2006
+ "wiki_link_count": 7
2007
+ }
2008
+ ],
2009
+ "hold": [
2010
+ {
2011
+ "tag": "helmet",
2012
+ "count": 24793,
2013
+ "signal": "both",
2014
+ "assigned_categories": [],
2015
+ "top_fused": [
2016
+ [
2017
+ "clothing_detail",
2018
+ 0.2934
2019
+ ],
2020
+ [
2021
+ "body_decor",
2022
+ 0.0901
2023
+ ],
2024
+ [
2025
+ "resolution",
2026
+ 0.0744
2027
+ ]
2028
+ ],
2029
+ "top_tfidf": [
2030
+ [
2031
+ "resolution",
2032
+ 0.124
2033
+ ],
2034
+ [
2035
+ "count",
2036
+ 0.1153
2037
+ ],
2038
+ [
2039
+ "body_type",
2040
+ 0.0904
2041
+ ]
2042
+ ],
2043
+ "top_wiki": [
2044
+ [
2045
+ "clothing_detail",
2046
+ 0.7143
2047
+ ],
2048
+ [
2049
+ "franchise_series",
2050
+ 0.1429
2051
+ ],
2052
+ [
2053
+ "body_decor",
2054
+ 0.1429
2055
+ ]
2056
+ ],
2057
+ "wiki_vote_count": 7,
2058
+ "wiki_link_count": 87
2059
+ },
2060
+ {
2061
+ "tag": "poster",
2062
+ "count": 6434,
2063
+ "signal": "both",
2064
+ "assigned_categories": [],
2065
+ "top_fused": [
2066
+ [
2067
+ "organization",
2068
+ 0.4334
2069
+ ],
2070
+ [
2071
+ "objects_props",
2072
+ 0.1036
2073
+ ],
2074
+ [
2075
+ "pose_action_detail",
2076
+ 0.0671
2077
+ ]
2078
+ ],
2079
+ "top_tfidf": [
2080
+ [
2081
+ "objects_props",
2082
+ 0.1727
2083
+ ],
2084
+ [
2085
+ "pose_action_detail",
2086
+ 0.1119
2087
+ ],
2088
+ [
2089
+ "background_composition",
2090
+ 0.078
2091
+ ]
2092
+ ],
2093
+ "top_wiki": [
2094
+ [
2095
+ "organization",
2096
+ 1.0
2097
+ ],
2098
+ [
2099
+ "text",
2100
+ 0.0
2101
+ ],
2102
+ [
2103
+ "franchise_series",
2104
+ 0.0
2105
+ ]
2106
+ ],
2107
+ "wiki_vote_count": 2,
2108
+ "wiki_link_count": 19
2109
+ },
2110
+ {
2111
+ "tag": "bottomless_female",
2112
+ "count": 4337,
2113
+ "signal": "both",
2114
+ "assigned_categories": [],
2115
+ "top_fused": [
2116
+ [
2117
+ "clothing_detail",
2118
+ 0.4343
2119
+ ],
2120
+ [
2121
+ "pose_action_detail",
2122
+ 0.0644
2123
+ ],
2124
+ [
2125
+ "gaze_detail",
2126
+ 0.0573
2127
+ ]
2128
+ ],
2129
+ "top_tfidf": [
2130
+ [
2131
+ "pose_action_detail",
2132
+ 0.1074
2133
+ ],
2134
+ [
2135
+ "gaze_detail",
2136
+ 0.0954
2137
+ ],
2138
+ [
2139
+ "expression_detail",
2140
+ 0.0898
2141
+ ]
2142
+ ],
2143
+ "top_wiki": [
2144
+ [
2145
+ "clothing_detail",
2146
+ 1.0
2147
+ ],
2148
+ [
2149
+ "text",
2150
+ 0.0
2151
+ ],
2152
+ [
2153
+ "franchise_series",
2154
+ 0.0
2155
+ ]
2156
+ ],
2157
+ "wiki_vote_count": 11,
2158
+ "wiki_link_count": 17
2159
+ },
2160
+ {
2161
+ "tag": "guardians_of_the_galaxy",
2162
+ "count": 3013,
2163
+ "signal": "tfidf_only",
2164
+ "assigned_categories": [],
2165
+ "top_fused": [
2166
+ [
2167
+ "style",
2168
+ 0.0935
2169
+ ],
2170
+ [
2171
+ "pose_action_detail",
2172
+ 0.0824
2173
+ ],
2174
+ [
2175
+ "expression_detail",
2176
+ 0.0768
2177
+ ]
2178
+ ],
2179
+ "top_tfidf": [
2180
+ [
2181
+ "style",
2182
+ 0.0935
2183
+ ],
2184
+ [
2185
+ "pose_action_detail",
2186
+ 0.0824
2187
+ ],
2188
+ [
2189
+ "expression_detail",
2190
+ 0.0768
2191
+ ]
2192
+ ],
2193
+ "top_wiki": [],
2194
+ "wiki_vote_count": 0,
2195
+ "wiki_link_count": 5
2196
+ },
2197
+ {
2198
+ "tag": "barbel_(anatomy)",
2199
+ "count": 2627,
2200
+ "signal": "tfidf_only",
2201
+ "assigned_categories": [],
2202
+ "top_fused": [
2203
+ [
2204
+ "anatomy_features",
2205
+ 0.1965
2206
+ ],
2207
+ [
2208
+ "color_markings",
2209
+ 0.1306
2210
+ ],
2211
+ [
2212
+ "perspective",
2213
+ 0.0843
2214
+ ]
2215
+ ],
2216
+ "top_tfidf": [
2217
+ [
2218
+ "anatomy_features",
2219
+ 0.1965
2220
+ ],
2221
+ [
2222
+ "color_markings",
2223
+ 0.1306
2224
+ ],
2225
+ [
2226
+ "perspective",
2227
+ 0.0843
2228
+ ]
2229
+ ],
2230
+ "top_wiki": [],
2231
+ "wiki_vote_count": 0,
2232
+ "wiki_link_count": 5
2233
+ },
2234
+ {
2235
+ "tag": "millie_(helluva_boss)",
2236
+ "count": 2009,
2237
+ "signal": "both",
2238
+ "assigned_categories": [],
2239
+ "top_fused": [
2240
+ [
2241
+ "clothing_detail",
2242
+ 0.4248
2243
+ ],
2244
+ [
2245
+ "expression_detail",
2246
+ 0.0566
2247
+ ],
2248
+ [
2249
+ "text",
2250
+ 0.0562
2251
+ ]
2252
+ ],
2253
+ "top_tfidf": [
2254
+ [
2255
+ "expression_detail",
2256
+ 0.0944
2257
+ ],
2258
+ [
2259
+ "text",
2260
+ 0.0936
2261
+ ],
2262
+ [
2263
+ "body_decor",
2264
+ 0.09
2265
+ ]
2266
+ ],
2267
+ "top_wiki": [
2268
+ [
2269
+ "clothing_detail",
2270
+ 1.0
2271
+ ],
2272
+ [
2273
+ "text",
2274
+ 0.0
2275
+ ],
2276
+ [
2277
+ "franchise_series",
2278
+ 0.0
2279
+ ]
2280
+ ],
2281
+ "wiki_vote_count": 1,
2282
+ "wiki_link_count": 13
2283
+ },
2284
+ {
2285
+ "tag": "hill",
2286
+ "count": 1443,
2287
+ "signal": "tfidf_only",
2288
+ "assigned_categories": [],
2289
+ "top_fused": [
2290
+ [
2291
+ "background_composition",
2292
+ 0.1306
2293
+ ],
2294
+ [
2295
+ "objects_props",
2296
+ 0.0996
2297
+ ],
2298
+ [
2299
+ "style",
2300
+ 0.0904
2301
+ ]
2302
+ ],
2303
+ "top_tfidf": [
2304
+ [
2305
+ "background_composition",
2306
+ 0.1306
2307
+ ],
2308
+ [
2309
+ "objects_props",
2310
+ 0.0996
2311
+ ],
2312
+ [
2313
+ "style",
2314
+ 0.0904
2315
+ ]
2316
+ ],
2317
+ "top_wiki": [],
2318
+ "wiki_vote_count": 0,
2319
+ "wiki_link_count": 3
2320
+ },
2321
+ {
2322
+ "tag": "electric_fan",
2323
+ "count": 1093,
2324
+ "signal": "tfidf_only",
2325
+ "assigned_categories": [],
2326
+ "top_fused": [
2327
+ [
2328
+ "objects_props",
2329
+ 0.1294
2330
+ ],
2331
+ [
2332
+ "pose_action_detail",
2333
+ 0.0944
2334
+ ],
2335
+ [
2336
+ "expression_detail",
2337
+ 0.0939
2338
+ ]
2339
+ ],
2340
+ "top_tfidf": [
2341
+ [
2342
+ "objects_props",
2343
+ 0.1294
2344
+ ],
2345
+ [
2346
+ "pose_action_detail",
2347
+ 0.0944
2348
+ ],
2349
+ [
2350
+ "expression_detail",
2351
+ 0.0939
2352
+ ]
2353
+ ],
2354
+ "top_wiki": [],
2355
+ "wiki_vote_count": 0,
2356
+ "wiki_link_count": 4
2357
+ },
2358
+ {
2359
+ "tag": "gammamon",
2360
+ "count": 1023,
2361
+ "signal": "both",
2362
+ "assigned_categories": [],
2363
+ "top_fused": [
2364
+ [
2365
+ "franchise_series",
2366
+ 0.4428
2367
+ ],
2368
+ [
2369
+ "body_type",
2370
+ 0.0619
2371
+ ],
2372
+ [
2373
+ "gender",
2374
+ 0.0359
2375
+ ]
2376
+ ],
2377
+ "top_tfidf": [
2378
+ [
2379
+ "body_type",
2380
+ 0.1032
2381
+ ],
2382
+ [
2383
+ "franchise_series",
2384
+ 0.0714
2385
+ ],
2386
+ [
2387
+ "gender",
2388
+ 0.0598
2389
+ ]
2390
+ ],
2391
+ "top_wiki": [
2392
+ [
2393
+ "franchise_series",
2394
+ 1.0
2395
+ ],
2396
+ [
2397
+ "text",
2398
+ 0.0
2399
+ ],
2400
+ [
2401
+ "background_composition",
2402
+ 0.0
2403
+ ]
2404
+ ],
2405
+ "wiki_vote_count": 2,
2406
+ "wiki_link_count": 13
2407
+ },
2408
+ {
2409
+ "tag": "zazush-una",
2410
+ "count": 971,
2411
+ "signal": "none",
2412
+ "assigned_categories": [],
2413
+ "top_fused": [
2414
+ [
2415
+ "text",
2416
+ 0.0
2417
+ ],
2418
+ [
2419
+ "franchise_series",
2420
+ 0.0
2421
+ ],
2422
+ [
2423
+ "background_composition",
2424
+ 0.0
2425
+ ]
2426
+ ],
2427
+ "top_tfidf": [],
2428
+ "top_wiki": [],
2429
+ "wiki_vote_count": 0,
2430
+ "wiki_link_count": 0
2431
+ },
2432
+ {
2433
+ "tag": "radio",
2434
+ "count": 842,
2435
+ "signal": "tfidf_only",
2436
+ "assigned_categories": [],
2437
+ "top_fused": [
2438
+ [
2439
+ "objects_props",
2440
+ 0.2006
2441
+ ],
2442
+ [
2443
+ "pose_action_detail",
2444
+ 0.1172
2445
+ ],
2446
+ [
2447
+ "background_composition",
2448
+ 0.0925
2449
+ ]
2450
+ ],
2451
+ "top_tfidf": [
2452
+ [
2453
+ "objects_props",
2454
+ 0.2006
2455
+ ],
2456
+ [
2457
+ "pose_action_detail",
2458
+ 0.1172
2459
+ ],
2460
+ [
2461
+ "background_composition",
2462
+ 0.0925
2463
+ ]
2464
+ ],
2465
+ "top_wiki": [],
2466
+ "wiki_vote_count": 0,
2467
+ "wiki_link_count": 5
2468
+ },
2469
+ {
2470
+ "tag": "by_bambii_dog",
2471
+ "count": 811,
2472
+ "signal": "none",
2473
+ "assigned_categories": [],
2474
+ "top_fused": [
2475
+ [
2476
+ "text",
2477
+ 0.0
2478
+ ],
2479
+ [
2480
+ "franchise_series",
2481
+ 0.0
2482
+ ],
2483
+ [
2484
+ "background_composition",
2485
+ 0.0
2486
+ ]
2487
+ ],
2488
+ "top_tfidf": [],
2489
+ "top_wiki": [],
2490
+ "wiki_vote_count": 0,
2491
+ "wiki_link_count": 0
2492
+ },
2493
+ {
2494
+ "tag": "cabin",
2495
+ "count": 694,
2496
+ "signal": "tfidf_only",
2497
+ "assigned_categories": [],
2498
+ "top_fused": [
2499
+ [
2500
+ "style",
2501
+ 0.0748
2502
+ ],
2503
+ [
2504
+ "organization",
2505
+ 0.074
2506
+ ],
2507
+ [
2508
+ "background_composition",
2509
+ 0.0733
2510
+ ]
2511
+ ],
2512
+ "top_tfidf": [
2513
+ [
2514
+ "style",
2515
+ 0.0748
2516
+ ],
2517
+ [
2518
+ "organization",
2519
+ 0.074
2520
+ ],
2521
+ [
2522
+ "background_composition",
2523
+ 0.0733
2524
+ ]
2525
+ ],
2526
+ "top_wiki": [],
2527
+ "wiki_vote_count": 0,
2528
+ "wiki_link_count": 0
2529
+ },
2530
+ {
2531
+ "tag": "by_luckypan",
2532
+ "count": 681,
2533
+ "signal": "none",
2534
+ "assigned_categories": [],
2535
+ "top_fused": [
2536
+ [
2537
+ "text",
2538
+ 0.0
2539
+ ],
2540
+ [
2541
+ "franchise_series",
2542
+ 0.0
2543
+ ],
2544
+ [
2545
+ "background_composition",
2546
+ 0.0
2547
+ ]
2548
+ ],
2549
+ "top_tfidf": [],
2550
+ "top_wiki": [],
2551
+ "wiki_vote_count": 0,
2552
+ "wiki_link_count": 0
2553
+ },
2554
+ {
2555
+ "tag": "silverstream_(mlp)",
2556
+ "count": 635,
2557
+ "signal": "tfidf_only",
2558
+ "assigned_categories": [],
2559
+ "top_fused": [
2560
+ [
2561
+ "body_type",
2562
+ 0.37
2563
+ ],
2564
+ [
2565
+ "resolution",
2566
+ 0.141
2567
+ ],
2568
+ [
2569
+ "count",
2570
+ 0.0959
2571
+ ]
2572
+ ],
2573
+ "top_tfidf": [
2574
+ [
2575
+ "body_type",
2576
+ 0.37
2577
+ ],
2578
+ [
2579
+ "resolution",
2580
+ 0.141
2581
+ ],
2582
+ [
2583
+ "count",
2584
+ 0.0959
2585
+ ]
2586
+ ],
2587
+ "top_wiki": [],
2588
+ "wiki_vote_count": 0,
2589
+ "wiki_link_count": 13
2590
+ },
2591
+ {
2592
+ "tag": "by_angrboda",
2593
+ "count": 622,
2594
+ "signal": "none",
2595
+ "assigned_categories": [],
2596
+ "top_fused": [
2597
+ [
2598
+ "text",
2599
+ 0.0
2600
+ ],
2601
+ [
2602
+ "franchise_series",
2603
+ 0.0
2604
+ ],
2605
+ [
2606
+ "background_composition",
2607
+ 0.0
2608
+ ]
2609
+ ],
2610
+ "top_tfidf": [],
2611
+ "top_wiki": [],
2612
+ "wiki_vote_count": 0,
2613
+ "wiki_link_count": 0
2614
+ },
2615
+ {
2616
+ "tag": "glistening_arms",
2617
+ "count": 599,
2618
+ "signal": "tfidf_only",
2619
+ "assigned_categories": [],
2620
+ "top_fused": [
2621
+ [
2622
+ "color_markings",
2623
+ 0.3229
2624
+ ],
2625
+ [
2626
+ "gaze_detail",
2627
+ 0.196
2628
+ ],
2629
+ [
2630
+ "anatomy_features",
2631
+ 0.1557
2632
+ ]
2633
+ ],
2634
+ "top_tfidf": [
2635
+ [
2636
+ "color_markings",
2637
+ 0.3229
2638
+ ],
2639
+ [
2640
+ "gaze_detail",
2641
+ 0.196
2642
+ ],
2643
+ [
2644
+ "anatomy_features",
2645
+ 0.1557
2646
+ ]
2647
+ ],
2648
+ "top_wiki": [],
2649
+ "wiki_vote_count": 0,
2650
+ "wiki_link_count": 0
2651
+ },
2652
+ {
2653
+ "tag": "by_evilymasterful",
2654
+ "count": 571,
2655
+ "signal": "none",
2656
+ "assigned_categories": [],
2657
+ "top_fused": [
2658
+ [
2659
+ "text",
2660
+ 0.0
2661
+ ],
2662
+ [
2663
+ "franchise_series",
2664
+ 0.0
2665
+ ],
2666
+ [
2667
+ "background_composition",
2668
+ 0.0
2669
+ ]
2670
+ ],
2671
+ "top_tfidf": [],
2672
+ "top_wiki": [],
2673
+ "wiki_vote_count": 0,
2674
+ "wiki_link_count": 0
2675
+ },
2676
+ {
2677
+ "tag": "by_0laffson",
2678
+ "count": 563,
2679
+ "signal": "none",
2680
+ "assigned_categories": [],
2681
+ "top_fused": [
2682
+ [
2683
+ "text",
2684
+ 0.0
2685
+ ],
2686
+ [
2687
+ "franchise_series",
2688
+ 0.0
2689
+ ],
2690
+ [
2691
+ "background_composition",
2692
+ 0.0
2693
+ ]
2694
+ ],
2695
+ "top_tfidf": [],
2696
+ "top_wiki": [],
2697
+ "wiki_vote_count": 0,
2698
+ "wiki_link_count": 0
2699
+ },
2700
+ {
2701
+ "tag": "daffy_duck",
2702
+ "count": 562,
2703
+ "signal": "both",
2704
+ "assigned_categories": [],
2705
+ "top_fused": [
2706
+ [
2707
+ "body_type",
2708
+ 0.4204
2709
+ ],
2710
+ [
2711
+ "objects_props",
2712
+ 0.0542
2713
+ ],
2714
+ [
2715
+ "species",
2716
+ 0.0486
2717
+ ]
2718
+ ],
2719
+ "top_tfidf": [
2720
+ [
2721
+ "objects_props",
2722
+ 0.0904
2723
+ ],
2724
+ [
2725
+ "species",
2726
+ 0.081
2727
+ ],
2728
+ [
2729
+ "pose_action_detail",
2730
+ 0.0674
2731
+ ]
2732
+ ],
2733
+ "top_wiki": [
2734
+ [
2735
+ "body_type",
2736
+ 1.0
2737
+ ],
2738
+ [
2739
+ "text",
2740
+ 0.0
2741
+ ],
2742
+ [
2743
+ "franchise_series",
2744
+ 0.0
2745
+ ]
2746
+ ],
2747
+ "wiki_vote_count": 1,
2748
+ "wiki_link_count": 6
2749
+ }
2750
+ ]
2751
+ }
2752
+ }
2753
+ }
data/runtime_metrics/ui_pipeline_timings.jsonl CHANGED
@@ -1,3 +1,16 @@
1
  {"timestamp_utc": "2026-03-02T12:44:26Z", "stages_s": {"preprocess": 7.90999984019436e-05, "rewrite": 1.9136111999978311, "structural": 1.0946640000038315, "probe": 0.5859509000001708, "retrieval": 4.595289600001706, "selection": 37.53351300000213, "implication_expansion": 0.15133090000017546, "prompt_composition": 6.299999949987978e-05, "group_display": 0.04701460000069346}, "total_s": 45.927563900004316, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
2
  {"timestamp_utc": "2026-03-02T16:08:08Z", "stages_s": {"preprocess": 6.989999383222312e-05, "rewrite": 3.0064916999981506, "structural": 4.2000028770416975e-06, "probe": 3.01228209999681, "retrieval": 3.3860946000058902, "selection": 5.285027000005357, "implication_expansion": 0.147530000002007, "prompt_composition": 3.850000211969018e-05, "group_display": 0.10624819999793544}, "total_s": 14.949083599989535, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
3
  {"timestamp_utc": "2026-03-02T16:08:37Z", "stages_s": {"preprocess": 7.179999374784529e-05, "rewrite": 4.608368299988797, "structural": 3.6999990697950125e-06, "probe": 1.5999976312741637e-06, "retrieval": 3.4574174999870593, "selection": 8.8562099999981, "implication_expansion": 0.14937499999359716, "prompt_composition": 3.650000144261867e-05, "group_display": 0.04632819999824278}, "total_s": 17.122792900001514, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {"timestamp_utc": "2026-03-02T12:44:26Z", "stages_s": {"preprocess": 7.90999984019436e-05, "rewrite": 1.9136111999978311, "structural": 1.0946640000038315, "probe": 0.5859509000001708, "retrieval": 4.595289600001706, "selection": 37.53351300000213, "implication_expansion": 0.15133090000017546, "prompt_composition": 6.299999949987978e-05, "group_display": 0.04701460000069346}, "total_s": 45.927563900004316, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
2
  {"timestamp_utc": "2026-03-02T16:08:08Z", "stages_s": {"preprocess": 6.989999383222312e-05, "rewrite": 3.0064916999981506, "structural": 4.2000028770416975e-06, "probe": 3.01228209999681, "retrieval": 3.3860946000058902, "selection": 5.285027000005357, "implication_expansion": 0.147530000002007, "prompt_composition": 3.850000211969018e-05, "group_display": 0.10624819999793544}, "total_s": 14.949083599989535, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
3
  {"timestamp_utc": "2026-03-02T16:08:37Z", "stages_s": {"preprocess": 7.179999374784529e-05, "rewrite": 4.608368299988797, "structural": 3.6999990697950125e-06, "probe": 1.5999976312741637e-06, "retrieval": 3.4574174999870593, "selection": 8.8562099999981, "implication_expansion": 0.14937499999359716, "prompt_composition": 3.650000144261867e-05, "group_display": 0.04632819999824278}, "total_s": 17.122792900001514, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
4
+ {"timestamp_utc": "2026-03-06T21:33:29Z", "stages_s": {"preprocess": 9.789998875930905e-05, "rewrite": 7.193461999995634, "structural": 3.3999676816165447e-06, "probe": 0.9721586999949068, "retrieval": 2.3267829000251368, "selection": 1.0979214000399224, "implication_expansion": 0.2668229000410065, "prompt_composition": 3.819999983534217e-05, "group_display": 0.08292249997612089}, "total_s": 11.945365399995353, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
5
+ {"timestamp_utc": "2026-03-06T21:38:28Z", "stages_s": {"preprocess": 1.9300030544400215e-05, "rewrite": 1.5391526999883354, "structural": 0.5504020000225864, "probe": 0.2567070999648422, "retrieval": 0.5546861999901012, "selection": 10.549223000009079, "implication_expansion": 3.8300000596791506e-05, "prompt_composition": 3.0399998649954796e-05, "group_display": 0.025673600030131638}, "total_s": 13.487254999985453, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
6
+ {"timestamp_utc": "2026-03-06T22:33:39Z", "stages_s": {"preprocess": 0.00016080000204965472, "rewrite": 1.9639222000259906, "structural": 0.7869719000300393, "probe": 0.503746600006707, "retrieval": 2.3870767999906093, "selection": 1.7242823000415228, "implication_expansion": 0.2691484999959357, "prompt_composition": 4.0899962186813354e-05, "group_display": 0.07882960001006722}, "total_s": 7.719753100012895, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
7
+ {"timestamp_utc": "2026-03-06T22:44:28Z", "stages_s": {"preprocess": 7.639999967068434e-05, "rewrite": 2.190264799981378, "structural": 0.5781105000060052, "probe": 0.23918199999025092, "retrieval": 2.4492038000025786, "selection": 0.4502618000260554, "implication_expansion": 0.1491194000118412, "prompt_composition": 3.100000321865082e-05, "group_display": 0.08213570003863424}, "total_s": 6.144094199989922, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
8
+ {"timestamp_utc": "2026-03-06T22:45:12Z", "stages_s": {"preprocess": 7.840001489967108e-05, "rewrite": 3.431444200046826, "structural": 3.400025889277458e-06, "probe": 1.3075993999955244, "retrieval": 2.425993000040762, "selection": 6.9358377999742515, "implication_expansion": 0.14671080000698566, "prompt_composition": 3.8300000596791506e-05, "group_display": 0.0784414000227116}, "total_s": 14.331572700000834, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
9
+ {"timestamp_utc": "2026-03-07T01:26:11Z", "stages_s": {"preprocess": 0.00019039999460801482, "rewrite": 1.972552299965173, "structural": 0.45487100002355874, "probe": 0.5801937999785878, "retrieval": 2.885647799994331, "selection": 2.3159614999894984, "implication_expansion": 0.27620089997071773, "prompt_composition": 3.2800016924738884e-05, "group_display": 0.07873340003425255}, "total_s": 8.597678899997845, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
10
+ {"timestamp_utc": "2026-03-07T01:37:49Z", "stages_s": {"preprocess": 0.00017070001922547817, "rewrite": 2.3397521000006236, "structural": 0.2748573999851942, "probe": 0.9656308999983594, "retrieval": 2.379494299995713, "selection": 1.9972827999736182, "implication_expansion": 0.26782700000330806, "prompt_composition": 2.889998722821474e-05, "group_display": 0.0790697000338696}, "total_s": 8.337543100002222, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
11
+ {"timestamp_utc": "2026-03-07T02:36:05Z", "stages_s": {"preprocess": 0.00019479996990412474, "rewrite": 2.3221199000254273, "structural": 0.8951652999967337, "probe": 0.9059996000141837, "retrieval": 7.6194937999825925, "selection": 10.099894999992102, "implication_expansion": 0.27516779996221885, "prompt_composition": 3.519997699186206e-05, "group_display": 0.08530839998275042}, "total_s": 22.24463780003134, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
12
+ {"timestamp_utc": "2026-03-07T02:37:16Z", "stages_s": {"preprocess": 2.0799983758479357e-05, "rewrite": 4.862703899969347, "structural": 3.8000289350748062e-06, "probe": 2.00001522898674e-06, "retrieval": 0.49216449999948964, "selection": 7.8598584000137635, "implication_expansion": 2.9799994081258774e-05, "prompt_composition": 2.4500011932104826e-05, "group_display": 0.03247090004151687}, "total_s": 13.258490999985952, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
13
+ {"timestamp_utc": "2026-03-07T02:59:49Z", "stages_s": {"preprocess": 8.230004459619522e-05, "rewrite": 1.6606152999447659, "structural": 0.6319172999938019, "probe": 0.32008590002078563, "retrieval": 2.676332700008061, "selection": 2.007969399972353, "implication_expansion": 0.26375650003319606, "prompt_composition": 3.090000245720148e-05, "group_display": 0.08008919999701902}, "total_s": 7.673184300016146, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
14
+ {"timestamp_utc": "2026-03-07T03:01:39Z", "stages_s": {"preprocess": 1.6900012269616127e-05, "rewrite": 1.713694000034593, "structural": 5.799985956400633e-06, "probe": 0.049874700023792684, "retrieval": 0.35970670002279803, "selection": 0.9267913000076078, "implication_expansion": 3.909994848072529e-05, "prompt_composition": 3.7299992982298136e-05, "group_display": 0.026757099956739694}, "total_s": 3.089661000005435, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
15
+ {"timestamp_utc": "2026-03-07T03:09:53Z", "stages_s": {"preprocess": 0.00012510002125054598, "rewrite": 2.249713899975177, "structural": 0.5107482000021264, "probe": 3.300025127828121e-06, "retrieval": 2.3757353999535553, "selection": 2.9089593999669887, "implication_expansion": 0.2682994999922812, "prompt_composition": 3.070000093430281e-05, "group_display": 0.07982710003852844}, "total_s": 8.42714020004496, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
16
+ {"timestamp_utc": "2026-03-07T03:37:54Z", "stages_s": {"preprocess": 0.00011760002234950662, "rewrite": 1.968222199997399, "structural": 1.1845426999498159, "probe": 2.214354399999138, "retrieval": 2.452574900002219, "selection": 0.8585481999907643, "implication_expansion": 0.27041040000040084, "prompt_composition": 3.319996176287532e-05, "group_display": 0.07736879994627088}, "total_s": 9.059251800004859, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
scripts/analyze_hybrid_category_assignment.py ADDED
@@ -0,0 +1,502 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Analyze hybrid category assignment for uncategorized tags.
2
+
3
+ Implements an offline analysis pipeline (no registry mutation):
4
+ 1) TF-IDF centroid scoring over current active categories.
5
+ 2) Wiki-link graph scoring from raw wiki pages.
6
+ 3) Weighted fusion of TF-IDF and wiki signals.
7
+ 4) Optional multi-category auto-assignment when top-2 fused probabilities are strong.
8
+
9
+ Outputs:
10
+ - data/analysis/hybrid_category_assignment_preview.json (default; overwritten)
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import csv
16
+ import json
17
+ import random
18
+ import re
19
+ import sys
20
+ from collections import Counter, defaultdict
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+ from typing import Dict, List, Sequence, Set, Tuple
24
+
25
+ import numpy as np
26
+
27
+ REPO = Path(__file__).resolve().parents[1]
28
+ if str(REPO) not in sys.path:
29
+ sys.path.insert(0, str(REPO))
30
+
31
+ from psq_rag.retrieval.state import get_tfidf_tag_vectors
32
+
33
+
34
+ REGISTRY_CSV = REPO / "data" / "analysis" / "category_registry.csv"
35
+ WIKI_PAGES_CSV = REPO / "wiki_pages-2023-08-08.csv"
36
+ OUT_JSON = REPO / "data" / "analysis" / "hybrid_category_assignment_preview.json"
37
+
38
+ SKIP_STATUSES = {"excluded", "review_bucket", "special_exclusion"}
39
+
40
+ LINK_PIPE_RE = re.compile(r"\[\[([^\]|]+)\|[^\]]+\]\]")
41
+ LINK_PLAIN_RE = re.compile(r"\[\[([^\]|]+)\]\]")
42
+
43
+
44
+ @dataclass
45
+ class TagScoreRow:
46
+ tag: str
47
+ count: int
48
+ signal: str
49
+ assignment: str
50
+ assigned_categories: List[str]
51
+ top_fused: List[Tuple[str, float]]
52
+ top_tfidf: List[Tuple[str, float]]
53
+ top_wiki: List[Tuple[str, float]]
54
+ wiki_vote_count: int
55
+ wiki_link_count: int
56
+
57
+
58
+ def _normalize_tag(tok: str) -> str:
59
+ return (tok or "").strip().lower().replace(" ", "_")
60
+
61
+
62
+ def _is_enabled(v: str) -> bool:
63
+ return str(v).strip().lower() in {"1", "true", "yes"}
64
+
65
+
66
+ def _softmax(values: np.ndarray, temperature: float) -> np.ndarray:
67
+ if values.size == 0:
68
+ return values
69
+ t = max(1e-6, float(temperature))
70
+ shifted = (values - float(np.max(values))) / t
71
+ expv = np.exp(shifted)
72
+ denom = float(np.sum(expv))
73
+ if denom <= 0.0:
74
+ return np.zeros_like(values, dtype=np.float32)
75
+ return (expv / denom).astype(np.float32)
76
+
77
+
78
+ def _topk_with_names(names: Sequence[str], arr: np.ndarray, k: int) -> List[Tuple[str, float]]:
79
+ if arr.size == 0:
80
+ return []
81
+ order = np.argsort(arr)[::-1][: max(1, int(k))]
82
+ return [(names[int(i)], float(arr[int(i)])) for i in order]
83
+
84
+
85
+ def load_registry(
86
+ path: Path,
87
+ ) -> Tuple[Dict[str, Set[str]], Dict[str, Set[str]], Dict[str, int], Set[str]]:
88
+ """Return (active_category_to_tags, tag_to_active_categories, tag_counts, uncategorized_tags)."""
89
+ active_category_tags: Dict[str, Set[str]] = defaultdict(set)
90
+ tag_to_active_categories: Dict[str, Set[str]] = defaultdict(set)
91
+ tag_counts: Dict[str, int] = {}
92
+ uncategorized: Set[str] = set()
93
+
94
+ with path.open("r", encoding="utf-8", newline="") as f:
95
+ reader = csv.DictReader(f)
96
+ for row in reader:
97
+ tag = _normalize_tag(row.get("tag") or "")
98
+ category = (row.get("category_name") or "").strip()
99
+ status = (row.get("category_status") or "").strip().lower()
100
+ enabled = _is_enabled(row.get("category_enabled") or "")
101
+ if not tag or not category:
102
+ continue
103
+
104
+ try:
105
+ cnt = int(float(row.get("tag_fluffyrock_count") or "0"))
106
+ except Exception:
107
+ cnt = 0
108
+ if tag not in tag_counts or cnt > tag_counts[tag]:
109
+ tag_counts[tag] = cnt
110
+
111
+ if category == "uncategorized_review":
112
+ uncategorized.add(tag)
113
+ continue
114
+
115
+ if status in SKIP_STATUSES or not enabled:
116
+ continue
117
+
118
+ active_category_tags[category].add(tag)
119
+ tag_to_active_categories[tag].add(category)
120
+
121
+ return active_category_tags, tag_to_active_categories, tag_counts, uncategorized
122
+
123
+
124
+ def build_centroids(
125
+ active_category_tags: Dict[str, Set[str]],
126
+ tag_to_row: Dict[str, int],
127
+ vectors_norm: np.ndarray,
128
+ ) -> Tuple[List[str], np.ndarray, Dict[str, int]]:
129
+ categories: List[str] = []
130
+ centroids: List[np.ndarray] = []
131
+ seed_sizes: Dict[str, int] = {}
132
+
133
+ for category in sorted(active_category_tags.keys()):
134
+ seeds = active_category_tags[category]
135
+ idxs = [tag_to_row[t] for t in seeds if t in tag_to_row]
136
+ if len(idxs) < 2:
137
+ continue
138
+ mat = vectors_norm[idxs]
139
+ c = mat.mean(axis=0)
140
+ n = float(np.linalg.norm(c))
141
+ if n <= 1e-12:
142
+ continue
143
+ categories.append(category)
144
+ centroids.append((c / n).astype(np.float32))
145
+ seed_sizes[category] = len(idxs)
146
+
147
+ if not categories:
148
+ return [], np.zeros((0, 0), dtype=np.float32), {}
149
+ return categories, np.vstack(centroids).astype(np.float32), seed_sizes
150
+
151
+
152
+ def _extract_links_from_body(body: str) -> List[str]:
153
+ links: List[str] = []
154
+ for tok in LINK_PIPE_RE.findall(body):
155
+ tag = _normalize_tag(tok)
156
+ if not tag or tag.startswith(("http://", "https://", "help:", "e621:", "tag_group:", "#")):
157
+ continue
158
+ links.append(tag)
159
+ for tok in LINK_PLAIN_RE.findall(body):
160
+ tag = _normalize_tag(tok)
161
+ if not tag or tag.startswith(("http://", "https://", "help:", "e621:", "tag_group:", "#")):
162
+ continue
163
+ links.append(tag)
164
+
165
+ seen: Set[str] = set()
166
+ deduped: List[str] = []
167
+ for tag in links:
168
+ if tag in seen:
169
+ continue
170
+ seen.add(tag)
171
+ deduped.append(tag)
172
+ return deduped
173
+
174
+
175
+ def build_wiki_votes(
176
+ wiki_csv: Path,
177
+ uncategorized_tags: Set[str],
178
+ tag_to_active_categories: Dict[str, Set[str]],
179
+ ) -> Tuple[Dict[str, Counter], Set[str], Dict[str, int]]:
180
+ """Return (wiki_votes_by_tag, tags_with_wiki_page, wiki_link_count_by_tag)."""
181
+ wiki_votes: Dict[str, Counter] = {}
182
+ has_page: Set[str] = set()
183
+ link_counts: Dict[str, int] = {}
184
+
185
+ with wiki_csv.open("r", encoding="utf-8", newline="") as f:
186
+ reader = csv.DictReader(f)
187
+ for row in reader:
188
+ title = _normalize_tag(row.get("title") or "")
189
+ if title not in uncategorized_tags:
190
+ continue
191
+ has_page.add(title)
192
+ body = row.get("body") or ""
193
+ links = _extract_links_from_body(body)
194
+ link_counts[title] = len(links)
195
+ if not links:
196
+ continue
197
+ votes = Counter()
198
+ for linked_tag in links:
199
+ cats = tag_to_active_categories.get(linked_tag)
200
+ if not cats:
201
+ continue
202
+ for c in cats:
203
+ votes[c] += 1
204
+ if votes:
205
+ wiki_votes[title] = votes
206
+ return wiki_votes, has_page, link_counts
207
+
208
+
209
+ def score_tags(
210
+ *,
211
+ uncategorized_tags: Set[str],
212
+ tag_counts: Dict[str, int],
213
+ categories: List[str],
214
+ centroid_matrix: np.ndarray,
215
+ tag_to_row: Dict[str, int],
216
+ vectors_norm: np.ndarray,
217
+ wiki_votes: Dict[str, Counter],
218
+ wiki_link_counts: Dict[str, int],
219
+ tfidf_weight: float,
220
+ wiki_weight: float,
221
+ tfidf_temp: float,
222
+ single_top1_min: float,
223
+ single_margin_min: float,
224
+ single_top2_max: float,
225
+ multi_top1_min: float,
226
+ multi_top2_min: float,
227
+ multi_pair_min: float,
228
+ ) -> List[TagScoreRow]:
229
+ out: List[TagScoreRow] = []
230
+ cat_to_idx = {c: i for i, c in enumerate(categories)}
231
+ tfidf_w = max(0.0, float(tfidf_weight))
232
+ wiki_w = max(0.0, float(wiki_weight))
233
+
234
+ for tag in sorted(uncategorized_tags):
235
+ count = int(tag_counts.get(tag, 0))
236
+ tfidf_arr = None
237
+ wiki_arr = None
238
+ wiki_vote_count = 0
239
+
240
+ row_idx = tag_to_row.get(tag)
241
+ if row_idx is not None and centroid_matrix.size > 0:
242
+ sims = centroid_matrix @ vectors_norm[int(row_idx)]
243
+ tfidf_arr = _softmax(sims.astype(np.float32), temperature=tfidf_temp)
244
+
245
+ votes = wiki_votes.get(tag)
246
+ if votes:
247
+ wiki_vote_count = int(sum(votes.values()))
248
+ wiki_arr = np.zeros(len(categories), dtype=np.float32)
249
+ for c, n in votes.items():
250
+ idx = cat_to_idx.get(c)
251
+ if idx is not None:
252
+ wiki_arr[idx] += float(n)
253
+ s = float(np.sum(wiki_arr))
254
+ if s > 0.0:
255
+ wiki_arr /= s
256
+ else:
257
+ wiki_arr = None
258
+
259
+ if tfidf_arr is not None and wiki_arr is not None:
260
+ fused = tfidf_w * tfidf_arr + wiki_w * wiki_arr
261
+ denom = float(np.sum(fused))
262
+ if denom > 0.0:
263
+ fused /= denom
264
+ signal = "both"
265
+ elif tfidf_arr is not None:
266
+ fused = tfidf_arr
267
+ signal = "tfidf_only"
268
+ elif wiki_arr is not None:
269
+ fused = wiki_arr
270
+ signal = "wiki_only"
271
+ else:
272
+ fused = np.zeros(len(categories), dtype=np.float32)
273
+ signal = "none"
274
+
275
+ top_fused = _topk_with_names(categories, fused, 3)
276
+ top_tfidf = _topk_with_names(categories, tfidf_arr, 3) if tfidf_arr is not None else []
277
+ top_wiki = _topk_with_names(categories, wiki_arr, 3) if wiki_arr is not None else []
278
+
279
+ if len(top_fused) == 0:
280
+ assignment = "hold"
281
+ assigned: List[str] = []
282
+ else:
283
+ c1, p1 = top_fused[0]
284
+ c2, p2 = top_fused[1] if len(top_fused) > 1 else ("", 0.0)
285
+
286
+ if p1 >= multi_top1_min and p2 >= multi_top2_min and (p1 + p2) >= multi_pair_min:
287
+ assignment = "multi"
288
+ assigned = [c1, c2]
289
+ elif p1 >= single_top1_min and ((p1 - p2) >= single_margin_min or p2 <= single_top2_max):
290
+ assignment = "single"
291
+ assigned = [c1]
292
+ else:
293
+ assignment = "hold"
294
+ assigned = []
295
+
296
+ out.append(
297
+ TagScoreRow(
298
+ tag=tag,
299
+ count=count,
300
+ signal=signal,
301
+ assignment=assignment,
302
+ assigned_categories=assigned,
303
+ top_fused=top_fused,
304
+ top_tfidf=top_tfidf,
305
+ top_wiki=top_wiki,
306
+ wiki_vote_count=wiki_vote_count,
307
+ wiki_link_count=int(wiki_link_counts.get(tag, 0)),
308
+ )
309
+ )
310
+ return out
311
+
312
+
313
+ def summarize_rows(
314
+ rows: List[TagScoreRow],
315
+ *,
316
+ n_uncat_total: int,
317
+ n_has_tfidf: int,
318
+ n_wiki_page: int,
319
+ n_wiki_votes: int,
320
+ sample_size: int,
321
+ seed: int,
322
+ ) -> Dict[str, object]:
323
+ assign_counts = Counter(r.assignment for r in rows)
324
+ signal_counts = Counter(r.signal for r in rows)
325
+ remaining_uncategorized = int(assign_counts.get("hold", 0))
326
+ newly_categorized = int(assign_counts.get("single", 0) + assign_counts.get("multi", 0))
327
+ multi_category_additions = int(sum(len(r.assigned_categories) for r in rows if r.assignment == "multi"))
328
+
329
+ single_by_category = Counter(
330
+ r.assigned_categories[0] for r in rows if r.assignment == "single" and r.assigned_categories
331
+ )
332
+ multi_pairs = Counter(
333
+ tuple(sorted(r.assigned_categories[:2])) for r in rows if r.assignment == "multi" and len(r.assigned_categories) >= 2
334
+ )
335
+
336
+ rng = random.Random(int(seed))
337
+
338
+ def sample_assignment(kind: str) -> List[Dict[str, object]]:
339
+ pool = [r for r in rows if r.assignment == kind]
340
+ if not pool:
341
+ return []
342
+ n = min(int(sample_size), len(pool))
343
+ picks = rng.sample(pool, n)
344
+ out: List[Dict[str, object]] = []
345
+ for r in sorted(picks, key=lambda x: (-x.count, x.tag)):
346
+ out.append(
347
+ {
348
+ "tag": r.tag,
349
+ "count": r.count,
350
+ "signal": r.signal,
351
+ "assigned_categories": r.assigned_categories,
352
+ "top_fused": [(c, round(p, 4)) for c, p in r.top_fused],
353
+ "top_tfidf": [(c, round(p, 4)) for c, p in r.top_tfidf],
354
+ "top_wiki": [(c, round(p, 4)) for c, p in r.top_wiki],
355
+ "wiki_vote_count": r.wiki_vote_count,
356
+ "wiki_link_count": r.wiki_link_count,
357
+ }
358
+ )
359
+ return out
360
+
361
+ return {
362
+ "counts": {
363
+ "uncategorized_total": int(n_uncat_total),
364
+ "scored_rows": int(len(rows)),
365
+ "has_tfidf_vector": int(n_has_tfidf),
366
+ "has_wiki_page": int(n_wiki_page),
367
+ "has_wiki_category_votes": int(n_wiki_votes),
368
+ "signals": dict(signal_counts),
369
+ "assignments": dict(assign_counts),
370
+ "newly_categorized": newly_categorized,
371
+ "remaining_uncategorized": remaining_uncategorized,
372
+ "multi_category_additions": multi_category_additions,
373
+ },
374
+ "top_single_categories": single_by_category.most_common(20),
375
+ "top_multi_category_pairs": [
376
+ {"categories": list(pair), "count": int(cnt)} for pair, cnt in multi_pairs.most_common(20)
377
+ ],
378
+ "samples": {
379
+ "single": sample_assignment("single"),
380
+ "multi": sample_assignment("multi"),
381
+ "hold": sample_assignment("hold"),
382
+ },
383
+ }
384
+
385
+
386
+ def parse_args() -> argparse.Namespace:
387
+ ap = argparse.ArgumentParser(description="Analyze hybrid TF-IDF + wiki category assignment for uncategorized tags.")
388
+ ap.add_argument("--tfidf-weight", type=float, default=0.6, help="Weight for TF-IDF centroid probabilities.")
389
+ ap.add_argument("--wiki-weight", type=float, default=0.4, help="Weight for wiki-link graph probabilities.")
390
+ ap.add_argument("--tfidf-temp", type=float, default=0.08, help="Softmax temperature for TF-IDF similarities.")
391
+
392
+ ap.add_argument("--single-top1-min", type=float, default=0.55, help="Single-assign threshold: top1 probability min.")
393
+ ap.add_argument("--single-margin-min", type=float, default=0.18, help="Single-assign threshold: top1-top2 margin min.")
394
+ ap.add_argument("--single-top2-max", type=float, default=0.35, help="Single-assign threshold: top2 probability max.")
395
+
396
+ ap.add_argument("--multi-top1-min", type=float, default=0.42, help="Multi-assign threshold: top1 probability min.")
397
+ ap.add_argument("--multi-top2-min", type=float, default=0.30, help="Multi-assign threshold: top2 probability min.")
398
+ ap.add_argument("--multi-pair-min", type=float, default=0.78, help="Multi-assign threshold: (top1+top2) min.")
399
+
400
+ ap.add_argument("--sample-size", type=int, default=20, help="Random examples per assignment bucket.")
401
+ ap.add_argument("--seed", type=int, default=42, help="Random seed for examples.")
402
+ ap.add_argument(
403
+ "--out-json",
404
+ type=Path,
405
+ default=OUT_JSON,
406
+ help="Output JSON report (overwritten each run).",
407
+ )
408
+ return ap.parse_args()
409
+
410
+
411
+ def main() -> None:
412
+ args = parse_args()
413
+
414
+ active_category_tags, tag_to_active_categories, tag_counts, uncategorized = load_registry(REGISTRY_CSV)
415
+ vectors = get_tfidf_tag_vectors()
416
+ vectors_norm = vectors["reduced_matrix_norm"]
417
+ tag_to_row = vectors["tag_to_row_index"]
418
+
419
+ categories, centroid_matrix, seed_sizes = build_centroids(active_category_tags, tag_to_row, vectors_norm)
420
+ if not categories:
421
+ raise RuntimeError("No centroids available from active categories. Check category registry content.")
422
+
423
+ wiki_votes, has_wiki_page, wiki_link_counts = build_wiki_votes(
424
+ WIKI_PAGES_CSV,
425
+ uncategorized_tags=uncategorized,
426
+ tag_to_active_categories=tag_to_active_categories,
427
+ )
428
+
429
+ rows = score_tags(
430
+ uncategorized_tags=uncategorized,
431
+ tag_counts=tag_counts,
432
+ categories=categories,
433
+ centroid_matrix=centroid_matrix,
434
+ tag_to_row=tag_to_row,
435
+ vectors_norm=vectors_norm,
436
+ wiki_votes=wiki_votes,
437
+ wiki_link_counts=wiki_link_counts,
438
+ tfidf_weight=args.tfidf_weight,
439
+ wiki_weight=args.wiki_weight,
440
+ tfidf_temp=args.tfidf_temp,
441
+ single_top1_min=args.single_top1_min,
442
+ single_margin_min=args.single_margin_min,
443
+ single_top2_max=args.single_top2_max,
444
+ multi_top1_min=args.multi_top1_min,
445
+ multi_top2_min=args.multi_top2_min,
446
+ multi_pair_min=args.multi_pair_min,
447
+ )
448
+
449
+ n_has_tfidf = sum(1 for t in uncategorized if t in tag_to_row)
450
+ summary = summarize_rows(
451
+ rows,
452
+ n_uncat_total=len(uncategorized),
453
+ n_has_tfidf=n_has_tfidf,
454
+ n_wiki_page=len(has_wiki_page),
455
+ n_wiki_votes=len(wiki_votes),
456
+ sample_size=args.sample_size,
457
+ seed=args.seed,
458
+ )
459
+
460
+ report = {
461
+ "config": {
462
+ "tfidf_weight": args.tfidf_weight,
463
+ "wiki_weight": args.wiki_weight,
464
+ "tfidf_temp": args.tfidf_temp,
465
+ "single_top1_min": args.single_top1_min,
466
+ "single_margin_min": args.single_margin_min,
467
+ "single_top2_max": args.single_top2_max,
468
+ "multi_top1_min": args.multi_top1_min,
469
+ "multi_top2_min": args.multi_top2_min,
470
+ "multi_pair_min": args.multi_pair_min,
471
+ "sample_size": args.sample_size,
472
+ "seed": args.seed,
473
+ },
474
+ "inputs": {
475
+ "registry_csv": str(REGISTRY_CSV),
476
+ "wiki_pages_csv": str(WIKI_PAGES_CSV),
477
+ "uncategorized_tags": len(uncategorized),
478
+ "active_categories_for_centroids": len(categories),
479
+ "centroid_seed_sizes": seed_sizes,
480
+ },
481
+ "summary": summary,
482
+ }
483
+
484
+ args.out_json.parent.mkdir(parents=True, exist_ok=True)
485
+ args.out_json.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
486
+
487
+ counts = summary["counts"]
488
+ print("Hybrid category assignment analysis complete")
489
+ print(f"Active categories (centroids): {len(categories)}")
490
+ print(
491
+ "Signals: "
492
+ f"tfidf={counts['has_tfidf_vector']} "
493
+ f"wiki_page={counts['has_wiki_page']} "
494
+ f"wiki_votes={counts['has_wiki_category_votes']}"
495
+ )
496
+ print(f"Assignments: {counts['assignments']}")
497
+ print(f"Remaining uncategorized: {counts['remaining_uncategorized']}")
498
+ print(f"Wrote: {args.out_json}")
499
+
500
+
501
+ if __name__ == "__main__":
502
+ main()