Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

App Files Files Community

Food Desert commited on Mar 7

Commit

33fc1b0

1 Parent(s): a48a025

Refine tag toggle UI ordering/colors and add category assignment analysis artifacts

Browse files

Files changed (5) hide show

app.py +1292 -750
data/analysis/category_registry.csv +0 -0
data/analysis/hybrid_category_assignment_preview.json +2753 -0
data/runtime_metrics/ui_pipeline_timings.jsonl +13 -0
scripts/analyze_hybrid_category_assignment.py +502 -0

app.py CHANGED Viewed

@@ -1,144 +1,409 @@
-import gradio as gr
-import os
-import logging
-import time
-import json
-from datetime import datetime
-from PIL import Image
-from pathlib import Path
-from typing import Any, Dict, List, Set
-from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
-from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
-from psq_rag.llm.rewrite import llm_rewrite_prompt
-from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
-from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags, llm_infer_probe_tags
-from psq_rag.retrieval.state import expand_tags_via_implications
-from psq_rag.ui.group_ranked_display import rank_groups_from_tfidf, _load_enabled_groups
-def _split_prompt_commas(s: str) -> List[str]:
-    return [p.strip() for p in (s or "").split(",") if p.strip()]
-def _norm_for_dedupe(tag: str) -> str:
-    # your canonical form for lookup/dedupe
-    return _norm_tag_for_lookup(tag.lower())
-def compose_final_prompt(rewritten_prompt: str, selected_tags: List[str]) -> str:
-    parts = _split_prompt_commas(rewritten_prompt)
-    parts.extend(selected_tags)
-    seen = set()
-    out = []
-    for p in parts:
-        key = _norm_for_dedupe(p)
-        if key in seen:
-            continue
-        seen.add(key)
-        out.append(p)
-    return ", ".join(out)
 def _display_tag_text(tag: str) -> str:
     return tag.replace("_", " ")
-def _escape_prompt_tag(tag: str) -> str:
-    return (
-        tag.replace("_", " ")
-        .replace("(", "\\(")
-        .replace(")", "\\)")
     )
-def _ordered_selected_for_prompt(selected: Set[str], row_defs: List[Dict[str, Any]]) -> List[str]:
-    out: List[str] = []
-    seen: Set[str] = set()
-    for row in row_defs:
-        for tag in row.get("tags", []):
-            if tag in selected and tag not in seen:
-                out.append(tag)
-                seen.add(tag)
-    # Fallback for any selected tags not present in current rows.
-    for tag in sorted(selected):
-        if tag not in seen:
-            out.append(tag)
-            seen.add(tag)
-    return out
-def _compose_toggle_prompt_text(selected_tags: List[str], row_defs: List[Dict[str, Any]]) -> str:
-    selected = {t for t in (selected_tags or []) if t}
-    ordered = _ordered_selected_for_prompt(selected, row_defs or [])
-    return ", ".join(_escape_prompt_tag(t) for t in ordered)
 def _build_toggle_rows(
     *,
     seed_terms: List[str],
-    llm_selected_tags: List[str],
     top_groups: int,
     top_tags_per_group: int,
     group_rank_top_k: int,
 ) -> List[Dict[str, Any]]:
-    ranked_rows = rank_groups_from_tfidf(
-        seed_terms=seed_terms,
-        top_groups=max(1, int(top_groups)),
-        top_tags_per_group=max(1, int(top_tags_per_group)),
-        group_rank_top_k=max(1, int(group_rank_top_k)),
-    )
     groups_map = _load_enabled_groups()
-    llm_selected = list(dict.fromkeys(_norm_tag_for_lookup(t) for t in llm_selected_tags if t))
-    row_defs: List[Dict[str, Any]] = []
-    displayed_group_names = [r.group_name for r in ranked_rows]
-    displayed_group_tag_sets: Dict[str, Set[str]] = {
-        name: set(groups_map.get(name, [])) for name in displayed_group_names
     }
-    tags_in_any_displayed_group: Set[str] = set()
-    for tag_set in displayed_group_tag_sets.values():
-        tags_in_any_displayed_group.update(tag_set)
-    llm_other = [t for t in llm_selected if t not in tags_in_any_displayed_group]
     row_defs.append(
         {
-            "name": "llm_selected_other",
-            "label": "LLM Selected (Other)",
-            "tags": llm_other,
         }
     )
     for row in ranked_rows:
         group_name = row.group_name
         group_tag_set = displayed_group_tag_sets.get(group_name, set())
-        selected_in_group = [t for t in llm_selected if t in group_tag_set]
-        ranked_tags = [t for t, _ in row.tags]
         merged = selected_in_group + [t for t in ranked_tags if t not in selected_in_group]
         keep_n = max(max(1, int(top_tags_per_group)), len(selected_in_group))
         merged = merged[:keep_n]
         row_defs.append(
             {
                 "name": group_name,
                 "label": f"{group_name} (E={row.expected_count:.2f})",
                 "tags": merged,
             }
         )
-    return row_defs
-def _build_row_component_updates(
-    row_defs: List[Dict[str, Any]],
-    selected_tags: List[str],
-    max_rows: int,
-):
-    selected = {t for t in (selected_tags or []) if t}
-    row_values_state: List[List[str]] = []
-    header_updates = []
-    checkbox_updates = []
-    for idx in range(max_rows):
         if idx < len(row_defs):
             row = row_defs[idx]
             tags = list(dict.fromkeys(row.get("tags", [])))
@@ -146,716 +411,993 @@ def _build_row_component_updates(
             row_values_state.append(values)
             visible = bool(tags)
             header_updates.append(gr.update(value=f"**{row.get('label', '')}**", visible=visible))
-            choices = [(_display_tag_text(t), t) for t in tags]
             checkbox_updates.append(
                 gr.update(
                     choices=choices,
                     value=values,
                     visible=visible,
-                )
-            )
-        else:
-            header_updates.append(gr.update(value="", visible=False))
-            checkbox_updates.append(gr.update(choices=[], value=[], visible=False))
-    prompt_text = _compose_toggle_prompt_text(list(selected), row_defs)
-    return prompt_text, row_values_state, header_updates, checkbox_updates
 def _on_toggle_row(
     row_idx: int,
     changed_values: List[str],
     selected_tags_state: List[str],
-    row_defs_state: List[Dict[str, Any]],
-    row_values_state: List[List[str]],
-    max_rows: int,
 ):
     row_defs = row_defs_state or []
     selected = set(selected_tags_state or [])
-    prev_values = list(row_values_state or [])
-    while len(prev_values) < len(row_defs):
-        prev_values.append([])
-    prev_set = set(prev_values[row_idx]) if row_idx < len(prev_values) else set()
-    new_set = set(changed_values or [])
-    selected.update(new_set - prev_set)
-    selected.difference_update(prev_set - new_set)
-    prompt_text, new_row_values_state, _header_updates, checkbox_updates = _build_row_component_updates(
-        row_defs=row_defs,
-        selected_tags=list(selected),
-        max_rows=max_rows,
-    )
-    return [sorted(selected), new_row_values_state, prompt_text, *checkbox_updates]
-def _build_ui_payload(
-    *,
-    console_text: str,
-    legacy_prompt_text: str,
-    row_defs: List[Dict[str, Any]],
-    selected_tags: List[str],
-):
-    prompt_text, row_values_state, header_updates, checkbox_updates = _build_row_component_updates(
-        row_defs=row_defs,
-        selected_tags=selected_tags,
-        max_rows=display_max_rows_default,
-    )
-    return [
-        console_text,
-        legacy_prompt_text,
-        prompt_text,
-        sorted(set(selected_tags or [])),
-        row_defs,
-        row_values_state,
-        *header_updates,
-        *checkbox_updates,
-    ]
-def _build_selection_query(
-    prompt_in: str,
-    rewritten: str,
-    structural_tags: List[str],
-    probe_tags: List[str],
-) -> str:
-    lines = [f"IMAGE DESCRIPTION: {prompt_in.strip()}"]
-    if rewritten and rewritten.strip():
-        lines.append(f"REWRITE PHRASES: {rewritten.strip()}")
-    hint_tags = []
-    if structural_tags:
-        hint_tags.extend(structural_tags)
-    if probe_tags:
-        hint_tags.extend(probe_tags)
-    if hint_tags:
-        # Keep hints as context only; selection still must choose by candidate indices.
-        lines.append(
-            "INFERRED TAG HINTS (context only): " + ", ".join(sorted(set(hint_tags)))
-        )
-    return "\n".join(lines)
-# Set up logging
-# Minimal prod logging: warnings+ to stderr, no file by default
-import os, logging
-LOG_LEVEL = os.environ.get("PSQ_LOG_LEVEL", "WARNING").upper()
-logging.basicConfig(
-    level=getattr(logging, LOG_LEVEL, logging.WARNING),
-    format="%(asctime)s %(levelname)s:%(message)s",
-    handlers=[logging.StreamHandler()]  # no file -> avoids huge logs on Spaces
-)
-# Quiet down common noisy libs (optional)
-for _name in ("gensim", "gradio", "hnswlib", "httpx", "uvicorn"):
-    logging.getLogger(_name).setLevel(logging.ERROR)
-# Turn off Gradio analytics phone-home to avoid those background thread errors (optional)
-os.environ["GRADIO_ANALYTICS_ENABLED"] = "0"
-MASCOT_DIR = Path(__file__).parent / "mascotimages"
-MASCOT_FILE = MASCOT_DIR / "transparentsquirrel.png"
-def _load_mascot_image():
-    """Load mascot image if available; return None when missing/unreadable."""
-    if not MASCOT_FILE.exists():
-        logging.warning("Mascot image missing: %s", MASCOT_FILE)
-        return None
-    try:
-        return Image.open(MASCOT_FILE).convert("RGBA")
-    except Exception as e:
-        logging.warning("Failed to load mascot image (%s): %s", MASCOT_FILE, e)
-        return None
-try:
-    from gradio_client import utils as _gc_utils
-    _orig_get_type = _gc_utils.get_type
-    _orig_j2p = _gc_utils._json_schema_to_python_type
-    _orig_pub = _gc_utils.json_schema_to_python_type
-    def _get_type_safe(schema):
-        # Sometimes schema is a bare True/False (JSON Schema boolean form)
-        if not isinstance(schema, dict):
-            return "any"
-        return _orig_get_type(schema)
-    def _j2p_safe(schema, defs=None):
-        # Accept non-dict schemas (True/False/None) and treat as "any"
-        if not isinstance(schema, dict):
-            return "any"
-        return _orig_j2p(schema, defs or schema.get("$defs"))
-    def _pub_safe(schema):
-        # Public wrapper used by Gradio; keep it resilient too
-        if not isinstance(schema, dict):
-            return "any"
-        return _j2p_safe(schema, schema.get("$defs"))
-    _gc_utils.get_type = _get_type_safe
-    _gc_utils._json_schema_to_python_type = _j2p_safe
-    _gc_utils.json_schema_to_python_type = _pub_safe
-except Exception as e:
-    print("gradio_client hotfix not applied:", e)
-# -------------------------------------------------------------------------------
-allow_nsfw_tags = False
-def _is_production_runtime() -> bool:
-    """Best-effort detection for deployed runtime (HF Spaces or explicit env)."""
-    if os.environ.get("PSQ_PRODUCTION", "").strip().lower() in {"1", "true", "yes"}:
-        return True
-    if os.environ.get("SPACE_ID"):
-        return True
-    if os.environ.get("HF_SPACE_ID"):
-        return True
-    if os.environ.get("SYSTEM") == "spaces":
-        return True
-    return False
-verbose_retrieval_default = "0" if _is_production_runtime() else "1"
-verbose_retrieval = os.environ.get("PSQ_VERBOSE_RETRIEVAL", verbose_retrieval_default).strip().lower() in {"1", "true", "yes"}
-verbose_retrieval_all = False
-verbose_retrieval_limit = 20
-enable_probe_tags = os.environ.get("PSQ_ENABLE_PROBE", "1").strip() not in {"0", "false", "False"}
-display_top_groups_default = int(os.environ.get("PSQ_DISPLAY_TOP_GROUPS", "10"))
-display_top_tags_per_group_default = int(os.environ.get("PSQ_DISPLAY_TOP_TAGS_PER_GROUP", "5"))
-display_rank_top_k_default = int(os.environ.get("PSQ_DISPLAY_GROUP_RANK_TOP_K", "5"))
-display_max_rows_default = int(os.environ.get("PSQ_DISPLAY_MAX_ROWS", "14"))
-retrieval_global_k = int(os.environ.get("PSQ_RETRIEVAL_GLOBAL_K", "300"))
-retrieval_per_phrase_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_K", "10"))
-retrieval_per_phrase_final_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_FINAL_K", "1"))
-selection_mode = os.environ.get("PSQ_SELECTION_MODE", "chunked_map_union").strip()
-selection_chunk_size = int(os.environ.get("PSQ_SELECTION_CHUNK_SIZE", "60"))
-selection_per_phrase_k = int(os.environ.get("PSQ_SELECTION_PER_PHRASE_K", "2"))
-selection_candidate_cap = int(os.environ.get("PSQ_SELECTION_CANDIDATE_CAP", "0"))
-stage1_rewrite_timeout_s = float(os.environ.get("PSQ_TIMEOUT_REWRITE_S", "45"))
-stage1_struct_timeout_s = float(os.environ.get("PSQ_TIMEOUT_STRUCT_S", "45"))
-stage1_probe_timeout_s = float(os.environ.get("PSQ_TIMEOUT_PROBE_S", "45"))
-stage3_select_timeout_s = float(os.environ.get("PSQ_TIMEOUT_SELECT_S", "45"))
-timing_log_path = Path(os.environ.get("PSQ_TIMING_LOG_PATH", "data/runtime_metrics/ui_pipeline_timings.jsonl"))
-css = """
-.scrollable-content{
-  max-height: 420px;
-  overflow-y: scroll;          /* always show scrollbar */
-  overflow-x: hidden;
-  padding-right: 8px;
-  padding-bottom: 14px;   /* <— add this */
-  scrollbar-gutter: stable;    /* prevent layout shift as it fills */
-  /* Firefox */
-  scrollbar-width: auto;
-  scrollbar-color: rgba(180,180,180,.9) rgba(0,0,0,.15);
 }
-/* WebKit/Chromium (Chrome/Edge/Safari) */
-.scrollable-content::-webkit-scrollbar{ width: 10px; }
-.scrollable-content::-webkit-scrollbar-thumb{ background: rgba(180,180,180,.9); border-radius: 8px; }
-.scrollable-content::-webkit-scrollbar-track{ background: rgba(0,0,0,.15); }
-/* (Optional) make both scroll panes taller so they fill more of the column */
-.pane-left  .scrollable-content,
-.pane-right .scrollable-content {
-  max-height: 610px;                /* was 420px; tweak to taste */
 }
-.lego-tags .gr-checkboxgroup {
   display: flex;
   flex-wrap: wrap;
   gap: 8px;
 }
-.lego-tags .gr-checkboxgroup label {
-  margin: 0;
-  padding: 0;
 }
-.lego-tags .gr-checkboxgroup input[type="checkbox"] {
-  display: none;
 }
-.lego-tags .gr-checkboxgroup span {
-  display: inline-block;
-  padding: 7px 12px;
-  border: 1px solid #8a8a8a;
-  border-radius: 10px;
-  background: #f4f4f4;
-  color: #222;
-  font-size: 0.95rem;
-  line-height: 1.2;
-  cursor: pointer;
-  user-select: none;
-  box-shadow: 0 1px 0 rgba(0,0,0,0.12), inset 0 1px 0 rgba(255,255,255,0.7);
-}
-.lego-tags .gr-checkboxgroup input[type="checkbox"]:checked + span {
-  background: #ffd86a;
-  border-color: #c49a00;
-  box-shadow: 0 2px 0 #a98000, inset 0 1px 0 rgba(255,255,255,0.65);
-  transform: translateY(1px);
 }
 """
 def rag_pipeline_ui(
-    user_prompt: str,
-    display_top_groups: float,
-    display_top_tags_per_group: float,
-    display_rank_top_k: float,
-):
-    logs = []
-    def log(s): logs.append(s)
-    try:
-        stage_timings = {}
-        def _record_timing(stage: str, dt_s: float):
-            stage_timings[stage] = float(dt_s)
-        def _emit_timing_summary(total_s: float):
-            summary_order = [
-                "preprocess",
-                "rewrite",
-                "structural",
-                "probe",
-                "retrieval",
-                "selection",
-                "implication_expansion",
-                "prompt_composition",
-                "group_display",
-            ]
-            lines = []
-            for k in summary_order:
-                if k in stage_timings:
-                    lines.append(f"{k}={stage_timings[k]:.2f}s")
-            slowest = max(stage_timings.items(), key=lambda kv: kv[1])[0] if stage_timings else "n/a"
-            log("Timing Summary: " + ", ".join(lines))
-            log(f"Timing Slowest Stage: {slowest}")
-            log(f"Timing Total: {total_s:.2f}s")
-        def _append_timing_jsonl(total_s: float):
-            try:
-                timing_log_path.parent.mkdir(parents=True, exist_ok=True)
-                rec = {
-                    "timestamp_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
-                    "stages_s": stage_timings,
-                    "total_s": float(total_s),
-                    "config": {
-                        "timeout_rewrite_s": stage1_rewrite_timeout_s,
-                        "timeout_struct_s": stage1_struct_timeout_s,
-                        "timeout_probe_s": stage1_probe_timeout_s,
-                        "timeout_select_s": stage3_select_timeout_s,
-                    },
-                }
-                with timing_log_path.open("a", encoding="utf-8") as f:
-                    f.write(json.dumps(rec, ensure_ascii=True) + "\n")
-                log(f"Timing Log: wrote {timing_log_path}")
-            except Exception as e:
-                log(f"Timing Log: failed ({type(e).__name__}: {e})")
-        def _future_with_timeout(fut, timeout_s: float, stage_name: str, fallback):
-            t0 = time.perf_counter()
-            try:
-                out = fut.result(timeout=max(1.0, float(timeout_s)))
-                dt = time.perf_counter() - t0
-                log(f"{stage_name}: {dt:.2f}s")
-                stage_key = {
-                    "Rewrite": "rewrite",
-                    "Structural inference": "structural",
-                    "Probe inference": "probe",
-                    "Index selection": "selection",
-                }.get(stage_name)
-                if stage_key:
-                    _record_timing(stage_key, dt)
-                return out
-            except FutureTimeoutError:
-                fut.cancel()
-                log(f"{stage_name}: timed out after {timeout_s:.0f}s; using fallback")
-                return fallback
-            except Exception as e:
-                log(f"{stage_name}: failed ({type(e).__name__}: {e}); using fallback")
-                return fallback
-        t_total0 = time.perf_counter()
-        log("Start: received prompt")
-        prompt_in = (user_prompt or "").strip()
-        if not prompt_in:
-            return _build_ui_payload(
-                console_text="Error: empty prompt",
-                legacy_prompt_text="",
-                row_defs=[],
-                selected_tags=[],
-            )
-        log("Input:")
-        log(prompt_in)
-        log("")
-        log(
-            "Runtime config: "
-            f"retrieval_global_k={retrieval_global_k} "
-            f"retrieval_per_phrase_k={retrieval_per_phrase_k} "
-            f"retrieval_per_phrase_final_k={retrieval_per_phrase_final_k} "
-            f"selection_mode={selection_mode} "
-            f"selection_chunk_size={selection_chunk_size} "
-            f"selection_per_phrase_k={selection_per_phrase_k}"
-        )
-        log("")
-        t0 = time.perf_counter()
-        user_tags = extract_user_provided_tags_upto_3_words(prompt_in)
-        dt = time.perf_counter()-t0
-        _record_timing("preprocess", dt)
-        log(f"Preprocess (user tag extraction): {dt:.2f}s")
-        log("Heuristically extracted user tags:")
-        if user_tags:
-            log(", ".join(user_tags))
-        else:
-            log("(none)")
-        log("")
-        log("Step 1: LLM rewrite + structural inference + probe (concurrent)")
-        max_workers = 3 if enable_probe_tags else 2
-        with ThreadPoolExecutor(max_workers=max_workers) as ex:
-            fut_rewrite = ex.submit(llm_rewrite_prompt, prompt_in, log)
-            fut_struct = ex.submit(llm_infer_structural_tags, prompt_in, log=log)
-            fut_probe = ex.submit(llm_infer_probe_tags, prompt_in, log=log) if enable_probe_tags else None
-            rewritten = _future_with_timeout(
-                fut_rewrite, stage1_rewrite_timeout_s, "Rewrite", prompt_in
-            )
-            structural_tags = _future_with_timeout(
-                fut_struct, stage1_struct_timeout_s, "Structural inference", []
-            )
-            probe_tags = (
-                _future_with_timeout(fut_probe, stage1_probe_timeout_s, "Probe inference", [])
-                if fut_probe else []
-            )
-        log("Rewrite:")
-        log(rewritten if rewritten else "(empty)")
-        log("")
-        rewrite_for_retrieval = rewritten
-        if user_tags:
-            # keep them separate in logs, but allow them to help retrieval
-            rewrite_for_retrieval = (rewrite_for_retrieval + ", " + ", ".join(user_tags)).strip(", ").strip()
-        log("Step 2: Prompt Squirrel retrieval (hidden)")
-        try:
-            t0 = time.perf_counter()
-            retrieval_context_tags = list(dict.fromkeys((structural_tags or []) + (probe_tags or [])))
-            rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
-            retrieval_result = psq_candidates_from_rewrite_phrases(
-                rewrite_phrases=rewrite_phrases,
-                allow_nsfw_tags=allow_nsfw_tags,
-                context_tags=retrieval_context_tags,
-                global_k=max(1, retrieval_global_k),
-                per_phrase_k=max(1, retrieval_per_phrase_k),
-                per_phrase_final_k=max(1, retrieval_per_phrase_final_k),
-                verbose=verbose_retrieval,
-            )
-            if isinstance(retrieval_result, tuple):
-                candidates, phrase_reports = retrieval_result
-            else:
-                candidates, phrase_reports = retrieval_result, []
-            if selection_candidate_cap > 0 and len(candidates) > selection_candidate_cap:
-                candidates = candidates[:selection_candidate_cap]
-                log(f"Selection candidate cap applied: {selection_candidate_cap}")
-            dt = time.perf_counter()-t0
-            _record_timing("retrieval", dt)
-            log(f"Retrieval: {dt:.2f}s")
-            log(f"Retrieved {len(candidates)} candidate tags")
-            if verbose_retrieval:
-                log(f"Total unique candidates: {len(candidates)}")
-                limit = None if verbose_retrieval_all else max(1, int(verbose_retrieval_limit))
-                for report in phrase_reports:
-                    phrase = report.get("normalized") or report.get("phrase") or ""
-                    lookup = report.get("lookup") or ""
-                    tfidf_vocab = report.get("tfidf_vocab")
-                    log(f"Phrase: {phrase} (lookup={lookup}) tfidf_vocab={tfidf_vocab}")
-                    rows = report.get("candidates", [])
-                    shown = rows if limit is None else rows[:limit]
-                    for row in shown:
-                          tag = row.get("tag")
-                          alias_token = row.get("alias_token")
-                          score_fasttext = row.get("score_fasttext")
-                          score_context = row.get("score_context")
-                          score_combined = row.get("score_combined")
-                          count = row.get("count")
-                          alias_part = ""
-                          if alias_token and alias_token != tag:
-                              alias_part = f" [alias_token={alias_token}]"
-                          fasttext_str = (
-                              f"{score_fasttext:.3f}" if isinstance(score_fasttext, (int, float)) else score_fasttext
-                          )
-                          if score_context is None:
-                              context_str = "None"
-                          else:
-                              context_str = (
-                                  f"{score_context:.3f}" if isinstance(score_context, (int, float)) else score_context
-                              )
-                          combined_str = (
-                              f"{score_combined:.3f}" if isinstance(score_combined, (int, float)) else score_combined
-                          )
-                          log(
-                              f"  {tag}{alias_part} | fasttext={fasttext_str} context={context_str} "
-                              f"combined={combined_str} count={count}"
-                          )
-                    if limit is not None and len(rows) > limit:
-                        log(f"  ... ({len(rows) - limit} more)")
-        except Exception as e:
-            log(f"Retrieval fallback: {type(e).__name__}: {e}")
-            candidates = []
-        log("Step 3: LLM index selection (uses rewrite + structural/probe context)")
-        selection_query = _build_selection_query(
-            prompt_in=prompt_in,
-            rewritten=rewritten,
-            structural_tags=structural_tags,
-            probe_tags=probe_tags,
-        )
-        with ThreadPoolExecutor(max_workers=1) as ex:
-            fut_sel = ex.submit(
-                llm_select_indices,
-                query_text=selection_query,
-                candidates=candidates,
-                max_pick=0,
-                log=log,
-                mode=selection_mode,
-                chunk_size=max(1, selection_chunk_size),
-                per_phrase_k=max(1, selection_per_phrase_k),
-            )
-            picked_indices = _future_with_timeout(
-                fut_sel, stage3_select_timeout_s, "Index selection", []
-            )
-        selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
-        if structural_tags:
-            # Add structural tags that aren't already selected
-            existing = {t for t in selected_tags}
-            new_structural = [t for t in structural_tags if t not in existing]
-            selected_tags.extend(new_structural)
-            log(f"  Added {len(new_structural)} structural tags: {', '.join(new_structural)}")
-        else:
-            log("  No structural tags inferred")
-        if probe_tags:
-            existing = {t for t in selected_tags}
-            new_probe = [t for t in probe_tags if t not in existing]
-            selected_tags.extend(new_probe)
-            log(f"  Added {len(new_probe)} probe tags: {', '.join(new_probe)}")
-        elif enable_probe_tags:
-            log("  No probe tags inferred")
-        llm_selected_tags = list(dict.fromkeys(selected_tags))
-        log("Step 3c: Expand via tag implications")
-        t0 = time.perf_counter()
-        tag_set = set(selected_tags)
-        expanded, implied_only = expand_tags_via_implications(tag_set)
-        dt = time.perf_counter()-t0
-        _record_timing("implication_expansion", dt)
-        log(f"Implication expansion: {dt:.2f}s")
-        if implied_only:
-            selected_tags.extend(sorted(implied_only))
-            log(f"  Added {len(implied_only)} implied tags: {', '.join(sorted(implied_only))}")
-        else:
-            log("  No additional implied tags")
-        log("Step 4: Compose final prompt")
-        t0 = time.perf_counter()
-        final_prompt = compose_final_prompt(rewritten, selected_tags)
-        dt = time.perf_counter()-t0
-        _record_timing("prompt_composition", dt)
-        log(f"Prompt composition: {dt:.2f}s")
         log("Step 5: Build ranked group/category display")
         t0 = time.perf_counter()
         seed_terms = []
         seed_terms.extend(user_tags)
         seed_terms.extend([p.strip() for p in (rewritten or "").split(",") if p.strip()])
-        seed_terms.extend(structural_tags or [])
-        seed_terms.extend(probe_tags or [])
-        seed_terms.extend(selected_tags)
-        seed_terms = list(dict.fromkeys(seed_terms))
         toggle_rows = _build_toggle_rows(
             seed_terms=seed_terms,
-            llm_selected_tags=llm_selected_tags,
             top_groups=max(1, int(display_top_groups)),
             top_tags_per_group=max(1, int(display_top_tags_per_group)),
             group_rank_top_k=max(1, int(display_rank_top_k)),
         )
-        dt = time.perf_counter()-t0
-        _record_timing("group_display", dt)
-        log(f"Ranked group display: {dt:.2f}s ({len(toggle_rows)} rows)")
-        total_dt = time.perf_counter()-t_total0
-        _emit_timing_summary(total_dt)
-        _append_timing_jsonl(total_dt)
-        log("Done: final prompt ready")
-        return _build_ui_payload(
-            console_text="\n".join(logs),
-            legacy_prompt_text=final_prompt,
-            row_defs=toggle_rows,
-            selected_tags=llm_selected_tags,
-        )
-    except Exception as e:
-        log(f"Error: {type(e).__name__}: {e}")
-        return _build_ui_payload(
-            console_text="\n".join(logs),
-            legacy_prompt_text="",
-            row_defs=[],
-            selected_tags=[],
-        )
-with gr.Blocks(css=css) as app:
-    with gr.Row():
-        with gr.Column(scale=3, elem_classes=["prompt-col"]):
-            image_tags = gr.Textbox(
-                label="Enter Prompt",
-                placeholder="e.g. fox, outside, detailed background, .",
-                lines=1
-            )
-        with gr.Column(scale=1):
-            _mascot_pil = _load_mascot_image()
-            if _mascot_pil is not None:
-                mascot_img = gr.Image(
-                    value=_mascot_pil,
-                    show_label=False,
-                    interactive=False,
-                    height=220,
-                    elem_id="mascot"
-                )
-            else:
-                mascot_img = gr.Markdown("`(mascot image unavailable)`")
-            submit_button = gr.Button("Run", variant="primary")
-    gr.Markdown(
-        """
-### Prompt Squirrel RAG (pipeline version)
-Type a rough prompt. This tool rewrites it and aligns it to an e621-style tag vocabulary using Prompt Squirrel internally,
-then returns a cleaned, model-friendly prompt.
-        """.strip()
-    )
-    console = gr.Textbox(
-        label="Console",
-        lines=10,
-        interactive=False,
-        placeholder="Progress logs will appear here."
-    )
-    suggested_prompt = gr.Textbox(
-        label="Suggested Prompt (From Toggled Tags)",
-        lines=3,
-        interactive=False,
-        show_copy_button=True,
-        placeholder="Comma-separated tags selected in the rows below."
-    )
-    with gr.Accordion("Legacy Pipeline Prompt (for reference)", open=False):
-        legacy_final_prompt = gr.Textbox(
-            label="Legacy Final Prompt",
-            lines=3,
-            interactive=False,
-            show_copy_button=True,
-        )
     selected_tags_state = gr.State([])
     row_defs_state = gr.State([])
     row_values_state = gr.State([])
     gr.Markdown("### Toggle Tag Rows")
-    row_headers: List[gr.Markdown] = []
-    row_checkboxes: List[gr.CheckboxGroup] = []
-    for _ in range(display_max_rows_default):
-        row_headers.append(gr.Markdown(value="", visible=False))
-        row_checkboxes.append(
-            gr.CheckboxGroup(
-                choices=[],
-                value=[],
-                visible=False,
-                interactive=True,
-                container=False,
-                elem_classes=["lego-tags"],
-            )
-        )
-    gr.Markdown(
-        "Toggling a tag in any row toggles it everywhere else that tag appears."
-    )
-    with gr.Accordion("Display Settings", open=False):
-        with gr.Row():
-            display_top_groups = gr.Number(
-                value=display_top_groups_default,
-                precision=0,
-                label="Rows (Top Groups/Categories)",
-                minimum=1,
-            )
-            display_top_tags_per_group = gr.Number(
-                value=display_top_tags_per_group_default,
-                precision=0,
-                label="Top Tags Shown Per Row",
-                minimum=1,
-            )
-            display_rank_top_k = gr.Number(
-                value=display_rank_top_k_default,
-                precision=0,
-                label="Top Tags Used for Row Ranking",
-                minimum=1,
-            )
-    run_outputs = [
-        console,
-        legacy_final_prompt,
-        suggested_prompt,
-        selected_tags_state,
-        row_defs_state,
-        row_values_state,
-        *row_headers,
-        *row_checkboxes,
-    ]
-    submit_button.click(
-        rag_pipeline_ui,
-        inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
-        outputs=run_outputs
     )
-    image_tags.submit(
-        rag_pipeline_ui,
-        inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
-        outputs=run_outputs
     )
     for idx, row_cb in enumerate(row_checkboxes):
         row_cb.change(
             fn=lambda changed_values, selected_state, row_defs, row_values, i=idx: _on_toggle_row(
                 i,
                 changed_values,
-                selected_state,
-                row_defs,
-                row_values,
-                display_max_rows_default,
             ),
             inputs=[row_cb, selected_tags_state, row_defs_state, row_values_state],
             outputs=[selected_tags_state, row_values_state, suggested_prompt, *row_checkboxes],
         )
-if __name__ == "__main__":
-    app.queue().launch(allowed_paths=[str(MASCOT_DIR)])

+import gradio as gr
+import os
+import logging
+import time
+import json
+import csv
+from datetime import datetime
+from functools import lru_cache
+from PIL import Image
+from pathlib import Path
+from typing import Any, Dict, List, Set, Tuple
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
+from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
+from psq_rag.llm.rewrite import llm_rewrite_prompt
+from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
+from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags, llm_infer_probe_tags
+from psq_rag.retrieval.state import expand_tags_via_implications, get_tag_type_name, get_tag_implications
+from psq_rag.ui.group_ranked_display import rank_groups_from_tfidf, _load_enabled_groups
+def _split_prompt_commas(s: str) -> List[str]:
+    return [p.strip() for p in (s or "").split(",") if p.strip()]
+def _norm_for_dedupe(tag: str) -> str:
+    # your canonical form for lookup/dedupe
+    return _norm_tag_for_lookup(tag.lower())
+def compose_final_prompt(rewritten_prompt: str, selected_tags: List[str]) -> str:
+    parts = _split_prompt_commas(rewritten_prompt)
+    parts.extend(selected_tags)
+    seen = set()
+    out = []
+    for p in parts:
+        key = _norm_for_dedupe(p)
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(p)
+    return ", ".join(out)
 def _display_tag_text(tag: str) -> str:
     return tag.replace("_", " ")
+def _normalize_selection_origin(origin: str) -> str:
+    o = (origin or "").strip().lower()
+    if o in {"rewrite", "selection", "probe", "structural", "user", "candidate"}:
+        return o
+    return "selection"
+def _choice_label_with_source_meta(tag: str, *, origin: str, preselected: bool) -> str:
+    # Marker is stripped client-side and converted into data attributes for CSS-driven colors.
+    origin_norm = _normalize_selection_origin(origin)
+    pre = "1" if preselected else "0"
+    return f"{_display_tag_text(tag)} [[psq:{origin_norm}:{pre}]]"
+def _selection_source_rank(origin: str) -> int:
+    o = _normalize_selection_origin(origin)
+    if o == "structural":
+        return 0
+    if o == "probe":
+        return 1
+    # Keep rewrite/user in the same priority band as general selection for row ordering.
+    return 2
+def _build_implied_parent_map(
+    direct_tags_ordered: List[str],
+    implied_tags: List[str],
+) -> Dict[str, str]:
+    implied_set = {_norm_tag_for_lookup(t) for t in (implied_tags or []) if t}
+    if not implied_set or not direct_tags_ordered:
+        return {}
+    impl = get_tag_implications()
+    parent_by_implied: Dict[str, str] = {}
+    for direct in direct_tags_ordered:
+        d = _norm_tag_for_lookup(direct)
+        if not d:
+            continue
+        queue = [d]
+        seen = {d}
+        while queue:
+            t = queue.pop()
+            for parent in impl.get(t, ()):
+                p = _norm_tag_for_lookup(parent)
+                if not p or p in seen:
+                    continue
+                seen.add(p)
+                if p in implied_set and p not in parent_by_implied:
+                    parent_by_implied[p] = d
+                queue.append(p)
+    return parent_by_implied
+def _order_selected_tags_for_row(
+    *,
+    row_selected_tags: List[str],
+    selected_index: Dict[str, int],
+    tag_selection_origins: Dict[str, str],
+    implied_parent_map: Dict[str, str],
+) -> List[str]:
+    row_selected_norm = [_norm_tag_for_lookup(t) for t in (row_selected_tags or []) if t]
+    implied_in_row = {t for t in row_selected_norm if t in implied_parent_map}
+    base_tags = [t for t in row_selected_norm if t not in implied_in_row]
+    base_tags.sort(
+        key=lambda t: (
+            _selection_source_rank(tag_selection_origins.get(t, "selection")),
+            selected_index.get(t, 10**9),
+            t,
+        )
     )
+    children_by_parent: Dict[str, List[str]] = {}
+    for implied in implied_in_row:
+        parent = implied_parent_map.get(implied)
+        if parent:
+            children_by_parent.setdefault(parent, []).append(implied)
+    for parent, children in children_by_parent.items():
+        children.sort(key=lambda t: (selected_index.get(t, 10**9), t))
+    ordered: List[str] = []
+    emitted: Set[str] = set()
+    for tag in base_tags:
+        if tag in emitted:
+            continue
+        ordered.append(tag)
+        emitted.add(tag)
+        for child in children_by_parent.get(tag, []):
+            if child not in emitted:
+                ordered.append(child)
+                emitted.add(child)
+    remaining_implied = [t for t in row_selected_norm if t not in emitted]
+    remaining_implied.sort(
+        key=lambda t: (
+            _selection_source_rank(tag_selection_origins.get(implied_parent_map.get(t, ""), "selection")),
+            selected_index.get(implied_parent_map.get(t, ""), 10**9),
+            selected_index.get(t, 10**9),
+            t,
+        )
+    )
+    for t in remaining_implied:
+        if t not in emitted:
+            ordered.append(t)
+            emitted.add(t)
+    return ordered
+def _escape_prompt_tag(tag: str) -> str:
+    return (
+        tag.replace("_", " ")
+        .replace("(", "\\(")
+        .replace(")", "\\)")
+    )
+def _ordered_selected_for_prompt(selected: Set[str], row_defs: List[Dict[str, Any]]) -> List[str]:
+    out: List[str] = []
+    seen: Set[str] = set()
+    for row in row_defs:
+        for tag in row.get("tags", []):
+            if tag in selected and tag not in seen:
+                out.append(tag)
+                seen.add(tag)
+    # Fallback for any selected tags not present in current rows.
+    for tag in sorted(selected):
+        if tag not in seen:
+            out.append(tag)
+            seen.add(tag)
+    return out
+def _compose_toggle_prompt_text(selected_tags: List[str], row_defs: List[Dict[str, Any]]) -> str:
+    selected = {t for t in (selected_tags or []) if t}
+    ordered = _ordered_selected_for_prompt(selected, row_defs or [])
+    return ", ".join(_escape_prompt_tag(t) for t in ordered)
+def _is_artist_tag(tag: str) -> bool:
+    t = _norm_tag_for_lookup(str(tag))
+    if not t:
+        return False
+    # Keep a resilient fallback for malformed/missing tag typing metadata.
+    return get_tag_type_name(t) == "artist" or t.startswith("by_")
+@lru_cache(maxsize=1)
+def _load_excluded_recommendation_tags() -> Set[str]:
+    csv_path = Path("data/analysis/category_registry.csv")
+    out: Set[str] = set()
+    if not csv_path.exists():
+        return out
+    try:
+        with csv_path.open("r", encoding="utf-8", newline="") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                tag = _norm_tag_for_lookup(str(row.get("tag") or ""))
+                if not tag:
+                    continue
+                status = str(row.get("category_status") or "").strip().lower()
+                if status == "excluded":
+                    out.add(tag)
+    except Exception:
+        return set()
+    return out
+def _is_excluded_recommendation_tag(tag: str) -> bool:
+    t = _norm_tag_for_lookup(str(tag))
+    if not t:
+        return False
+    return t in _load_excluded_recommendation_tags()
+def _filter_excluded_recommendation_tags(tags: List[str]) -> Tuple[List[str], List[str]]:
+    excluded = _load_excluded_recommendation_tags()
+    if not excluded:
+        return list(dict.fromkeys(_norm_tag_for_lookup(t) for t in (tags or []) if t)), []
+    keep: List[str] = []
+    removed: List[str] = []
+    seen: Set[str] = set()
+    for raw in (tags or []):
+        t = _norm_tag_for_lookup(str(raw))
+        if not t:
+            continue
+        if t in excluded:
+            removed.append(t)
+            continue
+        if t in seen:
+            continue
+        seen.add(t)
+        keep.append(t)
+    return keep, sorted(set(removed))
 def _build_toggle_rows(
     *,
     seed_terms: List[str],
+    selected_tags: List[str],
+    tag_selection_origins: Dict[str, str],
+    implied_parent_map: Dict[str, str],
     top_groups: int,
     top_tags_per_group: int,
     group_rank_top_k: int,
 ) -> List[Dict[str, Any]]:
+    ranked_rows = rank_groups_from_tfidf(
+        seed_terms=seed_terms,
+        top_groups=max(1, int(top_groups)),
+        top_tags_per_group=max(1, int(top_tags_per_group)),
+        group_rank_top_k=max(1, int(group_rank_top_k)),
+    )
     groups_map = _load_enabled_groups()
+    selected_active = list(
+        dict.fromkeys(
+            _norm_tag_for_lookup(t)
+            for t in selected_tags
+            if t and not _is_artist_tag(t) and not _is_excluded_recommendation_tag(t)
+        )
+    )
+    selected_index: Dict[str, int] = {t: i for i, t in enumerate(selected_active)}
+    row_defs: List[Dict[str, Any]] = []
+    displayed_group_names = [r.group_name for r in ranked_rows]
+    displayed_group_tag_sets: Dict[str, Set[str]] = {
+        name: {t for t in groups_map.get(name, []) if not _is_artist_tag(t)}
+        for name in displayed_group_names
+    }
+    tags_in_any_displayed_group: Set[str] = set()
+    for tag_set in displayed_group_tag_sets.values():
+        tags_in_any_displayed_group.update(tag_set)
+    selected_other_raw = [t for t in selected_active if t not in tags_in_any_displayed_group]
+    selected_other = _order_selected_tags_for_row(
+        row_selected_tags=selected_other_raw,
+        selected_index=selected_index,
+        tag_selection_origins=tag_selection_origins,
+        implied_parent_map=implied_parent_map,
+    )
+    selected_other_meta = {
+        t: {
+            "origin": _normalize_selection_origin(tag_selection_origins.get(t, "selection")),
+            "preselected": True,
+        }
+        for t in selected_other
     }
     row_defs.append(
         {
+            "name": "selected_other",
+            "label": "Selected (Other)",
+            "tags": selected_other,
+            "tag_meta": selected_other_meta,
         }
     )
     for row in ranked_rows:
         group_name = row.group_name
         group_tag_set = displayed_group_tag_sets.get(group_name, set())
+        selected_in_group_raw = [t for t in selected_active if t in group_tag_set]
+        selected_in_group = _order_selected_tags_for_row(
+            row_selected_tags=selected_in_group_raw,
+            selected_index=selected_index,
+            tag_selection_origins=tag_selection_origins,
+            implied_parent_map=implied_parent_map,
+        )
+        ranked_tags = [
+            t
+            for t, _ in row.tags
+            if not _is_artist_tag(t) and not _is_excluded_recommendation_tag(t)
+        ]
         merged = selected_in_group + [t for t in ranked_tags if t not in selected_in_group]
         keep_n = max(max(1, int(top_tags_per_group)), len(selected_in_group))
         merged = merged[:keep_n]
+        tag_meta = {
+            t: {
+                "origin": _normalize_selection_origin(tag_selection_origins.get(t, "selection")),
+                "preselected": t in selected_active,
+            }
+            for t in merged
+        }
         row_defs.append(
             {
                 "name": group_name,
                 "label": f"{group_name} (E={row.expected_count:.2f})",
                 "tags": merged,
+                "tag_meta": tag_meta,
             }
         )
+    return row_defs
+def _build_display_audit_line(
+    row_defs: List[Dict[str, Any]],
+    *,
+    active_selected_tags: List[str],
+    direct_selected_tags: List[str],
+    implied_selected_tags: List[str],
+) -> str:
+    active_set = {
+        _norm_tag_for_lookup(t)
+        for t in (active_selected_tags or [])
+        if t and not _is_artist_tag(t)
+    }
+    direct_set = {
+        _norm_tag_for_lookup(t)
+        for t in (direct_selected_tags or [])
+        if t and not _is_artist_tag(t)
+    }
+    implied_set = {
+        _norm_tag_for_lookup(t)
+        for t in (implied_selected_tags or [])
+        if t and not _is_artist_tag(t)
+    }
+    info_by_tag: Dict[str, Dict[str, Any]] = {}
+    for row in row_defs or []:
+        row_name = row.get("name", "")
+        row_label = row.get("label", row_name)
+        for tag in row.get("tags", []):
+            rec = info_by_tag.setdefault(tag, {"rows": [], "sources": set()})
+            rec["rows"].append(row_label)
+            if row_name == "selected_other":
+                rec["sources"].add("selected_other_row")
+            else:
+                rec["sources"].add("ranked_group_row")
+            if tag in active_set:
+                rec["sources"].add("selected_active")
+            if tag in direct_set:
+                rec["sources"].add("selected_direct")
+            if tag in implied_set:
+                rec["sources"].add("selected_implied")
+    payload = {
+        "n_tags": len(info_by_tag),
+        "tags": [
+            {
+                "tag": tag,
+                "rows": rec["rows"],
+                "sources": sorted(rec["sources"]),
+            }
+            for tag, rec in sorted(info_by_tag.items())
+        ],
+    }
+    return "Display Tag Audit: " + json.dumps(payload, ensure_ascii=True)
+def _build_row_component_updates(
+    row_defs: List[Dict[str, Any]],
+    selected_tags: List[str],
+    max_rows: int,
+):
+    selected = {t for t in (selected_tags or []) if t}
+    row_values_state: List[List[str]] = []
+    header_updates = []
+    checkbox_updates = []
+    for idx in range(max_rows):
         if idx < len(row_defs):
             row = row_defs[idx]
             tags = list(dict.fromkeys(row.get("tags", [])))
             row_values_state.append(values)
             visible = bool(tags)
             header_updates.append(gr.update(value=f"**{row.get('label', '')}**", visible=visible))
+            tag_meta = row.get("tag_meta", {}) if isinstance(row.get("tag_meta", {}), dict) else {}
+            choices = []
+            for t in tags:
+                meta = tag_meta.get(t, {}) if isinstance(tag_meta.get(t, {}), dict) else {}
+                origin = _normalize_selection_origin(str(meta.get("origin", "selection")))
+                preselected = bool(meta.get("preselected", False))
+                choices.append((_choice_label_with_source_meta(t, origin=origin, preselected=preselected), t))
             checkbox_updates.append(
                 gr.update(
                     choices=choices,
                     value=values,
                     visible=visible,
+                )
+            )
+        else:
+            header_updates.append(gr.update(value="", visible=False))
+            checkbox_updates.append(gr.update(choices=[], value=[], visible=False))
+    prompt_text = _compose_toggle_prompt_text(list(selected), row_defs)
+    return prompt_text, row_values_state, header_updates, checkbox_updates
 def _on_toggle_row(
     row_idx: int,
     changed_values: List[str],
     selected_tags_state: List[str],
+    row_defs_state: List[Dict[str, Any]],
+    row_values_state: List[List[str]],
+    max_rows: int,
 ):
     row_defs = row_defs_state or []
     selected = set(selected_tags_state or [])
+    row = row_defs[row_idx] if 0 <= row_idx < len(row_defs) else {}
+    row_tags = list(dict.fromkeys(row.get("tags", [])))
+    row_tag_set = set(row_tags)
+    row_tag_by_norm = {_norm_tag_for_lookup(t): t for t in row_tags}
+    # Be tolerant to UI payload forms: canonical tag values, display labels, or normalized variants.
+    new_set: Set[str] = set()
+    for raw in (changed_values or []):
+        if raw in row_tag_set:
+            new_set.add(raw)
+            continue
+        raw_norm = _norm_tag_for_lookup(str(raw))
+        mapped = row_tag_by_norm.get(raw_norm)
+        if mapped:
+            new_set.add(mapped)
+    prev_row_selected = {t for t in selected if t in row_tag_set}
+    selected.difference_update(row_tag_set)
+    selected.update(new_set)
+    toggled_tags = prev_row_selected ^ new_set
+    # Recompute row selections, but only push UI updates to rows touched by the toggled tags.
+    new_row_values_state: List[List[str]] = []
+    affected_rows: Set[int] = {row_idx}
+    for idx, row in enumerate(row_defs):
+        tags = list(dict.fromkeys(row.get("tags", [])))
+        values = [t for t in tags if t in selected]
+        new_row_values_state.append(values)
+        if toggled_tags and any(t in toggled_tags for t in tags):
+            affected_rows.add(idx)
+    checkbox_updates = []
+    for idx in range(max_rows):
+        if idx < len(row_defs) and idx in affected_rows:
+            checkbox_updates.append(gr.update(value=new_row_values_state[idx]))
+        else:
+            checkbox_updates.append(gr.update())
+    prompt_text = _compose_toggle_prompt_text(sorted(selected), row_defs)
+    return [sorted(selected), new_row_values_state, prompt_text, *checkbox_updates]
+def _build_ui_payload(
+    *,
+    console_text: str,
+    legacy_prompt_text: str,
+    row_defs: List[Dict[str, Any]],
+    selected_tags: List[str],
+):
+    prompt_text, row_values_state, header_updates, checkbox_updates = _build_row_component_updates(
+        row_defs=row_defs,
+        selected_tags=selected_tags,
+        max_rows=display_max_rows_default,
+    )
+    return [
+        console_text,
+        legacy_prompt_text,
+        prompt_text,
+        sorted(set(selected_tags or [])),
+        row_defs,
+        row_values_state,
+        *header_updates,
+        *checkbox_updates,
+    ]
+def _build_selection_query(
+    prompt_in: str,
+    rewritten: str,
+    structural_tags: List[str],
+    probe_tags: List[str],
+) -> str:
+    lines = [f"IMAGE DESCRIPTION: {prompt_in.strip()}"]
+    if rewritten and rewritten.strip():
+        lines.append(f"REWRITE PHRASES: {rewritten.strip()}")
+    hint_tags = []
+    if structural_tags:
+        hint_tags.extend(structural_tags)
+    if probe_tags:
+        hint_tags.extend(probe_tags)
+    if hint_tags:
+        # Keep hints as context only; selection still must choose by candidate indices.
+        lines.append(
+            "INFERRED TAG HINTS (context only): " + ", ".join(sorted(set(hint_tags)))
+        )
+    return "\n".join(lines)
+# Set up logging
+# Minimal prod logging: warnings+ to stderr, no file by default
+import os, logging
+LOG_LEVEL = os.environ.get("PSQ_LOG_LEVEL", "WARNING").upper()
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL, logging.WARNING),
+    format="%(asctime)s %(levelname)s:%(message)s",
+    handlers=[logging.StreamHandler()]  # no file -> avoids huge logs on Spaces
+)
+# Quiet down common noisy libs (optional)
+for _name in ("gensim", "gradio", "hnswlib", "httpx", "uvicorn"):
+    logging.getLogger(_name).setLevel(logging.ERROR)
+# Turn off Gradio analytics phone-home to avoid those background thread errors (optional)
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "0"
+MASCOT_DIR = Path(__file__).parent / "mascotimages"
+MASCOT_FILE = MASCOT_DIR / "transparentsquirrel.png"
+def _load_mascot_image():
+    """Load mascot image if available; return None when missing/unreadable."""
+    if not MASCOT_FILE.exists():
+        logging.warning("Mascot image missing: %s", MASCOT_FILE)
+        return None
+    try:
+        return Image.open(MASCOT_FILE).convert("RGBA")
+    except Exception as e:
+        logging.warning("Failed to load mascot image (%s): %s", MASCOT_FILE, e)
+        return None
+try:
+    from gradio_client import utils as _gc_utils
+    _orig_get_type = _gc_utils.get_type
+    _orig_j2p = _gc_utils._json_schema_to_python_type
+    _orig_pub = _gc_utils.json_schema_to_python_type
+    def _get_type_safe(schema):
+        # Sometimes schema is a bare True/False (JSON Schema boolean form)
+        if not isinstance(schema, dict):
+            return "any"
+        return _orig_get_type(schema)
+    def _j2p_safe(schema, defs=None):
+        # Accept non-dict schemas (True/False/None) and treat as "any"
+        if not isinstance(schema, dict):
+            return "any"
+        return _orig_j2p(schema, defs or schema.get("$defs"))
+    def _pub_safe(schema):
+        # Public wrapper used by Gradio; keep it resilient too
+        if not isinstance(schema, dict):
+            return "any"
+        return _j2p_safe(schema, schema.get("$defs"))
+    _gc_utils.get_type = _get_type_safe
+    _gc_utils._json_schema_to_python_type = _j2p_safe
+    _gc_utils.json_schema_to_python_type = _pub_safe
+except Exception as e:
+    print("gradio_client hotfix not applied:", e)
+# -------------------------------------------------------------------------------
+allow_nsfw_tags = False
+def _is_production_runtime() -> bool:
+    """Best-effort detection for deployed runtime (HF Spaces or explicit env)."""
+    if os.environ.get("PSQ_PRODUCTION", "").strip().lower() in {"1", "true", "yes"}:
+        return True
+    if os.environ.get("SPACE_ID"):
+        return True
+    if os.environ.get("HF_SPACE_ID"):
+        return True
+    if os.environ.get("SYSTEM") == "spaces":
+        return True
+    return False
+verbose_retrieval_default = "0" if _is_production_runtime() else "1"
+verbose_retrieval = os.environ.get("PSQ_VERBOSE_RETRIEVAL", verbose_retrieval_default).strip().lower() in {"1", "true", "yes"}
+verbose_retrieval_all = False
+verbose_retrieval_limit = 20
+enable_probe_tags = os.environ.get("PSQ_ENABLE_PROBE", "1").strip() not in {"0", "false", "False"}
+display_top_groups_default = int(os.environ.get("PSQ_DISPLAY_TOP_GROUPS", "10"))
+display_top_tags_per_group_default = int(os.environ.get("PSQ_DISPLAY_TOP_TAGS_PER_GROUP", "5"))
+display_rank_top_k_default = int(os.environ.get("PSQ_DISPLAY_GROUP_RANK_TOP_K", "5"))
+display_max_rows_default = int(os.environ.get("PSQ_DISPLAY_MAX_ROWS", "14"))
+retrieval_global_k = int(os.environ.get("PSQ_RETRIEVAL_GLOBAL_K", "300"))
+retrieval_per_phrase_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_K", "10"))
+retrieval_per_phrase_final_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_FINAL_K", "1"))
+selection_mode = os.environ.get("PSQ_SELECTION_MODE", "chunked_map_union").strip()
+selection_chunk_size = int(os.environ.get("PSQ_SELECTION_CHUNK_SIZE", "60"))
+selection_per_phrase_k = int(os.environ.get("PSQ_SELECTION_PER_PHRASE_K", "2"))
+selection_candidate_cap = int(os.environ.get("PSQ_SELECTION_CANDIDATE_CAP", "0"))
+stage1_rewrite_timeout_s = float(os.environ.get("PSQ_TIMEOUT_REWRITE_S", "45"))
+stage1_struct_timeout_s = float(os.environ.get("PSQ_TIMEOUT_STRUCT_S", "45"))
+stage1_probe_timeout_s = float(os.environ.get("PSQ_TIMEOUT_PROBE_S", "45"))
+stage3_select_timeout_s = float(os.environ.get("PSQ_TIMEOUT_SELECT_S", "45"))
+timing_log_path = Path(os.environ.get("PSQ_TIMING_LOG_PATH", "data/runtime_metrics/ui_pipeline_timings.jsonl"))
+css = """
+.scrollable-content{
+  max-height: 420px;
+  overflow-y: scroll;          /* always show scrollbar */
+  overflow-x: hidden;
+  padding-right: 8px;
+  padding-bottom: 14px;   /* <— add this */
+  scrollbar-gutter: stable;    /* prevent layout shift as it fills */
+  /* Firefox */
+  scrollbar-width: auto;
+  scrollbar-color: rgba(180,180,180,.9) rgba(0,0,0,.15);
+}
+/* WebKit/Chromium (Chrome/Edge/Safari) */
+.scrollable-content::-webkit-scrollbar{ width: 10px; }
+.scrollable-content::-webkit-scrollbar-thumb{ background: rgba(180,180,180,.9); border-radius: 8px; }
+.scrollable-content::-webkit-scrollbar-track{ background: rgba(0,0,0,.15); }
+/* (Optional) make both scroll panes taller so they fill more of the column */
+.pane-left  .scrollable-content,
+.pane-right .scrollable-content {
+  max-height: 610px;                /* was 420px; tweak to taste */
+}
+.lego-tags .gr-checkboxgroup,
+.lego-tags .wrap {
+  display: flex !important;
+  flex-wrap: wrap !important;
+  gap: 10px !important;
+}
+.lego-tags label {
+  margin: 0 !important;
+  padding: 0 !important;
+  position: relative !important;
+}
+/* Hide native checkbox visuals completely */
+.lego-tags input[type="checkbox"] {
+  appearance: none !important;
+  -webkit-appearance: none !important;
+  -moz-appearance: none !important;
+  position: absolute !important;
+  width: 1px !important;
+  height: 1px !important;
+  opacity: 0 !important;
+  pointer-events: none !important;
+  display: none !important;
+}
+/* Brick button skin (works for both +span and ~span structures) */
+.lego-tags input[type="checkbox"] + span,
+.lego-tags input[type="checkbox"] ~ span {
+  --on-bg1: #ffd166;
+  --on-bg2: #f39c4a;
+  --on-border: #b86e21;
+  --on-text: #2e1706;
+  position: relative !important;
+  display: inline-flex !important;
+  align-items: center !important;
+  min-height: 40px !important;
+  padding: 10px 15px 9px !important;
+  border: 2px solid #7d8897 !important;
+  border-radius: 10px !important;
+  background: linear-gradient(180deg, #e8ecf2 0%, #c7ced8 100%) !important;
+  color: #2d3440 !important;
+  font-size: 0.97rem !important;
+  font-weight: 800 !important;
+  line-height: 1.15 !important;
+  cursor: pointer !important;
+  user-select: none !important;
+  letter-spacing: 0.01em !important;
+  box-shadow: 0 4px 0 rgba(0,0,0,0.22), inset 0 1px 0 rgba(255,255,255,0.72) !important;
+  transition: transform 0.08s ease, box-shadow 0.08s ease, filter 0.08s ease !important;
+}
+.lego-tags input[type="checkbox"] + span::before,
+.lego-tags input[type="checkbox"] ~ span::before {
+  content: "" !important;
+  position: absolute !important;
+  top: 5px !important;
+  left: 8px !important;
+  width: 8px !important;
+  height: 8px !important;
+  border-radius: 50% !important;
+  background: rgba(255,255,255,0.58) !important;
+  box-shadow: 22px 0 0 rgba(255,255,255,0.58) !important;
+  pointer-events: none !important;
+}
+/* Bright color cycle used only when selected */
+.lego-tags label:nth-child(8n+1) span { --on-bg1: #ffd166; --on-bg2: #f39c4a; --on-border: #b86e21; --on-text: #2e1706; }
+.lego-tags label:nth-child(8n+2) span { --on-bg1: #6ee7ff; --on-bg2: #1fb7ff; --on-border: #157cb3; --on-text: #07263c; }
+.lego-tags label:nth-child(8n+3) span { --on-bg1: #9dff8f; --on-bg2: #45c96f; --on-border: #2a8b4b; --on-text: #0d2917; }
+.lego-tags label:nth-child(8n+4) span { --on-bg1: #ff8fab; --on-bg2: #ff5c7a; --on-border: #b83956; --on-text: #3f0f1d; }
+.lego-tags label:nth-child(8n+5) span { --on-bg1: #d0a8ff; --on-bg2: #a46cff; --on-border: #7147b3; --on-text: #25143f; }
+.lego-tags label:nth-child(8n+6) span { --on-bg1: #ffe27a; --on-bg2: #f7bf39; --on-border: #ad7f1f; --on-text: #332407; }
+.lego-tags label:nth-child(8n+7) span { --on-bg1: #8effd5; --on-bg2: #2ed6b5; --on-border: #1e947d; --on-text: #0d2a25; }
+.lego-tags label:nth-child(8n+8) span { --on-bg1: #ffb47e; --on-bg2: #ff8753; --on-border: #b95b2d; --on-text: #391a0a; }
+/* Source-driven selected colors (applies when tags are preselected by the pipeline). */
+.lego-tags label[data-psq-preselected="1"][data-psq-origin="rewrite"] span {
+  --on-bg1: #77f0d7;
+  --on-bg2: #26b9a3;
+  --on-border: #187869;
+  --on-text: #062923;
+}
+.lego-tags label[data-psq-preselected="1"][data-psq-origin="selection"] span {
+  --on-bg1: #ffd98a;
+  --on-bg2: #f0a93c;
+  --on-border: #a66f1f;
+  --on-text: #382206;
+}
+.lego-tags label[data-psq-preselected="1"][data-psq-origin="probe"] span {
+  --on-bg1: #d8b4ff;
+  --on-bg2: #9a6cff;
+  --on-border: #6745b0;
+  --on-text: #24143b;
+}
+.lego-tags label[data-psq-preselected="1"][data-psq-origin="structural"] span {
+  --on-bg1: #a6f79a;
+  --on-bg2: #53c368;
+  --on-border: #2f8442;
+  --on-text: #102d17;
+}
+.lego-tags label[data-psq-preselected="1"][data-psq-origin="implied"] span {
+  --on-bg1: #d7dde8;
+  --on-bg2: #a8b3c4;
+  --on-border: #6f7e95;
+  --on-text: #1d2633;
 }
+/* User-selected tags (not initially selected by the pipeline). */
+.lego-tags label[data-psq-preselected="0"] span {
+  --on-bg1: #9ec5ff;
+  --on-bg2: #4f86ff;
+  --on-border: #2f5fbf;
+  --on-text: #0b1f42;
+}
+.lego-tags label:hover span {
+  filter: brightness(1.02) !important;
+  transform: translateY(1px) !important;
 }
+/* ON state: brighter + visibly recessed */
+.lego-tags input[type="checkbox"]:checked + span,
+.lego-tags input[type="checkbox"]:checked ~ span,
+.lego-tags label:has(input[type="checkbox"]:checked) span {
+  background: linear-gradient(180deg, var(--on-bg1) 0%, var(--on-bg2) 100%) !important;
+  color: var(--on-text) !important;
+  border-color: var(--on-border) !important;
+  filter: saturate(1.2) brightness(1.12) !important;
+  transform: translateY(-2px) !important;
+  box-shadow:
+    inset 0 3px 6px rgba(0,0,0,0.20),
+    inset 0 -1px 0 rgba(255,255,255,0.36),
+    0 6px 0 rgba(0,0,0,0.32) !important;
+}
+.source-legend {
   display: flex;
   flex-wrap: wrap;
   gap: 8px;
+  margin: 4px 0 10px 0;
 }
+.source-legend .chip {
+  display: inline-flex;
+  align-items: center;
+  gap: 8px;
+  border-radius: 999px;
+  border: 1px solid #8792a2;
+  padding: 5px 10px;
+  font-size: 0.85rem;
+  font-weight: 700;
+  color: #1f2430;
+  background: #f3f6fb;
 }
+.source-legend .swatch {
+  width: 12px;
+  height: 12px;
+  border-radius: 50%;
+  border: 1px solid rgba(0,0,0,0.2);
 }
+.source-legend .rewrite { background: #26b9a3; }
+.source-legend .selection { background: #f0a93c; }
+.source-legend .probe { background: #9a6cff; }
+.source-legend .structural { background: #53c368; }
+.source-legend .implied { background: #a8b3c4; }
+.source-legend .user { background: #4f86ff; }
+.source-legend .unselected { background: #c7ced8; }
+"""
+client_js = """
+() => {
+  const markerRe = /\\s*\\[\\[psq:([a-z_]+):(0|1)\\]\\]\\s*$/;
+  const applyTagMeta = () => {
+    const labels = document.querySelectorAll(".lego-tags label");
+    labels.forEach((label) => {
+      const span = label.querySelector("span");
+      if (!span) return;
+      const text = span.textContent || "";
+      const match = text.match(markerRe);
+      if (!match) return;
+      label.dataset.psqOrigin = match[1];
+      label.dataset.psqPreselected = match[2];
+      span.textContent = text.replace(markerRe, "");
+    });
+  };
+  applyTagMeta();
+  const observer = new MutationObserver(() => applyTagMeta());
+  observer.observe(document.body, { childList: true, subtree: true, characterData: true });
 }
 """
 def rag_pipeline_ui(
+    user_prompt: str,
+    display_top_groups: float,
+    display_top_tags_per_group: float,
+    display_rank_top_k: float,
+):
+    logs = []
+    def log(s): logs.append(s)
+    try:
+        stage_timings = {}
+        def _record_timing(stage: str, dt_s: float):
+            stage_timings[stage] = float(dt_s)
+        def _emit_timing_summary(total_s: float):
+            summary_order = [
+                "preprocess",
+                "rewrite",
+                "structural",
+                "probe",
+                "retrieval",
+                "selection",
+                "implication_expansion",
+                "prompt_composition",
+                "group_display",
+            ]
+            lines = []
+            for k in summary_order:
+                if k in stage_timings:
+                    lines.append(f"{k}={stage_timings[k]:.2f}s")
+            slowest = max(stage_timings.items(), key=lambda kv: kv[1])[0] if stage_timings else "n/a"
+            log("Timing Summary: " + ", ".join(lines))
+            log(f"Timing Slowest Stage: {slowest}")
+            log(f"Timing Total: {total_s:.2f}s")
+        def _append_timing_jsonl(total_s: float):
+            try:
+                timing_log_path.parent.mkdir(parents=True, exist_ok=True)
+                rec = {
+                    "timestamp_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
+                    "stages_s": stage_timings,
+                    "total_s": float(total_s),
+                    "config": {
+                        "timeout_rewrite_s": stage1_rewrite_timeout_s,
+                        "timeout_struct_s": stage1_struct_timeout_s,
+                        "timeout_probe_s": stage1_probe_timeout_s,
+                        "timeout_select_s": stage3_select_timeout_s,
+                    },
+                }
+                with timing_log_path.open("a", encoding="utf-8") as f:
+                    f.write(json.dumps(rec, ensure_ascii=True) + "\n")
+                log(f"Timing Log: wrote {timing_log_path}")
+            except Exception as e:
+                log(f"Timing Log: failed ({type(e).__name__}: {e})")
+        def _future_with_timeout(fut, timeout_s: float, stage_name: str, fallback):
+            t0 = time.perf_counter()
+            try:
+                out = fut.result(timeout=max(1.0, float(timeout_s)))
+                dt = time.perf_counter() - t0
+                log(f"{stage_name}: {dt:.2f}s")
+                stage_key = {
+                    "Rewrite": "rewrite",
+                    "Structural inference": "structural",
+                    "Probe inference": "probe",
+                    "Index selection": "selection",
+                }.get(stage_name)
+                if stage_key:
+                    _record_timing(stage_key, dt)
+                return out
+            except FutureTimeoutError:
+                fut.cancel()
+                log(f"{stage_name}: timed out after {timeout_s:.0f}s; using fallback")
+                return fallback
+            except Exception as e:
+                log(f"{stage_name}: failed ({type(e).__name__}: {e}); using fallback")
+                return fallback
+        t_total0 = time.perf_counter()
+        log("Start: received prompt")
+        prompt_in = (user_prompt or "").strip()
+        if not prompt_in:
+            return _build_ui_payload(
+                console_text="Error: empty prompt",
+                legacy_prompt_text="",
+                row_defs=[],
+                selected_tags=[],
+            )
+        log("Input:")
+        log(prompt_in)
+        log("")
+        log(
+            "Runtime config: "
+            f"retrieval_global_k={retrieval_global_k} "
+            f"retrieval_per_phrase_k={retrieval_per_phrase_k} "
+            f"retrieval_per_phrase_final_k={retrieval_per_phrase_final_k} "
+            f"selection_mode={selection_mode} "
+            f"selection_chunk_size={selection_chunk_size} "
+            f"selection_per_phrase_k={selection_per_phrase_k}"
+        )
+        log("")
+        t0 = time.perf_counter()
+        user_tags = extract_user_provided_tags_upto_3_words(prompt_in)
+        dt = time.perf_counter()-t0
+        _record_timing("preprocess", dt)
+        log(f"Preprocess (user tag extraction): {dt:.2f}s")
+        log("Heuristically extracted user tags:")
+        if user_tags:
+            log(", ".join(user_tags))
+        else:
+            log("(none)")
+        log("")
+        log("Step 1: LLM rewrite + structural inference + probe (concurrent)")
+        max_workers = 3 if enable_probe_tags else 2
+        with ThreadPoolExecutor(max_workers=max_workers) as ex:
+            fut_rewrite = ex.submit(llm_rewrite_prompt, prompt_in, log)
+            fut_struct = ex.submit(llm_infer_structural_tags, prompt_in, log=log)
+            fut_probe = ex.submit(llm_infer_probe_tags, prompt_in, log=log) if enable_probe_tags else None
+            rewritten = _future_with_timeout(
+                fut_rewrite, stage1_rewrite_timeout_s, "Rewrite", prompt_in
+            )
+            structural_tags = _future_with_timeout(
+                fut_struct, stage1_struct_timeout_s, "Structural inference", []
+            )
+            probe_tags = (
+                _future_with_timeout(fut_probe, stage1_probe_timeout_s, "Probe inference", [])
+                if fut_probe else []
+            )
+        log("Rewrite:")
+        log(rewritten if rewritten else "(empty)")
+        log("")
+        rewrite_for_retrieval = rewritten
+        if user_tags:
+            # keep them separate in logs, but allow them to help retrieval
+            rewrite_for_retrieval = (rewrite_for_retrieval + ", " + ", ".join(user_tags)).strip(", ").strip()
+        log("Step 2: Prompt Squirrel retrieval (hidden)")
+        try:
+            t0 = time.perf_counter()
+            retrieval_context_tags = list(dict.fromkeys((structural_tags or []) + (probe_tags or [])))
+            rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
+            retrieval_result = psq_candidates_from_rewrite_phrases(
+                rewrite_phrases=rewrite_phrases,
+                allow_nsfw_tags=allow_nsfw_tags,
+                context_tags=retrieval_context_tags,
+                global_k=max(1, retrieval_global_k),
+                per_phrase_k=max(1, retrieval_per_phrase_k),
+                per_phrase_final_k=max(1, retrieval_per_phrase_final_k),
+                verbose=verbose_retrieval,
+            )
+            if isinstance(retrieval_result, tuple):
+                candidates, phrase_reports = retrieval_result
+            else:
+                candidates, phrase_reports = retrieval_result, []
+            if selection_candidate_cap > 0 and len(candidates) > selection_candidate_cap:
+                candidates = candidates[:selection_candidate_cap]
+                log(f"Selection candidate cap applied: {selection_candidate_cap}")
+            dt = time.perf_counter()-t0
+            _record_timing("retrieval", dt)
+            log(f"Retrieval: {dt:.2f}s")
+            log(f"Retrieved {len(candidates)} candidate tags")
+            if verbose_retrieval:
+                log(f"Total unique candidates: {len(candidates)}")
+                limit = None if verbose_retrieval_all else max(1, int(verbose_retrieval_limit))
+                for report in phrase_reports:
+                    phrase = report.get("normalized") or report.get("phrase") or ""
+                    lookup = report.get("lookup") or ""
+                    tfidf_vocab = report.get("tfidf_vocab")
+                    log(f"Phrase: {phrase} (lookup={lookup}) tfidf_vocab={tfidf_vocab}")
+                    rows = report.get("candidates", [])
+                    shown = rows if limit is None else rows[:limit]
+                    for row in shown:
+                          tag = row.get("tag")
+                          alias_token = row.get("alias_token")
+                          score_fasttext = row.get("score_fasttext")
+                          score_context = row.get("score_context")
+                          score_combined = row.get("score_combined")
+                          count = row.get("count")
+                          alias_part = ""
+                          if alias_token and alias_token != tag:
+                              alias_part = f" [alias_token={alias_token}]"
+                          fasttext_str = (
+                              f"{score_fasttext:.3f}" if isinstance(score_fasttext, (int, float)) else score_fasttext
+                          )
+                          if score_context is None:
+                              context_str = "None"
+                          else:
+                              context_str = (
+                                  f"{score_context:.3f}" if isinstance(score_context, (int, float)) else score_context
+                              )
+                          combined_str = (
+                              f"{score_combined:.3f}" if isinstance(score_combined, (int, float)) else score_combined
+                          )
+                          log(
+                              f"  {tag}{alias_part} | fasttext={fasttext_str} context={context_str} "
+                              f"combined={combined_str} count={count}"
+                          )
+                    if limit is not None and len(rows) > limit:
+                        log(f"  ... ({len(rows) - limit} more)")
+        except Exception as e:
+            log(f"Retrieval fallback: {type(e).__name__}: {e}")
+            candidates = []
+        log("Step 3: LLM index selection (uses rewrite + structural/probe context)")
+        selection_query = _build_selection_query(
+            prompt_in=prompt_in,
+            rewritten=rewritten,
+            structural_tags=structural_tags,
+            probe_tags=probe_tags,
+        )
+        with ThreadPoolExecutor(max_workers=1) as ex:
+            fut_sel = ex.submit(
+                llm_select_indices,
+                query_text=selection_query,
+                candidates=candidates,
+                max_pick=0,
+                log=log,
+                mode=selection_mode,
+                chunk_size=max(1, selection_chunk_size),
+                per_phrase_k=max(1, selection_per_phrase_k),
+            )
+            picked_indices = _future_with_timeout(
+                fut_sel, stage3_select_timeout_s, "Index selection", []
+            )
+        selection_selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
+        selected_tags = list(selection_selected_tags)
+        if structural_tags:
+            # Add structural tags that aren't already selected
+            existing = {t for t in selected_tags}
+            new_structural = [t for t in structural_tags if t not in existing]
+            selected_tags.extend(new_structural)
+            log(f"  Added {len(new_structural)} structural tags: {', '.join(new_structural)}")
+        else:
+            log("  No structural tags inferred")
+        if probe_tags:
+            existing = {t for t in selected_tags}
+            new_probe = [t for t in probe_tags if t not in existing]
+            selected_tags.extend(new_probe)
+            log(f"  Added {len(new_probe)} probe tags: {', '.join(new_probe)}")
+        elif enable_probe_tags:
+            log("  No probe tags inferred")
+        selected_tags, removed_excluded_direct = _filter_excluded_recommendation_tags(selected_tags)
+        if removed_excluded_direct:
+            log(f"  Removed {len(removed_excluded_direct)} excluded tags: {', '.join(removed_excluded_direct)}")
+        direct_selected_tags = list(dict.fromkeys(selected_tags))
+        log("Step 3c: Expand via tag implications")
+        t0 = time.perf_counter()
+        tag_set = set(selected_tags)
+        expanded, implied_only = expand_tags_via_implications(tag_set)
+        dt = time.perf_counter()-t0
+        _record_timing("implication_expansion", dt)
+        log(f"Implication expansion: {dt:.2f}s")
+        implied_selected_tags = sorted(implied_only) if implied_only else []
+        if implied_only:
+            selected_tags.extend(sorted(implied_only))
+            log(f"  Added {len(implied_only)} implied tags: {', '.join(sorted(implied_only))}")
+        else:
+            log("  No additional implied tags")
+        selected_tags, removed_excluded_implied = _filter_excluded_recommendation_tags(selected_tags)
+        implied_selected_tags = [
+            t for t in implied_selected_tags if not _is_excluded_recommendation_tag(t)
+        ]
+        if removed_excluded_implied:
+            log(
+                f"  Removed {len(removed_excluded_implied)} excluded tags after implications: "
+                f"{', '.join(removed_excluded_implied)}"
+            )
+        log("Step 4: Compose final prompt")
+        t0 = time.perf_counter()
+        final_prompt = compose_final_prompt(rewritten, selected_tags)
+        dt = time.perf_counter()-t0
+        _record_timing("prompt_composition", dt)
+        log(f"Prompt composition: {dt:.2f}s")
         log("Step 5: Build ranked group/category display")
         t0 = time.perf_counter()
         seed_terms = []
         seed_terms.extend(user_tags)
         seed_terms.extend([p.strip() for p in (rewritten or "").split(",") if p.strip()])
+        seed_terms.extend(structural_tags or [])
+        seed_terms.extend(probe_tags or [])
+        seed_terms.extend(selected_tags)
+        seed_terms = list(dict.fromkeys(seed_terms))
+        active_selected_tags = list(dict.fromkeys(selected_tags))
+        structural_set = {_norm_tag_for_lookup(t) for t in (structural_tags or []) if t}
+        probe_set = {_norm_tag_for_lookup(t) for t in (probe_tags or []) if t}
+        implied_set = {_norm_tag_for_lookup(t) for t in (implied_selected_tags or []) if t}
+        rewrite_set = {
+            _norm_tag_for_lookup(t)
+            for t in (list(user_tags or []) + [p.strip() for p in (rewritten or "").split(",") if p.strip()])
+            if t
+        }
+        selection_set = {_norm_tag_for_lookup(t) for t in (selection_selected_tags or []) if t}
+        tag_selection_origins: Dict[str, str] = {}
+        for tag in active_selected_tags:
+            tag_norm = _norm_tag_for_lookup(tag)
+            if tag_norm in structural_set:
+                origin = "structural"
+            elif tag_norm in probe_set:
+                origin = "probe"
+            elif tag_norm in rewrite_set:
+                origin = "rewrite"
+            elif tag_norm in selection_set:
+                origin = "selection"
+            elif tag_norm in implied_set:
+                origin = "implied"
+            else:
+                # Unknown/fallback tags use selection color.
+                origin = "selection"
+            tag_selection_origins[tag] = origin
+            if tag_norm and tag_norm != tag:
+                tag_selection_origins[tag_norm] = origin
+        direct_tags_for_implied = list(
+            dict.fromkeys(_norm_tag_for_lookup(t) for t in (direct_selected_tags or []) if t)
+        )
+        direct_tags_for_implied_idx = {t: i for i, t in enumerate(direct_tags_for_implied)}
+        direct_tags_for_implied.sort(
+            key=lambda t: (
+                _selection_source_rank(tag_selection_origins.get(t, "selection")),
+                direct_tags_for_implied_idx.get(t, 10**9),
+            )
+        )
+        implied_parent_map = _build_implied_parent_map(
+            direct_tags_ordered=direct_tags_for_implied,
+            implied_tags=implied_selected_tags,
+        )
         toggle_rows = _build_toggle_rows(
             seed_terms=seed_terms,
+            selected_tags=active_selected_tags,
+            tag_selection_origins=tag_selection_origins,
+            implied_parent_map=implied_parent_map,
             top_groups=max(1, int(display_top_groups)),
             top_tags_per_group=max(1, int(display_top_tags_per_group)),
             group_rank_top_k=max(1, int(display_rank_top_k)),
         )
+        dt = time.perf_counter()-t0
+        _record_timing("group_display", dt)
+        log(f"Ranked group display: {dt:.2f}s ({len(toggle_rows)} rows)")
+        log(
+            _build_display_audit_line(
+                toggle_rows,
+                active_selected_tags=active_selected_tags,
+                direct_selected_tags=direct_selected_tags,
+                implied_selected_tags=implied_selected_tags,
+            )
+        )
+        total_dt = time.perf_counter()-t_total0
+        _emit_timing_summary(total_dt)
+        _append_timing_jsonl(total_dt)
+        log("Done: final prompt ready")
+        return _build_ui_payload(
+            console_text="\n".join(logs),
+            legacy_prompt_text=final_prompt,
+            row_defs=toggle_rows,
+            selected_tags=active_selected_tags,
+        )
+    except Exception as e:
+        log(f"Error: {type(e).__name__}: {e}")
+        return _build_ui_payload(
+            console_text="\n".join(logs),
+            legacy_prompt_text="",
+            row_defs=[],
+            selected_tags=[],
+        )
+with gr.Blocks(css=css, js=client_js) as app:
+    with gr.Row():
+        with gr.Column(scale=3, elem_classes=["prompt-col"]):
+            image_tags = gr.Textbox(
+                label="Enter Prompt",
+                placeholder="e.g. fox, outside, detailed background, .",
+                lines=1
+            )
+        with gr.Column(scale=1):
+            _mascot_pil = _load_mascot_image()
+            if _mascot_pil is not None:
+                mascot_img = gr.Image(
+                    value=_mascot_pil,
+                    show_label=False,
+                    interactive=False,
+                    height=220,
+                    elem_id="mascot"
+                )
+            else:
+                mascot_img = gr.Markdown("`(mascot image unavailable)`")
+            submit_button = gr.Button("Run", variant="primary")
+    gr.Markdown(
+        """
+### Prompt Squirrel RAG (pipeline version)
+Type a rough prompt. This tool rewrites it and aligns it to an e621-style tag vocabulary using Prompt Squirrel internally,
+then returns a cleaned, model-friendly prompt.
+        """.strip()
+    )
+    console = gr.Textbox(
+        label="Console",
+        lines=10,
+        interactive=False,
+        placeholder="Progress logs will appear here."
+    )
+    suggested_prompt = gr.Textbox(
+        label="Suggested Prompt (From Toggled Tags)",
+        lines=3,
+        interactive=False,
+        show_copy_button=True,
+        placeholder="Comma-separated tags selected in the rows below."
+    )
+    with gr.Accordion("Legacy Pipeline Prompt (for reference)", open=False):
+        legacy_final_prompt = gr.Textbox(
+            label="Legacy Final Prompt",
+            lines=3,
+            interactive=False,
+            show_copy_button=True,
+        )
     selected_tags_state = gr.State([])
     row_defs_state = gr.State([])
     row_values_state = gr.State([])
     gr.Markdown("### Toggle Tag Rows")
+    gr.HTML(
+        """
+        <div class="source-legend">
+          <span class="chip"><span class="swatch rewrite"></span>Rewrite phrase</span>
+          <span class="chip"><span class="swatch selection"></span>General selection</span>
+          <span class="chip"><span class="swatch probe"></span>Probe query</span>
+          <span class="chip"><span class="swatch structural"></span>Structural query</span>
+          <span class="chip"><span class="swatch implied"></span>Implied</span>
+          <span class="chip"><span class="swatch user"></span>User-toggled</span>
+          <span class="chip"><span class="swatch unselected"></span>Unselected</span>
+        </div>
+        """
     )
+    gr.Markdown(
+        "Rows are ranked by expected tag count (E). Within each row: structural -> probe -> selected, "
+        "implied tags follow their triggering selected tag when possible, then unselected tags in confidence order."
     )
+    row_headers: List[gr.Markdown] = []
+    row_checkboxes: List[gr.CheckboxGroup] = []
+    for _ in range(display_max_rows_default):
+        row_headers.append(gr.Markdown(value="", visible=False))
+        row_checkboxes.append(
+            gr.CheckboxGroup(
+                choices=[],
+                value=[],
+                visible=False,
+                interactive=True,
+                container=False,
+                elem_classes=["lego-tags"],
+            )
+        )
+    gr.Markdown(
+        "Toggling a tag in any row toggles it everywhere else that tag appears."
+    )
+    with gr.Accordion("Display Settings", open=False):
+        with gr.Row():
+            display_top_groups = gr.Number(
+                value=display_top_groups_default,
+                precision=0,
+                label="Rows (Top Groups/Categories)",
+                minimum=1,
+            )
+            display_top_tags_per_group = gr.Number(
+                value=display_top_tags_per_group_default,
+                precision=0,
+                label="Top Tags Shown Per Row",
+                minimum=1,
+            )
+            display_rank_top_k = gr.Number(
+                value=display_rank_top_k_default,
+                precision=0,
+                label="Top Tags Used for Row Ranking",
+                minimum=1,
+            )
+    run_outputs = [
+        console,
+        legacy_final_prompt,
+        suggested_prompt,
+        selected_tags_state,
+        row_defs_state,
+        row_values_state,
+        *row_headers,
+        *row_checkboxes,
+    ]
+    submit_button.click(
+        rag_pipeline_ui,
+        inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
+        outputs=run_outputs
+    )
+    image_tags.submit(
+        rag_pipeline_ui,
+        inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
+        outputs=run_outputs
+    )
     for idx, row_cb in enumerate(row_checkboxes):
         row_cb.change(
             fn=lambda changed_values, selected_state, row_defs, row_values, i=idx: _on_toggle_row(
                 i,
                 changed_values,
+                selected_state,
+                row_defs,
+                row_values,
+                display_max_rows_default,
             ),
             inputs=[row_cb, selected_tags_state, row_defs_state, row_values_state],
             outputs=[selected_tags_state, row_values_state, suggested_prompt, *row_checkboxes],
+            queue=False,
+            show_progress="hidden",
         )
+if __name__ == "__main__":
+    app.queue().launch(allowed_paths=[str(MASCOT_DIR)])

data/analysis/category_registry.csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

data/analysis/hybrid_category_assignment_preview.json ADDED Viewed

	@@ -0,0 +1,2753 @@

+{
+  "config": {
+    "tfidf_weight": 0.6,
+    "wiki_weight": 0.4,
+    "tfidf_temp": 0.08,
+    "single_top1_min": 0.55,
+    "single_margin_min": 0.18,
+    "single_top2_max": 0.35,
+    "multi_top1_min": 0.42,
+    "multi_top2_min": 0.3,
+    "multi_pair_min": 0.78,
+    "sample_size": 20,
+    "seed": 42
+  },
+  "inputs": {
+    "registry_csv": "E:\\image\\backup\\Prompt_Squirrel_RAG\\data\\analysis\\category_registry.csv",
+    "wiki_pages_csv": "E:\\image\\backup\\Prompt_Squirrel_RAG\\wiki_pages-2023-08-08.csv",
+    "uncategorized_tags": 6261,
+    "active_categories_for_centroids": 19,
+    "centroid_seed_sizes": {
+      "anatomy_features": 407,
+      "background_composition": 84,
+      "body_decor": 9,
+      "body_type": 7,
+      "clothing_detail": 418,
+      "color_markings": 234,
+      "count": 5,
+      "expression_detail": 31,
+      "franchise_series": 86,
+      "gaze_detail": 22,
+      "gender": 3,
+      "objects_props": 264,
+      "organization": 8,
+      "perspective": 6,
+      "pose_action_detail": 100,
+      "resolution": 3,
+      "species": 13,
+      "style": 9,
+      "text": 4
+    }
+  },
+  "summary": {
+    "counts": {
+      "uncategorized_total": 6261,
+      "scored_rows": 6261,
+      "has_tfidf_vector": 5089,
+      "has_wiki_page": 4368,
+      "has_wiki_category_votes": 1957,
+      "signals": {
+        "tfidf_only": 3176,
+        "both": 1913,
+        "none": 1128,
+        "wiki_only": 44
+      },
+      "assignments": {
+        "hold": 5997,
+        "multi": 31,
+        "single": 233
+      },
+      "newly_categorized": 264,
+      "remaining_uncategorized": 5997,
+      "multi_category_additions": 62
+    },
+    "top_single_categories": [
+      [
+        "franchise_series",
+        177
+      ],
+      [
+        "clothing_detail",
+        13
+      ],
+      [
+        "anatomy_features",
+        9
+      ],
+      [
+        "text",
+        8
+      ],
+      [
+        "organization",
+        7
+      ],
+      [
+        "body_type",
+        6
+      ],
+      [
+        "style",
+        2
+      ],
+      [
+        "species",
+        2
+      ],
+      [
+        "body_decor",
+        2
+      ],
+      [
+        "objects_props",
+        2
+      ],
+      [
+        "background_composition",
+        1
+      ],
+      [
+        "color_markings",
+        1
+      ],
+      [
+        "expression_detail",
+        1
+      ],
+      [
+        "pose_action_detail",
+        1
+      ],
+      [
+        "count",
+        1
+      ]
+    ],
+    "top_multi_category_pairs": [
+      {
+        "categories": [
+          "body_type",
+          "franchise_series"
+        ],
+        "count": 7
+      },
+      {
+        "categories": [
+          "objects_props",
+          "pose_action_detail"
+        ],
+        "count": 2
+      },
+      {
+        "categories": [
+          "franchise_series",
+          "gender"
+        ],
+        "count": 2
+      },
+      {
+        "categories": [
+          "body_type",
+          "species"
+        ],
+        "count": 2
+      },
+      {
+        "categories": [
+          "color_markings",
+          "franchise_series"
+        ],
+        "count": 2
+      },
+      {
+        "categories": [
+          "anatomy_features",
+          "color_markings"
+        ],
+        "count": 2
+      },
+      {
+        "categories": [
+          "expression_detail",
+          "pose_action_detail"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "expression_detail",
+          "text"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "color_markings",
+          "style"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "anatomy_features",
+          "objects_props"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "franchise_series",
+          "species"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "perspective",
+          "pose_action_detail"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "count",
+          "franchise_series"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "clothing_detail",
+          "franchise_series"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "body_decor",
+          "objects_props"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "background_composition",
+          "franchise_series"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "anatomy_features",
+          "species"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "anatomy_features",
+          "franchise_series"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "body_type",
+          "gender"
+        ],
+        "count": 1
+      },
+      {
+        "categories": [
+          "clothing_detail",
+          "color_markings"
+        ],
+        "count": 1
+      }
+    ],
+    "samples": {
+      "single": [
+        {
+          "tag": "eeveelution",
+          "count": 58150,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.721
+            ],
+            [
+              "gender",
+              0.0689
+            ],
+            [
+              "resolution",
+              0.0234
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.535
+            ],
+            [
+              "gender",
+              0.1148
+            ],
+            [
+              "resolution",
+              0.039
+            ]
+          ],
+          "top_wiki": [
+            [
+              "franchise_series",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 13
+        },
+        {
+          "tag": "boss_monster",
+          "count": 19924,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "anatomy_features"
+          ],
+          "top_fused": [
+            [
+              "anatomy_features",
+              1.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "anatomy_features",
+              1.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 4,
+          "wiki_link_count": 19
+        },
+        {
+          "tag": "blaze_the_cat",
+          "count": 7169,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6294
+            ],
+            [
+              "resolution",
+              0.0338
+            ],
+            [
+              "expression_detail",
+              0.0334
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.3824
+            ],
+            [
+              "resolution",
+              0.0563
+            ],
+            [
+              "expression_detail",
+              0.0556
+            ]
+          ],
+          "top_wiki": [
+            [
+              "franchise_series",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 6
+        },
+        {
+          "tag": "espeon",
+          "count": 7029,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6445
+            ],
+            [
+              "gender",
+              0.0854
+            ],
+            [
+              "resolution",
+              0.0296
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6445
+            ],
+            [
+              "gender",
+              0.0854
+            ],
+            [
+              "resolution",
+              0.0296
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 11
+        },
+        {
+          "tag": "zangoose",
+          "count": 6959,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6316
+            ],
+            [
+              "gender",
+              0.0872
+            ],
+            [
+              "resolution",
+              0.0371
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6316
+            ],
+            [
+              "gender",
+              0.0872
+            ],
+            [
+              "resolution",
+              0.0371
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 4
+        },
+        {
+          "tag": "snivy",
+          "count": 3315,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.7953
+            ],
+            [
+              "gender",
+              0.0605
+            ],
+            [
+              "resolution",
+              0.0192
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7953
+            ],
+            [
+              "gender",
+              0.0605
+            ],
+            [
+              "resolution",
+              0.0192
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 6
+        },
+        {
+          "tag": "buizel",
+          "count": 3220,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6631
+            ],
+            [
+              "gender",
+              0.0802
+            ],
+            [
+              "resolution",
+              0.0254
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6631
+            ],
+            [
+              "gender",
+              0.0802
+            ],
+            [
+              "resolution",
+              0.0254
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 2
+        },
+        {
+          "tag": "floatzel",
+          "count": 2957,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.674
+            ],
+            [
+              "gender",
+              0.0738
+            ],
+            [
+              "resolution",
+              0.0261
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.674
+            ],
+            [
+              "gender",
+              0.0738
+            ],
+            [
+              "resolution",
+              0.0261
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 2
+        },
+        {
+          "tag": "charmeleon",
+          "count": 2899,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6974
+            ],
+            [
+              "gender",
+              0.0566
+            ],
+            [
+              "count",
+              0.0233
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6974
+            ],
+            [
+              "gender",
+              0.0566
+            ],
+            [
+              "count",
+              0.0233
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 9
+        },
+        {
+          "tag": "dragonite",
+          "count": 2477,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6717
+            ],
+            [
+              "gender",
+              0.0623
+            ],
+            [
+              "resolution",
+              0.0246
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6717
+            ],
+            [
+              "gender",
+              0.0623
+            ],
+            [
+              "resolution",
+              0.0246
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 3
+        },
+        {
+          "tag": "ampharos",
+          "count": 2449,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.7534
+            ],
+            [
+              "gender",
+              0.0559
+            ],
+            [
+              "resolution",
+              0.0207
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7534
+            ],
+            [
+              "gender",
+              0.0559
+            ],
+            [
+              "resolution",
+              0.0207
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 4
+        },
+        {
+          "tag": "pichu",
+          "count": 1980,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.7864
+            ],
+            [
+              "gender",
+              0.0402
+            ],
+            [
+              "resolution",
+              0.0197
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7864
+            ],
+            [
+              "gender",
+              0.0402
+            ],
+            [
+              "resolution",
+              0.0197
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 5
+        },
+        {
+          "tag": "quiver",
+          "count": 1372,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "objects_props"
+          ],
+          "top_fused": [
+            [
+              "objects_props",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "objects_props",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 4
+        },
+        {
+          "tag": "snorlax",
+          "count": 1079,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6034
+            ],
+            [
+              "gender",
+              0.0515
+            ],
+            [
+              "resolution",
+              0.0309
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6034
+            ],
+            [
+              "gender",
+              0.0515
+            ],
+            [
+              "resolution",
+              0.0309
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 2
+        },
+        {
+          "tag": "blastoise",
+          "count": 1006,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6609
+            ],
+            [
+              "gender",
+              0.0453
+            ],
+            [
+              "count",
+              0.0243
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6609
+            ],
+            [
+              "gender",
+              0.0453
+            ],
+            [
+              "count",
+              0.0243
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 4
+        },
+        {
+          "tag": "roserade",
+          "count": 871,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6325
+            ],
+            [
+              "gender",
+              0.052
+            ],
+            [
+              "resolution",
+              0.0336
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6325
+            ],
+            [
+              "gender",
+              0.052
+            ],
+            [
+              "resolution",
+              0.0336
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 3
+        },
+        {
+          "tag": "alolan_raichu",
+          "count": 730,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6985
+            ],
+            [
+              "gender",
+              0.0612
+            ],
+            [
+              "resolution",
+              0.0263
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6985
+            ],
+            [
+              "gender",
+              0.0612
+            ],
+            [
+              "resolution",
+              0.0263
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 4
+        },
+        {
+          "tag": "nickit",
+          "count": 663,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6143
+            ],
+            [
+              "gender",
+              0.0972
+            ],
+            [
+              "resolution",
+              0.0326
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6143
+            ],
+            [
+              "gender",
+              0.0972
+            ],
+            [
+              "resolution",
+              0.0326
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "linoone",
+          "count": 628,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.7373
+            ],
+            [
+              "gender",
+              0.0406
+            ],
+            [
+              "resolution",
+              0.0196
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7373
+            ],
+            [
+              "gender",
+              0.0406
+            ],
+            [
+              "resolution",
+              0.0196
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 2
+        },
+        {
+          "tag": "amped_toxtricity",
+          "count": 568,
+          "signal": "tfidf_only",
+          "assigned_categories": [
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.6964
+            ],
+            [
+              "gender",
+              0.0587
+            ],
+            [
+              "resolution",
+              0.0257
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.6964
+            ],
+            [
+              "gender",
+              0.0587
+            ],
+            [
+              "resolution",
+              0.0257
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        }
+      ],
+      "multi": [
+        {
+          "tag": "cub",
+          "count": 147547,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "species",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "species",
+              0.6
+            ],
+            [
+              "body_type",
+              0.4
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "species",
+              0.6
+            ],
+            [
+              "body_type",
+              0.4
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 5,
+          "wiki_link_count": 11
+        },
+        {
+          "tag": "dock",
+          "count": 16478,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "anatomy_features",
+            "objects_props"
+          ],
+          "top_fused": [
+            [
+              "anatomy_features",
+              0.5
+            ],
+            [
+              "objects_props",
+              0.5
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "anatomy_features",
+              0.5
+            ],
+            [
+              "objects_props",
+              0.5
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 3
+        },
+        {
+          "tag": "teenager",
+          "count": 13700,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "body_type",
+            "gender"
+          ],
+          "top_fused": [
+            [
+              "body_type",
+              0.5
+            ],
+            [
+              "gender",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "body_type",
+              0.5
+            ],
+            [
+              "gender",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 4,
+          "wiki_link_count": 19
+        },
+        {
+          "tag": "ringtail",
+          "count": 6643,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "anatomy_features",
+            "color_markings"
+          ],
+          "top_fused": [
+            [
+              "anatomy_features",
+              0.6667
+            ],
+            [
+              "color_markings",
+              0.3333
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "anatomy_features",
+              0.6667
+            ],
+            [
+              "color_markings",
+              0.3333
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 3,
+          "wiki_link_count": 5
+        },
+        {
+          "tag": "greninja",
+          "count": 3805,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4591
+            ],
+            [
+              "body_type",
+              0.4072
+            ],
+            [
+              "gender",
+              0.0343
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7651
+            ],
+            [
+              "gender",
+              0.0571
+            ],
+            [
+              "resolution",
+              0.0246
+            ]
+          ],
+          "top_wiki": [
+            [
+              "body_type",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 9
+        },
+        {
+          "tag": "roxanne_wolf_(fnaf)",
+          "count": 3637,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "anatomy_features",
+            "color_markings"
+          ],
+          "top_fused": [
+            [
+              "anatomy_features",
+              0.6
+            ],
+            [
+              "color_markings",
+              0.4
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "anatomy_features",
+              0.6
+            ],
+            [
+              "color_markings",
+              0.4
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 5,
+          "wiki_link_count": 21
+        },
+        {
+          "tag": "pet",
+          "count": 3461,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "body_decor",
+            "objects_props"
+          ],
+          "top_fused": [
+            [
+              "body_decor",
+              0.5
+            ],
+            [
+              "objects_props",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "body_decor",
+              0.5
+            ],
+            [
+              "objects_props",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 18
+        },
+        {
+          "tag": "zorua",
+          "count": 3167,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4565
+            ],
+            [
+              "body_type",
+              0.4079
+            ],
+            [
+              "gender",
+              0.0393
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7608
+            ],
+            [
+              "gender",
+              0.0655
+            ],
+            [
+              "resolution",
+              0.0223
+            ]
+          ],
+          "top_wiki": [
+            [
+              "body_type",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 3
+        },
+        {
+          "tag": "kirlia",
+          "count": 3140,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4781
+            ],
+            [
+              "body_type",
+              0.4081
+            ],
+            [
+              "gender",
+              0.0273
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7969
+            ],
+            [
+              "gender",
+              0.0455
+            ],
+            [
+              "resolution",
+              0.0237
+            ]
+          ],
+          "top_wiki": [
+            [
+              "body_type",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 8
+        },
+        {
+          "tag": "simba",
+          "count": 2566,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "franchise_series",
+            "gender"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.5
+            ],
+            [
+              "gender",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "franchise_series",
+              0.5
+            ],
+            [
+              "gender",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 14
+        },
+        {
+          "tag": "colorful",
+          "count": 2402,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "color_markings",
+            "style"
+          ],
+          "top_fused": [
+            [
+              "color_markings",
+              0.6667
+            ],
+            [
+              "style",
+              0.3333
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "color_markings",
+              0.6667
+            ],
+            [
+              "style",
+              0.3333
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 3,
+          "wiki_link_count": 5
+        },
+        {
+          "tag": "mawile",
+          "count": 2121,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "clothing_detail"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4989
+            ],
+            [
+              "clothing_detail",
+              0.4019
+            ],
+            [
+              "gender",
+              0.0226
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.8316
+            ],
+            [
+              "gender",
+              0.0377
+            ],
+            [
+              "resolution",
+              0.0176
+            ]
+          ],
+          "top_wiki": [
+            [
+              "clothing_detail",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 6
+        },
+        {
+          "tag": "troll",
+          "count": 1556,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "species",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "species",
+              0.5
+            ],
+            [
+              "body_type",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "species",
+              0.5
+            ],
+            [
+              "body_type",
+              0.5
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 10
+        },
+        {
+          "tag": "squirtle",
+          "count": 1167,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4597
+            ],
+            [
+              "body_type",
+              0.4074
+            ],
+            [
+              "gender",
+              0.0188
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7662
+            ],
+            [
+              "gender",
+              0.0313
+            ],
+            [
+              "resolution",
+              0.0187
+            ]
+          ],
+          "top_wiki": [
+            [
+              "body_type",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 10
+        },
+        {
+          "tag": "oshawott",
+          "count": 1157,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.5151
+            ],
+            [
+              "body_type",
+              0.4049
+            ],
+            [
+              "gender",
+              0.019
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.8585
+            ],
+            [
+              "gender",
+              0.0316
+            ],
+            [
+              "resolution",
+              0.013
+            ]
+          ],
+          "top_wiki": [
+            [
+              "body_type",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 9
+        },
+        {
+          "tag": "cosplay_pikachu_(character)",
+          "count": 1138,
+          "signal": "both",
+          "assigned_categories": [
+            "gender",
+            "franchise_series"
+          ],
+          "top_fused": [
+            [
+              "gender",
+              0.4429
+            ],
+            [
+              "franchise_series",
+              0.343
+            ],
+            [
+              "resolution",
+              0.0276
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.5717
+            ],
+            [
+              "gender",
+              0.0715
+            ],
+            [
+              "resolution",
+              0.046
+            ]
+          ],
+          "top_wiki": [
+            [
+              "gender",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 5
+        },
+        {
+          "tag": "legendary_duo",
+          "count": 1059,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "count"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.477
+            ],
+            [
+              "count",
+              0.4082
+            ],
+            [
+              "gender",
+              0.0209
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7951
+            ],
+            [
+              "gender",
+              0.0348
+            ],
+            [
+              "resolution",
+              0.0174
+            ]
+          ],
+          "top_wiki": [
+            [
+              "count",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 26
+        },
+        {
+          "tag": "sobble",
+          "count": 762,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "anatomy_features"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4854
+            ],
+            [
+              "anatomy_features",
+              0.3047
+            ],
+            [
+              "color_markings",
+              0.1033
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.8089
+            ],
+            [
+              "gender",
+              0.0414
+            ],
+            [
+              "resolution",
+              0.0166
+            ]
+          ],
+          "top_wiki": [
+            [
+              "anatomy_features",
+              0.75
+            ],
+            [
+              "color_markings",
+              0.25
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 4,
+          "wiki_link_count": 20
+        },
+        {
+          "tag": "chesnaught",
+          "count": 718,
+          "signal": "both",
+          "assigned_categories": [
+            "franchise_series",
+            "body_type"
+          ],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4713
+            ],
+            [
+              "body_type",
+              0.406
+            ],
+            [
+              "gender",
+              0.0226
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "franchise_series",
+              0.7856
+            ],
+            [
+              "gender",
+              0.0377
+            ],
+            [
+              "resolution",
+              0.0181
+            ]
+          ],
+          "top_wiki": [
+            [
+              "body_type",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 7
+        },
+        {
+          "tag": "</3",
+          "count": 712,
+          "signal": "wiki_only",
+          "assigned_categories": [
+            "expression_detail",
+            "pose_action_detail"
+          ],
+          "top_fused": [
+            [
+              "expression_detail",
+              0.6667
+            ],
+            [
+              "pose_action_detail",
+              0.3333
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [
+            [
+              "expression_detail",
+              0.6667
+            ],
+            [
+              "pose_action_detail",
+              0.3333
+            ],
+            [
+              "text",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 3,
+          "wiki_link_count": 7
+        }
+      ],
+      "hold": [
+        {
+          "tag": "helmet",
+          "count": 24793,
+          "signal": "both",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "clothing_detail",
+              0.2934
+            ],
+            [
+              "body_decor",
+              0.0901
+            ],
+            [
+              "resolution",
+              0.0744
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "resolution",
+              0.124
+            ],
+            [
+              "count",
+              0.1153
+            ],
+            [
+              "body_type",
+              0.0904
+            ]
+          ],
+          "top_wiki": [
+            [
+              "clothing_detail",
+              0.7143
+            ],
+            [
+              "franchise_series",
+              0.1429
+            ],
+            [
+              "body_decor",
+              0.1429
+            ]
+          ],
+          "wiki_vote_count": 7,
+          "wiki_link_count": 87
+        },
+        {
+          "tag": "poster",
+          "count": 6434,
+          "signal": "both",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "organization",
+              0.4334
+            ],
+            [
+              "objects_props",
+              0.1036
+            ],
+            [
+              "pose_action_detail",
+              0.0671
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "objects_props",
+              0.1727
+            ],
+            [
+              "pose_action_detail",
+              0.1119
+            ],
+            [
+              "background_composition",
+              0.078
+            ]
+          ],
+          "top_wiki": [
+            [
+              "organization",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 19
+        },
+        {
+          "tag": "bottomless_female",
+          "count": 4337,
+          "signal": "both",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "clothing_detail",
+              0.4343
+            ],
+            [
+              "pose_action_detail",
+              0.0644
+            ],
+            [
+              "gaze_detail",
+              0.0573
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "pose_action_detail",
+              0.1074
+            ],
+            [
+              "gaze_detail",
+              0.0954
+            ],
+            [
+              "expression_detail",
+              0.0898
+            ]
+          ],
+          "top_wiki": [
+            [
+              "clothing_detail",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 11,
+          "wiki_link_count": 17
+        },
+        {
+          "tag": "guardians_of_the_galaxy",
+          "count": 3013,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "style",
+              0.0935
+            ],
+            [
+              "pose_action_detail",
+              0.0824
+            ],
+            [
+              "expression_detail",
+              0.0768
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "style",
+              0.0935
+            ],
+            [
+              "pose_action_detail",
+              0.0824
+            ],
+            [
+              "expression_detail",
+              0.0768
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 5
+        },
+        {
+          "tag": "barbel_(anatomy)",
+          "count": 2627,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "anatomy_features",
+              0.1965
+            ],
+            [
+              "color_markings",
+              0.1306
+            ],
+            [
+              "perspective",
+              0.0843
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "anatomy_features",
+              0.1965
+            ],
+            [
+              "color_markings",
+              0.1306
+            ],
+            [
+              "perspective",
+              0.0843
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 5
+        },
+        {
+          "tag": "millie_(helluva_boss)",
+          "count": 2009,
+          "signal": "both",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "clothing_detail",
+              0.4248
+            ],
+            [
+              "expression_detail",
+              0.0566
+            ],
+            [
+              "text",
+              0.0562
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "expression_detail",
+              0.0944
+            ],
+            [
+              "text",
+              0.0936
+            ],
+            [
+              "body_decor",
+              0.09
+            ]
+          ],
+          "top_wiki": [
+            [
+              "clothing_detail",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 13
+        },
+        {
+          "tag": "hill",
+          "count": 1443,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "background_composition",
+              0.1306
+            ],
+            [
+              "objects_props",
+              0.0996
+            ],
+            [
+              "style",
+              0.0904
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "background_composition",
+              0.1306
+            ],
+            [
+              "objects_props",
+              0.0996
+            ],
+            [
+              "style",
+              0.0904
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 3
+        },
+        {
+          "tag": "electric_fan",
+          "count": 1093,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "objects_props",
+              0.1294
+            ],
+            [
+              "pose_action_detail",
+              0.0944
+            ],
+            [
+              "expression_detail",
+              0.0939
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "objects_props",
+              0.1294
+            ],
+            [
+              "pose_action_detail",
+              0.0944
+            ],
+            [
+              "expression_detail",
+              0.0939
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 4
+        },
+        {
+          "tag": "gammamon",
+          "count": 1023,
+          "signal": "both",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "franchise_series",
+              0.4428
+            ],
+            [
+              "body_type",
+              0.0619
+            ],
+            [
+              "gender",
+              0.0359
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "body_type",
+              0.1032
+            ],
+            [
+              "franchise_series",
+              0.0714
+            ],
+            [
+              "gender",
+              0.0598
+            ]
+          ],
+          "top_wiki": [
+            [
+              "franchise_series",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 2,
+          "wiki_link_count": 13
+        },
+        {
+          "tag": "zazush-una",
+          "count": 971,
+          "signal": "none",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "radio",
+          "count": 842,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "objects_props",
+              0.2006
+            ],
+            [
+              "pose_action_detail",
+              0.1172
+            ],
+            [
+              "background_composition",
+              0.0925
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "objects_props",
+              0.2006
+            ],
+            [
+              "pose_action_detail",
+              0.1172
+            ],
+            [
+              "background_composition",
+              0.0925
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 5
+        },
+        {
+          "tag": "by_bambii_dog",
+          "count": 811,
+          "signal": "none",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "cabin",
+          "count": 694,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "style",
+              0.0748
+            ],
+            [
+              "organization",
+              0.074
+            ],
+            [
+              "background_composition",
+              0.0733
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "style",
+              0.0748
+            ],
+            [
+              "organization",
+              0.074
+            ],
+            [
+              "background_composition",
+              0.0733
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "by_luckypan",
+          "count": 681,
+          "signal": "none",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "silverstream_(mlp)",
+          "count": 635,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "body_type",
+              0.37
+            ],
+            [
+              "resolution",
+              0.141
+            ],
+            [
+              "count",
+              0.0959
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "body_type",
+              0.37
+            ],
+            [
+              "resolution",
+              0.141
+            ],
+            [
+              "count",
+              0.0959
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 13
+        },
+        {
+          "tag": "by_angrboda",
+          "count": 622,
+          "signal": "none",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "glistening_arms",
+          "count": 599,
+          "signal": "tfidf_only",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "color_markings",
+              0.3229
+            ],
+            [
+              "gaze_detail",
+              0.196
+            ],
+            [
+              "anatomy_features",
+              0.1557
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "color_markings",
+              0.3229
+            ],
+            [
+              "gaze_detail",
+              0.196
+            ],
+            [
+              "anatomy_features",
+              0.1557
+            ]
+          ],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "by_evilymasterful",
+          "count": 571,
+          "signal": "none",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "by_0laffson",
+          "count": 563,
+          "signal": "none",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ],
+            [
+              "background_composition",
+              0.0
+            ]
+          ],
+          "top_tfidf": [],
+          "top_wiki": [],
+          "wiki_vote_count": 0,
+          "wiki_link_count": 0
+        },
+        {
+          "tag": "daffy_duck",
+          "count": 562,
+          "signal": "both",
+          "assigned_categories": [],
+          "top_fused": [
+            [
+              "body_type",
+              0.4204
+            ],
+            [
+              "objects_props",
+              0.0542
+            ],
+            [
+              "species",
+              0.0486
+            ]
+          ],
+          "top_tfidf": [
+            [
+              "objects_props",
+              0.0904
+            ],
+            [
+              "species",
+              0.081
+            ],
+            [
+              "pose_action_detail",
+              0.0674
+            ]
+          ],
+          "top_wiki": [
+            [
+              "body_type",
+              1.0
+            ],
+            [
+              "text",
+              0.0
+            ],
+            [
+              "franchise_series",
+              0.0
+            ]
+          ],
+          "wiki_vote_count": 1,
+          "wiki_link_count": 6
+        }
+      ]
+    }
+  }
+}

data/runtime_metrics/ui_pipeline_timings.jsonl CHANGED Viewed

@@ -1,3 +1,16 @@
 {"timestamp_utc": "2026-03-02T12:44:26Z", "stages_s": {"preprocess": 7.90999984019436e-05, "rewrite": 1.9136111999978311, "structural": 1.0946640000038315, "probe": 0.5859509000001708, "retrieval": 4.595289600001706, "selection": 37.53351300000213, "implication_expansion": 0.15133090000017546, "prompt_composition": 6.299999949987978e-05, "group_display": 0.04701460000069346}, "total_s": 45.927563900004316, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
 {"timestamp_utc": "2026-03-02T16:08:08Z", "stages_s": {"preprocess": 6.989999383222312e-05, "rewrite": 3.0064916999981506, "structural": 4.2000028770416975e-06, "probe": 3.01228209999681, "retrieval": 3.3860946000058902, "selection": 5.285027000005357, "implication_expansion": 0.147530000002007, "prompt_composition": 3.850000211969018e-05, "group_display": 0.10624819999793544}, "total_s": 14.949083599989535, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
 {"timestamp_utc": "2026-03-02T16:08:37Z", "stages_s": {"preprocess": 7.179999374784529e-05, "rewrite": 4.608368299988797, "structural": 3.6999990697950125e-06, "probe": 1.5999976312741637e-06, "retrieval": 3.4574174999870593, "selection": 8.8562099999981, "implication_expansion": 0.14937499999359716, "prompt_composition": 3.650000144261867e-05, "group_display": 0.04632819999824278}, "total_s": 17.122792900001514, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}

 {"timestamp_utc": "2026-03-02T12:44:26Z", "stages_s": {"preprocess": 7.90999984019436e-05, "rewrite": 1.9136111999978311, "structural": 1.0946640000038315, "probe": 0.5859509000001708, "retrieval": 4.595289600001706, "selection": 37.53351300000213, "implication_expansion": 0.15133090000017546, "prompt_composition": 6.299999949987978e-05, "group_display": 0.04701460000069346}, "total_s": 45.927563900004316, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
 {"timestamp_utc": "2026-03-02T16:08:08Z", "stages_s": {"preprocess": 6.989999383222312e-05, "rewrite": 3.0064916999981506, "structural": 4.2000028770416975e-06, "probe": 3.01228209999681, "retrieval": 3.3860946000058902, "selection": 5.285027000005357, "implication_expansion": 0.147530000002007, "prompt_composition": 3.850000211969018e-05, "group_display": 0.10624819999793544}, "total_s": 14.949083599989535, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
 {"timestamp_utc": "2026-03-02T16:08:37Z", "stages_s": {"preprocess": 7.179999374784529e-05, "rewrite": 4.608368299988797, "structural": 3.6999990697950125e-06, "probe": 1.5999976312741637e-06, "retrieval": 3.4574174999870593, "selection": 8.8562099999981, "implication_expansion": 0.14937499999359716, "prompt_composition": 3.650000144261867e-05, "group_display": 0.04632819999824278}, "total_s": 17.122792900001514, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-06T21:33:29Z", "stages_s": {"preprocess": 9.789998875930905e-05, "rewrite": 7.193461999995634, "structural": 3.3999676816165447e-06, "probe": 0.9721586999949068, "retrieval": 2.3267829000251368, "selection": 1.0979214000399224, "implication_expansion": 0.2668229000410065, "prompt_composition": 3.819999983534217e-05, "group_display": 0.08292249997612089}, "total_s": 11.945365399995353, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-06T21:38:28Z", "stages_s": {"preprocess": 1.9300030544400215e-05, "rewrite": 1.5391526999883354, "structural": 0.5504020000225864, "probe": 0.2567070999648422, "retrieval": 0.5546861999901012, "selection": 10.549223000009079, "implication_expansion": 3.8300000596791506e-05, "prompt_composition": 3.0399998649954796e-05, "group_display": 0.025673600030131638}, "total_s": 13.487254999985453, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-06T22:33:39Z", "stages_s": {"preprocess": 0.00016080000204965472, "rewrite": 1.9639222000259906, "structural": 0.7869719000300393, "probe": 0.503746600006707, "retrieval": 2.3870767999906093, "selection": 1.7242823000415228, "implication_expansion": 0.2691484999959357, "prompt_composition": 4.0899962186813354e-05, "group_display": 0.07882960001006722}, "total_s": 7.719753100012895, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-06T22:44:28Z", "stages_s": {"preprocess": 7.639999967068434e-05, "rewrite": 2.190264799981378, "structural": 0.5781105000060052, "probe": 0.23918199999025092, "retrieval": 2.4492038000025786, "selection": 0.4502618000260554, "implication_expansion": 0.1491194000118412, "prompt_composition": 3.100000321865082e-05, "group_display": 0.08213570003863424}, "total_s": 6.144094199989922, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-06T22:45:12Z", "stages_s": {"preprocess": 7.840001489967108e-05, "rewrite": 3.431444200046826, "structural": 3.400025889277458e-06, "probe": 1.3075993999955244, "retrieval": 2.425993000040762, "selection": 6.9358377999742515, "implication_expansion": 0.14671080000698566, "prompt_composition": 3.8300000596791506e-05, "group_display": 0.0784414000227116}, "total_s": 14.331572700000834, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T01:26:11Z", "stages_s": {"preprocess": 0.00019039999460801482, "rewrite": 1.972552299965173, "structural": 0.45487100002355874, "probe": 0.5801937999785878, "retrieval": 2.885647799994331, "selection": 2.3159614999894984, "implication_expansion": 0.27620089997071773, "prompt_composition": 3.2800016924738884e-05, "group_display": 0.07873340003425255}, "total_s": 8.597678899997845, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T01:37:49Z", "stages_s": {"preprocess": 0.00017070001922547817, "rewrite": 2.3397521000006236, "structural": 0.2748573999851942, "probe": 0.9656308999983594, "retrieval": 2.379494299995713, "selection": 1.9972827999736182, "implication_expansion": 0.26782700000330806, "prompt_composition": 2.889998722821474e-05, "group_display": 0.0790697000338696}, "total_s": 8.337543100002222, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T02:36:05Z", "stages_s": {"preprocess": 0.00019479996990412474, "rewrite": 2.3221199000254273, "structural": 0.8951652999967337, "probe": 0.9059996000141837, "retrieval": 7.6194937999825925, "selection": 10.099894999992102, "implication_expansion": 0.27516779996221885, "prompt_composition": 3.519997699186206e-05, "group_display": 0.08530839998275042}, "total_s": 22.24463780003134, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T02:37:16Z", "stages_s": {"preprocess": 2.0799983758479357e-05, "rewrite": 4.862703899969347, "structural": 3.8000289350748062e-06, "probe": 2.00001522898674e-06, "retrieval": 0.49216449999948964, "selection": 7.8598584000137635, "implication_expansion": 2.9799994081258774e-05, "prompt_composition": 2.4500011932104826e-05, "group_display": 0.03247090004151687}, "total_s": 13.258490999985952, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T02:59:49Z", "stages_s": {"preprocess": 8.230004459619522e-05, "rewrite": 1.6606152999447659, "structural": 0.6319172999938019, "probe": 0.32008590002078563, "retrieval": 2.676332700008061, "selection": 2.007969399972353, "implication_expansion": 0.26375650003319606, "prompt_composition": 3.090000245720148e-05, "group_display": 0.08008919999701902}, "total_s": 7.673184300016146, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T03:01:39Z", "stages_s": {"preprocess": 1.6900012269616127e-05, "rewrite": 1.713694000034593, "structural": 5.799985956400633e-06, "probe": 0.049874700023792684, "retrieval": 0.35970670002279803, "selection": 0.9267913000076078, "implication_expansion": 3.909994848072529e-05, "prompt_composition": 3.7299992982298136e-05, "group_display": 0.026757099956739694}, "total_s": 3.089661000005435, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T03:09:53Z", "stages_s": {"preprocess": 0.00012510002125054598, "rewrite": 2.249713899975177, "structural": 0.5107482000021264, "probe": 3.300025127828121e-06, "retrieval": 2.3757353999535553, "selection": 2.9089593999669887, "implication_expansion": 0.2682994999922812, "prompt_composition": 3.070000093430281e-05, "group_display": 0.07982710003852844}, "total_s": 8.42714020004496, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
+{"timestamp_utc": "2026-03-07T03:37:54Z", "stages_s": {"preprocess": 0.00011760002234950662, "rewrite": 1.968222199997399, "structural": 1.1845426999498159, "probe": 2.214354399999138, "retrieval": 2.452574900002219, "selection": 0.8585481999907643, "implication_expansion": 0.27041040000040084, "prompt_composition": 3.319996176287532e-05, "group_display": 0.07736879994627088}, "total_s": 9.059251800004859, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}

scripts/analyze_hybrid_category_assignment.py ADDED Viewed

	@@ -0,0 +1,502 @@

+"""Analyze hybrid category assignment for uncategorized tags.
+Implements an offline analysis pipeline (no registry mutation):
+1) TF-IDF centroid scoring over current active categories.
+2) Wiki-link graph scoring from raw wiki pages.
+3) Weighted fusion of TF-IDF and wiki signals.
+4) Optional multi-category auto-assignment when top-2 fused probabilities are strong.
+Outputs:
+  - data/analysis/hybrid_category_assignment_preview.json (default; overwritten)
+"""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import random
+import re
+import sys
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Sequence, Set, Tuple
+import numpy as np
+REPO = Path(__file__).resolve().parents[1]
+if str(REPO) not in sys.path:
+    sys.path.insert(0, str(REPO))
+from psq_rag.retrieval.state import get_tfidf_tag_vectors
+REGISTRY_CSV = REPO / "data" / "analysis" / "category_registry.csv"
+WIKI_PAGES_CSV = REPO / "wiki_pages-2023-08-08.csv"
+OUT_JSON = REPO / "data" / "analysis" / "hybrid_category_assignment_preview.json"
+SKIP_STATUSES = {"excluded", "review_bucket", "special_exclusion"}
+LINK_PIPE_RE = re.compile(r"\[\[([^\]|]+)\|[^\]]+\]\]")
+LINK_PLAIN_RE = re.compile(r"\[\[([^\]|]+)\]\]")
+@dataclass
+class TagScoreRow:
+    tag: str
+    count: int
+    signal: str
+    assignment: str
+    assigned_categories: List[str]
+    top_fused: List[Tuple[str, float]]
+    top_tfidf: List[Tuple[str, float]]
+    top_wiki: List[Tuple[str, float]]
+    wiki_vote_count: int
+    wiki_link_count: int
+def _normalize_tag(tok: str) -> str:
+    return (tok or "").strip().lower().replace(" ", "_")
+def _is_enabled(v: str) -> bool:
+    return str(v).strip().lower() in {"1", "true", "yes"}
+def _softmax(values: np.ndarray, temperature: float) -> np.ndarray:
+    if values.size == 0:
+        return values
+    t = max(1e-6, float(temperature))
+    shifted = (values - float(np.max(values))) / t
+    expv = np.exp(shifted)
+    denom = float(np.sum(expv))
+    if denom <= 0.0:
+        return np.zeros_like(values, dtype=np.float32)
+    return (expv / denom).astype(np.float32)
+def _topk_with_names(names: Sequence[str], arr: np.ndarray, k: int) -> List[Tuple[str, float]]:
+    if arr.size == 0:
+        return []
+    order = np.argsort(arr)[::-1][: max(1, int(k))]
+    return [(names[int(i)], float(arr[int(i)])) for i in order]
+def load_registry(
+    path: Path,
+) -> Tuple[Dict[str, Set[str]], Dict[str, Set[str]], Dict[str, int], Set[str]]:
+    """Return (active_category_to_tags, tag_to_active_categories, tag_counts, uncategorized_tags)."""
+    active_category_tags: Dict[str, Set[str]] = defaultdict(set)
+    tag_to_active_categories: Dict[str, Set[str]] = defaultdict(set)
+    tag_counts: Dict[str, int] = {}
+    uncategorized: Set[str] = set()
+    with path.open("r", encoding="utf-8", newline="") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            tag = _normalize_tag(row.get("tag") or "")
+            category = (row.get("category_name") or "").strip()
+            status = (row.get("category_status") or "").strip().lower()
+            enabled = _is_enabled(row.get("category_enabled") or "")
+            if not tag or not category:
+                continue
+            try:
+                cnt = int(float(row.get("tag_fluffyrock_count") or "0"))
+            except Exception:
+                cnt = 0
+            if tag not in tag_counts or cnt > tag_counts[tag]:
+                tag_counts[tag] = cnt
+            if category == "uncategorized_review":
+                uncategorized.add(tag)
+                continue
+            if status in SKIP_STATUSES or not enabled:
+                continue
+            active_category_tags[category].add(tag)
+            tag_to_active_categories[tag].add(category)
+    return active_category_tags, tag_to_active_categories, tag_counts, uncategorized
+def build_centroids(
+    active_category_tags: Dict[str, Set[str]],
+    tag_to_row: Dict[str, int],
+    vectors_norm: np.ndarray,
+) -> Tuple[List[str], np.ndarray, Dict[str, int]]:
+    categories: List[str] = []
+    centroids: List[np.ndarray] = []
+    seed_sizes: Dict[str, int] = {}
+    for category in sorted(active_category_tags.keys()):
+        seeds = active_category_tags[category]
+        idxs = [tag_to_row[t] for t in seeds if t in tag_to_row]
+        if len(idxs) < 2:
+            continue
+        mat = vectors_norm[idxs]
+        c = mat.mean(axis=0)
+        n = float(np.linalg.norm(c))
+        if n <= 1e-12:
+            continue
+        categories.append(category)
+        centroids.append((c / n).astype(np.float32))
+        seed_sizes[category] = len(idxs)
+    if not categories:
+        return [], np.zeros((0, 0), dtype=np.float32), {}
+    return categories, np.vstack(centroids).astype(np.float32), seed_sizes
+def _extract_links_from_body(body: str) -> List[str]:
+    links: List[str] = []
+    for tok in LINK_PIPE_RE.findall(body):
+        tag = _normalize_tag(tok)
+        if not tag or tag.startswith(("http://", "https://", "help:", "e621:", "tag_group:", "#")):
+            continue
+        links.append(tag)
+    for tok in LINK_PLAIN_RE.findall(body):
+        tag = _normalize_tag(tok)
+        if not tag or tag.startswith(("http://", "https://", "help:", "e621:", "tag_group:", "#")):
+            continue
+        links.append(tag)
+    seen: Set[str] = set()
+    deduped: List[str] = []
+    for tag in links:
+        if tag in seen:
+            continue
+        seen.add(tag)
+        deduped.append(tag)
+    return deduped
+def build_wiki_votes(
+    wiki_csv: Path,
+    uncategorized_tags: Set[str],
+    tag_to_active_categories: Dict[str, Set[str]],
+) -> Tuple[Dict[str, Counter], Set[str], Dict[str, int]]:
+    """Return (wiki_votes_by_tag, tags_with_wiki_page, wiki_link_count_by_tag)."""
+    wiki_votes: Dict[str, Counter] = {}
+    has_page: Set[str] = set()
+    link_counts: Dict[str, int] = {}
+    with wiki_csv.open("r", encoding="utf-8", newline="") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            title = _normalize_tag(row.get("title") or "")
+            if title not in uncategorized_tags:
+                continue
+            has_page.add(title)
+            body = row.get("body") or ""
+            links = _extract_links_from_body(body)
+            link_counts[title] = len(links)
+            if not links:
+                continue
+            votes = Counter()
+            for linked_tag in links:
+                cats = tag_to_active_categories.get(linked_tag)
+                if not cats:
+                    continue
+                for c in cats:
+                    votes[c] += 1
+            if votes:
+                wiki_votes[title] = votes
+    return wiki_votes, has_page, link_counts
+def score_tags(
+    *,
+    uncategorized_tags: Set[str],
+    tag_counts: Dict[str, int],
+    categories: List[str],
+    centroid_matrix: np.ndarray,
+    tag_to_row: Dict[str, int],
+    vectors_norm: np.ndarray,
+    wiki_votes: Dict[str, Counter],
+    wiki_link_counts: Dict[str, int],
+    tfidf_weight: float,
+    wiki_weight: float,
+    tfidf_temp: float,
+    single_top1_min: float,
+    single_margin_min: float,
+    single_top2_max: float,
+    multi_top1_min: float,
+    multi_top2_min: float,
+    multi_pair_min: float,
+) -> List[TagScoreRow]:
+    out: List[TagScoreRow] = []
+    cat_to_idx = {c: i for i, c in enumerate(categories)}
+    tfidf_w = max(0.0, float(tfidf_weight))
+    wiki_w = max(0.0, float(wiki_weight))
+    for tag in sorted(uncategorized_tags):
+        count = int(tag_counts.get(tag, 0))
+        tfidf_arr = None
+        wiki_arr = None
+        wiki_vote_count = 0
+        row_idx = tag_to_row.get(tag)
+        if row_idx is not None and centroid_matrix.size > 0:
+            sims = centroid_matrix @ vectors_norm[int(row_idx)]
+            tfidf_arr = _softmax(sims.astype(np.float32), temperature=tfidf_temp)
+        votes = wiki_votes.get(tag)
+        if votes:
+            wiki_vote_count = int(sum(votes.values()))
+            wiki_arr = np.zeros(len(categories), dtype=np.float32)
+            for c, n in votes.items():
+                idx = cat_to_idx.get(c)
+                if idx is not None:
+                    wiki_arr[idx] += float(n)
+            s = float(np.sum(wiki_arr))
+            if s > 0.0:
+                wiki_arr /= s
+            else:
+                wiki_arr = None
+        if tfidf_arr is not None and wiki_arr is not None:
+            fused = tfidf_w * tfidf_arr + wiki_w * wiki_arr
+            denom = float(np.sum(fused))
+            if denom > 0.0:
+                fused /= denom
+            signal = "both"
+        elif tfidf_arr is not None:
+            fused = tfidf_arr
+            signal = "tfidf_only"
+        elif wiki_arr is not None:
+            fused = wiki_arr
+            signal = "wiki_only"
+        else:
+            fused = np.zeros(len(categories), dtype=np.float32)
+            signal = "none"
+        top_fused = _topk_with_names(categories, fused, 3)
+        top_tfidf = _topk_with_names(categories, tfidf_arr, 3) if tfidf_arr is not None else []
+        top_wiki = _topk_with_names(categories, wiki_arr, 3) if wiki_arr is not None else []
+        if len(top_fused) == 0:
+            assignment = "hold"
+            assigned: List[str] = []
+        else:
+            c1, p1 = top_fused[0]
+            c2, p2 = top_fused[1] if len(top_fused) > 1 else ("", 0.0)
+            if p1 >= multi_top1_min and p2 >= multi_top2_min and (p1 + p2) >= multi_pair_min:
+                assignment = "multi"
+                assigned = [c1, c2]
+            elif p1 >= single_top1_min and ((p1 - p2) >= single_margin_min or p2 <= single_top2_max):
+                assignment = "single"
+                assigned = [c1]
+            else:
+                assignment = "hold"
+                assigned = []
+        out.append(
+            TagScoreRow(
+                tag=tag,
+                count=count,
+                signal=signal,
+                assignment=assignment,
+                assigned_categories=assigned,
+                top_fused=top_fused,
+                top_tfidf=top_tfidf,
+                top_wiki=top_wiki,
+                wiki_vote_count=wiki_vote_count,
+                wiki_link_count=int(wiki_link_counts.get(tag, 0)),
+            )
+        )
+    return out
+def summarize_rows(
+    rows: List[TagScoreRow],
+    *,
+    n_uncat_total: int,
+    n_has_tfidf: int,
+    n_wiki_page: int,
+    n_wiki_votes: int,
+    sample_size: int,
+    seed: int,
+) -> Dict[str, object]:
+    assign_counts = Counter(r.assignment for r in rows)
+    signal_counts = Counter(r.signal for r in rows)
+    remaining_uncategorized = int(assign_counts.get("hold", 0))
+    newly_categorized = int(assign_counts.get("single", 0) + assign_counts.get("multi", 0))
+    multi_category_additions = int(sum(len(r.assigned_categories) for r in rows if r.assignment == "multi"))
+    single_by_category = Counter(
+        r.assigned_categories[0] for r in rows if r.assignment == "single" and r.assigned_categories
+    )
+    multi_pairs = Counter(
+        tuple(sorted(r.assigned_categories[:2])) for r in rows if r.assignment == "multi" and len(r.assigned_categories) >= 2
+    )
+    rng = random.Random(int(seed))
+    def sample_assignment(kind: str) -> List[Dict[str, object]]:
+        pool = [r for r in rows if r.assignment == kind]
+        if not pool:
+            return []
+        n = min(int(sample_size), len(pool))
+        picks = rng.sample(pool, n)
+        out: List[Dict[str, object]] = []
+        for r in sorted(picks, key=lambda x: (-x.count, x.tag)):
+            out.append(
+                {
+                    "tag": r.tag,
+                    "count": r.count,
+                    "signal": r.signal,
+                    "assigned_categories": r.assigned_categories,
+                    "top_fused": [(c, round(p, 4)) for c, p in r.top_fused],
+                    "top_tfidf": [(c, round(p, 4)) for c, p in r.top_tfidf],
+                    "top_wiki": [(c, round(p, 4)) for c, p in r.top_wiki],
+                    "wiki_vote_count": r.wiki_vote_count,
+                    "wiki_link_count": r.wiki_link_count,
+                }
+            )
+        return out
+    return {
+        "counts": {
+            "uncategorized_total": int(n_uncat_total),
+            "scored_rows": int(len(rows)),
+            "has_tfidf_vector": int(n_has_tfidf),
+            "has_wiki_page": int(n_wiki_page),
+            "has_wiki_category_votes": int(n_wiki_votes),
+            "signals": dict(signal_counts),
+            "assignments": dict(assign_counts),
+            "newly_categorized": newly_categorized,
+            "remaining_uncategorized": remaining_uncategorized,
+            "multi_category_additions": multi_category_additions,
+        },
+        "top_single_categories": single_by_category.most_common(20),
+        "top_multi_category_pairs": [
+            {"categories": list(pair), "count": int(cnt)} for pair, cnt in multi_pairs.most_common(20)
+        ],
+        "samples": {
+            "single": sample_assignment("single"),
+            "multi": sample_assignment("multi"),
+            "hold": sample_assignment("hold"),
+        },
+    }
+def parse_args() -> argparse.Namespace:
+    ap = argparse.ArgumentParser(description="Analyze hybrid TF-IDF + wiki category assignment for uncategorized tags.")
+    ap.add_argument("--tfidf-weight", type=float, default=0.6, help="Weight for TF-IDF centroid probabilities.")
+    ap.add_argument("--wiki-weight", type=float, default=0.4, help="Weight for wiki-link graph probabilities.")
+    ap.add_argument("--tfidf-temp", type=float, default=0.08, help="Softmax temperature for TF-IDF similarities.")
+    ap.add_argument("--single-top1-min", type=float, default=0.55, help="Single-assign threshold: top1 probability min.")
+    ap.add_argument("--single-margin-min", type=float, default=0.18, help="Single-assign threshold: top1-top2 margin min.")
+    ap.add_argument("--single-top2-max", type=float, default=0.35, help="Single-assign threshold: top2 probability max.")
+    ap.add_argument("--multi-top1-min", type=float, default=0.42, help="Multi-assign threshold: top1 probability min.")
+    ap.add_argument("--multi-top2-min", type=float, default=0.30, help="Multi-assign threshold: top2 probability min.")
+    ap.add_argument("--multi-pair-min", type=float, default=0.78, help="Multi-assign threshold: (top1+top2) min.")
+    ap.add_argument("--sample-size", type=int, default=20, help="Random examples per assignment bucket.")
+    ap.add_argument("--seed", type=int, default=42, help="Random seed for examples.")
+    ap.add_argument(
+        "--out-json",
+        type=Path,
+        default=OUT_JSON,
+        help="Output JSON report (overwritten each run).",
+    )
+    return ap.parse_args()
+def main() -> None:
+    args = parse_args()
+    active_category_tags, tag_to_active_categories, tag_counts, uncategorized = load_registry(REGISTRY_CSV)
+    vectors = get_tfidf_tag_vectors()
+    vectors_norm = vectors["reduced_matrix_norm"]
+    tag_to_row = vectors["tag_to_row_index"]
+    categories, centroid_matrix, seed_sizes = build_centroids(active_category_tags, tag_to_row, vectors_norm)
+    if not categories:
+        raise RuntimeError("No centroids available from active categories. Check category registry content.")
+    wiki_votes, has_wiki_page, wiki_link_counts = build_wiki_votes(
+        WIKI_PAGES_CSV,
+        uncategorized_tags=uncategorized,
+        tag_to_active_categories=tag_to_active_categories,
+    )
+    rows = score_tags(
+        uncategorized_tags=uncategorized,
+        tag_counts=tag_counts,
+        categories=categories,
+        centroid_matrix=centroid_matrix,
+        tag_to_row=tag_to_row,
+        vectors_norm=vectors_norm,
+        wiki_votes=wiki_votes,
+        wiki_link_counts=wiki_link_counts,
+        tfidf_weight=args.tfidf_weight,
+        wiki_weight=args.wiki_weight,
+        tfidf_temp=args.tfidf_temp,
+        single_top1_min=args.single_top1_min,
+        single_margin_min=args.single_margin_min,
+        single_top2_max=args.single_top2_max,
+        multi_top1_min=args.multi_top1_min,
+        multi_top2_min=args.multi_top2_min,
+        multi_pair_min=args.multi_pair_min,
+    )
+    n_has_tfidf = sum(1 for t in uncategorized if t in tag_to_row)
+    summary = summarize_rows(
+        rows,
+        n_uncat_total=len(uncategorized),
+        n_has_tfidf=n_has_tfidf,
+        n_wiki_page=len(has_wiki_page),
+        n_wiki_votes=len(wiki_votes),
+        sample_size=args.sample_size,
+        seed=args.seed,
+    )
+    report = {
+        "config": {
+            "tfidf_weight": args.tfidf_weight,
+            "wiki_weight": args.wiki_weight,
+            "tfidf_temp": args.tfidf_temp,
+            "single_top1_min": args.single_top1_min,
+            "single_margin_min": args.single_margin_min,
+            "single_top2_max": args.single_top2_max,
+            "multi_top1_min": args.multi_top1_min,
+            "multi_top2_min": args.multi_top2_min,
+            "multi_pair_min": args.multi_pair_min,
+            "sample_size": args.sample_size,
+            "seed": args.seed,
+        },
+        "inputs": {
+            "registry_csv": str(REGISTRY_CSV),
+            "wiki_pages_csv": str(WIKI_PAGES_CSV),
+            "uncategorized_tags": len(uncategorized),
+            "active_categories_for_centroids": len(categories),
+            "centroid_seed_sizes": seed_sizes,
+        },
+        "summary": summary,
+    }
+    args.out_json.parent.mkdir(parents=True, exist_ok=True)
+    args.out_json.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
+    counts = summary["counts"]
+    print("Hybrid category assignment analysis complete")
+    print(f"Active categories (centroids): {len(categories)}")
+    print(
+        "Signals: "
+        f"tfidf={counts['has_tfidf_vector']} "
+        f"wiki_page={counts['has_wiki_page']} "
+        f"wiki_votes={counts['has_wiki_category_votes']}"
+    )
+    print(f"Assignments: {counts['assignments']}")
+    print(f"Remaining uncategorized: {counts['remaining_uncategorized']}")
+    print(f"Wrote: {args.out_json}")
+if __name__ == "__main__":
+    main()