Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 10

Commit

349b999

1 Parent(s): df66964

Add diagnostic eval metrics, why-distribution tracking, and generic character filter

- select.py: return_metadata option exposes per-tag 'why' rationale from LLM
- select.py: Route generic character-category tags (fan_character, viewer, etc.)
to general pipeline instead of entity pipeline to reduce false positives
- eval_pipeline.py: New metrics — retrieval precision, selection-given-retrieval,
over-selection ratio, and why distribution breakdown in summary output
- eval_pipeline.py: All new metrics saved to JSONL output for analysis

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (2) hide show

psq_rag/llm/select.py +29 -2
scripts/eval_pipeline.py +53 -1

psq_rag/llm/select.py CHANGED Viewed

@@ -23,6 +23,19 @@ from rapidfuzz import fuzz
 from psq_rag.retrieval.psq_retrieval import Candidate  # Candidate(tag, score_*, count, sources)
 from psq_rag.retrieval.state import get_tag_type_name, get_tag2aliases
 WHY_ENUM = ["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
@@ -334,11 +347,18 @@ def _split_candidates_by_type(
     unknown_count = 0
     copyright_count = 0
     for idx, cand in enumerate(candidates):
         type_name = get_tag_type_name(cand.tag)
         if type_name == "character":
-            entity_with_idx.append((idx, cand))
         elif type_name == "copyright":
             # Filter out copyright/series tags - too broad for image generation
             copyright_count += 1
@@ -355,6 +375,7 @@ def _split_candidates_by_type(
             f"general={len(general_with_idx)} "
             f"entity={len(entity_with_idx)} "
             f"copyright_filtered={copyright_count} "
             f"unknown_type={unknown_count}"
         )
@@ -462,7 +483,8 @@ def llm_select_indices(
     per_phrase_k: int = 2,                 # per-call budget = per_phrase_k * phrases_in_call
     temperature: float = 0.0,
     max_tokens: int = 512,
-) -> List[int]:
     """Return indices into the ORIGINAL candidates list (legacy interface).
     This implementation uses LangChain ONLY.
@@ -704,8 +726,13 @@ def llm_select_indices(
     # Map back to original indices
     out_idx: List[int] = []
     for t in ordered_tags:
         if t in tag_to_first_index:
             out_idx.append(tag_to_first_index[t])
     return out_idx

 from psq_rag.retrieval.psq_retrieval import Candidate  # Candidate(tag, score_*, count, sources)
 from psq_rag.retrieval.state import get_tag_type_name, get_tag2aliases
+# Character-typed tags that are generic categories, not actual named characters.
+# These leak through the alias filter because they match common words in captions.
+# They are excluded from the entity pipeline and instead routed to general selection.
+_GENERIC_CHARACTER_TAGS = frozenset({
+    "fan_character",
+    "background_character",
+    "unnamed_character",
+    "unknown_character",
+    "anonymous_character",
+    "viewer",
+    "original_character",
+})
 WHY_ENUM = ["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
     unknown_count = 0
     copyright_count = 0
+    generic_char_count = 0
     for idx, cand in enumerate(candidates):
         type_name = get_tag_type_name(cand.tag)
         if type_name == "character":
+            if cand.tag in _GENERIC_CHARACTER_TAGS:
+                # Route generic character-category tags to general selection
+                general_with_idx.append((idx, cand))
+                generic_char_count += 1
+            else:
+                entity_with_idx.append((idx, cand))
         elif type_name == "copyright":
             # Filter out copyright/series tags - too broad for image generation
             copyright_count += 1
             f"general={len(general_with_idx)} "
             f"entity={len(entity_with_idx)} "
             f"copyright_filtered={copyright_count} "
+            f"generic_char_to_general={generic_char_count} "
             f"unknown_type={unknown_count}"
         )
     per_phrase_k: int = 2,                 # per-call budget = per_phrase_k * phrases_in_call
     temperature: float = 0.0,
     max_tokens: int = 512,
+    return_metadata: bool = False,
+) -> Union[List[int], Tuple[List[int], Dict[str, str]]]:
     """Return indices into the ORIGINAL candidates list (legacy interface).
     This implementation uses LangChain ONLY.
     # Map back to original indices
     out_idx: List[int] = []
+    tag_why: Dict[str, str] = {}
     for t in ordered_tags:
         if t in tag_to_first_index:
             out_idx.append(tag_to_first_index[t])
+            tag_why[t] = best[t][1]  # why string
+    if return_metadata:
+        return out_idx, tag_why
     return out_idx

scripts/eval_pipeline.py CHANGED Viewed

@@ -127,6 +127,12 @@ class SampleResult:
     general_precision: float = 0.0
     general_recall: float = 0.0
     general_f1: float = 0.0
     # Timing
     stage1_time: float = 0.0
     stage2_time: float = 0.0
@@ -233,7 +239,7 @@ def _process_one_sample(
         # --- Stage 3: LLM Selection ---
         t0 = time.time()
-        picked_indices = llm_select_indices(
             query_text=caption,
             candidates=candidates,
             max_pick=0,
@@ -243,17 +249,34 @@ def _process_one_sample(
             per_phrase_k=per_phrase_k,
             temperature=temperature,
             max_tokens=max_tokens,
         )
         result.stage3_time = time.time() - t0
         result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
         # Overall selection metrics
         p, r, f1 = _compute_metrics(result.selected_tags, gt_tags)
         result.selection_precision = p
         result.selection_recall = r
         result.selection_f1 = f1
         # Split ground-truth and selected tags by type
         gt_char, gt_gen = _classify_tags(gt_tags, get_tag_type_name)
         sel_char, sel_gen = _classify_tags(result.selected_tags, get_tag_type_name)
@@ -456,6 +479,11 @@ def print_summary(results: List[SampleResult]) -> None:
     print("Stage 2 - Retrieval:")
     print(f"  Avg recall@300:       {avg_retrieval_recall:.4f}")
     print(f"  Avg candidates:       {avg_retrieved:.1f}")
     print()
     print("Stage 3 - Selection (ALL tags):")
     print(f"  Avg precision:        {avg_sel_precision:.4f}")
@@ -463,6 +491,25 @@ def print_summary(results: List[SampleResult]) -> None:
     print(f"  Avg F1:               {avg_sel_f1:.4f}")
     print(f"  Avg selected tags:    {avg_selected:.1f}")
     print(f"  Avg ground-truth tags:{avg_gt:.1f}")
     # --- Character tag breakdown ---
     # Only include samples that actually have character tags in ground truth
@@ -678,6 +725,11 @@ def main(argv=None) -> int:
                 "general_precision": round(r.general_precision, 4),
                 "general_recall": round(r.general_recall, 4),
                 "general_f1": round(r.general_f1, 4),
                 # Timing
                 "stage1_time": round(r.stage1_time, 3),
                 "stage2_time": round(r.stage2_time, 3),

     general_precision: float = 0.0
     general_recall: float = 0.0
     general_f1: float = 0.0
+    # New diagnostic metrics
+    retrieval_precision: float = 0.0       # |retrieved ∩ gt| / |retrieved|
+    selection_given_retrieval: float = 0.0  # |selected ∩ gt| / |retrieved ∩ gt|
+    over_selection_ratio: float = 0.0       # |selected| / |gt|
+    # Why distribution (from Stage 3 LLM)
+    why_counts: Dict[str, int] = field(default_factory=dict)
     # Timing
     stage1_time: float = 0.0
     stage2_time: float = 0.0
         # --- Stage 3: LLM Selection ---
         t0 = time.time()
+        picked_indices, tag_why = llm_select_indices(
             query_text=caption,
             candidates=candidates,
             max_pick=0,
             per_phrase_k=per_phrase_k,
             temperature=temperature,
             max_tokens=max_tokens,
+            return_metadata=True,
         )
         result.stage3_time = time.time() - t0
         result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
+        # Why distribution
+        why_counts: Dict[str, int] = {}
+        for w in tag_why.values():
+            why_counts[w] = why_counts.get(w, 0) + 1
+        result.why_counts = why_counts
         # Overall selection metrics
         p, r, f1 = _compute_metrics(result.selected_tags, gt_tags)
         result.selection_precision = p
         result.selection_recall = r
         result.selection_f1 = f1
+        # New diagnostic metrics
+        retrieved_and_gt = result.retrieved_tags & gt_tags
+        selected_and_gt = result.selected_tags & gt_tags
+        if result.retrieved_tags:
+            result.retrieval_precision = len(retrieved_and_gt) / len(result.retrieved_tags)
+        if retrieved_and_gt:
+            result.selection_given_retrieval = len(selected_and_gt) / len(retrieved_and_gt)
+        if gt_tags:
+            result.over_selection_ratio = len(result.selected_tags) / len(gt_tags)
         # Split ground-truth and selected tags by type
         gt_char, gt_gen = _classify_tags(gt_tags, get_tag_type_name)
         sel_char, sel_gen = _classify_tags(result.selected_tags, get_tag_type_name)
     print("Stage 2 - Retrieval:")
     print(f"  Avg recall@300:       {avg_retrieval_recall:.4f}")
     print(f"  Avg candidates:       {avg_retrieved:.1f}")
+    avg_retrieval_precision = _safe_avg([r.retrieval_precision for r in valid])
+    avg_sel_given_ret = _safe_avg([r.selection_given_retrieval for r in valid
+                                   if (r.retrieved_tags & r.ground_truth_tags)])
+    avg_over_sel = _safe_avg([r.over_selection_ratio for r in valid])
     print()
     print("Stage 3 - Selection (ALL tags):")
     print(f"  Avg precision:        {avg_sel_precision:.4f}")
     print(f"  Avg F1:               {avg_sel_f1:.4f}")
     print(f"  Avg selected tags:    {avg_selected:.1f}")
     print(f"  Avg ground-truth tags:{avg_gt:.1f}")
+    print()
+    print("Diagnostic Metrics:")
+    print(f"  Retrieval precision:  {avg_retrieval_precision:.4f}  (|ret∩gt|/|ret|, noise level fed to Stage 3)")
+    print(f"  Sel-given-retrieval:  {avg_sel_given_ret:.4f}  (of gt tags retrieved, fraction kept by Stage 3)")
+    print(f"  Over-selection ratio: {avg_over_sel:.2f}x  (|selected|/|gt|, ideal ~1.0)")
+    # Why distribution across all samples
+    total_why: Dict[str, int] = {}
+    for r in valid:
+        for w, cnt in r.why_counts.items():
+            total_why[w] = total_why.get(w, 0) + cnt
+    if total_why:
+        total_selections = sum(total_why.values())
+        print()
+        print("Why Distribution (Stage 3 LLM rationale):")
+        for w in ["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]:
+            cnt = total_why.get(w, 0)
+            pct = 100 * cnt / total_selections if total_selections else 0
+            print(f"  {w:20s} {cnt:4d}  ({pct:5.1f}%)")
     # --- Character tag breakdown ---
     # Only include samples that actually have character tags in ground truth
                 "general_precision": round(r.general_precision, 4),
                 "general_recall": round(r.general_recall, 4),
                 "general_f1": round(r.general_f1, 4),
+                # Diagnostic metrics
+                "retrieval_precision": round(r.retrieval_precision, 4),
+                "selection_given_retrieval": round(r.selection_given_retrieval, 4),
+                "over_selection_ratio": round(r.over_selection_ratio, 2),
+                "why_counts": r.why_counts,
                 # Timing
                 "stage1_time": round(r.stage1_time, 3),
                 "stage2_time": round(r.stage2_time, 3),