Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 14

Commit

0ed7e94

1 Parent(s): 7261188

Add ranking metrics infrastructure to eval pipeline

Implements Recall@K, Precision@K, and MRR for categorized suggestions.

Changes to eval_pipeline.py:
- Add categorized_suggestions field to SampleResult
- Generate categorized suggestions after LLM selection
- Store top-20 suggestions per category in detail results

Changes to eval_categorized.py:
- Compute ranking metrics from categorized_suggestions
- Show Recall@K, Precision@K, MRR alongside P/R/F1
- Track coverage (how many GT tags appear in suggestions)
- Gracefully handle old results without categorized_suggestions

Ranking metrics:
- Recall@K: Fraction of ground truth tags found in top-K suggestions
- Precision@K: Fraction of top-K suggestions that are correct
- MRR: Mean reciprocal rank of GT tags in suggestion list
- Coverage: Total GT tags that appear anywhere in suggestions

To generate results with ranking metrics:
python scripts/eval_pipeline.py --n 50 --expand-implications

To evaluate with ranking metrics:
python scripts/eval_categorized.py --results <results.jsonl> --k 5

https://claude.ai/code/session_015ZwE7a5E6YVTrMpuB2pXX7

Files changed (2) hide show

scripts/eval_categorized.py +61 -13
scripts/eval_pipeline.py +30 -0

scripts/eval_categorized.py CHANGED Viewed

@@ -109,9 +109,11 @@ class CategoryMetrics:
     # Ranking metrics (for suggestions)
     total_gt_tags: int = 0  # Total ground truth tags across all samples
-    recall_at_k: float = 0.0  # How many GT tags found in top-K
-    precision_at_k: float = 0.0  # How many top-K are correct
     mrr: float = 0.0  # Mean reciprocal rank
     @property
     def precision(self) -> float:
@@ -217,13 +219,42 @@ def compute_category_metrics(
             cat_metric.total_gt_tags += len(gt_cat_tags)
-            # TODO: Ranking metrics (Recall@K, Precision@K, MRR)
-            # These require ranked suggestions per category in the eval results.
-            # Current eval_pipeline.py only outputs binary predictions (selected_tags).
-            # To add ranking metrics, we need to:
-            # 1. Modify eval_pipeline to generate categorized suggestions
-            # 2. Store top-K suggestions per category in results
-            # 3. Compute rank of each GT tag in the suggestion list
     return metrics
@@ -231,9 +262,17 @@ def compute_category_metrics(
 def print_category_metrics(
     metrics: Dict[str, CategoryMetrics],
     categories: Dict[str, TagCategory],
 ):
     """
     Print metrics organized by importance.
     """
     # Group by importance level
     by_importance = defaultdict(list)
@@ -257,15 +296,24 @@ def print_category_metrics(
             print(f"  Constraint: {category.constraint.value}")
             print(f"  Ground truth tags: {cat_metric.total_gt_tags}")
-            # For EXACTLY_ONE, show accuracy
             if category.constraint.value == "exactly_one":
                 print(f"  Accuracy:  {cat_metric.accuracy:.3f}")
-            # For others, show P/R/F1
             print(f"  Precision: {cat_metric.precision:.3f}")
             print(f"  Recall:    {cat_metric.recall:.3f}")
             print(f"  F1:        {cat_metric.f1:.3f}")
             # Show raw counts for debugging
             print(f"  (TP={cat_metric.tp}, FP={cat_metric.fp}, FN={cat_metric.fn}, TN={cat_metric.tn})")
@@ -374,7 +422,7 @@ def main():
     )
     # Print results
-    print_category_metrics(metrics, categories)
 if __name__ == "__main__":

     # Ranking metrics (for suggestions)
     total_gt_tags: int = 0  # Total ground truth tags across all samples
+    found_in_suggestions: int = 0  # GT tags that appear anywhere in suggestions
+    recall_at_k: float = 0.0  # Fraction of GT tags found in top-K
+    precision_at_k: float = 0.0  # Fraction of top-K that are correct
     mrr: float = 0.0  # Mean reciprocal rank
+    mrr_count: int = 0  # Number of GT tags used for MRR calculation
     @property
     def precision(self) -> float:
             cat_metric.total_gt_tags += len(gt_cat_tags)
+            # Ranking metrics (if categorized_suggestions are available)
+            categorized_suggestions = result.get('categorized_suggestions', {})
+            cat_suggestions = categorized_suggestions.get(cat_name, [])
+            if cat_suggestions and gt_cat_tags:
+                # Convert to dict for easier lookup: {tag: rank}
+                # Suggestions are already sorted by score, so index = rank (0-indexed)
+                suggestion_ranks = {tag: rank for rank, (tag, score) in enumerate(cat_suggestions)}
+                # Count how many GT tags appear in suggestions (at any rank)
+                found_count = sum(1 for gt_tag in gt_cat_tags if gt_tag in suggestion_ranks)
+                cat_metric.found_in_suggestions += found_count
+                # Recall@K: fraction of GT tags in top-K
+                top_k_tags = {tag for tag, score in cat_suggestions[:k]}
+                recall_at_k_count = len(gt_cat_tags & top_k_tags)
+                # Precision@K: fraction of top-K that are in GT
+                if len(top_k_tags) > 0:
+                    precision_at_k_count = len(top_k_tags & gt_cat_tags)
+                else:
+                    precision_at_k_count = 0
+                # MRR: mean of 1/rank for each GT tag found in suggestions
+                reciprocal_ranks = []
+                for gt_tag in gt_cat_tags:
+                    if gt_tag in suggestion_ranks:
+                        rank = suggestion_ranks[gt_tag]
+                        reciprocal_ranks.append(1.0 / (rank + 1))  # +1 because rank is 0-indexed
+                # Accumulate for averaging later
+                cat_metric.recall_at_k += recall_at_k_count / len(gt_cat_tags) if gt_cat_tags else 0
+                cat_metric.precision_at_k += precision_at_k_count / min(k, len(cat_suggestions)) if cat_suggestions else 0
+                if reciprocal_ranks:
+                    cat_metric.mrr += sum(reciprocal_ranks) / len(reciprocal_ranks)
+                    cat_metric.mrr_count += 1
     return metrics
 def print_category_metrics(
     metrics: Dict[str, CategoryMetrics],
     categories: Dict[str, TagCategory],
+    n_samples: int,
+    k: int,
 ):
     """
     Print metrics organized by importance.
+    Args:
+        metrics: Category metrics
+        categories: Category definitions
+        n_samples: Number of samples evaluated
+        k: Top-K for ranking metrics
     """
     # Group by importance level
     by_importance = defaultdict(list)
             print(f"  Constraint: {category.constraint.value}")
             print(f"  Ground truth tags: {cat_metric.total_gt_tags}")
+            # Binary prediction metrics
             if category.constraint.value == "exactly_one":
                 print(f"  Accuracy:  {cat_metric.accuracy:.3f}")
             print(f"  Precision: {cat_metric.precision:.3f}")
             print(f"  Recall:    {cat_metric.recall:.3f}")
             print(f"  F1:        {cat_metric.f1:.3f}")
+            # Ranking metrics (averaged across samples)
+            if cat_metric.mrr_count > 0:
+                avg_recall_at_k = cat_metric.recall_at_k / n_samples if n_samples > 0 else 0
+                avg_precision_at_k = cat_metric.precision_at_k / n_samples if n_samples > 0 else 0
+                avg_mrr = cat_metric.mrr / cat_metric.mrr_count
+                print(f"  Recall@{k}:    {avg_recall_at_k:.3f}  (GT tags found in top-{k})")
+                print(f"  Precision@{k}: {avg_precision_at_k:.3f}  (top-{k} that are correct)")
+                print(f"  MRR:          {avg_mrr:.3f}  (mean reciprocal rank)")
+                print(f"  Coverage:     {cat_metric.found_in_suggestions}/{cat_metric.total_gt_tags}  (GT tags in suggestions)")
             # Show raw counts for debugging
             print(f"  (TP={cat_metric.tp}, FP={cat_metric.fp}, FN={cat_metric.fn}, TN={cat_metric.tn})")
     )
     # Print results
+    print_category_metrics(metrics, categories, len(eval_results), args.k)
 if __name__ == "__main__":

scripts/eval_pipeline.py CHANGED Viewed

@@ -166,6 +166,8 @@ class SampleResult:
     stage2_time: float = 0.0
     stage3_time: float = 0.0
     stage3s_time: float = 0.0
     # Errors
     error: Optional[str] = None
@@ -327,6 +329,33 @@ def _process_one_sample(
             result.selected_tags = expanded
             log(f"Implications: +{len(implied_only)} tags")
         # Remove eval-excluded tags from predictions before scoring
         result.selected_tags -= _EVAL_EXCLUDED_TAGS
         result.retrieved_tags -= _EVAL_EXCLUDED_TAGS
@@ -915,6 +944,7 @@ def main(argv=None) -> int:
                 "selected_tags": sorted(r.selected_tags),
                 "implied_tags": sorted(r.implied_tags),
                 "structural_tags": r.structural_tags,
                 "why_counts": r.why_counts,
                 "tag_evidence": r.tag_evidence,
                 "gt_character_tags": sorted(r.gt_character_tags),

     stage2_time: float = 0.0
     stage3_time: float = 0.0
     stage3s_time: float = 0.0
+    # Categorized suggestions (for ranking metrics)
+    categorized_suggestions: Dict[str, List[Tuple[str, float]]] = field(default_factory=dict)
     # Errors
     error: Optional[str] = None
             result.selected_tags = expanded
             log(f"Implications: +{len(implied_only)} tags")
+        # Generate categorized suggestions (for ranking metrics)
+        try:
+            from psq_rag.tagging.categorized_suggestions import (
+                generate_categorized_suggestions,
+                get_category_suggestions_dict,
+            )
+            # Use selected tags to generate category-wise ranked suggestions
+            categorized = generate_categorized_suggestions(
+                selected_tags=list(result.selected_tags),
+                allow_nsfw_tags=allow_nsfw,
+                top_n_per_category=20,  # Get top 20 per category for eval
+                top_n_other=50,
+            )
+            # Convert to simple dict format: category -> [(tag, score), ...]
+            result.categorized_suggestions = {}
+            for cat_name, cat_sugg in categorized.by_category.items():
+                result.categorized_suggestions[cat_name] = cat_sugg.suggestions
+            # Also store "other" suggestions
+            result.categorized_suggestions['other'] = categorized.other_suggestions
+            log(f"Categorized: {len(result.categorized_suggestions)} categories")
+        except Exception as e:
+            log(f"Warning: Failed to generate categorized suggestions: {e}")
         # Remove eval-excluded tags from predictions before scoring
         result.selected_tags -= _EVAL_EXCLUDED_TAGS
         result.retrieved_tags -= _EVAL_EXCLUDED_TAGS
                 "selected_tags": sorted(r.selected_tags),
                 "implied_tags": sorted(r.implied_tags),
                 "structural_tags": r.structural_tags,
+                "categorized_suggestions": r.categorized_suggestions,
                 "why_counts": r.why_counts,
                 "tag_evidence": r.tag_evidence,
                 "gt_character_tags": sorted(r.gt_character_tags),