Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 10

Commit

133d74c

1 Parent(s): 6909d06

Improve eval harness: shuffle samples, always write results

- Samples are now randomly shuffled with --seed (default 42) for
reproducible but varied evaluation across runs
- Results JSONL always saved to data/eval_results/ with auto-generated
timestamp filename (or custom path with -o)
- First line of output is run metadata (settings, timestamp, error count)
- Default caption field is caption_cogvlm (vision model, not tag-derived)
- Added --no-shuffle flag for sequential sample order
- Added data/eval_results/ to .gitignore

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (2) hide show

.gitignore +1 -0
scripts/eval_pipeline.py +81 -34

.gitignore CHANGED Viewed

@@ -10,3 +10,4 @@ tf_idf_files_420.joblib
 e621FastTextModel010Replacement_small.bin
 tfidf_hnsw_artists.bin
 tfidf_hnsw_tags.bin

 e621FastTextModel010Replacement_small.bin
 tfidf_hnsw_artists.bin
 tfidf_hnsw_tags.bin
+data/eval_results/

scripts/eval_pipeline.py CHANGED Viewed

@@ -10,14 +10,20 @@ Metrics computed:
     selected tags match the ground truth
 Usage:
-    # Full end-to-end (Stage 1 + 2 + 3):
     python scripts/eval_pipeline.py --n 20
-    # Skip Stage 1 LLM rewrite, use ground-truth tags as retrieval input:
     python scripts/eval_pipeline.py --n 20 --skip-rewrite
-    # Use a specific caption field:
-    python scripts/eval_pipeline.py --n 20 --caption-field caption_cogvlm
 Requires:
     - OPENROUTER_API_KEY env var (for Stage 1 rewrite and Stage 3 selection)
@@ -30,9 +36,11 @@ from __future__ import annotations
 import argparse
 import json
 import os
 import sys
 import time
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple
@@ -110,6 +118,8 @@ def run_eval(
     temperature: float = 0.0,
     max_tokens: int = 512,
     verbose: bool = False,
 ) -> List[SampleResult]:
     from psq_rag.llm.rewrite import llm_rewrite_prompt
@@ -125,11 +135,9 @@ def run_eval(
         print(f"ERROR: Eval data not found: {EVAL_DATA_PATH}")
         sys.exit(1)
-    samples = []
     with EVAL_DATA_PATH.open("r", encoding="utf-8") as f:
         for line in f:
-            if len(samples) >= n_samples:
-                break
             row = json.loads(line)
             caption = row.get(caption_field, "")
             if not caption or not caption.strip():
@@ -137,14 +145,20 @@ def run_eval(
             gt_tags = _flatten_ground_truth_tags(row.get("tags_ground_truth_categorized", ""))
             if not gt_tags:
                 continue
-            samples.append({
-                "id": row.get("id", row.get("row_id", len(samples))),
                 "caption": caption.strip(),
                 "gt_tags": gt_tags,
             })
-    print(f"Loaded {len(samples)} samples (caption_field={caption_field})")
-    print(f"skip_rewrite={skip_rewrite}, allow_nsfw={allow_nsfw}, mode={mode}")
     print()
     results: List[SampleResult] = []
@@ -340,7 +354,13 @@ def main(argv=None) -> int:
     ap.add_argument("--max-tokens", type=int, default=512)
     ap.add_argument("--verbose", "-v", action="store_true", help="Show per-call Stage 3 logs")
     ap.add_argument("--output", "-o", type=str, default=None,
-                    help="Save detailed results as JSONL to this path")
     args = ap.parse_args(list(argv) if argv is not None else None)
@@ -355,34 +375,61 @@ def main(argv=None) -> int:
         temperature=args.temperature,
         max_tokens=args.max_tokens,
         verbose=args.verbose,
     )
     print_summary(results)
-    # Optionally save detailed results
     if args.output:
         out_path = Path(args.output)
-        out_path.parent.mkdir(parents=True, exist_ok=True)
-        with out_path.open("w", encoding="utf-8") as f:
-            for r in results:
-                row = {
-                    "sample_id": r.sample_id,
-                    "caption": r.caption,
-                    "ground_truth_tags": sorted(r.ground_truth_tags),
-                    "rewrite_phrases": r.rewrite_phrases,
-                    "retrieved_tags": sorted(r.retrieved_tags),
-                    "selected_tags": sorted(r.selected_tags),
-                    "retrieval_recall": round(r.retrieval_recall, 4),
-                    "selection_precision": round(r.selection_precision, 4),
-                    "selection_recall": round(r.selection_recall, 4),
-                    "selection_f1": round(r.selection_f1, 4),
-                    "stage1_time": round(r.stage1_time, 3),
-                    "stage2_time": round(r.stage2_time, 3),
-                    "stage3_time": round(r.stage3_time, 3),
-                    "error": r.error,
-                }
-                f.write(json.dumps(row, ensure_ascii=False) + "\n")
-        print(f"\nDetailed results saved to: {out_path}")
     return 0

     selected tags match the ground truth
 Usage:
+    # Full end-to-end (Stage 1 + 2 + 3), 20 random samples:
     python scripts/eval_pipeline.py --n 20
+    # Reproducible run with specific seed:
+    python scripts/eval_pipeline.py --n 50 --seed 123
+    # Skip Stage 1 LLM rewrite (cheaper, tests Stage 2+3 only):
     python scripts/eval_pipeline.py --n 20 --skip-rewrite
+    # First N samples in file order (no shuffle):
+    python scripts/eval_pipeline.py --n 20 --no-shuffle
+Results are always saved as JSONL to data/eval_results/ (auto-named by timestamp)
+or to a custom path with -o.
 Requires:
     - OPENROUTER_API_KEY env var (for Stage 1 rewrite and Stage 3 selection)
 import argparse
 import json
 import os
+import random
 import sys
 import time
 from dataclasses import dataclass, field
+from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Set, Tuple
     temperature: float = 0.0,
     max_tokens: int = 512,
     verbose: bool = False,
+    shuffle: bool = True,
+    seed: int = 42,
 ) -> List[SampleResult]:
     from psq_rag.llm.rewrite import llm_rewrite_prompt
         print(f"ERROR: Eval data not found: {EVAL_DATA_PATH}")
         sys.exit(1)
+    all_samples = []
     with EVAL_DATA_PATH.open("r", encoding="utf-8") as f:
         for line in f:
             row = json.loads(line)
             caption = row.get(caption_field, "")
             if not caption or not caption.strip():
             gt_tags = _flatten_ground_truth_tags(row.get("tags_ground_truth_categorized", ""))
             if not gt_tags:
                 continue
+            all_samples.append({
+                "id": row.get("id", row.get("row_id", len(all_samples))),
                 "caption": caption.strip(),
                 "gt_tags": gt_tags,
             })
+    if shuffle:
+        rng = random.Random(seed)
+        rng.shuffle(all_samples)
+    samples = all_samples[:n_samples]
+    print(f"Loaded {len(samples)}/{len(all_samples)} samples (caption_field={caption_field})")
+    print(f"shuffle={shuffle}, seed={seed}, skip_rewrite={skip_rewrite}, allow_nsfw={allow_nsfw}, mode={mode}")
     print()
     results: List[SampleResult] = []
     ap.add_argument("--max-tokens", type=int, default=512)
     ap.add_argument("--verbose", "-v", action="store_true", help="Show per-call Stage 3 logs")
     ap.add_argument("--output", "-o", type=str, default=None,
+                    help="Save detailed results as JSONL (default: auto-generated in data/eval_results/)")
+    ap.add_argument("--shuffle", action="store_true", default=True,
+                    help="Randomly shuffle samples before selecting (default: True)")
+    ap.add_argument("--no-shuffle", dest="shuffle", action="store_false",
+                    help="Use samples in file order (first N)")
+    ap.add_argument("--seed", type=int, default=42,
+                    help="Random seed for shuffle (default: 42)")
     args = ap.parse_args(list(argv) if argv is not None else None)
         temperature=args.temperature,
         max_tokens=args.max_tokens,
         verbose=args.verbose,
+        shuffle=args.shuffle,
+        seed=args.seed,
     )
     print_summary(results)
+    # Always save detailed results
     if args.output:
         out_path = Path(args.output)
+    else:
+        results_dir = _REPO_ROOT / "data" / "eval_results"
+        results_dir.mkdir(parents=True, exist_ok=True)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        out_path = results_dir / f"eval_{args.caption_field}_n{args.n}_seed{args.seed}_{timestamp}.jsonl"
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    # Write run metadata as first line
+    meta = {
+        "_meta": True,
+        "timestamp": datetime.now().isoformat(),
+        "n_samples": len(results),
+        "caption_field": args.caption_field,
+        "skip_rewrite": args.skip_rewrite,
+        "allow_nsfw": args.allow_nsfw,
+        "mode": args.mode,
+        "chunk_size": args.chunk_size,
+        "per_phrase_k": args.per_phrase_k,
+        "temperature": args.temperature,
+        "shuffle": args.shuffle,
+        "seed": args.seed,
+        "n_errors": sum(1 for r in results if r.error),
+    }
+    with out_path.open("w", encoding="utf-8") as f:
+        f.write(json.dumps(meta, ensure_ascii=False) + "\n")
+        for r in results:
+            row = {
+                "sample_id": r.sample_id,
+                "caption": r.caption,
+                "ground_truth_tags": sorted(r.ground_truth_tags),
+                "rewrite_phrases": r.rewrite_phrases,
+                "retrieved_tags": sorted(r.retrieved_tags),
+                "selected_tags": sorted(r.selected_tags),
+                "retrieval_recall": round(r.retrieval_recall, 4),
+                "selection_precision": round(r.selection_precision, 4),
+                "selection_recall": round(r.selection_recall, 4),
+                "selection_f1": round(r.selection_f1, 4),
+                "stage1_time": round(r.stage1_time, 3),
+                "stage2_time": round(r.stage2_time, 3),
+                "stage3_time": round(r.stage3_time, 3),
+                "error": r.error,
+            }
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+    print(f"\nDetailed results saved to: {out_path}")
     return 0