Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

App Files Files Community

Food Desert commited on Feb 24

Commit

3c18372

1 Parent(s): 73f56cf

Simplify Stage3 chunking to interleave-only and add eval diagnostics

Browse files

Files changed (2) hide show

psq_rag/llm/select.py +88 -14
scripts/eval_pipeline.py +129 -71

psq_rag/llm/select.py CHANGED Viewed

@@ -253,6 +253,13 @@ def _interleave_round_robin(cands: Sequence[Candidate]) -> List[Candidate]:
     return out
 def _display_tag(tag: str) -> str:
     # Display tags with spaces for the LLM, but keep canonical underscores internally.
     return tag.replace("_", " ")
@@ -494,8 +501,13 @@ def llm_select_indices(
     temperature: float = 0.0,
     max_tokens: int = 512,
     return_metadata: bool = False,
     min_why: Optional[str] = "strong_implied",
-) -> Union[List[int], Tuple[List[int], Dict[str, str]]]:
     """Return indices into the ORIGINAL candidates list (legacy interface).
     min_why: if set, only keep tags whose 'why' is at or above this confidence
@@ -586,6 +598,42 @@ def llm_select_indices(
     # Global union: tag -> best (score, why)
     best: Dict[str, Tuple[float, str]] = {}
     def run_call(call_cands: Sequence[Candidate], label: str, system_template: str) -> None:
         # Create chain with the provided system template
@@ -598,9 +646,10 @@ def llm_select_indices(
         )
         chain = prompt | llm | parser
-        ordered = _interleave_round_robin(call_cands)
         candidate_lines, idx_to_tag, idx_to_candidate = _format_candidates_local(ordered)
         N_local = len(idx_to_tag)
         phrases = _phrases_in_call(call_cands)
         per_call_budget = max(1, per_phrase_k * phrases) if phrases > 0 else per_phrase_k
@@ -618,6 +667,7 @@ def llm_select_indices(
         # Invoke LangChain chain (templating fills {N} and other vars)
         for att in range(retries + 1):
             try:
                 if log:
                     log(
                         f"Stage3 {label}: "
@@ -637,6 +687,16 @@ def llm_select_indices(
                     }
                 )
                 selected, diag = _parse_validate_map(parsed, idx_to_tag, per_call_budget=per_call_budget)
                 if log:
                     log(f"Stage3 {label}: attempt {att+1} diag={diag}")
                     if not summary_logged and (selected or att == retries):
@@ -662,6 +722,7 @@ def llm_select_indices(
                         log(f"Stage3 {label} selections: (none)")
                 if selected:
                     for s in selected:
                         prev = best.get(s.tag)
                         if prev is None or s.score > prev[0]:
@@ -669,11 +730,14 @@ def llm_select_indices(
                     return
             except Exception as e:
                 if log:
                     log(f"Stage3 {label}: attempt {att+1} error: {e}")
         if log:
             log(f"Stage3 {label}: gave up after {retries+1} attempts")
     # Split candidates by type (general vs entity)
     general_with_idx, entity_with_idx = _split_candidates_by_type(norm, log)
@@ -687,12 +751,9 @@ def llm_select_indices(
         if mode == "single_shot":
             run_call(general_cands, "general_single_shot", SELECT_SYSTEM_TEMPLATE)
         else:
-            for start in range(0, len(general_cands), chunk_size):
-                run_call(
-                    general_cands[start:start + chunk_size],
-                    f"general_chunk_{start//chunk_size}",
-                    SELECT_SYSTEM_TEMPLATE
-                )
     # Process entity candidates (characters only) with alias-based pre-filtering
     if entity_cands:
@@ -725,12 +786,9 @@ def llm_select_indices(
             if mode == "single_shot":
                 run_call(filtered_entity_cands, "entity_single_shot", ENTITY_SYSTEM_TEMPLATE)
             else:
-                for start in range(0, len(filtered_entity_cands), chunk_size):
-                    run_call(
-                        filtered_entity_cands[start:start + chunk_size],
-                        f"entity_chunk_{start//chunk_size}",
-                        ENTITY_SYSTEM_TEMPLATE
-                    )
     # Apply why threshold: drop tags below the minimum confidence level.
     if min_why is not None:
@@ -757,7 +815,23 @@ def llm_select_indices(
             out_idx.append(tag_to_first_index[t])
             tag_why[t] = best[t][1]  # why string
     if return_metadata:
         return out_idx, tag_why
     return out_idx

     return out
+def _build_chunks(cands: Sequence[Candidate], chunk_size: int) -> List[List[Candidate]]:
+    if chunk_size <= 0:
+        raise ValueError(f"chunk_size must be > 0, got {chunk_size}")
+    ordered = _interleave_round_robin(cands)
+    return [ordered[i:i + chunk_size] for i in range(0, len(ordered), chunk_size)]
 def _display_tag(tag: str) -> str:
     # Display tags with spaces for the LLM, but keep canonical underscores internally.
     return tag.replace("_", " ")
     temperature: float = 0.0,
     max_tokens: int = 512,
     return_metadata: bool = False,
+    return_diagnostics: bool = False,
     min_why: Optional[str] = "strong_implied",
+) -> Union[
+    List[int],
+    Tuple[List[int], Dict[str, str]],
+    Tuple[List[int], Dict[str, str], Dict[str, Any]],
+]:
     """Return indices into the ORIGINAL candidates list (legacy interface).
     min_why: if set, only keep tags whose 'why' is at or above this confidence
     # Global union: tag -> best (score, why)
     best: Dict[str, Tuple[float, str]] = {}
+    diagnostics: Dict[str, Any] = {
+        "mode": mode,
+        "chunk_strategy": "interleave",
+        "chunk_passes": 1,
+        "chunk_shuffle_within_call": False,
+        "calls_total": 0,
+        "calls_with_selection": 0,
+        "calls_exhausted_retries": 0,
+        "attempts_total": 0,
+        "attempt_errors": 0,
+        "attempt_parse_fail": 0,
+        "attempt_parse_ok": 0,
+        "invalid_items_total": 0,
+        "oob_indices_total": 0,
+        "dupe_indices_total": 0,
+        "kept_total": 0,
+        "attempts_by_n_local": {},
+    }
+    def _record_attempt_for_n(n_local: int, *, parse_ok: bool, error: bool) -> None:
+        by_n = diagnostics["attempts_by_n_local"]
+        key = str(n_local)
+        if key not in by_n:
+            by_n[key] = {
+                "attempts": 0,
+                "parse_ok": 0,
+                "parse_fail": 0,
+                "errors": 0,
+            }
+        by_n[key]["attempts"] += 1
+        if error:
+            by_n[key]["errors"] += 1
+        elif parse_ok:
+            by_n[key]["parse_ok"] += 1
+        else:
+            by_n[key]["parse_fail"] += 1
     def run_call(call_cands: Sequence[Candidate], label: str, system_template: str) -> None:
         # Create chain with the provided system template
         )
         chain = prompt | llm | parser
+        ordered = _interleave_round_robin(call_cands) if mode == "single_shot" else list(call_cands)
         candidate_lines, idx_to_tag, idx_to_candidate = _format_candidates_local(ordered)
         N_local = len(idx_to_tag)
+        diagnostics["calls_total"] += 1
         phrases = _phrases_in_call(call_cands)
         per_call_budget = max(1, per_phrase_k * phrases) if phrases > 0 else per_phrase_k
         # Invoke LangChain chain (templating fills {N} and other vars)
         for att in range(retries + 1):
             try:
+                diagnostics["attempts_total"] += 1
                 if log:
                     log(
                         f"Stage3 {label}: "
                     }
                 )
                 selected, diag = _parse_validate_map(parsed, idx_to_tag, per_call_budget=per_call_budget)
+                diagnostics["invalid_items_total"] += int(diag.get("invalid_items", 0))
+                diagnostics["oob_indices_total"] += int(diag.get("oob_indices", 0))
+                diagnostics["dupe_indices_total"] += int(diag.get("dupe_indices", 0))
+                diagnostics["kept_total"] += int(diag.get("kept", 0))
+                if bool(diag.get("parse_ok", False)):
+                    diagnostics["attempt_parse_ok"] += 1
+                    _record_attempt_for_n(N_local, parse_ok=True, error=False)
+                else:
+                    diagnostics["attempt_parse_fail"] += 1
+                    _record_attempt_for_n(N_local, parse_ok=False, error=False)
                 if log:
                     log(f"Stage3 {label}: attempt {att+1} diag={diag}")
                     if not summary_logged and (selected or att == retries):
                         log(f"Stage3 {label} selections: (none)")
                 if selected:
+                    diagnostics["calls_with_selection"] += 1
                     for s in selected:
                         prev = best.get(s.tag)
                         if prev is None or s.score > prev[0]:
                     return
             except Exception as e:
+                diagnostics["attempt_errors"] += 1
+                _record_attempt_for_n(N_local, parse_ok=False, error=True)
                 if log:
                     log(f"Stage3 {label}: attempt {att+1} error: {e}")
         if log:
             log(f"Stage3 {label}: gave up after {retries+1} attempts")
+        diagnostics["calls_exhausted_retries"] += 1
     # Split candidates by type (general vs entity)
     general_with_idx, entity_with_idx = _split_candidates_by_type(norm, log)
         if mode == "single_shot":
             run_call(general_cands, "general_single_shot", SELECT_SYSTEM_TEMPLATE)
         else:
+            base_chunks = _build_chunks(general_cands, chunk_size)
+            for chunk_idx, chunk in enumerate(base_chunks):
+                run_call(chunk, f"general_chunk_{chunk_idx}", SELECT_SYSTEM_TEMPLATE)
     # Process entity candidates (characters only) with alias-based pre-filtering
     if entity_cands:
             if mode == "single_shot":
                 run_call(filtered_entity_cands, "entity_single_shot", ENTITY_SYSTEM_TEMPLATE)
             else:
+                base_chunks = _build_chunks(filtered_entity_cands, chunk_size)
+                for chunk_idx, chunk in enumerate(base_chunks):
+                    run_call(chunk, f"entity_chunk_{chunk_idx}", ENTITY_SYSTEM_TEMPLATE)
     # Apply why threshold: drop tags below the minimum confidence level.
     if min_why is not None:
             out_idx.append(tag_to_first_index[t])
             tag_why[t] = best[t][1]  # why string
+    if diagnostics["attempts_total"] > 0:
+        diagnostics["attempt_failure_rate"] = (
+            diagnostics["attempt_parse_fail"] + diagnostics["attempt_errors"]
+        ) / diagnostics["attempts_total"]
+    else:
+        diagnostics["attempt_failure_rate"] = 0.0
+    if diagnostics["calls_total"] > 0:
+        diagnostics["call_exhaustion_rate"] = (
+            diagnostics["calls_exhausted_retries"] / diagnostics["calls_total"]
+        )
+    else:
+        diagnostics["call_exhaustion_rate"] = 0.0
     if return_metadata:
+        if return_diagnostics:
+            return out_idx, tag_why, diagnostics
         return out_idx, tag_why
     return out_idx

scripts/eval_pipeline.py CHANGED Viewed

@@ -162,7 +162,8 @@ class SampleResult:
     selection_given_retrieval: float = 0.0  # |selected ∩ gt| / |retrieved ∩ gt|
     over_selection_ratio: float = 0.0       # |selected| / |gt|
     # Why distribution (from Stage 3 LLM)
-    why_counts: Dict[str, int] = field(default_factory=dict)
     # Tag implications
     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
     # Structural inference tags (solo/duo/male/female/anthro/biped etc.)
@@ -216,12 +217,12 @@ def _process_one_sample(
     per_phrase_final_k: int,
     temperature: float,
     max_tokens: int,
-    verbose: bool,
-    print_lock: threading.Lock,
-    min_why: Optional[str] = None,
-    expand_implications: bool = False,
-    infer_structural: bool = False,
-) -> SampleResult:
     """Process a single eval sample through the full pipeline. Thread-safe."""
     from psq_rag.llm.rewrite import llm_rewrite_prompt
     from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases
@@ -300,20 +301,22 @@ def _process_one_sample(
         # --- Stage 3: LLM Selection ---
         t0 = time.time()
-        picked_indices, tag_why = llm_select_indices(
-            query_text=caption,
-            candidates=candidates,
-            max_pick=0,
-            log=log,
-            mode=mode,
-            chunk_size=chunk_size,
-            per_phrase_k=per_phrase_k,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            return_metadata=True,
-            min_why=min_why,
-        )
-        result.stage3_time = time.time() - t0
         result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
         result.stage3_selected_tags = set(result.selected_tags)
@@ -497,10 +500,11 @@ def run_eval(
     max_tokens: int = 512,
     verbose: bool = False,
     shuffle: bool = True,
-    seed: int = 42,
-    workers: int = 1,
-    min_why: Optional[str] = "strong_implied",
-    expand_implications: bool = False,
     infer_structural: bool = False,
 ) -> List[SampleResult]:
     expand_gt = expand_implications
@@ -508,20 +512,26 @@ def run_eval(
         from psq_rag.retrieval.state import expand_tags_via_implications as _expand_gt_tags
     # Load eval samples — prefer expanded file, fall back to raw
-    eval_path = EVAL_DATA_PATH
-    if not eval_path.is_file():
-        eval_path = EVAL_DATA_PATH_RAW
-        if not eval_path.is_file():
-            print(f"ERROR: Eval data not found: {EVAL_DATA_PATH}")
-            sys.exit(1)
-        print(f"WARNING: Expanded eval data not found, falling back to raw: {eval_path}")
-        print("  Run: python scripts/preprocess_eval_data.py")
-    all_samples = []
-    using_expanded = False
-    with eval_path.open("r", encoding="utf-8") as f:
-        for line in f:
-            row = json.loads(line)
             caption = row.get(caption_field, "")
             if not caption or not caption.strip():
                 continue
@@ -543,19 +553,20 @@ def run_eval(
                 "caption": caption.strip(),
                 "gt_tags": gt_tags,
             })
-    if using_expanded:
-        print("Using implication-expanded ground truth")
     if shuffle:
         rng = random.Random(seed)
         rng.shuffle(all_samples)
     samples = all_samples[:n_samples]
-    print(f"Loaded {len(samples)}/{len(all_samples)} samples (caption_field={caption_field})")
-    print(f"shuffle={shuffle}, seed={seed}, skip_rewrite={skip_rewrite}, allow_nsfw={allow_nsfw}, mode={mode}")
-    print(f"workers={workers}")
-    print()
     # Pre-warm shared retrieval assets before spawning threads
     _prewarm_retrieval_assets()
@@ -572,10 +583,11 @@ def run_eval(
                 sample, i, total,
                 skip_rewrite, allow_nsfw, mode, chunk_size,
                 per_phrase_k, per_phrase_final_k, temperature, max_tokens, verbose,
-                print_lock, min_why, expand_implications,
                 infer_structural,
             )
-            results.append(result)
     else:
         # Parallel mode
         print(f"Processing {total} samples with {workers} parallel workers...")
@@ -584,12 +596,13 @@ def run_eval(
         results_by_index: Dict[int, SampleResult] = {}
         with ThreadPoolExecutor(max_workers=workers) as executor:
             futures = {
-                executor.submit(
                     _process_one_sample,
                     sample, i, total,
                     skip_rewrite, allow_nsfw, mode, chunk_size,
                     per_phrase_k, per_phrase_final_k, temperature, max_tokens, verbose,
-                    print_lock, min_why, expand_implications,
                     infer_structural,
                 ): i
                 for i, sample in enumerate(samples)
@@ -688,13 +701,52 @@ def print_summary(results: List[SampleResult]) -> None:
     print(f"  Avg leaf ground-truth:{avg_leaf_gt:.1f}")
     print()
-    print("Diagnostic Metrics:")
-    print(f"  Retrieval precision:  {avg_retrieval_precision:.4f}  (|ret∩gt|/|ret|, noise level fed to Stage 3)")
-    print(f"  Sel-given-retrieval:  {avg_sel_given_ret:.4f}  (of gt tags retrieved, fraction kept by Stage 3)")
-    print(f"  Over-selection ratio: {avg_over_sel:.2f}x  (|selected|/|gt|, ideal ~1.0)")
-    # Why distribution across all samples
-    total_why: Dict[str, int] = {}
     for r in valid:
         for w, cnt in r.why_counts.items():
             total_why[w] = total_why.get(w, 0) + cnt
@@ -830,8 +882,8 @@ def main(argv=None) -> int:
     ap.add_argument("--skip-rewrite", action="store_true",
                     help="Skip Stage 1 LLM rewrite; split caption directly into phrases")
     ap.add_argument("--allow-nsfw", action="store_true", help="Allow NSFW tags")
-    ap.add_argument("--mode", default="chunked_map_union",
-                    choices=["single_shot", "chunked_map_union"])
     ap.add_argument("--chunk-size", type=int, default=60)
     ap.add_argument("--per-phrase-k", type=int, default=2)
     ap.add_argument("--per-phrase-final-k", type=int, default=10,
@@ -847,8 +899,10 @@ def main(argv=None) -> int:
                     help="Use samples in file order (first N)")
     ap.add_argument("--seed", type=int, default=42,
                     help="Random seed for shuffle (default: 42)")
-    ap.add_argument("--workers", "-w", type=int, default=4,
-                    help="Number of parallel workers (default: 4, use 1 for sequential)")
     ap.add_argument("--min-why", default="strong_implied",
                     choices=["explicit", "strong_implied", "weak_implied", "style_or_meta", "other", "none"],
                     help="Minimum 'why' confidence to keep (default: strong_implied). Use 'none' to disable filtering.")
@@ -875,12 +929,13 @@ def main(argv=None) -> int:
         max_tokens=args.max_tokens,
         verbose=args.verbose,
         shuffle=args.shuffle,
-        seed=args.seed,
-        workers=args.workers,
-        min_why=min_why_val,
-        expand_implications=args.expand_implications,
-        infer_structural=args.infer_structural,
-    )
     print_summary(results)
@@ -907,9 +962,10 @@ def main(argv=None) -> int:
         "n_samples": len(results),
         "caption_field": args.caption_field,
         "skip_rewrite": args.skip_rewrite,
-        "allow_nsfw": args.allow_nsfw,
         "mode": args.mode,
         "chunk_size": args.chunk_size,
         "per_phrase_k": args.per_phrase_k,
         "per_phrase_final_k": args.per_phrase_final_k,
         "temperature": args.temperature,
@@ -953,7 +1009,8 @@ def main(argv=None) -> int:
                 "ret_P": round(r.retrieval_precision, 4),
                 "sel_given_ret": round(r.selection_given_retrieval, 4),
                 "over_sel": round(r.over_selection_ratio, 2),
-                "why": r.why_counts,
                 # Character metrics (compact)
                 "n_gt_char": len(r.gt_character_tags),
                 "n_sel_char": len(r.selected_character_tags),
@@ -1005,8 +1062,9 @@ def main(argv=None) -> int:
                 "implied_tags": sorted(r.implied_tags),
                 "structural_tags": r.structural_tags,
                 "categorized_suggestions": r.categorized_suggestions,
-                "why_counts": r.why_counts,
-                "tag_evidence": r.tag_evidence,
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
                 "gt_general_tags": sorted(r.gt_general_tags),

     selection_given_retrieval: float = 0.0  # |selected ∩ gt| / |retrieved ∩ gt|
     over_selection_ratio: float = 0.0       # |selected| / |gt|
     # Why distribution (from Stage 3 LLM)
+    why_counts: Dict[str, int] = field(default_factory=dict)
+    stage3_diag: Dict[str, Any] = field(default_factory=dict)
     # Tag implications
     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
     # Structural inference tags (solo/duo/male/female/anthro/biped etc.)
     per_phrase_final_k: int,
     temperature: float,
     max_tokens: int,
+    verbose: bool,
+    print_lock: threading.Lock,
+    min_why: Optional[str] = None,
+    expand_implications: bool = False,
+    infer_structural: bool = False,
+) -> SampleResult:
     """Process a single eval sample through the full pipeline. Thread-safe."""
     from psq_rag.llm.rewrite import llm_rewrite_prompt
     from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases
         # --- Stage 3: LLM Selection ---
         t0 = time.time()
+        picked_indices, tag_why, stage3_diag = llm_select_indices(
+            query_text=caption,
+            candidates=candidates,
+            max_pick=0,
+            log=log,
+            mode=mode,
+            chunk_size=chunk_size,
+            per_phrase_k=per_phrase_k,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            return_metadata=True,
+            return_diagnostics=True,
+            min_why=min_why,
+        )
+        result.stage3_time = time.time() - t0
+        result.stage3_diag = stage3_diag or {}
         result.selected_tags = {candidates[idx].tag for idx in picked_indices} if picked_indices else set()
         result.stage3_selected_tags = set(result.selected_tags)
     max_tokens: int = 512,
     verbose: bool = False,
     shuffle: bool = True,
+    seed: int = 42,
+    workers: int = 1,
+    min_why: Optional[str] = "strong_implied",
+    eval_path: Optional[str] = None,
+    expand_implications: bool = False,
     infer_structural: bool = False,
 ) -> List[SampleResult]:
     expand_gt = expand_implications
         from psq_rag.retrieval.state import expand_tags_via_implications as _expand_gt_tags
     # Load eval samples — prefer expanded file, fall back to raw
+    eval_path_obj = Path(eval_path) if eval_path else EVAL_DATA_PATH
+    if not eval_path_obj.is_absolute():
+        eval_path_obj = (_REPO_ROOT / eval_path_obj).resolve()
+    if not eval_path_obj.is_file() and eval_path is None:
+        eval_path_obj = EVAL_DATA_PATH_RAW
+        if not eval_path_obj.is_file():
+            print(f"ERROR: Eval data not found: {EVAL_DATA_PATH}")
+            sys.exit(1)
+        print(f"WARNING: Expanded eval data not found, falling back to raw: {eval_path_obj}")
+        print("  Run: python scripts/preprocess_eval_data.py")
+    elif not eval_path_obj.is_file():
+        print(f"ERROR: Eval data not found: {eval_path_obj}")
+        sys.exit(1)
+    all_samples = []
+    using_expanded = False
+    with eval_path_obj.open("r", encoding="utf-8") as f:
+        for line in f:
+            row = json.loads(line)
             caption = row.get(caption_field, "")
             if not caption or not caption.strip():
                 continue
                 "caption": caption.strip(),
                 "gt_tags": gt_tags,
             })
+    if using_expanded:
+        print("Using implication-expanded ground truth")
     if shuffle:
         rng = random.Random(seed)
         rng.shuffle(all_samples)
     samples = all_samples[:n_samples]
+    print(f"Loaded {len(samples)}/{len(all_samples)} samples (caption_field={caption_field})")
+    print(f"eval_path={eval_path_obj}")
+    print(f"shuffle={shuffle}, seed={seed}, skip_rewrite={skip_rewrite}, allow_nsfw={allow_nsfw}, mode={mode}")
+    print(f"workers={workers}")
+    print()
     # Pre-warm shared retrieval assets before spawning threads
     _prewarm_retrieval_assets()
                 sample, i, total,
                 skip_rewrite, allow_nsfw, mode, chunk_size,
                 per_phrase_k, per_phrase_final_k, temperature, max_tokens, verbose,
+                print_lock, min_why,
+                expand_implications,
                 infer_structural,
             )
+            results.append(result)
     else:
         # Parallel mode
         print(f"Processing {total} samples with {workers} parallel workers...")
         results_by_index: Dict[int, SampleResult] = {}
         with ThreadPoolExecutor(max_workers=workers) as executor:
             futures = {
+                executor.submit(
                     _process_one_sample,
                     sample, i, total,
                     skip_rewrite, allow_nsfw, mode, chunk_size,
                     per_phrase_k, per_phrase_final_k, temperature, max_tokens, verbose,
+                    print_lock, min_why,
+                    expand_implications,
                     infer_structural,
                 ): i
                 for i, sample in enumerate(samples)
     print(f"  Avg leaf ground-truth:{avg_leaf_gt:.1f}")
     print()
+    print("Diagnostic Metrics:")
+    print(f"  Retrieval precision:  {avg_retrieval_precision:.4f}  (|ret∩gt|/|ret|, noise level fed to Stage 3)")
+    print(f"  Sel-given-retrieval:  {avg_sel_given_ret:.4f}  (of gt tags retrieved, fraction kept by Stage 3)")
+    print(f"  Over-selection ratio: {avg_over_sel:.2f}x  (|selected|/|gt|, ideal ~1.0)")
+    stage3_diag_rows = [r.stage3_diag for r in valid if r.stage3_diag]
+    if stage3_diag_rows:
+        calls_total = sum(int(d.get("calls_total", 0)) for d in stage3_diag_rows)
+        calls_exhausted = sum(int(d.get("calls_exhausted_retries", 0)) for d in stage3_diag_rows)
+        attempts_total = sum(int(d.get("attempts_total", 0)) for d in stage3_diag_rows)
+        attempts_parse_fail = sum(int(d.get("attempt_parse_fail", 0)) for d in stage3_diag_rows)
+        attempts_errors = sum(int(d.get("attempt_errors", 0)) for d in stage3_diag_rows)
+        print()
+        print("Stage 3 Structured Output Reliability:")
+        print(f"  Calls total:          {calls_total}")
+        print(f"  Calls exhausted:      {calls_exhausted} ({(100 * calls_exhausted / calls_total) if calls_total else 0:.1f}%)")
+        print(f"  Attempts total:       {attempts_total}")
+        print(f"  Parse/schema failures:{attempts_parse_fail} ({(100 * attempts_parse_fail / attempts_total) if attempts_total else 0:.1f}%)")
+        print(f"  Call errors/exc:      {attempts_errors} ({(100 * attempts_errors / attempts_total) if attempts_total else 0:.1f}%)")
+        by_n_agg: Dict[int, Dict[str, int]] = {}
+        for d in stage3_diag_rows:
+            for n_str, n_stats in d.get("attempts_by_n_local", {}).items():
+                try:
+                    n_local = int(n_str)
+                except Exception:
+                    continue
+                cur = by_n_agg.setdefault(n_local, {"attempts": 0, "parse_fail": 0, "errors": 0})
+                cur["attempts"] += int(n_stats.get("attempts", 0))
+                cur["parse_fail"] += int(n_stats.get("parse_fail", 0))
+                cur["errors"] += int(n_stats.get("errors", 0))
+        if by_n_agg:
+            print("  Failure by call size (N_local):")
+            for n_local in sorted(by_n_agg.keys()):
+                s = by_n_agg[n_local]
+                fail = s["parse_fail"] + s["errors"]
+                rate = (100 * fail / s["attempts"]) if s["attempts"] else 0.0
+                print(
+                    f"    N={n_local:3d} attempts={s['attempts']:4d} "
+                    f"fail={fail:4d} ({rate:5.1f}%)"
+                )
+    # Why distribution across all samples
+    total_why: Dict[str, int] = {}
     for r in valid:
         for w, cnt in r.why_counts.items():
             total_why[w] = total_why.get(w, 0) + cnt
     ap.add_argument("--skip-rewrite", action="store_true",
                     help="Skip Stage 1 LLM rewrite; split caption directly into phrases")
     ap.add_argument("--allow-nsfw", action="store_true", help="Allow NSFW tags")
+    ap.add_argument("--mode", default="chunked_map_union",
+                    choices=["single_shot", "chunked_map_union"])
     ap.add_argument("--chunk-size", type=int, default=60)
     ap.add_argument("--per-phrase-k", type=int, default=2)
     ap.add_argument("--per-phrase-final-k", type=int, default=10,
                     help="Use samples in file order (first N)")
     ap.add_argument("--seed", type=int, default=42,
                     help="Random seed for shuffle (default: 42)")
+    ap.add_argument("--workers", "-w", type=int, default=4,
+                    help="Number of parallel workers (default: 4, use 1 for sequential)")
+    ap.add_argument("--eval-path", type=str, default=None,
+                    help="Optional path to eval JSONL (defaults to expanded 1000-sample set).")
     ap.add_argument("--min-why", default="strong_implied",
                     choices=["explicit", "strong_implied", "weak_implied", "style_or_meta", "other", "none"],
                     help="Minimum 'why' confidence to keep (default: strong_implied). Use 'none' to disable filtering.")
         max_tokens=args.max_tokens,
         verbose=args.verbose,
         shuffle=args.shuffle,
+        seed=args.seed,
+        workers=args.workers,
+        min_why=min_why_val,
+        eval_path=args.eval_path,
+        expand_implications=args.expand_implications,
+        infer_structural=args.infer_structural,
+    )
     print_summary(results)
         "n_samples": len(results),
         "caption_field": args.caption_field,
         "skip_rewrite": args.skip_rewrite,
+        "allow_nsfw": args.allow_nsfw,
         "mode": args.mode,
         "chunk_size": args.chunk_size,
+        "eval_path": args.eval_path,
         "per_phrase_k": args.per_phrase_k,
         "per_phrase_final_k": args.per_phrase_final_k,
         "temperature": args.temperature,
                 "ret_P": round(r.retrieval_precision, 4),
                 "sel_given_ret": round(r.selection_given_retrieval, 4),
                 "over_sel": round(r.over_selection_ratio, 2),
+                "why": r.why_counts,
+                "stage3_diag": r.stage3_diag,
                 # Character metrics (compact)
                 "n_gt_char": len(r.gt_character_tags),
                 "n_sel_char": len(r.selected_character_tags),
                 "implied_tags": sorted(r.implied_tags),
                 "structural_tags": r.structural_tags,
                 "categorized_suggestions": r.categorized_suggestions,
+                "why_counts": r.why_counts,
+                "stage3_diag": r.stage3_diag,
+                "tag_evidence": r.tag_evidence,
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
                 "gt_general_tags": sorted(r.gt_general_tags),