Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 12

Commit

a16e111

1 Parent(s): 4968635

Add structural tag inference (Stage 3s) and compact eval output

Stage 3s: New LLM step that infers structural tags (solo/duo/male/female/
anthro/biped/feral/humanoid/quadruped/intersex/ambiguous_gender/zero_pictured/
trio/group) via natural-language statement agreement instead of retrieval.
These tags are almost never stated in captions but are structurally obvious.

Compact eval output: Eval pipeline now writes two files:
- Compact metrics JSONL (tracked in git, small) with counts + diff sets
- Full detail JSONL (gitignored, large) with complete tag lists for analysis

Also: .gitignore updated to exclude *_detail.jsonl files.

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (4) hide show

.gitignore +2 -0
app.py +13 -2
psq_rag/llm/select.py +175 -0
scripts/eval_pipeline.py +102 -39

.gitignore CHANGED Viewed

@@ -10,3 +10,5 @@ tf_idf_files_420.joblib
 e621FastTextModel010Replacement_small.bin
 tfidf_hnsw_artists.bin
 tfidf_hnsw_tags.bin

 e621FastTextModel010Replacement_small.bin
 tfidf_hnsw_artists.bin
 tfidf_hnsw_tags.bin
+# Full detail eval files (large) — only compact metrics tracked in git
+*_detail.jsonl

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from typing import List
 from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
 from psq_rag.llm.rewrite import llm_rewrite_prompt
 from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
-from psq_rag.llm.select import llm_select_indices
 from psq_rag.retrieval.state import expand_tags_via_implications
@@ -224,7 +224,18 @@ def rag_pipeline_ui(user_prompt: str):
         selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
-        log("Step 3b: Expand via tag implications")
         tag_set = set(selected_tags)
         expanded, implied_only = expand_tags_via_implications(tag_set)
         if implied_only:

 from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
 from psq_rag.llm.rewrite import llm_rewrite_prompt
 from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
+from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
 from psq_rag.retrieval.state import expand_tags_via_implications
         selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
+        log("Step 3b: Structural tag inference (solo/duo/gender/body plan)")
+        structural_tags = llm_infer_structural_tags(prompt_in, log=log)
+        if structural_tags:
+            # Add structural tags that aren't already selected
+            existing = {t for t in selected_tags}
+            new_structural = [t for t in structural_tags if t not in existing]
+            selected_tags.extend(new_structural)
+            log(f"  Added {len(new_structural)} structural tags: {', '.join(new_structural)}")
+        else:
+            log("  No structural tags inferred")
+        log("Step 3c: Expand via tag implications")
         tag_set = set(selected_tags)
         expanded, implied_only = expand_tags_via_implications(tag_set)
         if implied_only:

psq_rag/llm/select.py CHANGED Viewed

@@ -760,3 +760,178 @@ def llm_select_indices(
         return out_idx, tag_why
     return out_idx

         return out_idx, tag_why
     return out_idx
+# ---------------------------------------------------------------------------
+# Stage 3s: Structural tag inference (solo/duo/male/female/anthro/biped …)
+# ---------------------------------------------------------------------------
+# Each statement maps to exactly one tag.  The LLM picks statement numbers.
+_STRUCTURAL_STATEMENTS: List[Tuple[str, str]] = [
+    # Character count
+    ("The image contains zero characters (no living beings depicted)", "zero_pictured"),
+    ("The image contains exactly one character", "solo"),
+    ("The image contains exactly two characters", "duo"),
+    ("The image contains exactly three characters", "trio"),
+    ("The image contains four or more characters", "group"),
+    # Body plan
+    ("A character is a regular (non-anthropomorphic) animal", "feral"),
+    ("A character is an anthropomorphic animal (walks upright, has human-like posture)", "anthro"),
+    ("A character is a human or human-like being", "humanoid"),
+    ("A character stands or walks on two legs", "biped"),
+    ("A character stands or walks on four legs", "quadruped"),
+    # Gender
+    ("The image contains a male character", "male"),
+    ("The image contains a female character", "female"),
+    ("A character's gender is ambiguous or unspecified", "ambiguous_gender"),
+    ("The image contains an intersex character", "intersex"),
+]
+STRUCTURAL_SYSTEM_TEMPLATE = """You are given a description of an image and a numbered list of statements.
+Select EVERY statement that is true about the described image.
+Return JSON ONLY matching this schema:
+{{
+  "selections": [
+    {{"i": <int>}},
+    ...
+  ]
+}}
+Rules:
+- Choose ONLY from indices 1..{N}.
+- A statement is true if the description clearly supports it OR it is very strongly implied.
+- Select ALL true statements, not just one per category.
+- For example, if the image has two anthropomorphic characters (one male, one female),
+  you would select: "exactly two characters", "anthropomorphic animal", "stands on two legs",
+  "male character", and "female character".
+- When no characters are visible, select only the "zero characters" statement.
+- Do NOT guess when the description provides no evidence.
+"""
+STRUCTURAL_USER_TEMPLATE = """IMAGE DESCRIPTION:
+{image_description}
+STATEMENTS (select all that are true by index):
+{statement_lines}
+"""
+class StructuralSelectionItem(BaseModel):
+    i: int = Field(..., description="1-based index into the statement list.")
+class StructuralSelectionResponse(BaseModel):
+    selections: List[StructuralSelectionItem] = Field(default_factory=list)
+def _build_structural_response_format() -> Dict[str, Any]:
+    schema = {
+        "type": "object",
+        "properties": {
+            "selections": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "i": {"type": "integer"},
+                    },
+                    "required": ["i"],
+                    "additionalProperties": False,
+                },
+            }
+        },
+        "required": ["selections"],
+        "additionalProperties": False,
+    }
+    return {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "structural_selection",
+            "strict": True,
+            "schema": schema,
+        },
+    }
+def llm_infer_structural_tags(
+    query_text: str,
+    log=None,
+    *,
+    temperature: float = 0.0,
+    max_tokens: int = 256,
+    retries: int = 2,
+) -> List[str]:
+    """Infer structural tags (solo/duo/male/female/anthro/biped/…) via LLM.
+    Instead of retrieving these from a candidate list, we ask the LLM to agree
+    with natural-language statements about the image.  This handles tags that
+    are almost never stated in captions but are visually/structurally obvious.
+    Returns a list of e621 tag strings (e.g. ["solo", "anthro", "male", "biped"]).
+    """
+    if log:
+        log("Stage3s (structural): inferring structural tags via statement agreement")
+    statements = _STRUCTURAL_STATEMENTS
+    lines = [f"{j}. {stmt}" for j, (stmt, _tag) in enumerate(statements, 1)]
+    statement_lines = "\n".join(lines)
+    N = len(statements)
+    response_format = _build_structural_response_format()
+    llm = _get_llm(temperature=temperature, max_tokens=max_tokens,
+                    response_format=response_format)
+    model_name = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
+    parser = PydanticOutputParser(pydantic_object=StructuralSelectionResponse)
+    prompt = ChatPromptTemplate.from_messages(
+        [
+            ("system", STRUCTURAL_SYSTEM_TEMPLATE),
+            ("human", STRUCTURAL_USER_TEMPLATE),
+        ],
+        template_format="f-string",
+    )
+    chain = prompt | llm | parser
+    if log:
+        log(f"Stage3s: model={model_name} statements={N}")
+    for att in range(retries + 1):
+        try:
+            parsed = chain.invoke({
+                "N": N,
+                "image_description": query_text,
+                "statement_lines": statement_lines,
+            })
+            if isinstance(parsed, BaseModel):
+                parsed = parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
+            sels = parsed.get("selections", []) if isinstance(parsed, dict) else []
+            chosen_tags: List[str] = []
+            seen = set()
+            for item in sels:
+                idx = item.get("i") if isinstance(item, dict) else None
+                if not isinstance(idx, int) or idx < 1 or idx > N:
+                    continue
+                tag = statements[idx - 1][1]
+                if tag not in seen:
+                    chosen_tags.append(tag)
+                    seen.add(tag)
+            if log:
+                tag_str = ", ".join(chosen_tags) if chosen_tags else "(none)"
+                log(f"Stage3s: attempt {att+1} selected {len(chosen_tags)} tags: {tag_str}")
+            return chosen_tags
+        except Exception as e:
+            if log:
+                log(f"Stage3s: attempt {att+1} error: {e}")
+    if log:
+        log(f"Stage3s: gave up after {retries+1} attempts")
+    return []

scripts/eval_pipeline.py CHANGED Viewed

@@ -151,6 +151,8 @@ class SampleResult:
     why_counts: Dict[str, int] = field(default_factory=dict)
     # Tag implications
     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
     # Leaf-only metrics (strips implied ancestors from both sides)
     leaf_precision: float = 0.0
     leaf_recall: float = 0.0
@@ -161,6 +163,7 @@ class SampleResult:
     stage1_time: float = 0.0
     stage2_time: float = 0.0
     stage3_time: float = 0.0
     # Errors
     error: Optional[str] = None
@@ -196,11 +199,12 @@ def _process_one_sample(
     print_lock: threading.Lock,
     min_why: Optional[str] = None,
     expand_implications: bool = False,
 ) -> SampleResult:
     """Process a single eval sample through the full pipeline. Thread-safe."""
     from psq_rag.llm.rewrite import llm_rewrite_prompt
     from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases
-    from psq_rag.llm.select import llm_select_indices
     from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
     def log(msg: str) -> None:
@@ -288,6 +292,19 @@ def _process_one_sample(
             why_counts[w] = why_counts.get(w, 0) + 1
         result.why_counts = why_counts
         # Tag implication expansion (post-Stage 3)
         if expand_implications and result.selected_tags:
             expanded, implied_only = expand_tags_via_implications(result.selected_tags)
@@ -355,11 +372,12 @@ def _process_one_sample(
         if gt_char:
             char_info = f" char[gt={len(gt_char)} sel={len(sel_char)} P={cp:.2f} R={cr:.2f}]"
         impl_info = f" (+{len(result.implied_tags)} implied)" if result.implied_tags else ""
         with print_lock:
             print(
                 f"  [{index+1}] retrieval_recall={result.retrieval_recall:.3f} "
                 f"sel_P={p:.3f} sel_R={r:.3f} sel_F1={f1:.3f} "
-                f"selected={len(result.selected_tags)}{impl_info}{char_info} "
                 f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
             )
@@ -404,6 +422,7 @@ def run_eval(
     workers: int = 1,
     min_why: Optional[str] = "strong_implied",
     expand_implications: bool = False,
 ) -> List[SampleResult]:
     # Load eval samples — prefer expanded file, fall back to raw
@@ -469,6 +488,7 @@ def run_eval(
                 skip_rewrite, allow_nsfw, mode, chunk_size,
                 per_phrase_k, temperature, max_tokens, verbose,
                 print_lock, min_why, expand_implications,
             )
             results.append(result)
     else:
@@ -485,6 +505,7 @@ def run_eval(
                     skip_rewrite, allow_nsfw, mode, chunk_size,
                     per_phrase_k, temperature, max_tokens, verbose,
                     print_lock, min_why, expand_implications,
                 ): i
                 for i, sample in enumerate(samples)
             }
@@ -553,6 +574,7 @@ def print_summary(results: List[SampleResult]) -> None:
     avg_over_sel = _safe_avg([r.over_selection_ratio for r in valid])
     avg_implied = sum(len(r.implied_tags) for r in valid) / n
     print()
     print("Stage 3 - Selection (ALL tags):")
@@ -562,6 +584,8 @@ def print_summary(results: List[SampleResult]) -> None:
     print(f"  Avg selected tags:    {avg_selected:.1f}")
     if avg_implied > 0:
         print(f"  Avg implied tags:     {avg_implied:.1f}  (added via tag implications)")
     print(f"  Avg ground-truth tags:{avg_gt:.1f}")
     # Leaf-only metrics
@@ -673,11 +697,14 @@ def print_summary(results: List[SampleResult]) -> None:
     print()
     print("-" * 70)
     print("Timing (avg per sample):")
     print(f"  Stage 1 (rewrite):    {avg_t1:.2f}s")
     print(f"  Stage 2 (retrieval):  {avg_t2:.2f}s")
     print(f"  Stage 3 (selection):  {avg_t3:.2f}s")
-    print(f"  Total:                {avg_t1 + avg_t2 + avg_t3:.2f}s")
     print()
     # Show worst and best F1 samples
@@ -739,6 +766,8 @@ def main(argv=None) -> int:
                     help="Minimum 'why' confidence to keep (default: strong_implied). Use 'none' to disable filtering.")
     ap.add_argument("--expand-implications", action="store_true", default=False,
                     help="Expand selected tags via tag implication chains (e.g. fox→canine→canid→mammal)")
     args = ap.parse_args(list(argv) if argv is not None else None)
@@ -761,18 +790,24 @@ def main(argv=None) -> int:
         workers=args.workers,
         min_why=min_why_val,
         expand_implications=args.expand_implications,
     )
     print_summary(results)
-    # Always save detailed results
     if args.output:
         out_path = Path(args.output)
     else:
-        results_dir = _REPO_ROOT / "data" / "eval_results"
-        results_dir.mkdir(parents=True, exist_ok=True)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        out_path = results_dir / f"eval_{args.caption_field}_n{args.n}_seed{args.seed}_{timestamp}.jsonl"
     out_path.parent.mkdir(parents=True, exist_ok=True)
@@ -793,10 +828,65 @@ def main(argv=None) -> int:
         "workers": args.workers,
         "min_why": args.min_why,
         "expand_implications": args.expand_implications,
         "n_errors": sum(1 for r in results if r.error),
     }
     with out_path.open("w", encoding="utf-8") as f:
         f.write(json.dumps(meta, ensure_ascii=False) + "\n")
         for r in results:
             row = {
@@ -806,44 +896,17 @@ def main(argv=None) -> int:
                 "rewrite_phrases": r.rewrite_phrases,
                 "retrieved_tags": sorted(r.retrieved_tags),
                 "selected_tags": sorted(r.selected_tags),
-                "retrieval_recall": round(r.retrieval_recall, 4),
-                "selection_precision": round(r.selection_precision, 4),
-                "selection_recall": round(r.selection_recall, 4),
-                "selection_f1": round(r.selection_f1, 4),
-                # Character tag breakdown
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
-                "retrieved_character_tags": sorted(r.retrieved_character_tags),
-                "char_retrieval_recall": round(r.char_retrieval_recall, 4),
-                "char_precision": round(r.char_precision, 4),
-                "char_recall": round(r.char_recall, 4),
-                "char_f1": round(r.char_f1, 4),
-                # General tag breakdown
                 "gt_general_tags": sorted(r.gt_general_tags),
                 "selected_general_tags": sorted(r.selected_general_tags),
-                "general_precision": round(r.general_precision, 4),
-                "general_recall": round(r.general_recall, 4),
-                "general_f1": round(r.general_f1, 4),
-                # Diagnostic metrics
-                "retrieval_precision": round(r.retrieval_precision, 4),
-                "selection_given_retrieval": round(r.selection_given_retrieval, 4),
-                "over_selection_ratio": round(r.over_selection_ratio, 2),
-                "why_counts": r.why_counts,
-                "implied_tags": sorted(r.implied_tags),
-                # Leaf metrics
-                "leaf_precision": round(r.leaf_precision, 4),
-                "leaf_recall": round(r.leaf_recall, 4),
-                "leaf_f1": round(r.leaf_f1, 4),
-                "leaf_selected_count": r.leaf_selected_count,
-                "leaf_gt_count": r.leaf_gt_count,
-                # Timing
-                "stage1_time": round(r.stage1_time, 3),
-                "stage2_time": round(r.stage2_time, 3),
-                "stage3_time": round(r.stage3_time, 3),
                 "error": r.error,
             }
             f.write(json.dumps(row, ensure_ascii=False) + "\n")
-    print(f"\nDetailed results saved to: {out_path}")
     return 0

     why_counts: Dict[str, int] = field(default_factory=dict)
     # Tag implications
     implied_tags: Set[str] = field(default_factory=set)  # tags added via implications (not LLM-selected)
+    # Structural inference tags (solo/duo/male/female/anthro/biped etc.)
+    structural_tags: List[str] = field(default_factory=list)
     # Leaf-only metrics (strips implied ancestors from both sides)
     leaf_precision: float = 0.0
     leaf_recall: float = 0.0
     stage1_time: float = 0.0
     stage2_time: float = 0.0
     stage3_time: float = 0.0
+    stage3s_time: float = 0.0
     # Errors
     error: Optional[str] = None
     print_lock: threading.Lock,
     min_why: Optional[str] = None,
     expand_implications: bool = False,
+    infer_structural: bool = False,
 ) -> SampleResult:
     """Process a single eval sample through the full pipeline. Thread-safe."""
     from psq_rag.llm.rewrite import llm_rewrite_prompt
     from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases
+    from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags
     from psq_rag.retrieval.state import get_tag_type_name, expand_tags_via_implications, get_leaf_tags
     def log(msg: str) -> None:
             why_counts[w] = why_counts.get(w, 0) + 1
         result.why_counts = why_counts
+        # Structural tag inference (solo/duo/male/female/anthro/biped etc.)
+        if infer_structural:
+            t0s = time.time()
+            structural = llm_infer_structural_tags(
+                caption, log=log, temperature=temperature,
+            )
+            result.stage3s_time = time.time() - t0s
+            result.structural_tags = structural
+            # Add structural tags not already selected
+            for st in structural:
+                result.selected_tags.add(st)
+            log(f"Structural: {structural}")
         # Tag implication expansion (post-Stage 3)
         if expand_implications and result.selected_tags:
             expanded, implied_only = expand_tags_via_implications(result.selected_tags)
         if gt_char:
             char_info = f" char[gt={len(gt_char)} sel={len(sel_char)} P={cp:.2f} R={cr:.2f}]"
         impl_info = f" (+{len(result.implied_tags)} implied)" if result.implied_tags else ""
+        struct_info = f" (+{len(result.structural_tags)} structural)" if result.structural_tags else ""
         with print_lock:
             print(
                 f"  [{index+1}] retrieval_recall={result.retrieval_recall:.3f} "
                 f"sel_P={p:.3f} sel_R={r:.3f} sel_F1={f1:.3f} "
+                f"selected={len(result.selected_tags)}{impl_info}{struct_info}{char_info} "
                 f"t1={result.stage1_time:.1f}s t2={result.stage2_time:.1f}s t3={result.stage3_time:.1f}s"
             )
     workers: int = 1,
     min_why: Optional[str] = "strong_implied",
     expand_implications: bool = False,
+    infer_structural: bool = False,
 ) -> List[SampleResult]:
     # Load eval samples — prefer expanded file, fall back to raw
                 skip_rewrite, allow_nsfw, mode, chunk_size,
                 per_phrase_k, temperature, max_tokens, verbose,
                 print_lock, min_why, expand_implications,
+                infer_structural,
             )
             results.append(result)
     else:
                     skip_rewrite, allow_nsfw, mode, chunk_size,
                     per_phrase_k, temperature, max_tokens, verbose,
                     print_lock, min_why, expand_implications,
+                    infer_structural,
                 ): i
                 for i, sample in enumerate(samples)
             }
     avg_over_sel = _safe_avg([r.over_selection_ratio for r in valid])
     avg_implied = sum(len(r.implied_tags) for r in valid) / n
+    avg_structural = sum(len(r.structural_tags) for r in valid) / n
     print()
     print("Stage 3 - Selection (ALL tags):")
     print(f"  Avg selected tags:    {avg_selected:.1f}")
     if avg_implied > 0:
         print(f"  Avg implied tags:     {avg_implied:.1f}  (added via tag implications)")
+    if avg_structural > 0:
+        print(f"  Avg structural tags:  {avg_structural:.1f}  (inferred via statement agreement)")
     print(f"  Avg ground-truth tags:{avg_gt:.1f}")
     # Leaf-only metrics
     print()
     print("-" * 70)
+    avg_t3s = sum(r.stage3s_time for r in valid) / n
     print("Timing (avg per sample):")
     print(f"  Stage 1 (rewrite):    {avg_t1:.2f}s")
     print(f"  Stage 2 (retrieval):  {avg_t2:.2f}s")
     print(f"  Stage 3 (selection):  {avg_t3:.2f}s")
+    if avg_t3s > 0:
+        print(f"  Stage 3s (structural):{avg_t3s:.2f}s")
+    print(f"  Total:                {avg_t1 + avg_t2 + avg_t3 + avg_t3s:.2f}s")
     print()
     # Show worst and best F1 samples
                     help="Minimum 'why' confidence to keep (default: strong_implied). Use 'none' to disable filtering.")
     ap.add_argument("--expand-implications", action="store_true", default=False,
                     help="Expand selected tags via tag implication chains (e.g. fox→canine→canid→mammal)")
+    ap.add_argument("--infer-structural", action="store_true", default=False,
+                    help="Infer structural tags (solo/duo/male/female/anthro/biped) via LLM statement agreement")
     args = ap.parse_args(list(argv) if argv is not None else None)
         workers=args.workers,
         min_why=min_why_val,
         expand_implications=args.expand_implications,
+        infer_structural=args.infer_structural,
     )
     print_summary(results)
+    # Save results in two formats:
+    # 1. Compact metrics JSONL (small, for git / LLM reading)
+    # 2. Full detail JSONL (large, for analysis scripts, gitignored)
+    results_dir = _REPO_ROOT / "data" / "eval_results"
+    results_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    base_name = f"eval_{args.caption_field}_n{args.n}_seed{args.seed}_{timestamp}"
     if args.output:
         out_path = Path(args.output)
     else:
+        out_path = results_dir / f"{base_name}.jsonl"
+    detail_path = results_dir / f"{base_name}_detail.jsonl"
     out_path.parent.mkdir(parents=True, exist_ok=True)
         "workers": args.workers,
         "min_why": args.min_why,
         "expand_implications": args.expand_implications,
+        "infer_structural": args.infer_structural,
         "n_errors": sum(1 for r in results if r.error),
     }
     with out_path.open("w", encoding="utf-8") as f:
+        f.write(json.dumps(meta, ensure_ascii=False) + "\n")
+        for r in results:
+            # Compact format: metrics + counts + small diff sets (not full tag lists)
+            missed_tags = sorted(r.ground_truth_tags - r.selected_tags)
+            extra_tags = sorted(r.selected_tags - r.ground_truth_tags)
+            row = {
+                "id": r.sample_id,
+                # Counts (not full lists)
+                "n_gt": len(r.ground_truth_tags),
+                "n_retrieved": len(r.retrieved_tags),
+                "n_selected": len(r.selected_tags),
+                "n_implied": len(r.implied_tags),
+                "n_structural": len(r.structural_tags),
+                # Overall metrics
+                "ret_R": round(r.retrieval_recall, 4),
+                "P": round(r.selection_precision, 4),
+                "R": round(r.selection_recall, 4),
+                "F1": round(r.selection_f1, 4),
+                # Leaf metrics
+                "leaf_P": round(r.leaf_precision, 4),
+                "leaf_R": round(r.leaf_recall, 4),
+                "leaf_F1": round(r.leaf_f1, 4),
+                "n_leaf_sel": r.leaf_selected_count,
+                "n_leaf_gt": r.leaf_gt_count,
+                # Diagnostic
+                "ret_P": round(r.retrieval_precision, 4),
+                "sel_given_ret": round(r.selection_given_retrieval, 4),
+                "over_sel": round(r.over_selection_ratio, 2),
+                "why": r.why_counts,
+                # Character metrics (compact)
+                "n_gt_char": len(r.gt_character_tags),
+                "n_sel_char": len(r.selected_character_tags),
+                "char_F1": round(r.char_f1, 4),
+                # General metrics (compact)
+                "gen_P": round(r.general_precision, 4),
+                "gen_R": round(r.general_recall, 4),
+                "gen_F1": round(r.general_f1, 4),
+                # Diff sets (small — only the errors, not the full lists)
+                "missed": missed_tags,
+                "extra": extra_tags,
+                # Structural tags inferred
+                "structural": r.structural_tags,
+                # Timing
+                "t1": round(r.stage1_time, 2),
+                "t2": round(r.stage2_time, 2),
+                "t3": round(r.stage3_time, 2),
+                "t3s": round(r.stage3s_time, 2),
+                "err": r.error,
+            }
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+    print(f"\nCompact results saved to: {out_path}")
+    # Write full detail file (for analysis scripts)
+    with detail_path.open("w", encoding="utf-8") as f:
         f.write(json.dumps(meta, ensure_ascii=False) + "\n")
         for r in results:
             row = {
                 "rewrite_phrases": r.rewrite_phrases,
                 "retrieved_tags": sorted(r.retrieved_tags),
                 "selected_tags": sorted(r.selected_tags),
+                "implied_tags": sorted(r.implied_tags),
+                "structural_tags": r.structural_tags,
+                "why_counts": r.why_counts,
                 "gt_character_tags": sorted(r.gt_character_tags),
                 "selected_character_tags": sorted(r.selected_character_tags),
                 "gt_general_tags": sorted(r.gt_general_tags),
                 "selected_general_tags": sorted(r.selected_general_tags),
                 "error": r.error,
             }
             f.write(json.dumps(row, ensure_ascii=False) + "\n")
+    print(f"Detail results saved to: {detail_path}")
     return 0