#!/usr/bin/env python3 """debug_pipeline.py — Diagnose why CBR returns 0 templates. Walks 5 random judgments through the pipeline step-by-step, printing the OUTPUT of each stage so we can see exactly where the failure is. """ from __future__ import annotations import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[2])) def main(): import argparse ap = argparse.ArgumentParser() ap.add_argument("--parquet", required=True) ap.add_argument("--n", type=int, default=5) args = ap.parse_args() print("=" * 78) print("STEP 1: load 5 random judgments") print("=" * 78) from tau_rag.scripts.benchmark_pipeline import _load_random_docs docs = _load_random_docs(args.parquet, n=args.n, seed=42) print(f"Loaded {len(docs)} docs\n") print("=" * 78) print("STEP 2: run detect_outcome on each — show what matches") print("=" * 78) from tau_rag.scripts.build_polarity_lexicon import ( detect_outcome, _ACCEPT_RE, _REJECT_RE, ) for i, (doc_id, text) in enumerate(docs): print(f"\n--- Doc {i+1} ({doc_id[:30]}) — text length: {len(text)} ---") outcome = detect_outcome(text) print(f" detected_outcome: {outcome}") # Show last 500 chars (operative section) tail = text[-500:] print(f" TAIL (last 500 chars):") print(f" {tail[:300]}...") # What did the regex match? if _ACCEPT_RE.search(tail): print(f" ACCEPT regex match: {_ACCEPT_RE.search(tail).group(0)!r}") if _REJECT_RE.search(tail): print(f" REJECT regex match: {_REJECT_RE.search(tail).group(0)!r}") print("\n" + "=" * 78) print("STEP 3: run judgment_structurer on each — show section IDs") print("=" * 78) try: from tau_rag.judgment_structurer import structure_judgment except Exception as e: print(f" CANNOT IMPORT judgment_structurer: {e}") return for i, (doc_id, text) in enumerate(docs[:3]): print(f"\n--- Doc {i+1} ({doc_id[:30]}) ---") try: s = structure_judgment(text) sections = s.get("sections", []) print(f" n_sections: {len(sections)}") for sec in sections: txt_len = len(sec.get("text") or "") print(f" id={sec.get('id', '')!r:30s} text_len={txt_len}") except Exception as e: print(f" STRUCTURER FAILED: {e}") print("\n" + "=" * 78) print("STEP 4: run CaseBasedArgumentExtractor on each doc, " "tracing the SECTION path (not fallback)") print("=" * 78) from tau_rag.intelligence.case_based_arguments import ( CaseBasedArgumentExtractor, ) ext = CaseBasedArgumentExtractor() print(f" ext.min_len = {ext.min_len}") for i, (doc_id, text) in enumerate(docs[:3]): print(f"\n--- Doc {i+1} ({doc_id[:30]}) ---") # Walk the section path manually so we see EXACTLY what runs. try: from tau_rag.judgment_structurer import structure_judgment struct = structure_judgment(text) sections = struct.get("sections", []) except Exception: sections = [] SIDE_MAP = { "arguments_plaintiff": "claimant", "arguments_claimant": "claimant", "arguments_defendant": "respondent", "arguments_respondent": "respondent", "arguments_general": None, } ARG_SECTIONS = set(SIDE_MAP.keys()) | {"discussion"} n_section_passes = 0 n_paragraphs = 0 n_min_len = 0 n_score_passing = 0 for sec in sections: sec_id = sec.get("id", "") if sec_id not in ARG_SECTIONS: continue n_section_passes += 1 sec_text = sec.get("text", "") or "" paras = ext._split_paragraphs(sec_text) n_paragraphs += len(paras) for p in paras: if len(p) >= ext.min_len: n_min_len += 1 tags = ext._classify_paragraph(p) if tags["score"] >= 0.20: n_score_passing += 1 print(f" arg-bearing sections found: {n_section_passes}") print(f" paragraphs from those sections: {n_paragraphs}") print(f" paragraphs ≥{ext.min_len} chars: " f"{n_min_len}") print(f" paragraphs with score ≥0.20: {n_score_passing}") templates = ext._extract_arguments_from_one_case(doc_id, text) print(f" n_templates_extracted (actual): {len(templates)}") if templates: for t in templates[:3]: print(f" side={t.side} score={t.confidence:.2f}") print(f" thesis: {t.thesis[:90]}...") if __name__ == "__main__": main()