#!/usr/bin/env python3
"""debug_pipeline.py — Diagnose why CBR returns 0 templates.

Walks 5 random judgments through the pipeline step-by-step, printing
the OUTPUT of each stage so we can see exactly where the failure is.
"""
from __future__ import annotations

import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))


def main():
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--parquet", required=True)
    ap.add_argument("--n", type=int, default=5)
    args = ap.parse_args()

    print("=" * 78)
    print("STEP 1: load 5 random judgments")
    print("=" * 78)
    from tau_rag.scripts.benchmark_pipeline import _load_random_docs
    docs = _load_random_docs(args.parquet, n=args.n, seed=42)
    print(f"Loaded {len(docs)} docs\n")

    print("=" * 78)
    print("STEP 2: run detect_outcome on each — show what matches")
    print("=" * 78)
    from tau_rag.scripts.build_polarity_lexicon import (
        detect_outcome, _ACCEPT_RE, _REJECT_RE,
    )
    for i, (doc_id, text) in enumerate(docs):
        print(f"\n--- Doc {i+1} ({doc_id[:30]}) — text length: {len(text)} ---")
        outcome = detect_outcome(text)
        print(f"  detected_outcome: {outcome}")
        # Show last 500 chars (operative section)
        tail = text[-500:]
        print(f"  TAIL (last 500 chars):")
        print(f"    {tail[:300]}...")
        # What did the regex match?
        if _ACCEPT_RE.search(tail):
            print(f"  ACCEPT regex match: {_ACCEPT_RE.search(tail).group(0)!r}")
        if _REJECT_RE.search(tail):
            print(f"  REJECT regex match: {_REJECT_RE.search(tail).group(0)!r}")

    print("\n" + "=" * 78)
    print("STEP 3: run judgment_structurer on each — show section IDs")
    print("=" * 78)
    try:
        from tau_rag.judgment_structurer import structure_judgment
    except Exception as e:
        print(f"  CANNOT IMPORT judgment_structurer: {e}")
        return

    for i, (doc_id, text) in enumerate(docs[:3]):
        print(f"\n--- Doc {i+1} ({doc_id[:30]}) ---")
        try:
            s = structure_judgment(text)
            sections = s.get("sections", [])
            print(f"  n_sections: {len(sections)}")
            for sec in sections:
                txt_len = len(sec.get("text") or "")
                print(f"    id={sec.get('id', '')!r:30s} text_len={txt_len}")
        except Exception as e:
            print(f"  STRUCTURER FAILED: {e}")

    print("\n" + "=" * 78)
    print("STEP 4: run CaseBasedArgumentExtractor on each doc, "
          "tracing the SECTION path (not fallback)")
    print("=" * 78)
    from tau_rag.intelligence.case_based_arguments import (
        CaseBasedArgumentExtractor,
    )
    ext = CaseBasedArgumentExtractor()
    print(f"  ext.min_len = {ext.min_len}")
    for i, (doc_id, text) in enumerate(docs[:3]):
        print(f"\n--- Doc {i+1} ({doc_id[:30]}) ---")
        # Walk the section path manually so we see EXACTLY what runs.
        try:
            from tau_rag.judgment_structurer import structure_judgment
            struct = structure_judgment(text)
            sections = struct.get("sections", [])
        except Exception:
            sections = []

        SIDE_MAP = {
            "arguments_plaintiff": "claimant",
            "arguments_claimant": "claimant",
            "arguments_defendant": "respondent",
            "arguments_respondent": "respondent",
            "arguments_general": None,
        }
        ARG_SECTIONS = set(SIDE_MAP.keys()) | {"discussion"}

        n_section_passes = 0
        n_paragraphs = 0
        n_min_len = 0
        n_score_passing = 0
        for sec in sections:
            sec_id = sec.get("id", "")
            if sec_id not in ARG_SECTIONS:
                continue
            n_section_passes += 1
            sec_text = sec.get("text", "") or ""
            paras = ext._split_paragraphs(sec_text)
            n_paragraphs += len(paras)
            for p in paras:
                if len(p) >= ext.min_len:
                    n_min_len += 1
                    tags = ext._classify_paragraph(p)
                    if tags["score"] >= 0.20:
                        n_score_passing += 1
        print(f"    arg-bearing sections found:    {n_section_passes}")
        print(f"    paragraphs from those sections: {n_paragraphs}")
        print(f"    paragraphs ≥{ext.min_len} chars:           "
              f"{n_min_len}")
        print(f"    paragraphs with score ≥0.20:    {n_score_passing}")

        templates = ext._extract_arguments_from_one_case(doc_id, text)
        print(f"  n_templates_extracted (actual): {len(templates)}")
        if templates:
            for t in templates[:3]:
                print(f"    side={t.side} score={t.confidence:.2f}")
                print(f"      thesis: {t.thesis[:90]}...")


if __name__ == "__main__":
    main()