#!/usr/bin/env python3 """eval_graph_arguments.py — quality snapshot of the graph-driven argument pipeline. Runs a fixed set of canonical Hebrew legal questions through ``/v1/lawyer/ask`` and reports, per question, whether the bundle produced the expected doctrine, whether ``arguments[0]`` came from the graph (vs. legacy verbatim_from_precedent), and a few sanity counters. Useful as: • Regression check — re-run after clustering / retriever changes to confirm no doctrine routing has shifted unexpectedly. • Diagnostic — when a corpus is added, see which questions newly route to a cluster (vs. fall through to the legacy path). • Snapshot — diff the ``--json`` output across runs to track quality over time. Usage: python -m tau_rag.scripts.eval_graph_arguments python -m tau_rag.scripts.eval_graph_arguments \ --base-url http://localhost:8000 \ --json /tmp/eval_$(date +%s).json """ from __future__ import annotations import argparse import json import sys import time import urllib.error import urllib.request from typing import Any, Dict, List, Optional # ────────────────────────────────────────────────────────────────────── # Canonical question set — covers the major Israeli civil-law doctrines # that should be in any reasonable corpus. Each entry carries: # • question — the user-facing query # • expect_anchor_substring — a string that should appear in the # bundle's anchor_label when routing succeeded. None = no specific # expectation (we only check that the system produced an answer). # • expect_quote_keywords — list of Hebrew terms; ALL must appear in # the resulting anchor_quote (case-insensitive). Catches the failure # mode where we promote a bundle but the quote is actually about a # different topic. Optional. # • expect_no_promotion — when True, PASS only if the bundle did NOT # promote (out-of-scope queries / sanity checks). # ────────────────────────────────────────────────────────────────────── QUESTIONS: List[Dict[str, Any]] = [ # ── Contract law — apropim doctrine ───────────────────────────── { "question": "פרשנות תכליתית של חוזה לפי הלכת אפרופים", "expect_anchor_substring": "אפרופים", "expect_quote_keywords": ["פירוש", "תכלית"], }, { "question": "חובת תום לב במשא ומתן לקראת חוזה", "expect_anchor_substring": None, "expect_quote_keywords": ["תום לב"], }, { "question": "פיצויים מוסכמים שאינם פרופורציונליים לנזק", "expect_anchor_substring": None, "expect_quote_keywords": ["פיצוי"], }, { "question": "תרופות בשל הפרת חוזה", "expect_anchor_substring": None, "expect_quote_keywords": ["תרופ", "חוזה"], }, { "question": "אכיפת חוזה לפי החוק", "expect_anchor_substring": None, }, # ── Tort law ───────────────────────────────────────────────────── { "question": "אחריות מעוולים יחד לנזיקין", "expect_anchor_substring": None, "expect_quote_keywords": ["נזיק"], }, { "question": "מבחן הצפיות בעבירה של רשלנות", "expect_anchor_substring": None, "expect_quote_keywords": ["רשלנות"], }, { "question": "פיצוי על נזק לא ממוני בנזיקין", "expect_anchor_substring": None, }, # ── Labor / employment ─────────────────────────────────────────── { "question": "פיצויי פיטורים לעובד שפוטר ללא שימוע", "expect_anchor_substring": None, "expect_quote_keywords": ["פיטור"], }, { "question": "זכויות עובד בעת מחלה לפי חוק דמי מחלה", "expect_anchor_substring": None, "expect_quote_keywords": ["מחלה"], }, { "question": "שעות עבודה ומנוחה לפי החוק", "expect_anchor_substring": None, }, { "question": "שוויון הזדמנויות בעבודה והפליה", "expect_anchor_substring": None, "expect_quote_keywords": ["הפלי"], }, # ── Health & insurance ─────────────────────────────────────────── { "question": "זכויות חולה לקבלת מידע רפואי", "expect_anchor_substring": None, "expect_quote_keywords": ["חולה"], }, { "question": "ביטוח בריאות ממלכתי וזכאות", "expect_anchor_substring": None, }, { "question": "ילד נכה ביטוח לאומי קצבה", "expect_anchor_substring": None, "expect_quote_keywords": ["ילד נכה"], }, # ── Out-of-scope / sanity ──────────────────────────────────────── { "question": "חוקי טראפיק באוקלהומה משנת 1985", "expect_anchor_substring": None, "expect_no_promotion": True, }, { "question": "כיצד לאפות עוגת שוקולד עם ביצים וקמח", "expect_anchor_substring": None, "expect_no_promotion": True, }, # ── Phase 3.1 expansion: bring eval set to 50 ──────────────────── # Goal per PRODUCTION_PLAN.md: ≥85% PASS, 0 FAIL on this expanded # set. Keywords are kept conservative (single Hebrew root) to avoid # false WEAKs on legitimate paraphrases. # Contract law — 9 new { "question": "סיכול חוזה לאור נסיבות בלתי צפויות", "expect_anchor_substring": None, "expect_quote_keywords": ["סיכול"], }, { "question": "טעות בכריתת חוזה ועילת ביטול", "expect_anchor_substring": None, "expect_quote_keywords": ["טעות"], }, { "question": "הטעייה בעת כריתת חוזה", "expect_anchor_substring": None, "expect_quote_keywords": ["הטעי"], }, { "question": "כפייה והשפעה בלתי הוגנת בכריתת חוזה", "expect_anchor_substring": None, "expect_quote_keywords": ["כפי"], }, { "question": "תניה מקפחת בחוזה אחיד", "expect_anchor_substring": None, "expect_quote_keywords": ["מקפח"], }, { "question": "ויתור על זכויות חוזיות", "expect_anchor_substring": None, "expect_quote_keywords": ["ויתור"], }, { "question": "עשיית עושר ולא במשפט", "expect_anchor_substring": None, "expect_quote_keywords": ["עושר"], }, { "question": "חוזה למראית עין", "expect_anchor_substring": None, "expect_quote_keywords": ["מראית"], }, { "question": "ערבות לחיוב חוזי", "expect_anchor_substring": None, "expect_quote_keywords": ["ערב"], }, # Tort — 9 new { "question": "אחריות מחזיק במקרקעין כלפי מבקרים", "expect_anchor_substring": None, "expect_quote_keywords": ["מקרק"], }, { "question": "גרימת מטרד לשכן", "expect_anchor_substring": None, "expect_quote_keywords": ["מטרד"], }, { "question": "חובת הקטנת הנזק על הניזוק", "expect_anchor_substring": None, "expect_quote_keywords": ["הקטנ"], }, { "question": "נטל הראיה בתביעת רשלנות", "expect_anchor_substring": None, "expect_quote_keywords": ["נטל"], }, { "question": "רשלנות רפואית של רופא מטפל", "expect_anchor_substring": None, "expect_quote_keywords": ["רשלנות"], }, { "question": "פגיעה בפרטיות בעידן הדיגיטלי", "expect_anchor_substring": None, "expect_quote_keywords": ["פרטיות"], }, { "question": "אחריות יצרן למוצר פגום", "expect_anchor_substring": None, "expect_quote_keywords": ["אחריות"], }, { "question": "רישיון מרצון בעוולת הסגת גבול", "expect_anchor_substring": None, "expect_quote_keywords": ["רישיון"], }, { "question": "עוולת תרמית בנזיקין", "expect_anchor_substring": None, "expect_quote_keywords": ["תרמית"], }, # Employment — 8 new { "question": "תשלום שעות נוספות לעובד", "expect_anchor_substring": None, "expect_quote_keywords": ["נוספות"], }, { "question": "שכר מינימום לעובד יומי", "expect_anchor_substring": None, "expect_quote_keywords": ["מינימום"], }, { "question": "דמי הבראה לעובד שנתי", "expect_anchor_substring": None, "expect_quote_keywords": ["הבראה"], }, { "question": "תחולת הסכם קיבוצי כללי", "expect_anchor_substring": None, "expect_quote_keywords": ["קיבוצי"], }, { "question": "הטרדה מינית במקום העבודה", "expect_anchor_substring": None, "expect_quote_keywords": ["הטרד"], }, { "question": "הודעה מוקדמת בעת פיטורים", "expect_anchor_substring": None, "expect_quote_keywords": ["הודעה"], }, { "question": "התפטרות בדין מפוטר", "expect_anchor_substring": None, "expect_quote_keywords": ["התפט"], }, { "question": "הפליה בעבודה על רקע מין או גיל", "expect_anchor_substring": None, "expect_quote_keywords": ["הפלי"], }, # Health — 4 new { "question": "מינוי אפוטרופוס על קטין", "expect_anchor_substring": None, "expect_quote_keywords": ["אפוטרופ"], }, { "question": "הסכמה מדעת לטיפול רפואי", "expect_anchor_substring": None, "expect_quote_keywords": ["הסכמ"], }, { "question": "סודיות רפואית וזכות לעיין בתיק", "expect_anchor_substring": None, "expect_quote_keywords": ["סודיות"], }, { "question": "סל שירותי הבריאות הממלכתי", "expect_anchor_substring": None, "expect_quote_keywords": ["סל"], }, # Out-of-scope — 3 new (false-positive control) { "question": "מתכון לעוגת לימון עם קצפת", "expect_anchor_substring": None, "expect_no_promotion": True, }, { "question": "הוראות הרכבת רהיט מאיקאה", "expect_anchor_substring": None, "expect_no_promotion": True, }, { "question": "תוצאות מבחני בגרות במתמטיקה", "expect_anchor_substring": None, "expect_no_promotion": True, }, ] # ────────────────────────────────────────────────────────────────────── # HTTP — stdlib only so the script runs anywhere # ────────────────────────────────────────────────────────────────────── def _post_json(url: str, body: Dict[str, Any], timeout: int = 30) -> Dict[str, Any]: """POST a JSON body and return the parsed response.""" data = json.dumps(body).encode("utf-8") req = urllib.request.Request( url, data=data, headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=timeout) as resp: return json.loads(resp.read().decode("utf-8")) except urllib.error.HTTPError as e: return {"_http_error": e.code, "_body": e.read().decode("utf-8")} except Exception as e: return {"_error": f"{type(e).__name__}: {e}"} # ────────────────────────────────────────────────────────────────────── # Per-question evaluation # ────────────────────────────────────────────────────────────────────── def evaluate( base_url: str, q: Dict[str, Any], timeout: int, *, via: str = "lawyer", ) -> Dict[str, Any]: """Run one question and extract the quality signals. `via` selects which endpoint drives the eval: • "lawyer" (default) — /v1/lawyer/ask, which runs the full synthesizer + promotes the bundle to arguments[0]. Slow on large corpora because of the synthesizer. • "hgraph" — /v1/hgraph/argument, the bundle endpoint directly. Bypasses the synthesizer; use this to evaluate clustering quality on large corpora without the synthesizer overhead. "Promoted" is then derived from the bundle's own can_promote signature (cluster_score ≥ 0.5 AND non-empty anchor_quote). """ started = time.monotonic() if via == "hgraph": payload = _post_json( f"{base_url}/v1/hgraph/argument", {"user_facts": q["question"], "retrieval_k": 20}, timeout=timeout, ) else: payload = _post_json( f"{base_url}/v1/lawyer/ask", {"question": q["question"]}, timeout=timeout, ) elapsed_ms = int((time.monotonic() - started) * 1000) # Network / API failure — return early with the error if "_error" in payload or "_http_error" in payload: return { "question": q["question"], "ok": False, "elapsed_ms": elapsed_ms, "error": payload.get("_error") or payload.get("_http_error"), } bundle = payload.get("bundle") or {} if via == "hgraph": # Synthesize a stand-in arguments[0] from the bundle's own # can_promote logic so the verdict ladder below works unchanged. cluster_score = float(bundle.get("cluster_score") or 0.0) anchor_quote = (bundle.get("anchor_quote") or "").strip() promoted_synthetic = cluster_score >= 0.5 and bool(anchor_quote) args = ([{"polish_method": "graph_bundle"}] if promoted_synthetic else []) else: args = payload.get("arguments") or [] arg0 = args[0] if args else {} expected = q.get("expect_anchor_substring") expect_no_promo = bool(q.get("expect_no_promotion")) expect_kws = q.get("expect_quote_keywords") or [] anchor_label = bundle.get("anchor_label") or "" anchor_quote = bundle.get("anchor_quote") or "" anchor_match = ( (expected is None) or (expected.lower() in anchor_label.lower()) ) # Quote-content check — every required keyword must appear in the # anchor_quote. Catches the failure mode where we promote a bundle # but the quote is from a different topic than the question. quote_lc = anchor_quote.lower() missing_kws = [kw for kw in expect_kws if kw.lower() not in quote_lc] quote_keywords_ok = (not expect_kws) or (not missing_kws) # Did the graph promote? (arguments[0].polish_method == 'graph_bundle') promoted = arg0.get("polish_method") == "graph_bundle" # Verdict ladder: # FAIL — expectation explicitly violated (out-of-scope promoted, # required substring missing, or required keywords missing) # PASS — promoted (when expected) AND all assertions held # WEAK — system produced something but didn't fully meet expectations if expect_no_promo: verdict = "PASS" if not promoted else "FAIL" elif expected is not None: if not anchor_match: verdict = "FAIL" elif not promoted: verdict = "WEAK" elif not quote_keywords_ok: verdict = "FAIL" # promoted but content is wrong else: verdict = "PASS" else: # No specific anchor expectation — content keywords still apply if not promoted: verdict = "WEAK" elif not quote_keywords_ok: verdict = "WEAK" # promoted, no anchor expected, but content off else: verdict = "PASS" return { "question": q["question"], "ok": True, "elapsed_ms": elapsed_ms, "verdict": verdict, "tier": payload.get("confidence"), "domain": payload.get("domain"), "cluster_id": bundle.get("cluster_id"), "anchor_label": anchor_label, "anchor_label_match": anchor_match, "expected_substring": expected, "expect_quote_keywords": expect_kws, "missing_keywords": missing_kws, "quote_keywords_ok": quote_keywords_ok, "promoted_to_arguments": promoted, "polish_method": arg0.get("polish_method"), "cluster_score": bundle.get("cluster_score"), "coverage": bundle.get("coverage"), "n_total_applications": bundle.get("n_total_applications"), "n_total_origins": bundle.get("n_total_origins"), "n_alternatives": len( ((bundle.get("diagnostic") or {}).get("alternative_clusters")) or [] ), "is_virtual_anchor": (bundle.get("anchor_id") or "").startswith("virtual:"), "anchor_quote_chars": len(anchor_quote), } # ────────────────────────────────────────────────────────────────────── # Reporting # ────────────────────────────────────────────────────────────────────── def print_table(rows: List[Dict[str, Any]]) -> None: """Pretty-print a one-line-per-question summary.""" print() header = ( f"{'#':>2} {'verdict':7s} {'tier':10s} {'cluster_score':>5} " f"{'cov':>3} {'apps':>4} {'alts':>4} {'method':18s} question" ) print(header) print("─" * len(header)) for i, r in enumerate(rows, 1): if not r.get("ok"): print(f"{i:>2} ERROR {'':10s} {'':5s} {'':3s} {'':4s} " f"{'':4s} {'':18s} {r['question'][:60]}") print(f" → {r.get('error')}") continue score = r.get("cluster_score") score_str = f"{score:.2f}" if isinstance(score, (int, float)) else "—" cov = r.get("coverage") cov_str = f"{int((cov or 0) * 100):>2}%" if cov is not None else "—" apps = r.get("n_total_applications") or 0 alts = r.get("n_alternatives") or 0 method = (r.get("polish_method") or "—")[:18] verdict_color = { "PASS": "\033[32mPASS \033[0m", "FAIL": "\033[31mFAIL \033[0m", "WEAK": "\033[33mWEAK \033[0m", }.get(r["verdict"], r["verdict"]) print(f"{i:>2} {verdict_color} {(r.get('tier') or '—')[:10]:10s} " f"{score_str:>5} {cov_str:>3} {apps:>4} {alts:>4} " f"{method:18s} {r['question'][:50]}") if not r["anchor_label_match"]: print(f" ⚠ expected '{r['expected_substring']}' in anchor; " f"got '{r['anchor_label'][:50]}'") if r.get("missing_keywords"): print(f" ⚠ missing keyword(s) in anchor_quote: " f"{r['missing_keywords']}") def print_summary(rows: List[Dict[str, Any]]) -> None: """Aggregate summary line.""" total = len(rows) if total == 0: return valid = [r for r in rows if r.get("ok")] n_pass = sum(1 for r in valid if r.get("verdict") == "PASS") n_fail = sum(1 for r in valid if r.get("verdict") == "FAIL") n_weak = sum(1 for r in valid if r.get("verdict") == "WEAK") n_err = total - len(valid) n_promoted = sum(1 for r in valid if r.get("promoted_to_arguments")) avg_ms = (sum(r.get("elapsed_ms", 0) for r in valid) / max(1, len(valid))) print() print("─" * 60) print(f"Total: {total} questions · PASS: {n_pass} FAIL: {n_fail} " f"WEAK: {n_weak} ERR: {n_err}") print(f"Promoted to arguments[0]: {n_promoted}/{len(valid)} " f"· avg latency: {avg_ms:.0f}ms") # ────────────────────────────────────────────────────────────────────── # Main # ────────────────────────────────────────────────────────────────────── def main() -> int: parser = argparse.ArgumentParser( description="Quality-check the graph-driven argument pipeline." ) parser.add_argument( "--base-url", default="http://127.0.0.1:8000", help="tau-rag server base URL", ) parser.add_argument( "--timeout", type=int, default=30, help="per-request timeout in seconds", ) parser.add_argument( "--json", default=None, help="if set, also write the full result rows to this JSON file", ) parser.add_argument( "--via", choices=("lawyer", "hgraph"), default="lawyer", help="which endpoint to evaluate: lawyer/ask (default, full path) " "or hgraph/argument (bundle-only, fast on large corpora)", ) args = parser.parse_args() print(f"# Running {len(QUESTIONS)} questions against {args.base_url} " f"via {args.via}") rows: List[Dict[str, Any]] = [] for q in QUESTIONS: rows.append(evaluate(args.base_url, q, args.timeout, via=args.via)) print_table(rows) print_summary(rows) if args.json: with open(args.json, "w", encoding="utf-8") as f: json.dump(rows, f, ensure_ascii=False, indent=2) print(f"\nFull results written to: {args.json}") # Exit non-zero if any FAIL or ERR bad = sum(1 for r in rows if not r.get("ok") or r.get("verdict") == "FAIL") return 1 if bad > 0 else 0 if __name__ == "__main__": sys.exit(main())