Spaces:

Legal-i
/

legal-eye

Running

App Files Files Community

legal-eye / tau_rag /scripts /eval_graph_arguments.py

Legal-i

Initial deploy: legal-eye Hebrew legal RAG (17K corpus, verbatim-from-precedent)

3be54c6 verified about 1 month ago

raw

history blame contribute delete

24 kB

	#!/usr/bin/env python3
	"""eval_graph_arguments.py — quality snapshot of the graph-driven
	argument pipeline.

	Runs a fixed set of canonical Hebrew legal questions through
	``/v1/lawyer/ask`` and reports, per question, whether the bundle
	produced the expected doctrine, whether ``arguments[0]`` came from
	the graph (vs. legacy verbatim_from_precedent), and a few sanity
	counters.

	Useful as:
	• Regression check — re-run after clustering / retriever changes
	to confirm no doctrine routing has shifted unexpectedly.
	• Diagnostic — when a corpus is added, see which questions newly
	route to a cluster (vs. fall through to the legacy path).
	• Snapshot — diff the ``--json`` output across runs to track
	quality over time.

	Usage:
	python -m tau_rag.scripts.eval_graph_arguments
	python -m tau_rag.scripts.eval_graph_arguments \
	--base-url http://localhost:8000 \
	--json /tmp/eval_$(date +%s).json
	"""
	from __future__ import annotations

	import argparse
	import json
	import sys
	import time
	import urllib.error
	import urllib.request
	from typing import Any, Dict, List, Optional


	# ──────────────────────────────────────────────────────────────────────
	# Canonical question set — covers the major Israeli civil-law doctrines
	# that should be in any reasonable corpus. Each entry carries:
	# • question — the user-facing query
	# • expect_anchor_substring — a string that should appear in the
	# bundle's anchor_label when routing succeeded. None = no specific
	# expectation (we only check that the system produced an answer).
	# • expect_quote_keywords — list of Hebrew terms; ALL must appear in
	# the resulting anchor_quote (case-insensitive). Catches the failure
	# mode where we promote a bundle but the quote is actually about a
	# different topic. Optional.
	# • expect_no_promotion — when True, PASS only if the bundle did NOT
	# promote (out-of-scope queries / sanity checks).
	# ──────────────────────────────────────────────────────────────────────
	QUESTIONS: List[Dict[str, Any]] = [
	# ── Contract law — apropim doctrine ─────────────────────────────
	{
	"question": "פרשנות תכליתית של חוזה לפי הלכת אפרופים",
	"expect_anchor_substring": "אפרופים",
	"expect_quote_keywords": ["פירוש", "תכלית"],
	},
	{
	"question": "חובת תום לב במשא ומתן לקראת חוזה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["תום לב"],
	},
	{
	"question": "פיצויים מוסכמים שאינם פרופורציונליים לנזק",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["פיצוי"],
	},
	{
	"question": "תרופות בשל הפרת חוזה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["תרופ", "חוזה"],
	},
	{
	"question": "אכיפת חוזה לפי החוק",
	"expect_anchor_substring": None,
	},
	# ── Tort law ─────────────────────────────────────────────────────
	{
	"question": "אחריות מעוולים יחד לנזיקין",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["נזיק"],
	},
	{
	"question": "מבחן הצפיות בעבירה של רשלנות",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["רשלנות"],
	},
	{
	"question": "פיצוי על נזק לא ממוני בנזיקין",
	"expect_anchor_substring": None,
	},
	# ── Labor / employment ───────────────────────────────────────────
	{
	"question": "פיצויי פיטורים לעובד שפוטר ללא שימוע",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["פיטור"],
	},
	{
	"question": "זכויות עובד בעת מחלה לפי חוק דמי מחלה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["מחלה"],
	},
	{
	"question": "שעות עבודה ומנוחה לפי החוק",
	"expect_anchor_substring": None,
	},
	{
	"question": "שוויון הזדמנויות בעבודה והפליה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הפלי"],
	},
	# ── Health & insurance ───────────────────────────────────────────
	{
	"question": "זכויות חולה לקבלת מידע רפואי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["חולה"],
	},
	{
	"question": "ביטוח בריאות ממלכתי וזכאות",
	"expect_anchor_substring": None,
	},
	{
	"question": "ילד נכה ביטוח לאומי קצבה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["ילד נכה"],
	},
	# ── Out-of-scope / sanity ────────────────────────────────────────
	{
	"question": "חוקי טראפיק באוקלהומה משנת 1985",
	"expect_anchor_substring": None,
	"expect_no_promotion": True,
	},
	{
	"question": "כיצד לאפות עוגת שוקולד עם ביצים וקמח",
	"expect_anchor_substring": None,
	"expect_no_promotion": True,
	},
	# ── Phase 3.1 expansion: bring eval set to 50 ────────────────────
	# Goal per PRODUCTION_PLAN.md: ≥85% PASS, 0 FAIL on this expanded
	# set. Keywords are kept conservative (single Hebrew root) to avoid
	# false WEAKs on legitimate paraphrases.
	# Contract law — 9 new
	{
	"question": "סיכול חוזה לאור נסיבות בלתי צפויות",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["סיכול"],
	},
	{
	"question": "טעות בכריתת חוזה ועילת ביטול",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["טעות"],
	},
	{
	"question": "הטעייה בעת כריתת חוזה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הטעי"],
	},
	{
	"question": "כפייה והשפעה בלתי הוגנת בכריתת חוזה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["כפי"],
	},
	{
	"question": "תניה מקפחת בחוזה אחיד",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["מקפח"],
	},
	{
	"question": "ויתור על זכויות חוזיות",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["ויתור"],
	},
	{
	"question": "עשיית עושר ולא במשפט",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["עושר"],
	},
	{
	"question": "חוזה למראית עין",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["מראית"],
	},
	{
	"question": "ערבות לחיוב חוזי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["ערב"],
	},
	# Tort — 9 new
	{
	"question": "אחריות מחזיק במקרקעין כלפי מבקרים",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["מקרק"],
	},
	{
	"question": "גרימת מטרד לשכן",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["מטרד"],
	},
	{
	"question": "חובת הקטנת הנזק על הניזוק",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הקטנ"],
	},
	{
	"question": "נטל הראיה בתביעת רשלנות",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["נטל"],
	},
	{
	"question": "רשלנות רפואית של רופא מטפל",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["רשלנות"],
	},
	{
	"question": "פגיעה בפרטיות בעידן הדיגיטלי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["פרטיות"],
	},
	{
	"question": "אחריות יצרן למוצר פגום",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["אחריות"],
	},
	{
	"question": "רישיון מרצון בעוולת הסגת גבול",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["רישיון"],
	},
	{
	"question": "עוולת תרמית בנזיקין",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["תרמית"],
	},
	# Employment — 8 new
	{
	"question": "תשלום שעות נוספות לעובד",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["נוספות"],
	},
	{
	"question": "שכר מינימום לעובד יומי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["מינימום"],
	},
	{
	"question": "דמי הבראה לעובד שנתי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הבראה"],
	},
	{
	"question": "תחולת הסכם קיבוצי כללי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["קיבוצי"],
	},
	{
	"question": "הטרדה מינית במקום העבודה",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הטרד"],
	},
	{
	"question": "הודעה מוקדמת בעת פיטורים",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הודעה"],
	},
	{
	"question": "התפטרות בדין מפוטר",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["התפט"],
	},
	{
	"question": "הפליה בעבודה על רקע מין או גיל",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הפלי"],
	},
	# Health — 4 new
	{
	"question": "מינוי אפוטרופוס על קטין",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["אפוטרופ"],
	},
	{
	"question": "הסכמה מדעת לטיפול רפואי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["הסכמ"],
	},
	{
	"question": "סודיות רפואית וזכות לעיין בתיק",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["סודיות"],
	},
	{
	"question": "סל שירותי הבריאות הממלכתי",
	"expect_anchor_substring": None,
	"expect_quote_keywords": ["סל"],
	},
	# Out-of-scope — 3 new (false-positive control)
	{
	"question": "מתכון לעוגת לימון עם קצפת",
	"expect_anchor_substring": None,
	"expect_no_promotion": True,
	},
	{
	"question": "הוראות הרכבת רהיט מאיקאה",
	"expect_anchor_substring": None,
	"expect_no_promotion": True,
	},
	{
	"question": "תוצאות מבחני בגרות במתמטיקה",
	"expect_anchor_substring": None,
	"expect_no_promotion": True,
	},
	]


	# ──────────────────────────────────────────────────────────────────────
	# HTTP — stdlib only so the script runs anywhere
	# ──────────────────────────────────────────────────────────────────────

	def _post_json(url: str, body: Dict[str, Any], timeout: int = 30) -> Dict[str, Any]:
	"""POST a JSON body and return the parsed response."""
	data = json.dumps(body).encode("utf-8")
	req = urllib.request.Request(
	url, data=data,
	headers={"Content-Type": "application/json"},
	method="POST",
	)
	try:
	with urllib.request.urlopen(req, timeout=timeout) as resp:
	return json.loads(resp.read().decode("utf-8"))
	except urllib.error.HTTPError as e:
	return {"_http_error": e.code, "_body": e.read().decode("utf-8")}
	except Exception as e:
	return {"_error": f"{type(e).__name__}: {e}"}


	# ──────────────────────────────────────────────────────────────────────
	# Per-question evaluation
	# ──────────────────────────────────────────────────────────────────────

	def evaluate(
	base_url: str, q: Dict[str, Any], timeout: int, *, via: str = "lawyer",
	) -> Dict[str, Any]:
	"""Run one question and extract the quality signals.

	`via` selects which endpoint drives the eval:
	• "lawyer" (default) — /v1/lawyer/ask, which runs the full
	synthesizer + promotes the bundle to arguments[0]. Slow on
	large corpora because of the synthesizer.
	• "hgraph" — /v1/hgraph/argument, the bundle endpoint directly.
	Bypasses the synthesizer; use this to evaluate clustering
	quality on large corpora without the synthesizer overhead.
	"Promoted" is then derived from the bundle's own can_promote
	signature (cluster_score ≥ 0.5 AND non-empty anchor_quote).
	"""
	started = time.monotonic()
	if via == "hgraph":
	payload = _post_json(
	f"{base_url}/v1/hgraph/argument",
	{"user_facts": q["question"], "retrieval_k": 20},
	timeout=timeout,
	)
	else:
	payload = _post_json(
	f"{base_url}/v1/lawyer/ask",
	{"question": q["question"]},
	timeout=timeout,
	)
	elapsed_ms = int((time.monotonic() - started) * 1000)

	# Network / API failure — return early with the error
	if "_error" in payload or "_http_error" in payload:
	return {
	"question": q["question"],
	"ok": False,
	"elapsed_ms": elapsed_ms,
	"error": payload.get("_error") or payload.get("_http_error"),
	}

	bundle = payload.get("bundle") or {}
	if via == "hgraph":
	# Synthesize a stand-in arguments[0] from the bundle's own
	# can_promote logic so the verdict ladder below works unchanged.
	cluster_score = float(bundle.get("cluster_score") or 0.0)
	anchor_quote = (bundle.get("anchor_quote") or "").strip()
	promoted_synthetic = cluster_score >= 0.5 and bool(anchor_quote)
	args = ([{"polish_method": "graph_bundle"}]
	if promoted_synthetic else [])
	else:
	args = payload.get("arguments") or []
	arg0 = args[0] if args else {}

	expected = q.get("expect_anchor_substring")
	expect_no_promo = bool(q.get("expect_no_promotion"))
	expect_kws = q.get("expect_quote_keywords") or []
	anchor_label = bundle.get("anchor_label") or ""
	anchor_quote = bundle.get("anchor_quote") or ""
	anchor_match = (
	(expected is None) or (expected.lower() in anchor_label.lower())
	)
	# Quote-content check — every required keyword must appear in the
	# anchor_quote. Catches the failure mode where we promote a bundle
	# but the quote is from a different topic than the question.
	quote_lc = anchor_quote.lower()
	missing_kws = [kw for kw in expect_kws if kw.lower() not in quote_lc]
	quote_keywords_ok = (not expect_kws) or (not missing_kws)

	# Did the graph promote? (arguments[0].polish_method == 'graph_bundle')
	promoted = arg0.get("polish_method") == "graph_bundle"

	# Verdict ladder:
	# FAIL — expectation explicitly violated (out-of-scope promoted,
	# required substring missing, or required keywords missing)
	# PASS — promoted (when expected) AND all assertions held
	# WEAK — system produced something but didn't fully meet expectations
	if expect_no_promo:
	verdict = "PASS" if not promoted else "FAIL"
	elif expected is not None:
	if not anchor_match:
	verdict = "FAIL"
	elif not promoted:
	verdict = "WEAK"
	elif not quote_keywords_ok:
	verdict = "FAIL" # promoted but content is wrong
	else:
	verdict = "PASS"
	else:
	# No specific anchor expectation — content keywords still apply
	if not promoted:
	verdict = "WEAK"
	elif not quote_keywords_ok:
	verdict = "WEAK" # promoted, no anchor expected, but content off
	else:
	verdict = "PASS"

	return {
	"question": q["question"],
	"ok": True,
	"elapsed_ms": elapsed_ms,
	"verdict": verdict,
	"tier": payload.get("confidence"),
	"domain": payload.get("domain"),
	"cluster_id": bundle.get("cluster_id"),
	"anchor_label": anchor_label,
	"anchor_label_match": anchor_match,
	"expected_substring": expected,
	"expect_quote_keywords": expect_kws,
	"missing_keywords": missing_kws,
	"quote_keywords_ok": quote_keywords_ok,
	"promoted_to_arguments": promoted,
	"polish_method": arg0.get("polish_method"),
	"cluster_score": bundle.get("cluster_score"),
	"coverage": bundle.get("coverage"),
	"n_total_applications": bundle.get("n_total_applications"),
	"n_total_origins": bundle.get("n_total_origins"),
	"n_alternatives": len(
	((bundle.get("diagnostic") or {}).get("alternative_clusters")) or []
	),
	"is_virtual_anchor": (bundle.get("anchor_id") or "").startswith("virtual:"),
	"anchor_quote_chars": len(anchor_quote),
	}


	# ──────────────────────────────────────────────────────────────────────
	# Reporting
	# ──────────────────────────────────────────────────────────────────────

	def print_table(rows: List[Dict[str, Any]]) -> None:
	"""Pretty-print a one-line-per-question summary."""
	print()
	header = (
	f"{'#':>2} {'verdict':7s} {'tier':10s} {'cluster_score':>5} "
	f"{'cov':>3} {'apps':>4} {'alts':>4} {'method':18s} question"
	)
	print(header)
	print("─" * len(header))
	for i, r in enumerate(rows, 1):
	if not r.get("ok"):
	print(f"{i:>2} ERROR {'':10s} {'':5s} {'':3s} {'':4s} "
	f"{'':4s} {'':18s} {r['question'][:60]}")
	print(f" → {r.get('error')}")
	continue
	score = r.get("cluster_score")
	score_str = f"{score:.2f}" if isinstance(score, (int, float)) else "—"
	cov = r.get("coverage")
	cov_str = f"{int((cov or 0) * 100):>2}%" if cov is not None else "—"
	apps = r.get("n_total_applications") or 0
	alts = r.get("n_alternatives") or 0
	method = (r.get("polish_method") or "—")[:18]
	verdict_color = {
	"PASS": "\033[32mPASS \033[0m",
	"FAIL": "\033[31mFAIL \033[0m",
	"WEAK": "\033[33mWEAK \033[0m",
	}.get(r["verdict"], r["verdict"])
	print(f"{i:>2} {verdict_color} {(r.get('tier') or '—')[:10]:10s} "
	f"{score_str:>5} {cov_str:>3} {apps:>4} {alts:>4} "
	f"{method:18s} {r['question'][:50]}")
	if not r["anchor_label_match"]:
	print(f" ⚠ expected '{r['expected_substring']}' in anchor; "
	f"got '{r['anchor_label'][:50]}'")
	if r.get("missing_keywords"):
	print(f" ⚠ missing keyword(s) in anchor_quote: "
	f"{r['missing_keywords']}")


	def print_summary(rows: List[Dict[str, Any]]) -> None:
	"""Aggregate summary line."""
	total = len(rows)
	if total == 0:
	return
	valid = [r for r in rows if r.get("ok")]
	n_pass = sum(1 for r in valid if r.get("verdict") == "PASS")
	n_fail = sum(1 for r in valid if r.get("verdict") == "FAIL")
	n_weak = sum(1 for r in valid if r.get("verdict") == "WEAK")
	n_err = total - len(valid)
	n_promoted = sum(1 for r in valid if r.get("promoted_to_arguments"))
	avg_ms = (sum(r.get("elapsed_ms", 0) for r in valid)
	/ max(1, len(valid)))

	print()
	print("─" * 60)
	print(f"Total: {total} questions · PASS: {n_pass} FAIL: {n_fail} "
	f"WEAK: {n_weak} ERR: {n_err}")
	print(f"Promoted to arguments[0]: {n_promoted}/{len(valid)} "
	f"· avg latency: {avg_ms:.0f}ms")


	# ──────────────────────────────────────────────────────────────────────
	# Main
	# ──────────────────────────────────────────────────────────────────────

	def main() -> int:
	parser = argparse.ArgumentParser(
	description="Quality-check the graph-driven argument pipeline."
	)
	parser.add_argument(
	"--base-url", default="http://127.0.0.1:8000",
	help="tau-rag server base URL",
	)
	parser.add_argument(
	"--timeout", type=int, default=30,
	help="per-request timeout in seconds",
	)
	parser.add_argument(
	"--json", default=None,
	help="if set, also write the full result rows to this JSON file",
	)
	parser.add_argument(
	"--via", choices=("lawyer", "hgraph"), default="lawyer",
	help="which endpoint to evaluate: lawyer/ask (default, full path) "
	"or hgraph/argument (bundle-only, fast on large corpora)",
	)
	args = parser.parse_args()

	print(f"# Running {len(QUESTIONS)} questions against {args.base_url} "
	f"via {args.via}")
	rows: List[Dict[str, Any]] = []
	for q in QUESTIONS:
	rows.append(evaluate(args.base_url, q, args.timeout, via=args.via))

	print_table(rows)
	print_summary(rows)

	if args.json:
	with open(args.json, "w", encoding="utf-8") as f:
	json.dump(rows, f, ensure_ascii=False, indent=2)
	print(f"\nFull results written to: {args.json}")

	# Exit non-zero if any FAIL or ERR
	bad = sum(1 for r in rows
	if not r.get("ok") or r.get("verdict") == "FAIL")
	return 1 if bad > 0 else 0


	if __name__ == "__main__":
	sys.exit(main())