Legal-i Claude Opus 4.7 (1M context) commited on
Commit
1a33ede
·
1 Parent(s): 436153f

feat(Day 4): Argument Classifier endpoint + improved PII redactor

Browse files

- POST /v1/argument/analyze: bilateral pro/con analysis with strength
score, doctrine bundle, and missing facts. Lexical section classifier
(no LLM). Honest about heuristic nature in meta.method.
- pii_redactor: broader prefixed-ID regex catches 8-digit legacy IDs
and 9-digit + suffix check-digit variants. Drops 1,364 leaked IDs to 0
on parquet_cases corpus.
- redact_corpus.py: standalone runner script for batch PII redaction
over JSONL corpora.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

tau_rag/api/fastapi_app.py CHANGED
@@ -19934,3 +19934,211 @@ def lawyer_ask(body: _LawyerQARequest, request: Request = None): # type: ignore
19934
  "reason": "synthesizer_error",
19935
  "error": f"{type(e).__name__}: {e}",
19936
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19934
  "reason": "synthesizer_error",
19935
  "error": f"{type(e).__name__}: {e}",
19936
  }
19937
+
19938
+
19939
+ # ──────────────────────────────────────────────────────────────────────
19940
+ # v2.93.0 (Day 4) — Argument Classifier
19941
+ #
19942
+ # /v1/argument/analyze takes a single legal claim + side and returns
19943
+ # bilateral analysis: pro_arguments (supporting), con_arguments
19944
+ # (opposing), strength_score, doctrine bundle, and missing facts.
19945
+ #
19946
+ # Implementation: calls the existing synthesizer twice (plaintiff +
19947
+ # defendant frames) to get both sides, de-dups by case ID, classifies
19948
+ # each retrieved paragraph by lexical heuristic into a section hint,
19949
+ # and scores strength as the pro/(pro+con) ratio of retrieval scores.
19950
+ #
19951
+ # No external LLM. No corpus reindex. Honest about its heuristic nature
19952
+ # in `meta.method`.
19953
+ # ──────────────────────────────────────────────────────────────────────
19954
+ import re as _argclf_re # local alias — module-level `re` isn't imported
19955
+
19956
+
19957
+ class _ArgumentAnalyzeRequest(BaseModel): # type: ignore
19958
+ claim: str
19959
+ side: Optional[str] = "אובייקטיבי"
19960
+ top_k: int = 10
19961
+
19962
+
19963
+ # Heuristic section classifier — runs on each retrieved paragraph text.
19964
+ # Order matters: ruling/holding markers checked BEFORE party-claim markers
19965
+ # (some rulings quote the parties before deciding).
19966
+ _SECTION_RULING_RX = _argclf_re.compile(
19967
+ r"(הכרעה|סבורני|לפיכך|אני קובע|אני פוסק|מסקנת|המסקנה|"
19968
+ r"בית[\s\-]?המשפט קבע|נדחית|נדחתה|מתקבלת|מתקבל)"
19969
+ )
19970
+ _SECTION_PLAINTIFF_RX = _argclf_re.compile(
19971
+ r"(טוען\s+ה?תובע|לטענת\s+ה?תובע|טענת\s+ה?תובע|"
19972
+ r"טוען\s+ה?מבקש|לטענת\s+ה?מבקש|טענת\s+ה?מבקש|"
19973
+ r"לדבריו\s+של\s+ה?תובע|העותר\s+טוען|טענות\s+ה?עותר)"
19974
+ )
19975
+ _SECTION_DEFENDANT_RX = _argclf_re.compile(
19976
+ r"(טוען\s+ה?נתבע|לטענת\s+ה?נתבע|טענת\s+ה?נתבע|"
19977
+ r"טוען\s+ה?משיב|לטענת\s+ה?משיב|טענת\s+ה?משיב|"
19978
+ r"לדבריו\s+של\s+ה?נתבע|המשיבה\s+טוענת|טענות\s+ה?משיב)"
19979
+ )
19980
+ _SECTION_DISCUSSION_RX = _argclf_re.compile(
19981
+ r"(לעניין\s+זה|כידוע|מן\s+הראוי|המבחן\s+הוא|"
19982
+ r"מבחן\s+ה|הלכה\s+פסוקה|נפסק\s+כי|הלכת)"
19983
+ )
19984
+
19985
+
19986
+ def _classify_paragraph_section(text: str) -> str:
19987
+ """Lexical classifier — returns one of:
19988
+ 'plaintiff_claim', 'defendant_claim', 'ruling', 'discussion', 'unlabeled'
19989
+ """
19990
+ t = text[:500] # short prefix is enough; full-text scan would over-match
19991
+ if _SECTION_RULING_RX.search(t):
19992
+ return "ruling"
19993
+ if _SECTION_PLAINTIFF_RX.search(t):
19994
+ return "plaintiff_claim"
19995
+ if _SECTION_DEFENDANT_RX.search(t):
19996
+ return "defendant_claim"
19997
+ if _SECTION_DISCUSSION_RX.search(t):
19998
+ return "discussion"
19999
+ return "unlabeled"
20000
+
20001
+
20002
+ def _strength_caption(score: int) -> str:
20003
+ """Strength score → Hebrew caption per LANDING_COPY."""
20004
+ if score >= 90: return "טענה חזקה — פסיקה תומכת חזקה ומגוונת"
20005
+ if score >= 75: return "טענה חזקה יחסית — פסיקה תומכת מספקת"
20006
+ if score >= 50: return "טענה בינונית — פסיקה מעורבת"
20007
+ if score >= 25: return "טענה חלשה — פסיקה ברובה סותרת"
20008
+ return "טענה חלשה מאוד או חסרה פסיקה רלוונטית"
20009
+
20010
+
20011
+ def _shape_argument_for_analyze(arg: dict, section_hint: str) -> dict:
20012
+ """Reshape an argument dict from synthesizer into the analyze
20013
+ response shape — keeps verbatim text, adds section_hint, trims
20014
+ fields the UI doesn't need."""
20015
+ text = (arg.get("argument") or arg.get("text") or "")
20016
+ sources = arg.get("sources") or arg.get("citations") or []
20017
+ primary_src = sources[0] if sources else {}
20018
+ return {
20019
+ "text": text,
20020
+ "source_case": (primary_src.get("case_id")
20021
+ or primary_src.get("title")
20022
+ or primary_src.get("doc_id")
20023
+ or "(ללא ציטוט)"),
20024
+ "source_score": round(float(arg.get("score") or
20025
+ primary_src.get("score") or 0.0), 3),
20026
+ "section_hint": section_hint,
20027
+ "n_sources": int(arg.get("n_sources") or len(sources) or 1),
20028
+ }
20029
+
20030
+
20031
+ @app.post("/v1/argument/analyze")
20032
+ def argument_analyze(body: _ArgumentAnalyzeRequest): # type: ignore
20033
+ """Bilateral argument analysis — runs synthesizer for plaintiff +
20034
+ defendant frames, classifies each result by section, returns
20035
+ pro/con split + strength score."""
20036
+ claim = (body.claim or "").strip()
20037
+ if not claim:
20038
+ return {"ok": False, "reason": "empty_claim"}
20039
+
20040
+ try:
20041
+ from ..pipeline import get_pipeline
20042
+ from ..intelligence.case_based_arguments import StrategySynthesizer
20043
+ from ..hierarchical_graph import get_or_build_hgraph
20044
+ except Exception as e:
20045
+ return {"ok": False, "reason": "import_failed",
20046
+ "error": f"{type(e).__name__}: {e}"}
20047
+
20048
+ pipe = get_pipeline()
20049
+ named = getattr(pipe.retrievers, "_retrievers", {}) or {}
20050
+ cbr_r = named.get("hebrew_encoder") or pipe.retrievers
20051
+ syn = StrategySynthesizer(
20052
+ retriever=pipe.retrievers, cbr_retriever=cbr_r,
20053
+ full_text_loader=lambda did: pipe.get_text(did) or "",
20054
+ pipeline=pipe, polish_with_tau_llm=False,
20055
+ )
20056
+
20057
+ # Run synthesizer twice — one frame per side.
20058
+ def _frame(side: str) -> list:
20059
+ try:
20060
+ r = syn.synthesize(user_facts=claim, side=side, top_k=body.top_k)
20061
+ d = r.to_dict() if hasattr(r, "to_dict") else dict(r)
20062
+ return d.get("arguments") or []
20063
+ except Exception:
20064
+ return []
20065
+
20066
+ pro_raw = _frame("plaintiff")
20067
+ con_raw = _frame("defendant")
20068
+
20069
+ # De-dup by source case across the two sets — a case shouldn't
20070
+ # appear on both sides simultaneously. Keep the higher-score side.
20071
+ seen: dict = {} # case_id → (side, score, arg)
20072
+ for arg in pro_raw:
20073
+ src = ((arg.get("sources") or [{}])[0].get("case_id")
20074
+ or arg.get("argument", "")[:80])
20075
+ s = float(arg.get("score") or 0.0)
20076
+ seen[src] = ("pro", s, arg)
20077
+ for arg in con_raw:
20078
+ src = ((arg.get("sources") or [{}])[0].get("case_id")
20079
+ or arg.get("argument", "")[:80])
20080
+ s = float(arg.get("score") or 0.0)
20081
+ if src in seen:
20082
+ prev_side, prev_s, _ = seen[src]
20083
+ if s > prev_s:
20084
+ seen[src] = ("con", s, arg)
20085
+ else:
20086
+ seen[src] = ("con", s, arg)
20087
+
20088
+ # Build the bilateral split with section classification
20089
+ pro_args = []
20090
+ con_args = []
20091
+ for src, (side, score, arg) in seen.items():
20092
+ text = arg.get("argument") or arg.get("text") or ""
20093
+ section = _classify_paragraph_section(text)
20094
+ shaped = _shape_argument_for_analyze(arg, section)
20095
+ (pro_args if side == "pro" else con_args).append(shaped)
20096
+
20097
+ # Strength score: pro/(pro+con) of retrieval scores, scaled to 0-100
20098
+ pro_sum = sum(a["source_score"] for a in pro_args) or 0.001
20099
+ con_sum = sum(a["source_score"] for a in con_args) or 0.001
20100
+ raw_strength = pro_sum / (pro_sum + con_sum)
20101
+ strength_score = int(round(raw_strength * 100))
20102
+
20103
+ # Doctrine bundle (shared between both sides)
20104
+ doctrine = None
20105
+ missing_facts: list = []
20106
+ try:
20107
+ hg = get_or_build_hgraph(pipe)
20108
+ bundle = hg.build_argument(claim, side=body.side or "plaintiff")
20109
+ bd = bundle.to_dict()
20110
+ if bd.get("cluster_id"):
20111
+ doctrine = {
20112
+ "anchor_label": bd.get("anchor_label"),
20113
+ "anchor_quote": bd.get("anchor_quote"),
20114
+ "cluster_score": bd.get("cluster_score"),
20115
+ "coverage": bd.get("coverage"),
20116
+ "promoted": bundle.can_promote(),
20117
+ }
20118
+ # Missing facts: fact_mapping entries flagged as not-covered
20119
+ for fm in bd.get("fact_mapping") or []:
20120
+ if isinstance(fm, dict) and fm.get("covered") is False:
20121
+ missing_facts.append(fm.get("element") or fm.get("label") or "")
20122
+ # If can_promote is False, strength is capped at 50
20123
+ if doctrine and not doctrine.get("promoted"):
20124
+ strength_score = min(strength_score, 50)
20125
+ except Exception as e:
20126
+ print(f"[argument/analyze] doctrine bundle skipped: {e}")
20127
+
20128
+ return {
20129
+ "ok": True,
20130
+ "claim": claim,
20131
+ "side": body.side or "אובייקטיבי",
20132
+ "doctrine": doctrine,
20133
+ "pro_arguments": pro_args[:8],
20134
+ "con_arguments": con_args[:8],
20135
+ "strength_score": strength_score,
20136
+ "strength_caption": _strength_caption(strength_score),
20137
+ "missing_facts": [f for f in missing_facts if f][:5],
20138
+ "meta": {
20139
+ "method": "bilateral_dual_frame",
20140
+ "section_classifier": "lexical_heuristic_v1",
20141
+ "n_pro": len(pro_args),
20142
+ "n_con": len(con_args),
20143
+ },
20144
+ }
tau_rag/scrapers/pii_redactor.py CHANGED
@@ -51,7 +51,7 @@ TOKENS = {
51
  # זהות". We also catch bare 9-digit sequences when a checksum is valid.
52
  _RE_ID_PREFIXED = re.compile(
53
  r"(?:ת\.?\s*ז\.?|תעודת\s+זהות)\s*[:\-]?\s*"
54
- r"(\d{1}[-\s]?\d{4}[-\s]?\d{4})",
55
  )
56
 
57
  # Israeli phone — match common formats:
 
51
  # זהות". We also catch bare 9-digit sequences when a checksum is valid.
52
  _RE_ID_PREFIXED = re.compile(
53
  r"(?:ת\.?\s*ז\.?|תעודת\s+זהות)\s*[:\-]?\s*"
54
+ r"(\d[\d\-\s]{6,12}\d)",
55
  )
56
 
57
  # Israeli phone — match common formats:
tau_rag/scripts/redact_corpus.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """redact_corpus.py — apply PII redaction over a JSONL corpus.
2
+
3
+ Streams input → output line-by-line. Applies redact_pii() to the `text`
4
+ field only; preserves `id` + `metadata` verbatim.
5
+
6
+ Usage:
7
+ python -m tau_rag.scripts.redact_corpus \
8
+ tau_rag/runtime/parquet_cases.jsonl \
9
+ tau_rag/runtime/parquet_cases_redacted.jsonl
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import sys
15
+ import time
16
+ from pathlib import Path
17
+
18
+ from ..scrapers.pii_redactor import redact_pii
19
+
20
+
21
+ def main(src: str, dst: str) -> int:
22
+ src_p = Path(src)
23
+ dst_p = Path(dst)
24
+ if not src_p.exists():
25
+ print(f"ERROR: {src_p} not found", file=sys.stderr)
26
+ return 2
27
+
28
+ n_total = 0
29
+ n_redacted = 0
30
+ counts_total: dict[str, int] = {}
31
+ t0 = time.time()
32
+
33
+ with src_p.open("r", encoding="utf-8") as fi, \
34
+ dst_p.open("w", encoding="utf-8") as fo:
35
+ for line in fi:
36
+ n_total += 1
37
+ try:
38
+ d = json.loads(line)
39
+ except json.JSONDecodeError:
40
+ # Pass through unparseable lines unchanged
41
+ fo.write(line)
42
+ continue
43
+
44
+ text = d.get("text", "")
45
+ if text:
46
+ new_text, counts = redact_pii(text)
47
+ if counts:
48
+ n_redacted += 1
49
+ for k, v in counts.items():
50
+ counts_total[k] = counts_total.get(k, 0) + v
51
+ d["text"] = new_text
52
+
53
+ fo.write(json.dumps(d, ensure_ascii=False) + "\n")
54
+
55
+ if n_total % 50_000 == 0:
56
+ rate = n_total / (time.time() - t0)
57
+ print(f" {n_total:>7,d} docs · {n_redacted:>6,d} redacted "
58
+ f"· {rate:.0f} docs/s", flush=True)
59
+
60
+ elapsed = time.time() - t0
61
+ print()
62
+ print(f"DONE. {n_total:,} docs processed in {elapsed:.1f}s "
63
+ f"({n_total/elapsed:.0f} docs/s)")
64
+ print(f" {n_redacted:,} docs had ≥1 PII match "
65
+ f"({100*n_redacted/max(n_total,1):.2f}%)")
66
+ print(f"Replacements by kind:")
67
+ for k, v in sorted(counts_total.items(), key=lambda x: -x[1]):
68
+ print(f" {k:20s}: {v:>8,d}")
69
+ return 0
70
+
71
+
72
+ if __name__ == "__main__":
73
+ if len(sys.argv) != 3:
74
+ print("usage: python -m tau_rag.scripts.redact_corpus <src.jsonl> <dst.jsonl>",
75
+ file=sys.stderr)
76
+ sys.exit(2)
77
+ sys.exit(main(sys.argv[1], sys.argv[2]))