Spaces:

Legal-i
/

legal-eye

Running

Legal-i Claude Opus 4.7 (1M context) commited on 29 days ago

Commit

1a33ede

1 Parent(s): 436153f

feat(Day 4): Argument Classifier endpoint + improved PII redactor

- POST /v1/argument/analyze: bilateral pro/con analysis with strength
score, doctrine bundle, and missing facts. Lexical section classifier
(no LLM). Honest about heuristic nature in meta.method.
- pii_redactor: broader prefixed-ID regex catches 8-digit legacy IDs
and 9-digit + suffix check-digit variants. Drops 1,364 leaked IDs to 0
on parquet_cases corpus.
- redact_corpus.py: standalone runner script for batch PII redaction
over JSONL corpora.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (3) hide show

tau_rag/api/fastapi_app.py +208 -0
tau_rag/scrapers/pii_redactor.py +1 -1
tau_rag/scripts/redact_corpus.py +77 -0

tau_rag/api/fastapi_app.py CHANGED Viewed

@@ -19934,3 +19934,211 @@ def lawyer_ask(body: _LawyerQARequest, request: Request = None):  # type: ignore
             "reason": "synthesizer_error",
             "error": f"{type(e).__name__}: {e}",
         }

             "reason": "synthesizer_error",
             "error": f"{type(e).__name__}: {e}",
         }
+# ──────────────────────────────────────────────────────────────────────
+# v2.93.0 (Day 4) — Argument Classifier
+#
+# /v1/argument/analyze takes a single legal claim + side and returns
+# bilateral analysis: pro_arguments (supporting), con_arguments
+# (opposing), strength_score, doctrine bundle, and missing facts.
+#
+# Implementation: calls the existing synthesizer twice (plaintiff +
+# defendant frames) to get both sides, de-dups by case ID, classifies
+# each retrieved paragraph by lexical heuristic into a section hint,
+# and scores strength as the pro/(pro+con) ratio of retrieval scores.
+#
+# No external LLM. No corpus reindex. Honest about its heuristic nature
+# in `meta.method`.
+# ──────────────────────────────────────────────────────────────────────
+import re as _argclf_re  # local alias — module-level `re` isn't imported
+class _ArgumentAnalyzeRequest(BaseModel):  # type: ignore
+    claim: str
+    side: Optional[str] = "אובייקטיבי"
+    top_k: int = 10
+# Heuristic section classifier — runs on each retrieved paragraph text.
+# Order matters: ruling/holding markers checked BEFORE party-claim markers
+# (some rulings quote the parties before deciding).
+_SECTION_RULING_RX = _argclf_re.compile(
+    r"(הכרעה|סבורני|לפיכך|אני קובע|אני פוסק|מסקנת|המסקנה|"
+    r"בית[\s\-]?המשפט קבע|נדחית|נדחתה|מתקבלת|מתקבל)"
+)
+_SECTION_PLAINTIFF_RX = _argclf_re.compile(
+    r"(טוען\s+ה?תובע|לטענת\s+ה?תובע|טענת\s+ה?תובע|"
+    r"טוען\s+ה?מבקש|לטענת\s+ה?מבקש|טענת\s+ה?מבקש|"
+    r"לדבריו\s+של\s+ה?תובע|העותר\s+טוען|טענות\s+ה?עותר)"
+)
+_SECTION_DEFENDANT_RX = _argclf_re.compile(
+    r"(טוען\s+ה?נתבע|לטענת\s+ה?נתבע|טענת\s+ה?נתבע|"
+    r"טוען\s+ה?משיב|לטענת\s+ה?משיב|טענת\s+ה?משיב|"
+    r"לדבריו\s+של\s+ה?נתבע|המשיבה\s+טוענת|טענות\s+ה?משיב)"
+)
+_SECTION_DISCUSSION_RX = _argclf_re.compile(
+    r"(לעניין\s+זה|כידוע|מן\s+הראוי|המבחן\s+הוא|"
+    r"מבחן\s+ה|הלכה\s+פסוקה|נפסק\s+כי|הלכת)"
+)
+def _classify_paragraph_section(text: str) -> str:
+    """Lexical classifier — returns one of:
+        'plaintiff_claim', 'defendant_claim', 'ruling', 'discussion', 'unlabeled'
+    """
+    t = text[:500]  # short prefix is enough; full-text scan would over-match
+    if _SECTION_RULING_RX.search(t):
+        return "ruling"
+    if _SECTION_PLAINTIFF_RX.search(t):
+        return "plaintiff_claim"
+    if _SECTION_DEFENDANT_RX.search(t):
+        return "defendant_claim"
+    if _SECTION_DISCUSSION_RX.search(t):
+        return "discussion"
+    return "unlabeled"
+def _strength_caption(score: int) -> str:
+    """Strength score → Hebrew caption per LANDING_COPY."""
+    if score >= 90:  return "טענה חזקה — פסיקה תומכת חזקה ומגוונת"
+    if score >= 75:  return "טענה חזקה יחסית — פסיקה תומכת מספקת"
+    if score >= 50:  return "טענה בינונית — פסיקה מעורבת"
+    if score >= 25:  return "טענה חלשה — פסיקה ברובה סותרת"
+    return "טענה חלשה מאוד או חסרה פסיקה רלוונטית"
+def _shape_argument_for_analyze(arg: dict, section_hint: str) -> dict:
+    """Reshape an argument dict from synthesizer into the analyze
+    response shape — keeps verbatim text, adds section_hint, trims
+    fields the UI doesn't need."""
+    text = (arg.get("argument") or arg.get("text") or "")
+    sources = arg.get("sources") or arg.get("citations") or []
+    primary_src = sources[0] if sources else {}
+    return {
+        "text": text,
+        "source_case": (primary_src.get("case_id")
+                        or primary_src.get("title")
+                        or primary_src.get("doc_id")
+                        or "(ללא ציטוט)"),
+        "source_score": round(float(arg.get("score") or
+                                    primary_src.get("score") or 0.0), 3),
+        "section_hint": section_hint,
+        "n_sources": int(arg.get("n_sources") or len(sources) or 1),
+    }
+@app.post("/v1/argument/analyze")
+def argument_analyze(body: _ArgumentAnalyzeRequest):  # type: ignore
+    """Bilateral argument analysis — runs synthesizer for plaintiff +
+    defendant frames, classifies each result by section, returns
+    pro/con split + strength score."""
+    claim = (body.claim or "").strip()
+    if not claim:
+        return {"ok": False, "reason": "empty_claim"}
+    try:
+        from ..pipeline import get_pipeline
+        from ..intelligence.case_based_arguments import StrategySynthesizer
+        from ..hierarchical_graph import get_or_build_hgraph
+    except Exception as e:
+        return {"ok": False, "reason": "import_failed",
+                "error": f"{type(e).__name__}: {e}"}
+    pipe = get_pipeline()
+    named = getattr(pipe.retrievers, "_retrievers", {}) or {}
+    cbr_r = named.get("hebrew_encoder") or pipe.retrievers
+    syn = StrategySynthesizer(
+        retriever=pipe.retrievers, cbr_retriever=cbr_r,
+        full_text_loader=lambda did: pipe.get_text(did) or "",
+        pipeline=pipe, polish_with_tau_llm=False,
+    )
+    # Run synthesizer twice — one frame per side.
+    def _frame(side: str) -> list:
+        try:
+            r = syn.synthesize(user_facts=claim, side=side, top_k=body.top_k)
+            d = r.to_dict() if hasattr(r, "to_dict") else dict(r)
+            return d.get("arguments") or []
+        except Exception:
+            return []
+    pro_raw = _frame("plaintiff")
+    con_raw = _frame("defendant")
+    # De-dup by source case across the two sets — a case shouldn't
+    # appear on both sides simultaneously. Keep the higher-score side.
+    seen: dict = {}  # case_id → (side, score, arg)
+    for arg in pro_raw:
+        src = ((arg.get("sources") or [{}])[0].get("case_id")
+               or arg.get("argument", "")[:80])
+        s = float(arg.get("score") or 0.0)
+        seen[src] = ("pro", s, arg)
+    for arg in con_raw:
+        src = ((arg.get("sources") or [{}])[0].get("case_id")
+               or arg.get("argument", "")[:80])
+        s = float(arg.get("score") or 0.0)
+        if src in seen:
+            prev_side, prev_s, _ = seen[src]
+            if s > prev_s:
+                seen[src] = ("con", s, arg)
+        else:
+            seen[src] = ("con", s, arg)
+    # Build the bilateral split with section classification
+    pro_args = []
+    con_args = []
+    for src, (side, score, arg) in seen.items():
+        text = arg.get("argument") or arg.get("text") or ""
+        section = _classify_paragraph_section(text)
+        shaped = _shape_argument_for_analyze(arg, section)
+        (pro_args if side == "pro" else con_args).append(shaped)
+    # Strength score: pro/(pro+con) of retrieval scores, scaled to 0-100
+    pro_sum = sum(a["source_score"] for a in pro_args) or 0.001
+    con_sum = sum(a["source_score"] for a in con_args) or 0.001
+    raw_strength = pro_sum / (pro_sum + con_sum)
+    strength_score = int(round(raw_strength * 100))
+    # Doctrine bundle (shared between both sides)
+    doctrine = None
+    missing_facts: list = []
+    try:
+        hg = get_or_build_hgraph(pipe)
+        bundle = hg.build_argument(claim, side=body.side or "plaintiff")
+        bd = bundle.to_dict()
+        if bd.get("cluster_id"):
+            doctrine = {
+                "anchor_label": bd.get("anchor_label"),
+                "anchor_quote": bd.get("anchor_quote"),
+                "cluster_score": bd.get("cluster_score"),
+                "coverage": bd.get("coverage"),
+                "promoted": bundle.can_promote(),
+            }
+        # Missing facts: fact_mapping entries flagged as not-covered
+        for fm in bd.get("fact_mapping") or []:
+            if isinstance(fm, dict) and fm.get("covered") is False:
+                missing_facts.append(fm.get("element") or fm.get("label") or "")
+        # If can_promote is False, strength is capped at 50
+        if doctrine and not doctrine.get("promoted"):
+            strength_score = min(strength_score, 50)
+    except Exception as e:
+        print(f"[argument/analyze] doctrine bundle skipped: {e}")
+    return {
+        "ok": True,
+        "claim": claim,
+        "side": body.side or "אובייקטיבי",
+        "doctrine": doctrine,
+        "pro_arguments": pro_args[:8],
+        "con_arguments": con_args[:8],
+        "strength_score": strength_score,
+        "strength_caption": _strength_caption(strength_score),
+        "missing_facts": [f for f in missing_facts if f][:5],
+        "meta": {
+            "method": "bilateral_dual_frame",
+            "section_classifier": "lexical_heuristic_v1",
+            "n_pro": len(pro_args),
+            "n_con": len(con_args),
+        },
+    }

tau_rag/scrapers/pii_redactor.py CHANGED Viewed

@@ -51,7 +51,7 @@ TOKENS = {
 # זהות". We also catch bare 9-digit sequences when a checksum is valid.
 _RE_ID_PREFIXED = re.compile(
     r"(?:ת\.?\s*ז\.?|תעודת\s+זהות)\s*[:\-]?\s*"
-    r"(\d{1}[-\s]?\d{4}[-\s]?\d{4})",
 )
 # Israeli phone — match common formats:

 # זהות". We also catch bare 9-digit sequences when a checksum is valid.
 _RE_ID_PREFIXED = re.compile(
     r"(?:ת\.?\s*ז\.?|תעודת\s+זהות)\s*[:\-]?\s*"
+    r"(\d[\d\-\s]{6,12}\d)",
 )
 # Israeli phone — match common formats:

tau_rag/scripts/redact_corpus.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""redact_corpus.py — apply PII redaction over a JSONL corpus.
+Streams input → output line-by-line. Applies redact_pii() to the `text`
+field only; preserves `id` + `metadata` verbatim.
+Usage:
+    python -m tau_rag.scripts.redact_corpus \
+        tau_rag/runtime/parquet_cases.jsonl \
+        tau_rag/runtime/parquet_cases_redacted.jsonl
+"""
+from __future__ import annotations
+import json
+import sys
+import time
+from pathlib import Path
+from ..scrapers.pii_redactor import redact_pii
+def main(src: str, dst: str) -> int:
+    src_p = Path(src)
+    dst_p = Path(dst)
+    if not src_p.exists():
+        print(f"ERROR: {src_p} not found", file=sys.stderr)
+        return 2
+    n_total = 0
+    n_redacted = 0
+    counts_total: dict[str, int] = {}
+    t0 = time.time()
+    with src_p.open("r", encoding="utf-8") as fi, \
+         dst_p.open("w", encoding="utf-8") as fo:
+        for line in fi:
+            n_total += 1
+            try:
+                d = json.loads(line)
+            except json.JSONDecodeError:
+                # Pass through unparseable lines unchanged
+                fo.write(line)
+                continue
+            text = d.get("text", "")
+            if text:
+                new_text, counts = redact_pii(text)
+                if counts:
+                    n_redacted += 1
+                    for k, v in counts.items():
+                        counts_total[k] = counts_total.get(k, 0) + v
+                    d["text"] = new_text
+            fo.write(json.dumps(d, ensure_ascii=False) + "\n")
+            if n_total % 50_000 == 0:
+                rate = n_total / (time.time() - t0)
+                print(f"  {n_total:>7,d} docs · {n_redacted:>6,d} redacted "
+                      f"· {rate:.0f} docs/s", flush=True)
+    elapsed = time.time() - t0
+    print()
+    print(f"DONE. {n_total:,} docs processed in {elapsed:.1f}s "
+          f"({n_total/elapsed:.0f} docs/s)")
+    print(f"      {n_redacted:,} docs had ≥1 PII match "
+          f"({100*n_redacted/max(n_total,1):.2f}%)")
+    print(f"Replacements by kind:")
+    for k, v in sorted(counts_total.items(), key=lambda x: -x[1]):
+        print(f"  {k:20s}: {v:>8,d}")
+    return 0
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("usage: python -m tau_rag.scripts.redact_corpus <src.jsonl> <dst.jsonl>",
+              file=sys.stderr)
+        sys.exit(2)
+    sys.exit(main(sys.argv[1], sys.argv[2]))