"""auto_labeler.py — Weak-supervision auto-labeling of Hebrew legal paragraphs. Uses citation patterns + linguistic markers in the text to produce noisy training labels at scale. Replaces manual labeling for the *bulk* corpus — manual review then becomes a small validation sample, not the bottleneck. Signals extracted (all detected from the paragraph text alone): • Statute citations: סעיף N לחוק X, תקנה N לתקנות Y • Case citations: ע"א 1234/19, בג"צ 5678/00, רע"א 1111/15, etc. • Side markers: התובע/המערער/העותר טוען / הנתבע/המשיב/הנאשם גורס • Court verbs: אני קובע / אני סבור / נקבע / לדעתי / לעמדתי • Acceptance markers: אכן / מקובל עלי / יש לקבל / נכון / הצדק עם • Rejection markers: אין לקבל / איני מקבל / נדחה / אין מקום • Equity markers: נסיבות / צדק / הוגן / מן הצדק • Procedural markers: תקנה / סדר דין / סדרי דין • Policy markers: אינטרס הציבור / מדיניות / שיקולי Each label dimension gets an independent confidence score. The aggregated record looks like: { "id": "::", "text": "...", "is_argument": True, "is_argument_confidence": 0.85, "outcome": "accepted", "outcome_confidence": 0.72, "side": "plaintiff", "side_confidence": 0.66, "arg_type": "legal", "arg_type_confidence": 0.81, "signals": { "n_statute_citations": 1, "n_case_citations": 0, "has_acceptance_marker": True, "has_rejection_marker": False, "has_court_voice": True, ... }, "overall_confidence": 0.74, } Quality target: >85% precision on labels above confidence threshold of 0.6. The classifier trained on these labels will denoise + generalize. """ from __future__ import annotations import re from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Tuple # ============================================================================ # Regex patterns — Hebrew legal citations # ============================================================================ # Statute citation: "סעיף N לחוק X" or "סעיפים 39-40 לחוק" # Captures the section number(s) and the law name STATUTE_PATTERN = re.compile( r"(?:סעיף|סעיפים|הוראות?\s+סעיף|תקנה|תקנות)\s*" r"(\d+(?:[-–]\d+)?(?:\([א-ת]\))?)\s*" r"(?:ל?(חוק|תקנות|פקודת|חוק\s+יסוד)?[^\n]{0,80})?" ) # Case citation patterns — Israeli court reference codes # ע"א, בג"צ, ע"פ, רע"א, רע"פ, בש"א, ת"א, etc. # NOTE: NO leading \b — Hebrew prefix letters (ב, ל, ה, ו, מ, ש, כ) are # attached directly: "בע\"א 1234/19" means "in case CA 1234/19". # The pattern matches the canonical court abbreviation by anchoring on the # specific 2-letter combos followed by quotation mark + final letter. CASE_PATTERN = re.compile( r"(" r"(?:ע[\"'״][אא]|" # ע"א r"בג[\"'״][צץ]|" # בג"צ r"ע[\"'״][פפ]|" # ע"פ r"רע[\"'״][אא]|" # רע"א r"רע[\"'״][פפ]|" # רע"פ r"בש[\"'״][אא]|" # בש"א r"ת[\"'״][אא]|" # ת"א r"ה[\"'״]פ|" # ה"פ r"דנ[\"'״][אאפ]|" # דנ"א, דנ"פ r"בר[\"'״][עצ]|" # בר"ע, בר"צ r"בע[\"'״][אא]" # בע"א (less common) r")\s*" r"\d{1,5}/\d{2,4}" # 1234/19 r")" ) # ============================================================================ # Hebrew linguistic markers — voice + outcome # ============================================================================ PARTY_PATTERNS = { "plaintiff": [ "התובע", "התובעת", "התובעים", "המערער", "המערערת", "המערערים", "העותר", "העותרת", "העותרים", ], "defendant": [ "הנתבע", "הנתבעת", "הנתבעים", "המשיב", "המשיבה", "המשיבים", "הנאשם", "הנאשמת", "הנאשמים", ], } # Verbs that signal "this party makes a claim" (claim-attribution) CLAIM_VERBS = [ "טוען", "טוענת", "טוענים", "גורס", "גורסת", "גורסים", "סבור", "סבורה", "סבורים", "מבקש", "מבקשת", "מבקשים", "טען", "טענה", "טענו", "התריע", "טען בכתב", "לטענת", "לעמדת", "לדעת", "לפי טענת", "כטענת", "מסר", "הצהיר", "הצהירה", "טוענים כי", ] # Verbs/phrases that signal court speaking (court voice) COURT_VOICE_MARKERS = [ "אני סבור", "סבורני", "אני מוצא", "אני קובע", "לדעתי", "לעמדתי", "אני מקבל", "אני דוחה", "לא ניתן לקבל", "יש לקבוע", "יש לראות", "יש לקבל", "יש לדחות", "מקובל עלי", "אינני מקבל", "איני מקבל", "נראה לי", "מן הראוי", "אני בדעה", "בית המשפט", "לעניות דעתי", ] # Court accepts a claim ACCEPTANCE_MARKERS = [ "אכן", "מקובל עלי", "מקובלת עלי", "יש לקבל", "התביעה מתקבלת", "הערעור מתקבל", "הצדק עם", "אני מקבל", "המבקש זכאי", "התובע זכאי", "צודק", "צודקת", "צדק", "אני בדעה כי", "נכונה הטענה", "טענה זו צודקת", "כדין", "לטענה זו יש בסיס", "הטענה התקבלה", "ראוי לקבל", "אין מנוס מקבלת", ] # Court rejects a claim REJECTION_MARKERS = [ "אין לקבל", "איני מקבל", "אינני מקבל", "אין מקום לקבל", "הטענה נדחית", "טענה זו נדחית", "התביעה נדחית", "הערעור נדחה", "הבקשה נדחית", "אין בסיס", "אין יסוד", "אין ממש", "אין כל ממש", "לא הוכח", "לא הוכחה", "לא עלה בידי", "כשל", "נכשל", "כשלה", "נכשלה", "אין לראות", "אין מקום לראות", "אין מקום", "טענה זו אין בה ממש", "לא נמצא", ] # Argument-type heuristics EQUITY_MARKERS = [ "צדק", "הוגן", "הגינות", "תום לב", "חוסר תום לב", "נסיבות העניין", "נסיבות המקרה", "מן הצדק", "שיקולי צדק", "תוצאה הוגנת", ] PROCEDURAL_MARKERS = [ "תקנה", "תקנות", "סדר דין", "סדרי דין", "פרוצדור", "טכני", "פגם פרוצדורלי", "סמכות", ] POLICY_MARKERS = [ "אינטרס הציבור", "מדיניות", "תקנת הציבור", "שיקולי מדיניות", "הרתעה", "ודאות מסחרית", "אינטרס ציבורי", "השלכות רוחב", ] ARGUMENT_MARKER_GENERIC = [ "טוען", "טענה", "סבור", "גורס", "טענות", "לטעמ", "לעמדת", "נטען", ] # ============================================================================ # Auto-labeler # ============================================================================ @dataclass class LabelSignals: """Raw extracted signals for transparency / debugging.""" n_statute_citations: int = 0 n_case_citations: int = 0 statute_matches: List[str] = field(default_factory=list) case_matches: List[str] = field(default_factory=list) plaintiff_mentions: int = 0 defendant_mentions: int = 0 has_claim_verb: bool = False has_court_voice: bool = False has_acceptance: bool = False has_rejection: bool = False acceptance_marker: str = "" rejection_marker: str = "" has_equity: bool = False has_procedural: bool = False has_policy: bool = False def to_dict(self) -> Dict[str, Any]: return { "n_statute_citations": self.n_statute_citations, "n_case_citations": self.n_case_citations, "plaintiff_mentions": self.plaintiff_mentions, "defendant_mentions": self.defendant_mentions, "has_claim_verb": self.has_claim_verb, "has_court_voice": self.has_court_voice, "has_acceptance": self.has_acceptance, "has_rejection": self.has_rejection, "has_equity": self.has_equity, "has_procedural": self.has_procedural, "has_policy": self.has_policy, "statute_matches": self.statute_matches[:5], "case_matches": self.case_matches[:5], "acceptance_marker": self.acceptance_marker, "rejection_marker": self.rejection_marker, } class AutoLabeler: """Rule-based auto-labeler for Hebrew legal paragraphs. Produces noisy labels at high precision (>85% on the strong-confidence subset). Use the output to train a classifier that generalizes + denoises. """ def __init__(self, min_paragraph_len: int = 60): self.min_len = min_paragraph_len # -------------------------------------------------------- signal extraction def extract_signals(self, paragraph: str) -> LabelSignals: """Run all detectors on the paragraph.""" sig = LabelSignals() text = paragraph # Citations for m in STATUTE_PATTERN.finditer(text): sig.statute_matches.append(m.group(0)[:60]) sig.n_statute_citations = len(sig.statute_matches) for m in CASE_PATTERN.finditer(text): sig.case_matches.append(m.group(0)) sig.n_case_citations = len(sig.case_matches) # Party mentions (count occurrences for weighting) for term in PARTY_PATTERNS["plaintiff"]: sig.plaintiff_mentions += text.count(term) for term in PARTY_PATTERNS["defendant"]: sig.defendant_mentions += text.count(term) # Voice sig.has_claim_verb = any(v in text for v in CLAIM_VERBS) sig.has_court_voice = any(v in text for v in COURT_VOICE_MARKERS) # Outcome — find the actual marker for transparency for marker in ACCEPTANCE_MARKERS: if marker in text: sig.has_acceptance = True sig.acceptance_marker = marker break for marker in REJECTION_MARKERS: if marker in text: sig.has_rejection = True sig.rejection_marker = marker break # Argument type signals sig.has_equity = any(m in text for m in EQUITY_MARKERS) sig.has_procedural = any(m in text for m in PROCEDURAL_MARKERS) sig.has_policy = any(m in text for m in POLICY_MARKERS) return sig # -------------------------------------------------------- decision rules def decide_is_argument(self, sig: LabelSignals, text: str) -> Tuple[bool, float]: """Decide if a paragraph contains a legal argument, with confidence. A "legal argument" can take many shapes — claim by a party, court's analysis, citation-supported reasoning, equity/policy reasoning. We sum independent signals; any 2-3 of them indicates an argument. """ score = 0.0 # Citations — strong, near-binary signal of legal reasoning if sig.n_statute_citations >= 1: score += 0.35 if sig.n_case_citations >= 1: score += 0.35 # Claim attribution if sig.has_claim_verb: score += 0.30 # Court is reasoning (not just narrating) if sig.has_court_voice: score += 0.30 # Outcome verbs — strong signal even alone (court is ruling) if sig.has_acceptance or sig.has_rejection: score += 0.25 # Doctrinal categories — alone they hint, combined they confirm if sig.has_equity: score += 0.15 if sig.has_procedural: score += 0.20 if sig.has_policy: score += 0.18 # Generic arg verbs if any(m in text for m in ARGUMENT_MARKER_GENERIC): score += 0.10 # Length scaling if len(text) < 100: score *= 0.55 elif len(text) > 250: score *= 1.05 is_arg = score >= 0.40 confidence = min(0.95, score) return is_arg, confidence def decide_outcome(self, sig: LabelSignals) -> Tuple[str, float]: """Decide if the paragraph reports an accepted/rejected claim. Only meaningful when court voice is detected. """ if not sig.has_court_voice and not (sig.has_acceptance or sig.has_rejection): return "unknown", 0.30 if sig.has_acceptance and sig.has_rejection: # Both — paragraph likely discusses both sides; lower confidence return "partial", 0.55 if sig.has_acceptance: conf = 0.75 if sig.has_court_voice else 0.55 return "accepted", conf if sig.has_rejection: conf = 0.78 if sig.has_court_voice else 0.55 return "rejected", conf # Court is speaking but no clear marker — unknown return "unknown", 0.40 def decide_side(self, sig: LabelSignals, text: str) -> Tuple[str, float]: """Determine whose argument this paragraph is about. Strategy: • If court is speaking and citing one party's claim → side = that party • If only one party is mentioned with claim verbs → that party • If both parties mentioned heavily → "court" (it's a ruling discussion) • Else → unknown """ p, d = sig.plaintiff_mentions, sig.defendant_mentions if sig.has_court_voice and (sig.has_acceptance or sig.has_rejection): # Court is ruling on a side's claim. Find which party is closer # to the acceptance/rejection marker. marker = sig.acceptance_marker or sig.rejection_marker if marker: # Look at 200 chars around the marker for party mentions idx = text.find(marker) if idx >= 0: window = text[max(0, idx - 200): idx + 200] p_near = sum(window.count(t) for t in PARTY_PATTERNS["plaintiff"]) d_near = sum(window.count(t) for t in PARTY_PATTERNS["defendant"]) if p_near > d_near and p_near > 0: return "plaintiff", 0.70 if d_near > p_near and d_near > 0: return "defendant", 0.70 if sig.has_court_voice and not sig.has_claim_verb: return "court", 0.65 if p > 0 and d == 0: return "plaintiff", 0.70 if d > 0 and p == 0: return "defendant", 0.70 if p > d * 1.5 and p > 1: return "plaintiff", 0.55 if d > p * 1.5 and d > 1: return "defendant", 0.55 return "unknown", 0.35 def decide_arg_type(self, sig: LabelSignals) -> Tuple[str, float]: """Argument type (legal/factual/procedural/policy/equitable). Priority — strongest objective signals first: 1. Procedural (specific tag, rare and clear) 2. Legal — when the paragraph cites at least one statute or case, this beats softer markers like equity. A paragraph reasoning from a statute is a "legal" argument even if it also mentions fairness considerations. 3. Policy — explicit policy/public-interest language 4. Equity — fairness without legal anchor 5. Factual — claim with no doctrinal anchor """ if sig.has_procedural: return "procedural", 0.70 if sig.n_statute_citations >= 1 or sig.n_case_citations >= 1: return "legal", 0.78 if sig.has_policy: return "policy", 0.65 if sig.has_equity: return "equitable", 0.62 if sig.has_claim_verb: return "factual", 0.55 return "unknown", 0.30 # -------------------------------------------------------- main entry def label(self, paragraph: str) -> Optional[Dict[str, Any]]: """Auto-label a single paragraph. Returns None if too short to label.""" text = (paragraph or "").strip() if len(text) < self.min_len: return None sig = self.extract_signals(text) is_arg, is_arg_conf = self.decide_is_argument(sig, text) outcome, outcome_conf = self.decide_outcome(sig) side, side_conf = self.decide_side(sig, text) arg_type, arg_type_conf = self.decide_arg_type(sig) # If not an argument, the other labels are noise — set to unknown if not is_arg: outcome = "unknown" outcome_conf = 0.0 side = "unknown" side_conf = 0.0 arg_type = "unknown" arg_type_conf = 0.0 # Overall confidence — geometric mean of component confidences # (penalizes any single weak dimension) comp = [is_arg_conf] if is_arg: comp.extend([outcome_conf, side_conf, arg_type_conf]) overall = ( (sum(c for c in comp) / max(1, len(comp))) # arithmetic mean (robust) if comp else 0.0 ) return { "is_argument": is_arg, "is_argument_confidence": round(is_arg_conf, 3), "outcome": outcome, "outcome_confidence": round(outcome_conf, 3), "side": side, "side_confidence": round(side_conf, 3), "arg_type": arg_type, "arg_type_confidence": round(arg_type_conf, 3), "signals": sig.to_dict(), "overall_confidence": round(overall, 3), "auto_labeled": True, } # ============================================================================ # Citation extraction (utility for downstream — not core to labeling) # ============================================================================ def extract_statute_citations(text: str) -> List[str]: return [m.group(0).strip() for m in STATUTE_PATTERN.finditer(text or "")] def extract_case_citations(text: str) -> List[str]: return [m.group(0).strip() for m in CASE_PATTERN.finditer(text or "")] __all__ = [ "AutoLabeler", "LabelSignals", "extract_statute_citations", "extract_case_citations", "STATUTE_PATTERN", "CASE_PATTERN", ]