"""auto_labeler.py — Weak-supervision auto-labeling of Hebrew legal paragraphs.

Uses citation patterns + linguistic markers in the text to produce noisy
training labels at scale. Replaces manual labeling for the *bulk* corpus —
manual review then becomes a small validation sample, not the bottleneck.

Signals extracted (all detected from the paragraph text alone):

  • Statute citations:  סעיף N לחוק X, תקנה N לתקנות Y
  • Case citations:     ע"א 1234/19, בג"צ 5678/00, רע"א 1111/15, etc.
  • Side markers:       התובע/המערער/העותר טוען / הנתבע/המשיב/הנאשם גורס
  • Court verbs:        אני קובע / אני סבור / נקבע / לדעתי / לעמדתי
  • Acceptance markers: אכן / מקובל עלי / יש לקבל / נכון / הצדק עם
  • Rejection markers:  אין לקבל / איני מקבל / נדחה / אין מקום
  • Equity markers:     נסיבות / צדק / הוגן / מן הצדק
  • Procedural markers: תקנה / סדר דין / סדרי דין
  • Policy markers:     אינטרס הציבור / מדיניות / שיקולי

Each label dimension gets an independent confidence score. The aggregated
record looks like:

    {
        "id": "<case_id>::<para_idx>",
        "text": "...",
        "is_argument": True,
        "is_argument_confidence": 0.85,
        "outcome": "accepted",
        "outcome_confidence": 0.72,
        "side": "plaintiff",
        "side_confidence": 0.66,
        "arg_type": "legal",
        "arg_type_confidence": 0.81,
        "signals": {
            "n_statute_citations": 1,
            "n_case_citations": 0,
            "has_acceptance_marker": True,
            "has_rejection_marker": False,
            "has_court_voice": True,
            ...
        },
        "overall_confidence": 0.74,
    }

Quality target: >85% precision on labels above confidence threshold of 0.6.
The classifier trained on these labels will denoise + generalize.
"""
from __future__ import annotations

import re
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple


# ============================================================================
# Regex patterns — Hebrew legal citations
# ============================================================================

# Statute citation: "סעיף N לחוק X" or "סעיפים 39-40 לחוק"
# Captures the section number(s) and the law name
STATUTE_PATTERN = re.compile(
    r"(?:סעיף|סעיפים|הוראות?\s+סעיף|תקנה|תקנות)\s*"
    r"(\d+(?:[-–]\d+)?(?:\([א-ת]\))?)\s*"
    r"(?:ל?(חוק|תקנות|פקודת|חוק\s+יסוד)?[^\n]{0,80})?"
)

# Case citation patterns — Israeli court reference codes
# ע"א, בג"צ, ע"פ, רע"א, רע"פ, בש"א, ת"א, etc.
# NOTE: NO leading \b — Hebrew prefix letters (ב, ל, ה, ו, מ, ש, כ) are
# attached directly: "בע\"א 1234/19" means "in case CA 1234/19".
# The pattern matches the canonical court abbreviation by anchoring on the
# specific 2-letter combos followed by quotation mark + final letter.
CASE_PATTERN = re.compile(
    r"("
    r"(?:ע[\"'״][אא]|"          # ע"א
    r"בג[\"'״][צץ]|"            # בג"צ
    r"ע[\"'״][פפ]|"             # ע"פ
    r"רע[\"'״][אא]|"            # רע"א
    r"רע[\"'״][פפ]|"            # רע"פ
    r"בש[\"'״][אא]|"            # בש"א
    r"ת[\"'״][אא]|"             # ת"א
    r"ה[\"'״]פ|"                # ה"פ
    r"דנ[\"'״][אאפ]|"           # דנ"א, דנ"פ
    r"בר[\"'״][עצ]|"            # בר"ע, בר"צ
    r"בע[\"'״][אא]"             # בע"א (less common)
    r")\s*"
    r"\d{1,5}/\d{2,4}"          # 1234/19
    r")"
)

# ============================================================================
# Hebrew linguistic markers — voice + outcome
# ============================================================================

PARTY_PATTERNS = {
    "plaintiff": [
        "התובע", "התובעת", "התובעים", "המערער", "המערערת",
        "המערערים", "העותר", "העותרת", "העותרים",
    ],
    "defendant": [
        "הנתבע", "הנתבעת", "הנתבעים", "המשיב", "המשיבה",
        "המשיבים", "הנאשם", "הנאשמת", "הנאשמים",
    ],
}

# Verbs that signal "this party makes a claim" (claim-attribution)
CLAIM_VERBS = [
    "טוען", "טוענת", "טוענים", "גורס", "גורסת", "גורסים",
    "סבור", "סבורה", "סבורים", "מבקש", "מבקשת", "מבקשים",
    "טען", "טענה", "טענו", "התריע", "טען בכתב",
    "לטענת", "לעמדת", "לדעת", "לפי טענת", "כטענת",
    "מסר", "הצהיר", "הצהירה", "טוענים כי",
]

# Verbs/phrases that signal court speaking (court voice)
COURT_VOICE_MARKERS = [
    "אני סבור", "סבורני", "אני מוצא", "אני קובע", "לדעתי",
    "לעמדתי", "אני מקבל", "אני דוחה", "לא ניתן לקבל",
    "יש לקבוע", "יש לראות", "יש לקבל", "יש לדחות",
    "מקובל עלי", "אינני מקבל", "איני מקבל",
    "נראה לי", "מן הראוי", "אני בדעה",
    "בית המשפט", "לעניות דעתי",
]

# Court accepts a claim
ACCEPTANCE_MARKERS = [
    "אכן", "מקובל עלי", "מקובלת עלי", "יש לקבל",
    "התביעה מתקבלת", "הערעור מתקבל", "הצדק עם",
    "אני מקבל", "המבקש זכאי", "התובע זכאי",
    "צודק", "צודקת", "צדק", "אני בדעה כי",
    "נכונה הטענה", "טענה זו צודקת", "כדין",
    "לטענה זו יש בסיס", "הטענה התקבלה",
    "ראוי לקבל", "אין מנוס מקבלת",
]

# Court rejects a claim
REJECTION_MARKERS = [
    "אין לקבל", "איני מקבל", "אינני מקבל", "אין מקום לקבל",
    "הטענה נדחית", "טענה זו נדחית", "התביעה נדחית",
    "הערעור נדחה", "הבקשה נדחית", "אין בסיס",
    "אין יסוד", "אין ממש", "אין כל ממש",
    "לא הוכח", "לא הוכחה", "לא עלה בידי",
    "כשל", "נכשל", "כשלה", "נכשלה",
    "אין לראות", "אין מקום לראות", "אין מקום",
    "טענה זו אין בה ממש", "לא נמצא",
]

# Argument-type heuristics
EQUITY_MARKERS = [
    "צדק", "הוגן", "הגינות", "תום לב", "חוסר תום לב",
    "נסיבות העניין", "נסיבות המקרה", "מן הצדק",
    "שיקולי צדק", "תוצאה הוגנת",
]
PROCEDURAL_MARKERS = [
    "תקנה", "תקנות", "סדר דין", "סדרי דין", "פרוצדור",
    "טכני", "פגם פרוצדורלי", "סמכות",
]
POLICY_MARKERS = [
    "אינטרס הציבור", "מדיניות", "תקנת הציבור",
    "שיקולי מדיניות", "הרתעה", "ודאות מסחרית",
    "אינטרס ציבורי", "השלכות רוחב",
]
ARGUMENT_MARKER_GENERIC = [
    "טוען", "טענה", "סבור", "גורס", "טענות", "לטעמ",
    "לעמדת", "נטען",
]


# ============================================================================
# Auto-labeler
# ============================================================================

@dataclass
class LabelSignals:
    """Raw extracted signals for transparency / debugging."""
    n_statute_citations: int = 0
    n_case_citations: int = 0
    statute_matches: List[str] = field(default_factory=list)
    case_matches: List[str] = field(default_factory=list)

    plaintiff_mentions: int = 0
    defendant_mentions: int = 0

    has_claim_verb: bool = False
    has_court_voice: bool = False

    has_acceptance: bool = False
    has_rejection: bool = False
    acceptance_marker: str = ""
    rejection_marker: str = ""

    has_equity: bool = False
    has_procedural: bool = False
    has_policy: bool = False

    def to_dict(self) -> Dict[str, Any]:
        return {
            "n_statute_citations": self.n_statute_citations,
            "n_case_citations": self.n_case_citations,
            "plaintiff_mentions": self.plaintiff_mentions,
            "defendant_mentions": self.defendant_mentions,
            "has_claim_verb": self.has_claim_verb,
            "has_court_voice": self.has_court_voice,
            "has_acceptance": self.has_acceptance,
            "has_rejection": self.has_rejection,
            "has_equity": self.has_equity,
            "has_procedural": self.has_procedural,
            "has_policy": self.has_policy,
            "statute_matches": self.statute_matches[:5],
            "case_matches": self.case_matches[:5],
            "acceptance_marker": self.acceptance_marker,
            "rejection_marker": self.rejection_marker,
        }


class AutoLabeler:
    """Rule-based auto-labeler for Hebrew legal paragraphs.

    Produces noisy labels at high precision (>85% on the strong-confidence subset).
    Use the output to train a classifier that generalizes + denoises.
    """

    def __init__(self, min_paragraph_len: int = 60):
        self.min_len = min_paragraph_len

    # -------------------------------------------------------- signal extraction
    def extract_signals(self, paragraph: str) -> LabelSignals:
        """Run all detectors on the paragraph."""
        sig = LabelSignals()
        text = paragraph

        # Citations
        for m in STATUTE_PATTERN.finditer(text):
            sig.statute_matches.append(m.group(0)[:60])
        sig.n_statute_citations = len(sig.statute_matches)

        for m in CASE_PATTERN.finditer(text):
            sig.case_matches.append(m.group(0))
        sig.n_case_citations = len(sig.case_matches)

        # Party mentions (count occurrences for weighting)
        for term in PARTY_PATTERNS["plaintiff"]:
            sig.plaintiff_mentions += text.count(term)
        for term in PARTY_PATTERNS["defendant"]:
            sig.defendant_mentions += text.count(term)

        # Voice
        sig.has_claim_verb = any(v in text for v in CLAIM_VERBS)
        sig.has_court_voice = any(v in text for v in COURT_VOICE_MARKERS)

        # Outcome — find the actual marker for transparency
        for marker in ACCEPTANCE_MARKERS:
            if marker in text:
                sig.has_acceptance = True
                sig.acceptance_marker = marker
                break
        for marker in REJECTION_MARKERS:
            if marker in text:
                sig.has_rejection = True
                sig.rejection_marker = marker
                break

        # Argument type signals
        sig.has_equity = any(m in text for m in EQUITY_MARKERS)
        sig.has_procedural = any(m in text for m in PROCEDURAL_MARKERS)
        sig.has_policy = any(m in text for m in POLICY_MARKERS)

        return sig

    # -------------------------------------------------------- decision rules
    def decide_is_argument(self, sig: LabelSignals, text: str) -> Tuple[bool, float]:
        """Decide if a paragraph contains a legal argument, with confidence.

        A "legal argument" can take many shapes — claim by a party, court's
        analysis, citation-supported reasoning, equity/policy reasoning.
        We sum independent signals; any 2-3 of them indicates an argument.
        """
        score = 0.0
        # Citations — strong, near-binary signal of legal reasoning
        if sig.n_statute_citations >= 1:
            score += 0.35
        if sig.n_case_citations >= 1:
            score += 0.35
        # Claim attribution
        if sig.has_claim_verb:
            score += 0.30
        # Court is reasoning (not just narrating)
        if sig.has_court_voice:
            score += 0.30
        # Outcome verbs — strong signal even alone (court is ruling)
        if sig.has_acceptance or sig.has_rejection:
            score += 0.25
        # Doctrinal categories — alone they hint, combined they confirm
        if sig.has_equity:
            score += 0.15
        if sig.has_procedural:
            score += 0.20
        if sig.has_policy:
            score += 0.18
        # Generic arg verbs
        if any(m in text for m in ARGUMENT_MARKER_GENERIC):
            score += 0.10
        # Length scaling
        if len(text) < 100:
            score *= 0.55
        elif len(text) > 250:
            score *= 1.05

        is_arg = score >= 0.40
        confidence = min(0.95, score)
        return is_arg, confidence

    def decide_outcome(self, sig: LabelSignals) -> Tuple[str, float]:
        """Decide if the paragraph reports an accepted/rejected claim.

        Only meaningful when court voice is detected.
        """
        if not sig.has_court_voice and not (sig.has_acceptance or sig.has_rejection):
            return "unknown", 0.30

        if sig.has_acceptance and sig.has_rejection:
            # Both — paragraph likely discusses both sides; lower confidence
            return "partial", 0.55

        if sig.has_acceptance:
            conf = 0.75 if sig.has_court_voice else 0.55
            return "accepted", conf

        if sig.has_rejection:
            conf = 0.78 if sig.has_court_voice else 0.55
            return "rejected", conf

        # Court is speaking but no clear marker — unknown
        return "unknown", 0.40

    def decide_side(self, sig: LabelSignals, text: str) -> Tuple[str, float]:
        """Determine whose argument this paragraph is about.

        Strategy:
          • If court is speaking and citing one party's claim → side = that party
          • If only one party is mentioned with claim verbs → that party
          • If both parties mentioned heavily → "court" (it's a ruling discussion)
          • Else → unknown
        """
        p, d = sig.plaintiff_mentions, sig.defendant_mentions

        if sig.has_court_voice and (sig.has_acceptance or sig.has_rejection):
            # Court is ruling on a side's claim. Find which party is closer
            # to the acceptance/rejection marker.
            marker = sig.acceptance_marker or sig.rejection_marker
            if marker:
                # Look at 200 chars around the marker for party mentions
                idx = text.find(marker)
                if idx >= 0:
                    window = text[max(0, idx - 200): idx + 200]
                    p_near = sum(window.count(t) for t in PARTY_PATTERNS["plaintiff"])
                    d_near = sum(window.count(t) for t in PARTY_PATTERNS["defendant"])
                    if p_near > d_near and p_near > 0:
                        return "plaintiff", 0.70
                    if d_near > p_near and d_near > 0:
                        return "defendant", 0.70

        if sig.has_court_voice and not sig.has_claim_verb:
            return "court", 0.65

        if p > 0 and d == 0:
            return "plaintiff", 0.70
        if d > 0 and p == 0:
            return "defendant", 0.70
        if p > d * 1.5 and p > 1:
            return "plaintiff", 0.55
        if d > p * 1.5 and d > 1:
            return "defendant", 0.55

        return "unknown", 0.35

    def decide_arg_type(self, sig: LabelSignals) -> Tuple[str, float]:
        """Argument type (legal/factual/procedural/policy/equitable).

        Priority — strongest objective signals first:
          1. Procedural (specific tag, rare and clear)
          2. Legal — when the paragraph cites at least one statute or case,
             this beats softer markers like equity. A paragraph reasoning
             from a statute is a "legal" argument even if it also mentions
             fairness considerations.
          3. Policy — explicit policy/public-interest language
          4. Equity — fairness without legal anchor
          5. Factual — claim with no doctrinal anchor
        """
        if sig.has_procedural:
            return "procedural", 0.70
        if sig.n_statute_citations >= 1 or sig.n_case_citations >= 1:
            return "legal", 0.78
        if sig.has_policy:
            return "policy", 0.65
        if sig.has_equity:
            return "equitable", 0.62
        if sig.has_claim_verb:
            return "factual", 0.55
        return "unknown", 0.30

    # -------------------------------------------------------- main entry
    def label(self, paragraph: str) -> Optional[Dict[str, Any]]:
        """Auto-label a single paragraph. Returns None if too short to label."""
        text = (paragraph or "").strip()
        if len(text) < self.min_len:
            return None

        sig = self.extract_signals(text)

        is_arg, is_arg_conf = self.decide_is_argument(sig, text)
        outcome, outcome_conf = self.decide_outcome(sig)
        side, side_conf = self.decide_side(sig, text)
        arg_type, arg_type_conf = self.decide_arg_type(sig)

        # If not an argument, the other labels are noise — set to unknown
        if not is_arg:
            outcome = "unknown"
            outcome_conf = 0.0
            side = "unknown"
            side_conf = 0.0
            arg_type = "unknown"
            arg_type_conf = 0.0

        # Overall confidence — geometric mean of component confidences
        # (penalizes any single weak dimension)
        comp = [is_arg_conf]
        if is_arg:
            comp.extend([outcome_conf, side_conf, arg_type_conf])
        overall = (
            (sum(c for c in comp) / max(1, len(comp)))   # arithmetic mean (robust)
            if comp else 0.0
        )

        return {
            "is_argument": is_arg,
            "is_argument_confidence": round(is_arg_conf, 3),
            "outcome": outcome,
            "outcome_confidence": round(outcome_conf, 3),
            "side": side,
            "side_confidence": round(side_conf, 3),
            "arg_type": arg_type,
            "arg_type_confidence": round(arg_type_conf, 3),
            "signals": sig.to_dict(),
            "overall_confidence": round(overall, 3),
            "auto_labeled": True,
        }


# ============================================================================
# Citation extraction (utility for downstream — not core to labeling)
# ============================================================================

def extract_statute_citations(text: str) -> List[str]:
    return [m.group(0).strip() for m in STATUTE_PATTERN.finditer(text or "")]


def extract_case_citations(text: str) -> List[str]:
    return [m.group(0).strip() for m in CASE_PATTERN.finditer(text or "")]


__all__ = [
    "AutoLabeler",
    "LabelSignals",
    "extract_statute_citations",
    "extract_case_citations",
    "STATUTE_PATTERN",
    "CASE_PATTERN",
]