#!/usr/bin/env python3
"""
Scrape Israeli statutes from he.wikisource.org via the MediaWiki API.

Strategy:
  - For each law, fetch the parsed HTML via /w/api.php?action=parse
  - Israeli laws on Wikisource are structured with "== סעיף N - title ==" headers
  - Walk the HTML, accumulate text under each heading as a separate doc
  - Output JSONL in the same format as our existing corpus

Run from any directory; the script writes to the same path as our other
corpora so start_server.sh picks it up automatically.

Usage:
  python3 tau_rag/scripts/scrape_israeli_statutes.py
  python3 tau_rag/scripts/scrape_israeli_statutes.py --laws "חוק_הגנת_הצרכן"
  python3 tau_rag/scripts/scrape_israeli_statutes.py --dry-run
"""
from __future__ import annotations

import argparse
import json
import re
import sys
import time
import urllib.parse
import urllib.request
from pathlib import Path

# ─────────────────────────────────────────────────────────────────────────
# 22 statutes covering the benchmark gaps (labor / consumer / tort / etc.)
# Format: (page_title_on_wikisource, short_law_code, full_name_for_metadata)
# ─────────────────────────────────────────────────────────────────────────
STATUTES = [
    # Labor (12)
    ("חוק_עבודת_נשים", "EN",
     "חוק עבודת נשים, התשי\"ד-1954"),
    ("חוק_שעות_עבודה_ומנוחה", "WHR",
     "חוק שעות עבודה ומנוחה, התשי\"א-1951"),
    ("חוק_שכר_מינימום", "MW",
     "חוק שכר מינימום, התשמ\"ז-1987"),
    ("חוק_פיצויי_פיטורים", "SP",
     "חוק פיצויי פיטורים, התשכ\"ג-1963"),
    ("חוק_הודעה_מוקדמת_לפיטורים_ולהתפטרות", "PN",
     "חוק הודעה מוקדמת לפיטורים ולהתפטרות, התשס\"א-2001"),
    ("חוק_חופשה_שנתית", "AL",
     "חוק חופשה שנתית, התשי\"א-1951"),
    ("חוק_דמי_מחלה", "SL",
     "חוק דמי מחלה, התשל\"ו-1976"),
    ("חוק_שכר_שווה_לעובדת_ולעובד", "EQ",
     "חוק שכר שווה לעובדת ולעובד, התשנ\"ו-1996"),
    ("חוק_עובדים_זרים", "FW",
     "חוק עובדים זרים, התשנ\"א-1991"),
    ("חוק_הסכמים_קיבוציים", "CB",
     "חוק הסכמים קיבוציים, התשי\"ז-1957"),
    ("חוק_הגנת_השכר", "WP",
     "חוק הגנת השכר, התשי\"ח-1958"),
    ("חוק_שוויון_ההזדמנויות_בעבודה", "EE",
     "חוק שוויון ההזדמנויות בעבודה, התשמ\"ח-1988"),
    # Consumer
    ("חוק_הגנת_הצרכן", "CP",
     "חוק הגנת הצרכן, התשמ\"א-1981"),
    # Tort
    ("פקודת_הנזיקין", "TT",
     "פקודת הנזיקין [נוסח חדש]"),
    ("חוק_פיצויים_לנפגעי_תאונות_דרכים", "RTA",
     "חוק פיצויים לנפגעי תאונות דרכים, התשל\"ה-1975"),
    # Defamation
    ("חוק_איסור_לשון_הרע", "DEF",
     "חוק איסור לשון הרע, התשכ\"ה-1965"),
    # Banking / consumer-finance
    ("חוק_הבנקאות_(שירות_ללקוח)", "BNK",
     "חוק הבנקאות (שירות ללקוח), התשמ\"א-1981"),
    # Insurance / contract specifics
    ("חוק_חוזה_הביטוח", "INS",
     "חוק חוזה הביטוח, התשמ\"א-1981"),
    # Family / succession
    ("חוק_הירושה", "INH",
     "חוק הירושה, התשכ\"ה-1965"),
    ("חוק_יחסי_ממון_בין_בני_זוג", "MR",
     "חוק יחסי ממון בין בני זוג, התשל\"ג-1973"),
    # Companies (commercial)
    ("חוק_החברות", "CO",
     "חוק החברות, התשנ\"ט-1999"),
    # Procedural — small but important for procedure questions
    ("חוק_בתי_המשפט", "CR",
     "חוק בתי המשפט [נוסח משולב], התשמ\"ד-1984"),
]

API = "https://he.wikisource.org/w/api.php"
USER_AGENT = "tau-rag-scraper/1.0 (research; contact: avribarzel@gmail.com)"


def fetch_wikitext(page_title: str) -> str | None:
    """Fetch the raw wikitext of a page via MediaWiki API."""
    params = {
        "action": "parse",
        "page": page_title,
        "prop": "wikitext",
        "format": "json",
        "redirects": "1",
    }
    url = API + "?" + urllib.parse.urlencode(params, safe=":/")
    req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            data = json.loads(resp.read().decode("utf-8"))
    except Exception as e:
        print(f"   ⚠️  fetch failed: {e}")
        return None
    if "error" in data:
        print(f"   ⚠️  API error: {data['error'].get('info','?')}")
        return None
    return data.get("parse", {}).get("wikitext", {}).get("*")


# Wikitext cleanup helpers
TEMPLATE_RE = re.compile(r"\{\{[^{}]*\}\}", re.DOTALL)
LINK_RE = re.compile(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]")
TAG_RE = re.compile(r"<[^>]+>")
HTML_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
BOLD_ITALIC = re.compile(r"'{2,}")


def clean_wiki(text: str) -> str:
    """Strip wiki/HTML markup, keep section structure roughly intact."""
    # Remove HTML comments
    text = HTML_COMMENT_RE.sub("", text)
    # Remove templates (collapse repeatedly for nested ones)
    for _ in range(8):
        new = TEMPLATE_RE.sub("", text)
        if new == text:
            break
        text = new
    # [[Page|display]] → display
    text = LINK_RE.sub(lambda m: m.group(2) or m.group(1), text)
    # HTML tags
    text = TAG_RE.sub("", text)
    # Bold / italic markers
    text = BOLD_ITALIC.sub("", text)
    # Normalize whitespace inside paragraphs but keep newlines
    lines = []
    for line in text.split("\n"):
        s = re.sub(r"[ \t]+", " ", line).strip()
        lines.append(s)
    return "\n".join(lines)


# Match Wikisource section heading: "== סעיף 1 - הגדרות ==" or "==סעיף 5(א)=="
HEADING_RE = re.compile(
    r"^={2,}\s*סעיף\s*(\S+?)\s*(?:[-–—]\s*([^=]+?))?\s*={2,}\s*$"
)


def split_sections(wikitext: str, law_code: str, full_name: str) -> list[dict]:
    """Split cleaned wikitext into per-section docs."""
    cleaned = clean_wiki(wikitext)
    docs = []
    cur_num = None
    cur_title = ""
    cur_body: list[str] = []

    def flush():
        if cur_num is None:
            return
        body = "\n".join(cur_body).strip()
        if not body or len(body) < 20:
            return
        # Build the same prefix-tag style as our existing corpus
        tags = [
            f"[{law_code}§{cur_num}]",
            f"[סעיף {cur_num} ל{full_name.split(',')[0].split('(')[0].strip()}]",
            f"[סעיף {cur_num}]",
            f"[סעיף{cur_num}]",
        ]
        if cur_title.strip():
            tags.append(f"[{cur_title.strip()}]")
        prefix = " ".join(tags)
        title_part = f" — סעיף {cur_num}"
        if cur_title.strip():
            title_part += f" — {cur_title.strip()}"
        text = f"{prefix} {full_name}{title_part}: {body}"
        docs.append({
            "id": f"heb_law/{law_code}/section_{cur_num}",
            "text": text,
            "metadata": {
                "law": full_name,
                "law_code": law_code,
                "section": cur_num,
                "title": cur_title.strip(),
                "verbatim": True,
                "source": "wikisource.org",
                "language": "he",
            }
        })

    for line in cleaned.split("\n"):
        m = HEADING_RE.match(line)
        if m:
            flush()
            cur_num = m.group(1)
            cur_title = (m.group(2) or "").strip()
            cur_body = []
        else:
            if cur_num is not None and line.strip():
                cur_body.append(line)
    flush()
    return docs


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--laws", default=None,
                        help="Comma-separated list of page titles (default: all 22)")
    parser.add_argument("--out", default=None,
                        help="Output JSONL path (default: tau_rag/runtime/uploads/heb_statutes_extra.jsonl)")
    parser.add_argument("--dry-run", action="store_true")
    parser.add_argument("--sleep", type=float, default=1.0,
                        help="Seconds between requests (default 1.0)")
    args = parser.parse_args()

    laws = STATUTES
    if args.laws:
        wanted = set(args.laws.split(","))
        laws = [s for s in STATUTES if s[0] in wanted]
        if not laws:
            print(f"❌ No matching statutes for {wanted}")
            return 1

    here = Path(__file__).resolve().parent.parent
    out_path = Path(args.out) if args.out else (
        here / "runtime" / "uploads" / "heb_statutes_extra.jsonl")

    print(f"▸ Will scrape {len(laws)} statutes → {out_path}")
    print()

    all_docs = []
    for i, (page, code, full_name) in enumerate(laws, 1):
        print(f"[{i}/{len(laws)}] {page}")
        if args.dry_run:
            continue
        wt = fetch_wikitext(page)
        if not wt:
            print(f"   ✗ skipped (no wikitext)")
            continue
        docs = split_sections(wt, code, full_name)
        print(f"   ✓ {len(docs)} sections")
        all_docs.extend(docs)
        time.sleep(args.sleep)

    if args.dry_run:
        print(f"\nDry run — would have written to {out_path}")
        return 0

    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        for d in all_docs:
            f.write(json.dumps(d, ensure_ascii=False) + "\n")
    print(f"\n✅ Wrote {len(all_docs):,} sections → {out_path}")

    # Quick stats per law
    print("\nDocs per law:")
    from collections import Counter
    per_law = Counter(d["metadata"]["law_code"] for d in all_docs)
    for code, n in per_law.most_common():
        print(f"  {n:>3}  {code}")
    return 0


if __name__ == "__main__":
    sys.exit(main())