#!/usr/bin/env python3 """ Scrape Israeli statutes from he.wikisource.org via the MediaWiki API. Strategy: - For each law, fetch the parsed HTML via /w/api.php?action=parse - Israeli laws on Wikisource are structured with "== סעיף N - title ==" headers - Walk the HTML, accumulate text under each heading as a separate doc - Output JSONL in the same format as our existing corpus Run from any directory; the script writes to the same path as our other corpora so start_server.sh picks it up automatically. Usage: python3 tau_rag/scripts/scrape_israeli_statutes.py python3 tau_rag/scripts/scrape_israeli_statutes.py --laws "חוק_הגנת_הצרכן" python3 tau_rag/scripts/scrape_israeli_statutes.py --dry-run """ from __future__ import annotations import argparse import json import re import sys import time import urllib.parse import urllib.request from pathlib import Path # ───────────────────────────────────────────────────────────────────────── # 22 statutes covering the benchmark gaps (labor / consumer / tort / etc.) # Format: (page_title_on_wikisource, short_law_code, full_name_for_metadata) # ───────────────────────────────────────────────────────────────────────── STATUTES = [ # Labor (12) ("חוק_עבודת_נשים", "EN", "חוק עבודת נשים, התשי\"ד-1954"), ("חוק_שעות_עבודה_ומנוחה", "WHR", "חוק שעות עבודה ומנוחה, התשי\"א-1951"), ("חוק_שכר_מינימום", "MW", "חוק שכר מינימום, התשמ\"ז-1987"), ("חוק_פיצויי_פיטורים", "SP", "חוק פיצויי פיטורים, התשכ\"ג-1963"), ("חוק_הודעה_מוקדמת_לפיטורים_ולהתפטרות", "PN", "חוק הודעה מוקדמת לפיטורים ולהתפטרות, התשס\"א-2001"), ("חוק_חופשה_שנתית", "AL", "חוק חופשה שנתית, התשי\"א-1951"), ("חוק_דמי_מחלה", "SL", "חוק דמי מחלה, התשל\"ו-1976"), ("חוק_שכר_שווה_לעובדת_ולעובד", "EQ", "חוק שכר שווה לעובדת ולעובד, התשנ\"ו-1996"), ("חוק_עובדים_זרים", "FW", "חוק עובדים זרים, התשנ\"א-1991"), ("חוק_הסכמים_קיבוציים", "CB", "חוק הסכמים קיבוציים, התשי\"ז-1957"), ("חוק_הגנת_השכר", "WP", "חוק הגנת השכר, התשי\"ח-1958"), ("חוק_שוויון_ההזדמנויות_בעבודה", "EE", "חוק שוויון ההזדמנויות בעבודה, התשמ\"ח-1988"), # Consumer ("חוק_הגנת_הצרכן", "CP", "חוק הגנת הצרכן, התשמ\"א-1981"), # Tort ("פקודת_הנזיקין", "TT", "פקודת הנזיקין [נוסח חדש]"), ("חוק_פיצויים_לנפגעי_תאונות_דרכים", "RTA", "חוק פיצויים לנפגעי תאונות דרכים, התשל\"ה-1975"), # Defamation ("חוק_איסור_לשון_הרע", "DEF", "חוק איסור לשון הרע, התשכ\"ה-1965"), # Banking / consumer-finance ("חוק_הבנקאות_(שירות_ללקוח)", "BNK", "חוק הבנקאות (שירות ללקוח), התשמ\"א-1981"), # Insurance / contract specifics ("חוק_חוזה_הביטוח", "INS", "חוק חוזה הביטוח, התשמ\"א-1981"), # Family / succession ("חוק_הירושה", "INH", "חוק הירושה, התשכ\"ה-1965"), ("חוק_יחסי_ממון_בין_בני_זוג", "MR", "חוק יחסי ממון בין בני זוג, התשל\"ג-1973"), # Companies (commercial) ("חוק_החברות", "CO", "חוק החברות, התשנ\"ט-1999"), # Procedural — small but important for procedure questions ("חוק_בתי_המשפט", "CR", "חוק בתי המשפט [נוסח משולב], התשמ\"ד-1984"), ] API = "https://he.wikisource.org/w/api.php" USER_AGENT = "tau-rag-scraper/1.0 (research; contact: avribarzel@gmail.com)" def fetch_wikitext(page_title: str) -> str | None: """Fetch the raw wikitext of a page via MediaWiki API.""" params = { "action": "parse", "page": page_title, "prop": "wikitext", "format": "json", "redirects": "1", } url = API + "?" + urllib.parse.urlencode(params, safe=":/") req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) try: with urllib.request.urlopen(req, timeout=30) as resp: data = json.loads(resp.read().decode("utf-8")) except Exception as e: print(f" ⚠️ fetch failed: {e}") return None if "error" in data: print(f" ⚠️ API error: {data['error'].get('info','?')}") return None return data.get("parse", {}).get("wikitext", {}).get("*") # Wikitext cleanup helpers TEMPLATE_RE = re.compile(r"\{\{[^{}]*\}\}", re.DOTALL) LINK_RE = re.compile(r"\[\[([^\]|]+)(?:\|([^\]]+))?\]\]") TAG_RE = re.compile(r"<[^>]+>") HTML_COMMENT_RE = re.compile(r"", re.DOTALL) BOLD_ITALIC = re.compile(r"'{2,}") def clean_wiki(text: str) -> str: """Strip wiki/HTML markup, keep section structure roughly intact.""" # Remove HTML comments text = HTML_COMMENT_RE.sub("", text) # Remove templates (collapse repeatedly for nested ones) for _ in range(8): new = TEMPLATE_RE.sub("", text) if new == text: break text = new # [[Page|display]] → display text = LINK_RE.sub(lambda m: m.group(2) or m.group(1), text) # HTML tags text = TAG_RE.sub("", text) # Bold / italic markers text = BOLD_ITALIC.sub("", text) # Normalize whitespace inside paragraphs but keep newlines lines = [] for line in text.split("\n"): s = re.sub(r"[ \t]+", " ", line).strip() lines.append(s) return "\n".join(lines) # Match Wikisource section heading: "== סעיף 1 - הגדרות ==" or "==סעיף 5(א)==" HEADING_RE = re.compile( r"^={2,}\s*סעיף\s*(\S+?)\s*(?:[-–—]\s*([^=]+?))?\s*={2,}\s*$" ) def split_sections(wikitext: str, law_code: str, full_name: str) -> list[dict]: """Split cleaned wikitext into per-section docs.""" cleaned = clean_wiki(wikitext) docs = [] cur_num = None cur_title = "" cur_body: list[str] = [] def flush(): if cur_num is None: return body = "\n".join(cur_body).strip() if not body or len(body) < 20: return # Build the same prefix-tag style as our existing corpus tags = [ f"[{law_code}§{cur_num}]", f"[סעיף {cur_num} ל{full_name.split(',')[0].split('(')[0].strip()}]", f"[סעיף {cur_num}]", f"[סעיף{cur_num}]", ] if cur_title.strip(): tags.append(f"[{cur_title.strip()}]") prefix = " ".join(tags) title_part = f" — סעיף {cur_num}" if cur_title.strip(): title_part += f" — {cur_title.strip()}" text = f"{prefix} {full_name}{title_part}: {body}" docs.append({ "id": f"heb_law/{law_code}/section_{cur_num}", "text": text, "metadata": { "law": full_name, "law_code": law_code, "section": cur_num, "title": cur_title.strip(), "verbatim": True, "source": "wikisource.org", "language": "he", } }) for line in cleaned.split("\n"): m = HEADING_RE.match(line) if m: flush() cur_num = m.group(1) cur_title = (m.group(2) or "").strip() cur_body = [] else: if cur_num is not None and line.strip(): cur_body.append(line) flush() return docs def main(): parser = argparse.ArgumentParser() parser.add_argument("--laws", default=None, help="Comma-separated list of page titles (default: all 22)") parser.add_argument("--out", default=None, help="Output JSONL path (default: tau_rag/runtime/uploads/heb_statutes_extra.jsonl)") parser.add_argument("--dry-run", action="store_true") parser.add_argument("--sleep", type=float, default=1.0, help="Seconds between requests (default 1.0)") args = parser.parse_args() laws = STATUTES if args.laws: wanted = set(args.laws.split(",")) laws = [s for s in STATUTES if s[0] in wanted] if not laws: print(f"❌ No matching statutes for {wanted}") return 1 here = Path(__file__).resolve().parent.parent out_path = Path(args.out) if args.out else ( here / "runtime" / "uploads" / "heb_statutes_extra.jsonl") print(f"▸ Will scrape {len(laws)} statutes → {out_path}") print() all_docs = [] for i, (page, code, full_name) in enumerate(laws, 1): print(f"[{i}/{len(laws)}] {page}") if args.dry_run: continue wt = fetch_wikitext(page) if not wt: print(f" ✗ skipped (no wikitext)") continue docs = split_sections(wt, code, full_name) print(f" ✓ {len(docs)} sections") all_docs.extend(docs) time.sleep(args.sleep) if args.dry_run: print(f"\nDry run — would have written to {out_path}") return 0 out_path.parent.mkdir(parents=True, exist_ok=True) with open(out_path, "w", encoding="utf-8") as f: for d in all_docs: f.write(json.dumps(d, ensure_ascii=False) + "\n") print(f"\n✅ Wrote {len(all_docs):,} sections → {out_path}") # Quick stats per law print("\nDocs per law:") from collections import Counter per_law = Counter(d["metadata"]["law_code"] for d in all_docs) for code, n in per_law.most_common(): print(f" {n:>3} {code}") return 0 if __name__ == "__main__": sys.exit(main())