#!/usr/bin/env python3
"""Pull Hebrew judgments from local parquet files into the tau-rag corpus.
Sources (auto-detected — first-found wins):
• LawDBHeb/cases_clean.parquet — 750k clean Hebrew judgments
• LawDBHeb/cases_law14_waiver.parquet — 10 judgments with rich metadata
• TAU_API/.../uploads/cases_all.parquet (alt)
Output: tau_rag/runtime/parquet_cases.jsonl (one JSON record per line)
The autoload hook in fastapi_app.py will pick this file up on startup,
classify the domain of each doc, and inject into the live pipeline so
they're searchable + structured-viewable just like the rest of the corpus.
Usage:
python3 -m tau_rag.scripts.ingest_parquet_cases [--n 1000] [--source PATH]
Default: 1000 random samples from cases_clean + all 10 from waiver.
"""
from __future__ import annotations
import argparse
import json
import random
import re
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional
import pyarrow.parquet as pq
# ============================================================================
# Locate parquet sources
# ============================================================================
# Auto-discovery: search common iCloud / Drive / local locations.
# The user's machine may have any of these, so we probe in order.
HOME = Path.home()
CANDIDATE_DIRS = [
# macOS iCloud Drive
HOME / "Library/Mobile Documents/com~apple~CloudDocs/LawDBHeb",
HOME / "Library/Mobile Documents/com~apple~CloudDocs",
# Sandbox path (CI / Cowork)
Path("/sessions/kind-affectionate-mccarthy/mnt/com~apple~CloudDocs/LawDBHeb"),
Path("/sessions/kind-affectionate-mccarthy/mnt/com~apple~CloudDocs"),
# Other common Mac locations
HOME / "Documents/LawDBHeb",
HOME / "Downloads",
HOME / "Documents",
# Linux / generic
Path("/data/LawDBHeb"),
Path("/data"),
]
CLEAN_FILENAMES = ["cases_clean.parquet", "cases_all.parquet"]
RICH_FILENAMES = ["cases_law14_waiver.parquet"]
def _find_first(filenames):
"""Walk CANDIDATE_DIRS looking for any of the given filenames.
Recurses one level into each so iCloud subfolders are caught."""
for base in CANDIDATE_DIRS:
if not base.exists():
continue
# Direct hit
for f in filenames:
p = base / f
if p.exists():
return p
# Glob one level deep — catches LawDBHeb/cases_clean.parquet when
# base is iCloudDocs root, etc.
for f in filenames:
try:
hits = list(base.glob(f"**/{f}"))
# Prefer paths under LawDBHeb/
hits.sort(key=lambda p: ("LawDBHeb" not in str(p), len(str(p))))
if hits:
return hits[0]
except (OSError, PermissionError):
continue
return None
CANDIDATE_CLEAN = [_find_first(CLEAN_FILENAMES) or Path("cases_clean.parquet")]
CANDIDATE_RICH = [_find_first(RICH_FILENAMES) or Path("cases_law14_waiver.parquet")]
OUT_PATH = (Path(__file__).resolve().parent.parent / "runtime"
/ "parquet_cases.jsonl")
# ============================================================================
# Cleaning
# ============================================================================
# Strip HTML comments / Word CSS / inline style blocks that pollute many of
# the rows (especially in cases_law14_waiver). We're permissive: anything
# that looks structural and Hebrew-rare gets flattened.
HTML_NOISE = re.compile(
r"" # HTML comments
r"|" # inline