#!/usr/bin/env python3 """Pull Hebrew judgments from local parquet files into the tau-rag corpus. Sources (auto-detected — first-found wins): • LawDBHeb/cases_clean.parquet — 750k clean Hebrew judgments • LawDBHeb/cases_law14_waiver.parquet — 10 judgments with rich metadata • TAU_API/.../uploads/cases_all.parquet (alt) Output: tau_rag/runtime/parquet_cases.jsonl (one JSON record per line) The autoload hook in fastapi_app.py will pick this file up on startup, classify the domain of each doc, and inject into the live pipeline so they're searchable + structured-viewable just like the rest of the corpus. Usage: python3 -m tau_rag.scripts.ingest_parquet_cases [--n 1000] [--source PATH] Default: 1000 random samples from cases_clean + all 10 from waiver. """ from __future__ import annotations import argparse import json import random import re import sys from pathlib import Path from typing import Any, Dict, List, Optional import pyarrow.parquet as pq # ============================================================================ # Locate parquet sources # ============================================================================ # Auto-discovery: search common iCloud / Drive / local locations. # The user's machine may have any of these, so we probe in order. HOME = Path.home() CANDIDATE_DIRS = [ # macOS iCloud Drive HOME / "Library/Mobile Documents/com~apple~CloudDocs/LawDBHeb", HOME / "Library/Mobile Documents/com~apple~CloudDocs", # Sandbox path (CI / Cowork) Path("/sessions/kind-affectionate-mccarthy/mnt/com~apple~CloudDocs/LawDBHeb"), Path("/sessions/kind-affectionate-mccarthy/mnt/com~apple~CloudDocs"), # Other common Mac locations HOME / "Documents/LawDBHeb", HOME / "Downloads", HOME / "Documents", # Linux / generic Path("/data/LawDBHeb"), Path("/data"), ] CLEAN_FILENAMES = ["cases_clean.parquet", "cases_all.parquet"] RICH_FILENAMES = ["cases_law14_waiver.parquet"] def _find_first(filenames): """Walk CANDIDATE_DIRS looking for any of the given filenames. Recurses one level into each so iCloud subfolders are caught.""" for base in CANDIDATE_DIRS: if not base.exists(): continue # Direct hit for f in filenames: p = base / f if p.exists(): return p # Glob one level deep — catches LawDBHeb/cases_clean.parquet when # base is iCloudDocs root, etc. for f in filenames: try: hits = list(base.glob(f"**/{f}")) # Prefer paths under LawDBHeb/ hits.sort(key=lambda p: ("LawDBHeb" not in str(p), len(str(p)))) if hits: return hits[0] except (OSError, PermissionError): continue return None CANDIDATE_CLEAN = [_find_first(CLEAN_FILENAMES) or Path("cases_clean.parquet")] CANDIDATE_RICH = [_find_first(RICH_FILENAMES) or Path("cases_law14_waiver.parquet")] OUT_PATH = (Path(__file__).resolve().parent.parent / "runtime" / "parquet_cases.jsonl") # ============================================================================ # Cleaning # ============================================================================ # Strip HTML comments / Word CSS / inline style blocks that pollute many of # the rows (especially in cases_law14_waiver). We're permissive: anything # that looks structural and Hebrew-rare gets flattened. HTML_NOISE = re.compile( r"" # HTML comments r"|]*>.*?" # inline