#!/usr/bin/env python3 """Tiny HTTP server for labeling Hebrew legal paragraphs. Run with: python3 -m tau_rag.scripts.labeling_server --pool data/paragraphs_to_label.jsonl Then open http://localhost:8765 in your browser. Each labeled paragraph is appended to `.labels.jsonl` immediately — your work is saved keystroke-by-keystroke, never lost. Output format (per labeled record): {"id":"...", "text":"...", "is_argument":true, "outcome":"accepted", "side":"plaintiff", "arg_type":"factual"} Keyboard shortcuts inside the browser: 1 / 2 → is_argument: yes / no a / r / p / u → outcome: accepted / rejected / partial / unknown t / d / c / x → side: plaintiff / defendant / court / unknown f / l / g / m → arg_type: factual / legal / procedural / policy Enter → save current + load next ← → → previous / next s → skip without labeling Approximate throughput: ~120-150 paragraphs/hour with shortcuts. """ from __future__ import annotations import argparse import json import sys from http.server import BaseHTTPRequestHandler, HTTPServer from pathlib import Path from typing import Dict, List, Optional from urllib.parse import urlparse, parse_qs HTML_PAGE = r""" tau-rag labeling
תייגתי 0 / 0 פסקאות

1. האם זה טיעון משפטי?

2. מה התוצאה? (אם זה טיעון)

3. של איזה צד?

4. סוג הטיעון

קיצורי מקלדת: 1/2 טיעון · A/R/P/U תוצאה · T/D/C/X צד · F/L/G/M/E/Z סוג · שמור · S דלג · חזור
""" class LabelingState: """In-memory state of the labeling session — backed by JSONL files.""" def __init__(self, pool_path: Path, out_path: Path): self.pool_path = pool_path self.out_path = out_path # Load all candidates self.items: List[Dict] = [] with pool_path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue self.items.append(json.loads(line)) # Load already-labeled IDs so we don't re-show them self.labeled_ids: set = set() if out_path.exists(): with out_path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: rec = json.loads(line) self.labeled_ids.add(rec["id"]) except Exception: pass self.idx = 0 # advance idx past anything already labeled while self.idx < len(self.items) and self.items[self.idx]["id"] in self.labeled_ids: self.idx += 1 self.history: List[int] = [] def current(self) -> Optional[Dict]: if self.idx >= len(self.items): return None rec = dict(self.items[self.idx]) rec["idx"] = self.idx rec["total"] = len(self.items) rec["labeled_count"] = len(self.labeled_ids) rec["done"] = False return rec def save_label(self, label_record: Dict) -> None: """Append a label to the output JSONL — durable, atomic per line.""" with self.out_path.open("a", encoding="utf-8") as f: f.write(json.dumps(label_record, ensure_ascii=False) + "\n") self.labeled_ids.add(label_record["id"]) self.history.append(self.idx) # advance to next unlabeled self.idx += 1 while self.idx < len(self.items) and self.items[self.idx]["id"] in self.labeled_ids: self.idx += 1 def skip(self, item_id: str) -> None: self.history.append(self.idx) self.idx += 1 while self.idx < len(self.items) and self.items[self.idx]["id"] in self.labeled_ids: self.idx += 1 def go_prev(self) -> None: if self.history: self.idx = self.history.pop() class LabelingHandler(BaseHTTPRequestHandler): state: LabelingState = None # injected via class attr def log_message(self, *args, **kwargs): pass # quiet def _send_json(self, payload, status=200): body = json.dumps(payload, ensure_ascii=False).encode("utf-8") self.send_response(status) self.send_header("Content-Type", "application/json; charset=utf-8") self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) def do_GET(self): path = urlparse(self.path).path qs = parse_qs(urlparse(self.path).query) if path == "/" or path == "/index.html": body = HTML_PAGE.encode("utf-8") self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) elif path == "/next": cur = self.state.current() if cur is None: self._send_json({ "done": True, "labeled_count": len(self.state.labeled_ids), "out_path": str(self.state.out_path), }) else: self._send_json(cur) elif path == "/skip": item_id = qs.get("id", [""])[0] self.state.skip(item_id) self._send_json({"ok": True}) elif path == "/prev": self.state.go_prev() self._send_json({"ok": True}) else: self.send_response(404); self.end_headers() def do_POST(self): if urlparse(self.path).path != "/save": self.send_response(404); self.end_headers(); return length = int(self.headers.get("Content-Length", "0")) body = self.rfile.read(length).decode("utf-8") try: label = json.loads(body) except Exception: self._send_json({"error": "bad json"}, status=400); return # Minimal validation if "id" not in label or "is_argument" not in label: self._send_json({"error": "missing fields"}, status=400); return self.state.save_label(label) self._send_json({"ok": True, "labeled_count": len(self.state.labeled_ids)}) def main(): ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--pool", required=True, help="JSONL with paragraphs to label " "(produced by extract_paragraphs_for_labeling.py)") ap.add_argument("--out", default=None, help="output JSONL of labels (default: .labels.jsonl)") ap.add_argument("--port", type=int, default=8765) ap.add_argument("--host", default="127.0.0.1") args = ap.parse_args() pool_path = Path(args.pool) if not pool_path.exists(): sys.exit(f"pool not found: {pool_path}") out_path = Path(args.out) if args.out else pool_path.with_suffix(".labels.jsonl") state = LabelingState(pool_path, out_path) LabelingHandler.state = state print(f"\n 📝 tau-rag labeling tool") print(f" pool: {pool_path} ({len(state.items):,} paragraphs)") print(f" output: {out_path} ({len(state.labeled_ids):,} already labeled)") print(f" open: http://{args.host}:{args.port}\n") print(f" Ctrl+C to stop. Your work is saved continuously.\n") httpd = HTTPServer((args.host, args.port), LabelingHandler) try: httpd.serve_forever() except KeyboardInterrupt: print(f"\n ✓ stopped. {len(state.labeled_ids):,} labels saved to {out_path}") if __name__ == "__main__": main()