legal-eye / tau_rag /scripts /redact_corpus.py
Legal-i's picture
feat(Day 4): Argument Classifier endpoint + improved PII redactor
1a33ede
Raw
History Blame Contribute Delete
2.42 kB
"""redact_corpus.py — apply PII redaction over a JSONL corpus.
Streams input → output line-by-line. Applies redact_pii() to the `text`
field only; preserves `id` + `metadata` verbatim.
Usage:
python -m tau_rag.scripts.redact_corpus \
tau_rag/runtime/parquet_cases.jsonl \
tau_rag/runtime/parquet_cases_redacted.jsonl
"""
from __future__ import annotations
import json
import sys
import time
from pathlib import Path
from ..scrapers.pii_redactor import redact_pii
def main(src: str, dst: str) -> int:
src_p = Path(src)
dst_p = Path(dst)
if not src_p.exists():
print(f"ERROR: {src_p} not found", file=sys.stderr)
return 2
n_total = 0
n_redacted = 0
counts_total: dict[str, int] = {}
t0 = time.time()
with src_p.open("r", encoding="utf-8") as fi, \
dst_p.open("w", encoding="utf-8") as fo:
for line in fi:
n_total += 1
try:
d = json.loads(line)
except json.JSONDecodeError:
# Pass through unparseable lines unchanged
fo.write(line)
continue
text = d.get("text", "")
if text:
new_text, counts = redact_pii(text)
if counts:
n_redacted += 1
for k, v in counts.items():
counts_total[k] = counts_total.get(k, 0) + v
d["text"] = new_text
fo.write(json.dumps(d, ensure_ascii=False) + "\n")
if n_total % 50_000 == 0:
rate = n_total / (time.time() - t0)
print(f" {n_total:>7,d} docs · {n_redacted:>6,d} redacted "
f"· {rate:.0f} docs/s", flush=True)
elapsed = time.time() - t0
print()
print(f"DONE. {n_total:,} docs processed in {elapsed:.1f}s "
f"({n_total/elapsed:.0f} docs/s)")
print(f" {n_redacted:,} docs had ≥1 PII match "
f"({100*n_redacted/max(n_total,1):.2f}%)")
print(f"Replacements by kind:")
for k, v in sorted(counts_total.items(), key=lambda x: -x[1]):
print(f" {k:20s}: {v:>8,d}")
return 0
if __name__ == "__main__":
if len(sys.argv) != 3:
print("usage: python -m tau_rag.scripts.redact_corpus <src.jsonl> <dst.jsonl>",
file=sys.stderr)
sys.exit(2)
sys.exit(main(sys.argv[1], sys.argv[2]))