| """Validate normalization seed data for the Pashto project. |
| |
| Usage: |
| python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv |
| """ |
|
|
| from __future__ import annotations |
|
|
| import csv |
| import sys |
| from pathlib import Path |
|
|
|
|
| REQUIRED_COLUMNS = ("id", "raw_text", "normalized_text", "note") |
|
|
|
|
| def detect_delimiter(first_line: str) -> str | None: |
| if "\t" in first_line: |
| return "\t" |
| if "," in first_line: |
| return "," |
| return None |
|
|
|
|
| def validate_file(path: Path) -> list[str]: |
| errors: list[str] = [] |
| seen_ids: dict[str, int] = {} |
|
|
| if not path.exists(): |
| return [f"File not found: {path}"] |
|
|
| with path.open("r", encoding="utf-8-sig", newline="") as handle: |
| first_line = handle.readline() |
| if not first_line: |
| return [f"Empty file: {path}"] |
|
|
| delimiter = detect_delimiter(first_line) |
| if delimiter is None: |
| return [ |
| "Could not detect delimiter. Use TSV (preferred) or CSV with headers: " |
| + ", ".join(REQUIRED_COLUMNS) |
| ] |
|
|
| handle.seek(0) |
| reader = csv.DictReader(handle, delimiter=delimiter) |
|
|
| if reader.fieldnames is None: |
| return [f"Missing header row in: {path}"] |
|
|
| missing = [col for col in REQUIRED_COLUMNS if col not in reader.fieldnames] |
| if missing: |
| errors.append(f"Missing required columns: {', '.join(missing)}") |
| return errors |
|
|
| row_count = 0 |
| for line_number, row in enumerate(reader, start=2): |
| row_count += 1 |
|
|
| row_id = (row.get("id") or "").strip() |
| raw_text = (row.get("raw_text") or "").strip() |
| normalized_text = (row.get("normalized_text") or "").strip() |
|
|
| if not row_id: |
| errors.append(f"Line {line_number}: empty 'id'") |
| elif row_id in seen_ids: |
| errors.append( |
| f"Line {line_number}: duplicate id '{row_id}' " |
| f"(first seen at line {seen_ids[row_id]})" |
| ) |
| else: |
| seen_ids[row_id] = line_number |
|
|
| if not raw_text: |
| errors.append(f"Line {line_number}: empty 'raw_text'") |
| if not normalized_text: |
| errors.append(f"Line {line_number}: empty 'normalized_text'") |
|
|
| if row_count == 0: |
| errors.append("No data rows found.") |
|
|
| return errors |
|
|
|
|
| def main() -> int: |
| if len(sys.argv) != 2: |
| print( |
| "Usage: python scripts/validate_normalization.py " |
| "data/processed/normalization_seed_v0.1.tsv" |
| ) |
| return 2 |
|
|
| input_path = Path(sys.argv[1]) |
| errors = validate_file(input_path) |
|
|
| if errors: |
| print("Validation failed:") |
| for error in errors: |
| print(f"- {error}") |
| return 1 |
|
|
| print(f"Validation passed: {input_path}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|