feat(data): add normalization starter dataset and validator

Files changed (4) hide show

data/README.md +10 -0
data/processed/normalization_seed_v0.1.tsv +21 -0
docs/pashto_normalization_v0.1.md +33 -0
scripts/validate_normalization.py +106 -0

data/README.md CHANGED Viewed

@@ -3,3 +3,13 @@
 - `raw/` incoming source files
 - `processed/` cleaned/aligned artifacts
 - `metadata/` manifests, speaker/dialect info, QA reports

 - `raw/` incoming source files
 - `processed/` cleaned/aligned artifacts
 - `metadata/` manifests, speaker/dialect info, QA reports
+## First Contribution (Normalization Starter)
+- `processed/normalization_seed_v0.1.tsv` starter normalization examples
+- `../docs/pashto_normalization_v0.1.md` baseline normalization policy
+- `../scripts/validate_normalization.py` basic file validator
+## Validate Seed File
+```bash
+python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
+```

data/processed/normalization_seed_v0.1.tsv ADDED Viewed

	@@ -0,0 +1,21 @@

+id	raw_text	normalized_text	note
+n001	  زه  کور ته ځم. 	زه کور ته ځم.	Trim leading/trailing and repeated spaces
+n002	سلام   نړۍ	سلام نړۍ	Collapse multiple spaces
+n003	دا يو  ازموينيز  متن دی.	دا يو ازموينيز متن دی.	Whitespace normalization
+n004	ته څنګه يې ؟	ته څنګه يې؟	Fix spacing before question mark
+n005	موږ،افغانان يو.	موږ، افغانان يو.	Add space after comma
+n006	دا ښه ده !!	دا ښه ده!	Reduce repeated exclamation
+n007	ولې نه؟؟	ولې نه؟	Reduce repeated question mark
+n008	دا ـــ اوږد ټکي دي.	دا اوږد ټکي دي.	Remove tatweel
+n009	 نن  باران  دی 	نن باران دی	Trim and collapse spaces
+n010	د کابل،ښکلی ښار	د کابل، ښکلی ښار	Add space after comma
+n011	ایا  ته  راځې؟	ایا ته راځې؟	Collapse spaces
+n012	  دا   يو  مثال   دی  	دا يو مثال دی	Aggressive whitespace cleanup
+n013	مونږ؛خو  چمتو يو.	مونږ؛ خو چمتو يو.	Add space after semicolon
+n014	ستړی مشې !	ستړی مشې!	Remove space before punctuation
+n015	زما  نوم   احمد دی.	زما نوم احمد دی.	Collapse spaces
+n016	دلته،هلته،هرځای	دلته، هلته، هرځای	Comma spacing consistency
+n017	ژبه  مو  ژوندۍ  ده.	ژبه مو ژوندۍ ده.	Collapse spaces
+n018	دا, يو مخلوط نښه ده.	دا، يو مخلوط نښه ده.	Normalize comma symbol
+n019	"سلام"   وويل.	"سلام" وويل.	Collapse spaces after quote
+n020	يوه  بله   کرښه.	يوه بله کرښه.	Whitespace normalization

docs/pashto_normalization_v0.1.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# Pashto Normalization Policy v0.1
+This starter policy defines simple, low-risk rules for text cleanup before
+training ASR/TTS/NLP baselines.
+## Scope
+- Applies to sentence-level text in this repository.
+- Prioritizes consistency over linguistic completeness.
+- Keeps semantic meaning unchanged.
+## Rules
+1. Trim leading and trailing whitespace.
+2. Collapse repeated internal spaces to a single space.
+3. Remove zero-width/invisible spacing characters.
+4. Remove elongation characters such as tatweel (`ـ`).
+5. Use Arabic punctuation consistently in Pashto text:
+   - comma: `،`
+   - question mark: `؟`
+   - semicolon: `؛`
+6. Keep sentence-final punctuation as a single character (avoid `!!`, `؟؟`).
+7. Normalize quotation usage to one style per sentence (avoid mixed quote styles).
+8. Normalize digit style to one standard per dataset split.
+9. Preserve original word order and meaning; do not rewrite content.
+10. Keep dialect wording as spoken; normalize form, not dialect identity.
+## Non-goals (for v0.1)
+- No stemming or morphology rules.
+- No automatic transliteration.
+- No named-entity rewriting.
+## File Reference
+- Seed examples: `data/processed/normalization_seed_v0.1.tsv`
+- Validator: `scripts/validate_normalization.py`

scripts/validate_normalization.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""Validate normalization seed data for the Pashto project.
+Usage:
+    python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
+"""
+from __future__ import annotations
+import csv
+import sys
+from pathlib import Path
+REQUIRED_COLUMNS = ("id", "raw_text", "normalized_text", "note")
+def detect_delimiter(first_line: str) -> str | None:
+    if "\t" in first_line:
+        return "\t"
+    if "," in first_line:
+        return ","
+    return None
+def validate_file(path: Path) -> list[str]:
+    errors: list[str] = []
+    seen_ids: dict[str, int] = {}
+    if not path.exists():
+        return [f"File not found: {path}"]
+    with path.open("r", encoding="utf-8-sig", newline="") as handle:
+        first_line = handle.readline()
+        if not first_line:
+            return [f"Empty file: {path}"]
+        delimiter = detect_delimiter(first_line)
+        if delimiter is None:
+            return [
+                "Could not detect delimiter. Use TSV (preferred) or CSV with headers: "
+                + ", ".join(REQUIRED_COLUMNS)
+            ]
+        handle.seek(0)
+        reader = csv.DictReader(handle, delimiter=delimiter)
+        if reader.fieldnames is None:
+            return [f"Missing header row in: {path}"]
+        missing = [col for col in REQUIRED_COLUMNS if col not in reader.fieldnames]
+        if missing:
+            errors.append(f"Missing required columns: {', '.join(missing)}")
+            return errors
+        row_count = 0
+        for line_number, row in enumerate(reader, start=2):
+            row_count += 1
+            row_id = (row.get("id") or "").strip()
+            raw_text = (row.get("raw_text") or "").strip()
+            normalized_text = (row.get("normalized_text") or "").strip()
+            if not row_id:
+                errors.append(f"Line {line_number}: empty 'id'")
+            elif row_id in seen_ids:
+                errors.append(
+                    f"Line {line_number}: duplicate id '{row_id}' "
+                    f"(first seen at line {seen_ids[row_id]})"
+                )
+            else:
+                seen_ids[row_id] = line_number
+            if not raw_text:
+                errors.append(f"Line {line_number}: empty 'raw_text'")
+            if not normalized_text:
+                errors.append(f"Line {line_number}: empty 'normalized_text'")
+        if row_count == 0:
+            errors.append("No data rows found.")
+    return errors
+def main() -> int:
+    if len(sys.argv) != 2:
+        print(
+            "Usage: python scripts/validate_normalization.py "
+            "data/processed/normalization_seed_v0.1.tsv"
+        )
+        return 2
+    input_path = Path(sys.argv[1])
+    errors = validate_file(input_path)
+    if errors:
+        print("Validation failed:")
+        for error in errors:
+            print(f"- {error}")
+        return 1
+    print(f"Validation passed: {input_path}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())