musaw commited on
Commit
379266c
·
1 Parent(s): 1f304d8

feat(data): add normalization starter dataset and validator

Browse files
data/README.md CHANGED
@@ -3,3 +3,13 @@
3
  - `raw/` incoming source files
4
  - `processed/` cleaned/aligned artifacts
5
  - `metadata/` manifests, speaker/dialect info, QA reports
 
 
 
 
 
 
 
 
 
 
 
3
  - `raw/` incoming source files
4
  - `processed/` cleaned/aligned artifacts
5
  - `metadata/` manifests, speaker/dialect info, QA reports
6
+
7
+ ## First Contribution (Normalization Starter)
8
+ - `processed/normalization_seed_v0.1.tsv` starter normalization examples
9
+ - `../docs/pashto_normalization_v0.1.md` baseline normalization policy
10
+ - `../scripts/validate_normalization.py` basic file validator
11
+
12
+ ## Validate Seed File
13
+ ```bash
14
+ python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
15
+ ```
data/processed/normalization_seed_v0.1.tsv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ id raw_text normalized_text note
2
+ n001 زه کور ته ځم. زه کور ته ځم. Trim leading/trailing and repeated spaces
3
+ n002 سلام نړۍ سلام نړۍ Collapse multiple spaces
4
+ n003 دا يو ازموينيز متن دی. دا يو ازموينيز متن دی. Whitespace normalization
5
+ n004 ته څنګه يې ؟ ته څنګه يې؟ Fix spacing before question mark
6
+ n005 موږ،افغانان يو. موږ، افغانان يو. Add space after comma
7
+ n006 دا ښه ده !! دا ښه ده! Reduce repeated exclamation
8
+ n007 ولې نه؟؟ ولې نه؟ Reduce repeated question mark
9
+ n008 دا ـــ اوږد ټکي دي. دا اوږد ټکي دي. Remove tatweel
10
+ n009 نن باران دی نن باران دی Trim and collapse spaces
11
+ n010 د کابل،ښکلی ښار د کابل، ښکلی ښار Add space after comma
12
+ n011 ایا ته راځې؟ ایا ته راځې؟ Collapse spaces
13
+ n012 دا يو مثال دی دا يو مثال دی Aggressive whitespace cleanup
14
+ n013 مونږ؛خو چمتو يو. مونږ؛ خو چمتو يو. Add space after semicolon
15
+ n014 ستړی مشې ! ستړی مشې! Remove space before punctuation
16
+ n015 زما نوم احمد دی. زما نوم احمد دی. Collapse spaces
17
+ n016 دلته،هلته،هرځای دلته، هلته، هرځای Comma spacing consistency
18
+ n017 ژبه مو ژوندۍ ده. ژبه مو ژوندۍ ده. Collapse spaces
19
+ n018 دا, يو مخلوط نښه ده. دا، يو مخلوط نښه ده. Normalize comma symbol
20
+ n019 "سلام" وويل. "سلام" وويل. Collapse spaces after quote
21
+ n020 يوه بله کرښه. يوه بله کرښه. Whitespace normalization
docs/pashto_normalization_v0.1.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pashto Normalization Policy v0.1
2
+
3
+ This starter policy defines simple, low-risk rules for text cleanup before
4
+ training ASR/TTS/NLP baselines.
5
+
6
+ ## Scope
7
+ - Applies to sentence-level text in this repository.
8
+ - Prioritizes consistency over linguistic completeness.
9
+ - Keeps semantic meaning unchanged.
10
+
11
+ ## Rules
12
+ 1. Trim leading and trailing whitespace.
13
+ 2. Collapse repeated internal spaces to a single space.
14
+ 3. Remove zero-width/invisible spacing characters.
15
+ 4. Remove elongation characters such as tatweel (`ـ`).
16
+ 5. Use Arabic punctuation consistently in Pashto text:
17
+ - comma: `،`
18
+ - question mark: `؟`
19
+ - semicolon: `؛`
20
+ 6. Keep sentence-final punctuation as a single character (avoid `!!`, `؟؟`).
21
+ 7. Normalize quotation usage to one style per sentence (avoid mixed quote styles).
22
+ 8. Normalize digit style to one standard per dataset split.
23
+ 9. Preserve original word order and meaning; do not rewrite content.
24
+ 10. Keep dialect wording as spoken; normalize form, not dialect identity.
25
+
26
+ ## Non-goals (for v0.1)
27
+ - No stemming or morphology rules.
28
+ - No automatic transliteration.
29
+ - No named-entity rewriting.
30
+
31
+ ## File Reference
32
+ - Seed examples: `data/processed/normalization_seed_v0.1.tsv`
33
+ - Validator: `scripts/validate_normalization.py`
scripts/validate_normalization.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Validate normalization seed data for the Pashto project.
2
+
3
+ Usage:
4
+ python scripts/validate_normalization.py data/processed/normalization_seed_v0.1.tsv
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ import sys
11
+ from pathlib import Path
12
+
13
+
14
+ REQUIRED_COLUMNS = ("id", "raw_text", "normalized_text", "note")
15
+
16
+
17
+ def detect_delimiter(first_line: str) -> str | None:
18
+ if "\t" in first_line:
19
+ return "\t"
20
+ if "," in first_line:
21
+ return ","
22
+ return None
23
+
24
+
25
+ def validate_file(path: Path) -> list[str]:
26
+ errors: list[str] = []
27
+ seen_ids: dict[str, int] = {}
28
+
29
+ if not path.exists():
30
+ return [f"File not found: {path}"]
31
+
32
+ with path.open("r", encoding="utf-8-sig", newline="") as handle:
33
+ first_line = handle.readline()
34
+ if not first_line:
35
+ return [f"Empty file: {path}"]
36
+
37
+ delimiter = detect_delimiter(first_line)
38
+ if delimiter is None:
39
+ return [
40
+ "Could not detect delimiter. Use TSV (preferred) or CSV with headers: "
41
+ + ", ".join(REQUIRED_COLUMNS)
42
+ ]
43
+
44
+ handle.seek(0)
45
+ reader = csv.DictReader(handle, delimiter=delimiter)
46
+
47
+ if reader.fieldnames is None:
48
+ return [f"Missing header row in: {path}"]
49
+
50
+ missing = [col for col in REQUIRED_COLUMNS if col not in reader.fieldnames]
51
+ if missing:
52
+ errors.append(f"Missing required columns: {', '.join(missing)}")
53
+ return errors
54
+
55
+ row_count = 0
56
+ for line_number, row in enumerate(reader, start=2):
57
+ row_count += 1
58
+
59
+ row_id = (row.get("id") or "").strip()
60
+ raw_text = (row.get("raw_text") or "").strip()
61
+ normalized_text = (row.get("normalized_text") or "").strip()
62
+
63
+ if not row_id:
64
+ errors.append(f"Line {line_number}: empty 'id'")
65
+ elif row_id in seen_ids:
66
+ errors.append(
67
+ f"Line {line_number}: duplicate id '{row_id}' "
68
+ f"(first seen at line {seen_ids[row_id]})"
69
+ )
70
+ else:
71
+ seen_ids[row_id] = line_number
72
+
73
+ if not raw_text:
74
+ errors.append(f"Line {line_number}: empty 'raw_text'")
75
+ if not normalized_text:
76
+ errors.append(f"Line {line_number}: empty 'normalized_text'")
77
+
78
+ if row_count == 0:
79
+ errors.append("No data rows found.")
80
+
81
+ return errors
82
+
83
+
84
+ def main() -> int:
85
+ if len(sys.argv) != 2:
86
+ print(
87
+ "Usage: python scripts/validate_normalization.py "
88
+ "data/processed/normalization_seed_v0.1.tsv"
89
+ )
90
+ return 2
91
+
92
+ input_path = Path(sys.argv[1])
93
+ errors = validate_file(input_path)
94
+
95
+ if errors:
96
+ print("Validation failed:")
97
+ for error in errors:
98
+ print(f"- {error}")
99
+ return 1
100
+
101
+ print(f"Validation passed: {input_path}")
102
+ return 0
103
+
104
+
105
+ if __name__ == "__main__":
106
+ raise SystemExit(main())