chore(resources): enforce Pashto-centric policy and prune reference-only entries

Files changed (14) hide show

docs/resource_catalog.md +2 -1
docs/resource_cycle_runbook.md +2 -0
docs/search/resources.json +1 -333
resources/README.md +6 -6
resources/catalog/README.md +4 -0
resources/catalog/pending_candidates.json +0 -0
resources/catalog/resources.json +0 -360
resources/models/README.md +2 -6
resources/papers/README.md +0 -4
resources/projects/README.md +0 -1
resources/tools/README.md +1 -2
scripts/sync_resources.py +208 -65
scripts/validate_resource_catalog.py +24 -0
tests/test_validate_resource_catalog.py +18 -0

docs/resource_catalog.md CHANGED Viewed

@@ -6,7 +6,8 @@ This index points to validated Pashto-related resources tracked in structured fi
 ## Validation method
 - Verify source URL resolves to official page or canonical repository.
-- Verify explicit Pashto support markers (`Pashto`, `ps`, `ps_af`, `pus`, `pbt_Arab`) where possible.
 - Include only resources with practical use for this repository.
 ## Structured catalog

 ## Validation method
 - Verify source URL resolves to official page or canonical repository.
+- Verify explicit Pashto support markers (`Pashto`, `Pukhto`, `Pushto`, `Pakhto`, `پښتو`, `ps`, `ps_af`, `pus`, `pbt_Arab`) where possible.
+- Reject resources where Pashto is only mentioned in passing and the primary work is focused on another language.
 - Include only resources with practical use for this repository.
 ## Structured catalog

docs/resource_cycle_runbook.md CHANGED Viewed

@@ -42,4 +42,6 @@ After discovery, promote only approved resources:
 ## Guardrails
 - Do not auto-promote candidates without evidence and license review.
 - Keep `status: verified` only for reviewed entries.
 - Generated files must be committed after catalog updates.

 ## Guardrails
 - Do not auto-promote candidates without evidence and license review.
 - Keep `status: verified` only for reviewed entries.
+- Do not promote "reference-only" resources where Pashto is incidental; only Pashto-centric resources are eligible.
+- Treat spelling variants as valid Pashto markers during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `پښتو`).
 - Generated files must be committed after catalog updates.

docs/search/resources.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "generated_on": "2026-02-16T00:00:00Z",
-  "count": 77,
   "resources": [
     {
       "id": "dataset-common-voice-ps-v24",
@@ -190,144 +190,6 @@
         "Pashto"
       ]
     },
-    {
-      "id": "model-whisper-large-v3",
-      "title": "Whisper Large v3",
-      "url": "https://huggingface.co/openai/whisper-large-v3",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
-      "primary_use": "ASR baseline and pseudo-labeling",
-      "tasks": [
-        "asr"
-      ],
-      "tags": [
-        "pashto",
-        "asr",
-        "whisper"
-      ],
-      "evidence_text": "Whisper tokenizer map includes ps language key.",
-      "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
-      "markers": [
-        "ps"
-      ]
-    },
-    {
-      "id": "model-mms-1b-all",
-      "title": "MMS 1B All",
-      "url": "https://huggingface.co/facebook/mms-1b-all",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Multilingual ASR model from MMS for low-resource transfer.",
-      "primary_use": "ASR transfer baseline",
-      "tasks": [
-        "asr"
-      ],
-      "tags": [
-        "pashto",
-        "asr",
-        "mms"
-      ],
-      "evidence_text": "MMS coverage table includes pus with ASR support.",
-      "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-      "markers": [
-        "pus"
-      ]
-    },
-    {
-      "id": "model-mms-tts",
-      "title": "MMS TTS",
-      "url": "https://huggingface.co/facebook/mms-tts",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
-      "primary_use": "TTS baseline and transfer",
-      "tasks": [
-        "tts"
-      ],
-      "tags": [
-        "pashto",
-        "tts",
-        "mms"
-      ],
-      "evidence_text": "MMS coverage table includes pus with TTS support.",
-      "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-      "markers": [
-        "pus"
-      ]
-    },
-    {
-      "id": "model-nllb-200-distilled-600m",
-      "title": "NLLB-200 Distilled 600M",
-      "url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "General multilingual translation model with Pashto script token support.",
-      "primary_use": "Pashto translation baseline",
-      "tasks": [
-        "mt"
-      ],
-      "tags": [
-        "pashto",
-        "mt",
-        "nllb"
-      ],
-      "evidence_text": "Model special token map includes pbt_Arab.",
-      "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
-      "markers": [
-        "pbt_Arab"
-      ]
-    },
-    {
-      "id": "model-opus-mt-en-mul",
-      "title": "OPUS MT en-mul",
-      "url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Translation model that can route English into Pashto via multilingual set.",
-      "primary_use": "English to Pashto translation path",
-      "tasks": [
-        "mt"
-      ],
-      "tags": [
-        "pashto",
-        "mt",
-        "opus"
-      ],
-      "evidence_text": "Language list includes pus code.",
-      "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
-      "markers": [
-        "pus"
-      ]
-    },
-    {
-      "id": "model-opus-mt-mul-en",
-      "title": "OPUS MT mul-en",
-      "url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Translation model for Pashto to English via multilingual encoder.",
-      "primary_use": "Pashto to English translation path",
-      "tasks": [
-        "mt"
-      ],
-      "tags": [
-        "pashto",
-        "mt",
-        "opus"
-      ],
-      "evidence_text": "Language list includes pus code.",
-      "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
-      "markers": [
-        "pus"
-      ]
-    },
     {
       "id": "model-pashto-bert",
       "title": "PashtoBERT",
@@ -447,150 +309,6 @@
         "pbt_Arab"
       ]
     },
-    {
-      "id": "tool-faster-whisper",
-      "title": "Faster-Whisper",
-      "url": "https://github.com/SYSTRAN/faster-whisper",
-      "category": "tool",
-      "source": "github",
-      "status": "verified",
-      "summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
-      "primary_use": "ASR inference acceleration",
-      "tasks": [
-        "asr"
-      ],
-      "tags": [
-        "pashto",
-        "tooling",
-        "asr"
-      ],
-      "evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
-      "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
-      "markers": [
-        "ps"
-      ]
-    },
-    {
-      "id": "tool-coqui-tts",
-      "title": "Coqui TTS",
-      "url": "https://github.com/coqui-ai/TTS",
-      "category": "tool",
-      "source": "github",
-      "status": "verified",
-      "summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
-      "primary_use": "TTS training and inference",
-      "tasks": [
-        "tts"
-      ],
-      "tags": [
-        "pashto",
-        "tooling",
-        "tts"
-      ],
-      "evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
-      "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-      "markers": [
-        "pus"
-      ]
-    },
-    {
-      "id": "paper-whisper-2212-04356",
-      "title": "Robust Speech Recognition via Large-Scale Weak Supervision",
-      "url": "https://arxiv.org/abs/2212.04356",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
-      "primary_use": "ASR methodology reference",
-      "tasks": [
-        "asr",
-        "research"
-      ],
-      "tags": [
-        "pashto",
-        "paper",
-        "asr"
-      ],
-      "evidence_text": "Paired with tokenizer language map containing ps.",
-      "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
-      "markers": [
-        "ps"
-      ]
-    },
-    {
-      "id": "paper-mms-2305-13516",
-      "title": "Scaling Speech Technology to 1,000+ Languages",
-      "url": "https://arxiv.org/abs/2305.13516",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
-      "primary_use": "ASR and TTS transfer reference",
-      "tasks": [
-        "asr",
-        "tts",
-        "research"
-      ],
-      "tags": [
-        "pashto",
-        "paper",
-        "speech"
-      ],
-      "evidence_text": "Coverage table marks pus support in MMS release.",
-      "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-      "markers": [
-        "pus"
-      ]
-    },
-    {
-      "id": "paper-nllb-2207-04672",
-      "title": "No Language Left Behind",
-      "url": "https://arxiv.org/abs/2207.04672",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
-      "primary_use": "MT research reference",
-      "tasks": [
-        "mt",
-        "research"
-      ],
-      "tags": [
-        "pashto",
-        "paper",
-        "mt"
-      ],
-      "evidence_text": "Model usage in repo references pbt_Arab token support.",
-      "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
-      "markers": [
-        "pbt_Arab"
-      ]
-    },
-    {
-      "id": "paper-fleurs-2205-12446",
-      "title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
-      "url": "https://arxiv.org/abs/2205.12446",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
-      "primary_use": "Speech benchmark methodology reference",
-      "tasks": [
-        "asr",
-        "benchmarking",
-        "research"
-      ],
-      "tags": [
-        "pashto",
-        "paper",
-        "benchmark"
-      ],
-      "evidence_text": "Dataset implementation includes ps_af language code.",
-      "evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
-      "markers": [
-        "ps_af"
-      ]
-    },
     {
       "id": "dataset-nexdata-99h-pashto-dialogue",
       "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
@@ -972,32 +690,6 @@
         "asr"
       ]
     },
-    {
-      "id": "code-github-mrychlik-worldly-ocr",
-      "title": "worldly-ocr",
-      "url": "https://github.com/mrychlik/worldly-ocr",
-      "category": "code",
-      "source": "github",
-      "status": "verified",
-      "summary": "Open OCR code project that explicitly includes Pashto among target languages.",
-      "primary_use": "Pashto OCR code reference and experimentation",
-      "tasks": [
-        "ocr",
-        "tooling"
-      ],
-      "tags": [
-        "pashto",
-        "code",
-        "github",
-        "ocr"
-      ],
-      "evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
-      "evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
-      "markers": [
-        "Pashto",
-        "OCR"
-      ]
-    },
     {
       "id": "paper-s2-psocr-lmm-pashto",
       "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
@@ -1687,30 +1379,6 @@
         "pashto"
       ]
     },
-    {
-      "id": "project-github-ihyacommunity-khushkhat-extension",
-      "title": "IhyaCommunity/Khushkhat-Extension",
-      "url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
-      "category": "project",
-      "source": "github",
-      "status": "verified",
-      "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
-      "primary_use": "Interactive Pashto demo and quick qualitative validation",
-      "tasks": [
-        "demo"
-      ],
-      "tags": [
-        "pashto",
-        "project",
-        "github",
-        "demo"
-      ],
-      "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
-      "evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
-      "markers": [
-        "pashto"
-      ]
-    },
     {
       "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
       "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",

 {
   "generated_on": "2026-02-16T00:00:00Z",
+  "count": 63,
   "resources": [
     {
       "id": "dataset-common-voice-ps-v24",
         "Pashto"
       ]
     },
     {
       "id": "model-pashto-bert",
       "title": "PashtoBERT",
         "pbt_Arab"
       ]
     },
     {
       "id": "dataset-nexdata-99h-pashto-dialogue",
       "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
         "asr"
       ]
     },
     {
       "id": "paper-s2-psocr-lmm-pashto",
       "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
         "pashto"
       ]
     },
     {
       "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
       "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",

resources/README.md CHANGED Viewed

@@ -4,12 +4,12 @@ Structured, Pashto-focused resource tracking lives in this folder.
 ## Sections
 - Datasets (28): [datasets/README.md](datasets/README.md)
-- Models (18): [models/README.md](models/README.md)
 - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
-- Tools (2): [tools/README.md](tools/README.md)
-- Papers (12): [papers/README.md](papers/README.md)
-- Projects (11): [projects/README.md](projects/README.md)
-- Code (2): [codes/README.md](codes/README.md)
 ## Machine-Readable Catalog
 - Canonical catalog: [catalog/resources.json](catalog/resources.json)
@@ -22,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
 - Run `python scripts/validate_resource_catalog.py` before opening a PR.
 - Run `python scripts/generate_resource_views.py` after catalog changes.
-Verified resource count: `77`

 ## Sections
 - Datasets (28): [datasets/README.md](datasets/README.md)
+- Models (12): [models/README.md](models/README.md)
 - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
+- Tools (0): [tools/README.md](tools/README.md)
+- Papers (8): [papers/README.md](papers/README.md)
+- Projects (10): [projects/README.md](projects/README.md)
+- Code (1): [codes/README.md](codes/README.md)
 ## Machine-Readable Catalog
 - Canonical catalog: [catalog/resources.json](catalog/resources.json)
 - Run `python scripts/validate_resource_catalog.py` before opening a PR.
 - Run `python scripts/generate_resource_views.py` after catalog changes.
+Verified resource count: `63`

resources/catalog/README.md CHANGED Viewed

@@ -12,3 +12,7 @@ This folder holds machine-readable resource data used by docs and GitHub Pages s
 2. Run `python scripts/validate_resource_catalog.py`.
 3. Run `python scripts/generate_resource_views.py`.
 4. Commit both catalog and generated markdown/search files.

 2. Run `python scripts/validate_resource_catalog.py`.
 3. Run `python scripts/generate_resource_views.py`.
 4. Commit both catalog and generated markdown/search files.
+## Promotion guardrail
+- Promote only Pashto-centric resources. Exclude entries where Pashto appears only as a side reference.
+- Accept Pashto naming variants during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `پښتو`).

resources/catalog/pending_candidates.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

resources/catalog/resources.json CHANGED Viewed

@@ -206,156 +206,6 @@
         "kaggle"
       ]
     },
-    {
-      "id": "model-whisper-large-v3",
-      "title": "Whisper Large v3",
-      "url": "https://huggingface.co/openai/whisper-large-v3",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
-      "primary_use": "ASR baseline and pseudo-labeling",
-      "tasks": [
-        "asr"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Whisper tokenizer map includes ps language key.",
-        "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
-        "markers": [
-          "ps"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "asr",
-        "whisper"
-      ]
-    },
-    {
-      "id": "model-mms-1b-all",
-      "title": "MMS 1B All",
-      "url": "https://huggingface.co/facebook/mms-1b-all",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Multilingual ASR model from MMS for low-resource transfer.",
-      "primary_use": "ASR transfer baseline",
-      "tasks": [
-        "asr"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "MMS coverage table includes pus with ASR support.",
-        "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-        "markers": [
-          "pus"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "asr",
-        "mms"
-      ]
-    },
-    {
-      "id": "model-mms-tts",
-      "title": "MMS TTS",
-      "url": "https://huggingface.co/facebook/mms-tts",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
-      "primary_use": "TTS baseline and transfer",
-      "tasks": [
-        "tts"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "MMS coverage table includes pus with TTS support.",
-        "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-        "markers": [
-          "pus"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "tts",
-        "mms"
-      ]
-    },
-    {
-      "id": "model-nllb-200-distilled-600m",
-      "title": "NLLB-200 Distilled 600M",
-      "url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "General multilingual translation model with Pashto script token support.",
-      "primary_use": "Pashto translation baseline",
-      "tasks": [
-        "mt"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Model special token map includes pbt_Arab.",
-        "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
-        "markers": [
-          "pbt_Arab"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "mt",
-        "nllb"
-      ]
-    },
-    {
-      "id": "model-opus-mt-en-mul",
-      "title": "OPUS MT en-mul",
-      "url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Translation model that can route English into Pashto via multilingual set.",
-      "primary_use": "English to Pashto translation path",
-      "tasks": [
-        "mt"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Language list includes pus code.",
-        "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
-        "markers": [
-          "pus"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "mt",
-        "opus"
-      ]
-    },
-    {
-      "id": "model-opus-mt-mul-en",
-      "title": "OPUS MT mul-en",
-      "url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
-      "category": "model",
-      "source": "huggingface",
-      "status": "verified",
-      "summary": "Translation model for Pashto to English via multilingual encoder.",
-      "primary_use": "Pashto to English translation path",
-      "tasks": [
-        "mt"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Language list includes pus code.",
-        "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
-        "markers": [
-          "pus"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "mt",
-        "opus"
-      ]
-    },
     {
       "id": "model-pashto-bert",
       "title": "PashtoBERT",
@@ -485,162 +335,6 @@
         "mt"
       ]
     },
-    {
-      "id": "tool-faster-whisper",
-      "title": "Faster-Whisper",
-      "url": "https://github.com/SYSTRAN/faster-whisper",
-      "category": "tool",
-      "source": "github",
-      "status": "verified",
-      "summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
-      "primary_use": "ASR inference acceleration",
-      "tasks": [
-        "asr"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
-        "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
-        "markers": [
-          "ps"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "tooling",
-        "asr"
-      ]
-    },
-    {
-      "id": "tool-coqui-tts",
-      "title": "Coqui TTS",
-      "url": "https://github.com/coqui-ai/TTS",
-      "category": "tool",
-      "source": "github",
-      "status": "verified",
-      "summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
-      "primary_use": "TTS training and inference",
-      "tasks": [
-        "tts"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
-        "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-        "markers": [
-          "pus"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "tooling",
-        "tts"
-      ]
-    },
-    {
-      "id": "paper-whisper-2212-04356",
-      "title": "Robust Speech Recognition via Large-Scale Weak Supervision",
-      "url": "https://arxiv.org/abs/2212.04356",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
-      "primary_use": "ASR methodology reference",
-      "tasks": [
-        "asr",
-        "research"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Paired with tokenizer language map containing ps.",
-        "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
-        "markers": [
-          "ps"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "paper",
-        "asr"
-      ]
-    },
-    {
-      "id": "paper-mms-2305-13516",
-      "title": "Scaling Speech Technology to 1,000+ Languages",
-      "url": "https://arxiv.org/abs/2305.13516",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
-      "primary_use": "ASR and TTS transfer reference",
-      "tasks": [
-        "asr",
-        "tts",
-        "research"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Coverage table marks pus support in MMS release.",
-        "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
-        "markers": [
-          "pus"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "paper",
-        "speech"
-      ]
-    },
-    {
-      "id": "paper-nllb-2207-04672",
-      "title": "No Language Left Behind",
-      "url": "https://arxiv.org/abs/2207.04672",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
-      "primary_use": "MT research reference",
-      "tasks": [
-        "mt",
-        "research"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Model usage in repo references pbt_Arab token support.",
-        "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
-        "markers": [
-          "pbt_Arab"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "paper",
-        "mt"
-      ]
-    },
-    {
-      "id": "paper-fleurs-2205-12446",
-      "title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
-      "url": "https://arxiv.org/abs/2205.12446",
-      "category": "paper",
-      "source": "arxiv",
-      "status": "verified",
-      "summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
-      "primary_use": "Speech benchmark methodology reference",
-      "tasks": [
-        "asr",
-        "benchmarking",
-        "research"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Dataset implementation includes ps_af language code.",
-        "evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
-        "markers": [
-          "ps_af"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "paper",
-        "benchmark"
-      ]
-    },
     {
       "id": "dataset-nexdata-99h-pashto-dialogue",
       "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
@@ -1061,34 +755,6 @@
         "asr"
       ]
     },
-    {
-      "id": "code-github-mrychlik-worldly-ocr",
-      "title": "worldly-ocr",
-      "url": "https://github.com/mrychlik/worldly-ocr",
-      "category": "code",
-      "source": "github",
-      "status": "verified",
-      "summary": "Open OCR code project that explicitly includes Pashto among target languages.",
-      "primary_use": "Pashto OCR code reference and experimentation",
-      "tasks": [
-        "ocr",
-        "tooling"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
-        "evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
-        "markers": [
-          "Pashto",
-          "OCR"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "code",
-        "github",
-        "ocr"
-      ]
-    },
     {
       "id": "paper-s2-psocr-lmm-pashto",
       "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
@@ -1837,32 +1503,6 @@
         "demo"
       ]
     },
-    {
-      "id": "project-github-ihyacommunity-khushkhat-extension",
-      "title": "IhyaCommunity/Khushkhat-Extension",
-      "url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
-      "category": "project",
-      "source": "github",
-      "status": "verified",
-      "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
-      "primary_use": "Interactive Pashto demo and quick qualitative validation",
-      "tasks": [
-        "demo"
-      ],
-      "pashto_evidence": {
-        "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
-        "evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
-        "markers": [
-          "pashto"
-        ]
-      },
-      "tags": [
-        "pashto",
-        "project",
-        "github",
-        "demo"
-      ]
-    },
     {
       "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
       "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",

         "kaggle"
       ]
     },
     {
       "id": "model-pashto-bert",
       "title": "PashtoBERT",
         "mt"
       ]
     },
     {
       "id": "dataset-nexdata-99h-pashto-dialogue",
       "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
         "asr"
       ]
     },
     {
       "id": "paper-s2-psocr-lmm-pashto",
       "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
         "demo"
       ]
     },
     {
       "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
       "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",

resources/models/README.md CHANGED Viewed

@@ -11,15 +11,11 @@
 | ihanif/whisper-small-pashto | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto) | Pashto ASR baseline and model comparison |
 | ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
 | ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
-| MMS 1B All | [huggingface](https://huggingface.co/facebook/mms-1b-all) | [MMS coverage table includes pus with ASR support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR transfer baseline |
-| MMS TTS | [huggingface](https://huggingface.co/facebook/mms-tts) | [MMS coverage table includes pus with TTS support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS baseline and transfer |
-| NLLB-200 Distilled 600M | [huggingface](https://huggingface.co/facebook/nllb-200-distilled-600M) | [Model special token map includes pbt_Arab. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | Pashto translation baseline |
-| OPUS MT en-mul | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | English to Pashto translation path |
-| OPUS MT mul-en | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | Pashto to English translation path |
 | PashtoBERT | [huggingface](https://huggingface.co/mdarhri/pashto-bert) | [Model card states training on Pashto corpus data. (`Pashto`)](https://huggingface.co/mdarhri/pashto-bert) | Pashto NLP baseline encoder |
 | wav2vec2 XLS-R 300M Pashto | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | [Model tags include pashto and ps, and model index references FLEURS config ps_af. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | Pashto ASR baseline and comparative experiments |
-| Whisper Large v3 | [huggingface](https://huggingface.co/openai/whisper-large-v3) | [Whisper tokenizer map includes ps language key. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR baseline and pseudo-labeling |
 | Whisper Medium Pashto | [huggingface](https://huggingface.co/ihanif/whisper-medium-pashto) | [Model tags include pashto and ps, and model index uses FLEURS ps_af split. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/whisper-medium-pashto) | Pashto ASR baseline and transcription quality comparisons |
 ## Maintenance
 - Source of truth: [../catalog/resources.json](../catalog/resources.json)

 | ihanif/whisper-small-pashto | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto) | Pashto ASR baseline and model comparison |
 | ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
 | ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
 | PashtoBERT | [huggingface](https://huggingface.co/mdarhri/pashto-bert) | [Model card states training on Pashto corpus data. (`Pashto`)](https://huggingface.co/mdarhri/pashto-bert) | Pashto NLP baseline encoder |
 | wav2vec2 XLS-R 300M Pashto | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | [Model tags include pashto and ps, and model index references FLEURS config ps_af. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | Pashto ASR baseline and comparative experiments |
+| Whisper Base Pashto | [huggingface](https://huggingface.co/ihanif/whisper-base-pashto) | [Model ID includes Pashto and card metadata references FLEURS config ps_af. (`Pashto`, `ps_af`)](https://huggingface.co/api/models/ihanif/whisper-base-pashto) | Pashto ASR baseline and speed-accuracy comparison |
 | Whisper Medium Pashto | [huggingface](https://huggingface.co/ihanif/whisper-medium-pashto) | [Model tags include pashto and ps, and model index uses FLEURS ps_af split. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/whisper-medium-pashto) | Pashto ASR baseline and transcription quality comparisons |
+| zirak-ai/pashto-bert-v1 | [huggingface](https://huggingface.co/zirak-ai/pashto-bert-v1) | [Hugging Face model ID and search tags explicitly include pashto marker. (`pashto`)](https://huggingface.co/zirak-ai/pashto-bert-v1) | Pashto encoder baseline for NLP tasks |
 ## Maintenance
 - Source of truth: [../catalog/resources.json](../catalog/resources.json)

resources/papers/README.md CHANGED Viewed

@@ -7,15 +7,11 @@
 | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
 | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
 | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
-| FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech | [arxiv](https://arxiv.org/abs/2205.12446) | [Dataset implementation includes ps_af language code. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark methodology reference |
 | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
-| No Language Left Behind | [arxiv](https://arxiv.org/abs/2207.04672) | [Model usage in repo references pbt_Arab token support. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | MT research reference |
 | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
 | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
 | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
 | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
-| Robust Speech Recognition via Large-Scale Weak Supervision | [arxiv](https://arxiv.org/abs/2212.04356) | [Paired with tokenizer language map containing ps. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR methodology reference |
-| Scaling Speech Technology to 1,000+ Languages | [arxiv](https://arxiv.org/abs/2305.13516) | [Coverage table marks pus support in MMS release. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR and TTS transfer reference |
 ## Maintenance
 - Source of truth: [../catalog/resources.json](../catalog/resources.json)

 | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
 | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
 | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
 | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
 | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
 | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
 | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
 | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
 ## Maintenance
 - Source of truth: [../catalog/resources.json](../catalog/resources.json)

resources/projects/README.md CHANGED Viewed

@@ -6,7 +6,6 @@
 |---|---|---|---|
 | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
 | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
-| IhyaCommunity/Khushkhat-Extension | [github](https://github.com/IhyaCommunity/Khushkhat-Extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/IhyaCommunity/Khushkhat-Extension) | Interactive Pashto demo and quick qualitative validation |
 | nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
 | Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
 | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |

 |---|---|---|---|
 | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
 | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
 | nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
 | Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
 | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |

resources/tools/README.md CHANGED Viewed

@@ -4,8 +4,7 @@
 | Resource | Link | Pashto Evidence | Primary Use |
 |---|---|---|---|
-| Coqui TTS | [github](https://github.com/coqui-ai/TTS) | [Can be paired with Pashto-supporting MMS checkpoints. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS training and inference |
-| Faster-Whisper | [github](https://github.com/SYSTRAN/faster-whisper) | [Whisper tokenizer includes ps and tool runs Whisper models. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR inference acceleration |
 ## Maintenance
 - Source of truth: [../catalog/resources.json](../catalog/resources.json)

 | Resource | Link | Pashto Evidence | Primary Use |
 |---|---|---|---|
+| _None yet_ | - | - | - |
 ## Maintenance
 - Source of truth: [../catalog/resources.json](../catalog/resources.json)

scripts/sync_resources.py CHANGED Viewed

@@ -30,6 +30,19 @@ from urllib.error import HTTPError, URLError
 USER_AGENT = "pashto-resource-sync/1.0"
 MAX_FETCH_RETRIES = 4
 RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
 def _slug(value: str) -> str:
@@ -39,6 +52,28 @@ def _slug(value: str) -> str:
     return value[:80] if value else "resource"
 def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
     if not retry_after:
         return None
@@ -201,16 +236,35 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
     if kind not in {"datasets", "models"}:
         return []
-    query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
-    url = f"https://huggingface.co/api/{kind}?{query}"
-    payload = _fetch_json(url, source_name=f"huggingface-{kind}")
     category = "dataset" if kind == "datasets" else "model"
     out: list[dict[str, Any]] = []
-    for item in payload:
         repo_id = item.get("id") or item.get("modelId")
         if not repo_id:
             continue
         repo_url = f"https://huggingface.co/{'datasets/' if kind == 'datasets' else ''}{repo_id}"
         rid = f"candidate-hf-{kind[:-1]}-{_slug(repo_id)}"
         out.append(
@@ -227,19 +281,40 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
                 tags=["pashto", "candidate", category],
             )
         )
     return out
 def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
-    query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
-    url = f"https://huggingface.co/api/spaces?{query}"
-    payload = _fetch_json(url, source_name="huggingface-spaces")
     out: list[dict[str, Any]] = []
-    for item in payload:
         space_id = item.get("id")
         if not space_id:
             continue
         space_url = f"https://huggingface.co/spaces/{space_id}"
         rid = f"candidate-hf-project-{_slug(space_id)}"
         summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
@@ -257,17 +332,36 @@ def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
                 tags=["pashto", "candidate", "project", "space"],
             )
         )
     return out
 def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
     # Public Kaggle dataset listing endpoint (no auth needed for list responses).
-    query = urllib.parse.urlencode({"search": "pashto", "page": "1"})
-    url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
-    payload = _fetch_json(url, source_name="kaggle-datasets")
     out: list[dict[str, Any]] = []
-    for item in payload:
         title = (item.get("titleNullable") or "").strip()
         dataset_url = (item.get("urlNullable") or "").strip()
         owner = (item.get("ownerRefNullable") or "").strip()
@@ -275,8 +369,9 @@ def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
         if not title or not dataset_url:
             continue
-        blob = f"{title} {subtitle}".lower()
-        if "pashto" not in blob and "pukhto" not in blob:
             continue
         owner_prefix = f"{owner}/" if owner else ""
@@ -304,7 +399,11 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
     # Query by topic first for high precision, then by keyword for recall.
     query_variants = [
         "topic:pashto",
         "pashto in:name,description,readme",
     ]
     combined: dict[str, dict[str, Any]] = {}
@@ -334,8 +433,10 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
                 item.get("description") or "",
                 " ".join(item.get("topics") or []),
             ]
-        ).lower()
-        if "pashto" not in name_blob and "pukhto" not in name_blob:
             continue
         html_url = item["html_url"]
@@ -367,70 +468,110 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
 def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
-    query = urllib.parse.urlencode(
-        {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
-    )
-    url = f"https://export.arxiv.org/api/query?{query}"
-    try:
-        xml_text = _fetch_text(url, timeout=30.0, source_name="arxiv")
-    except Exception as exc:  # noqa: BLE001
-        if not _is_ssl_cert_error(exc):
-            raise
-        # arXiv occasionally fails cert chain validation in some runner images.
-        insecure_context = ssl._create_unverified_context()
-        print("[warn] arxiv SSL verification failed; retrying with unverified TLS context")
-        xml_text = _fetch_text(
-            url,
-            timeout=30.0,
-            ssl_context=insecure_context,
-            source_name="arxiv",
         )
-    root = ET.fromstring(xml_text)
     ns = {"atom": "http://www.w3.org/2005/Atom"}
     out: list[dict[str, Any]] = []
-    for entry in root.findall("atom:entry", ns):
-        title = (entry.findtext("atom:title", default="", namespaces=ns) or "").strip()
-        link = (entry.findtext("atom:id", default="", namespaces=ns) or "").strip()
-        summary = (entry.findtext("atom:summary", default="", namespaces=ns) or "").strip()
-        if not title or not link:
-            continue
-        rid = f"candidate-arxiv-{_slug(title)}"
-        out.append(
-            _candidate(
-                rid=rid,
-                title=title,
-                url=link,
-                category="paper",
-                source="arxiv",
-                summary=summary[:240] if summary else "Candidate paper returned from arXiv query for Pashto.",
-                evidence_text="Matched by arXiv query: all:pashto.",
-                evidence_url=link,
-                markers=["pashto"],
-                tags=["pashto", "candidate", "paper"],
             )
-        )
     return out
 def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
     fields = "title,url,abstract,year,externalIds"
-    query = urllib.parse.urlencode(
-        {"query": "pashto", "limit": str(limit), "fields": fields}
-    )
-    url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
-    payload = _fetch_json(
-        url,
-        timeout=30.0,
-        source_name="semantic-scholar",
-    )
     out: list[dict[str, Any]] = []
-    for item in payload.get("data", []):
         title = (item.get("title") or "").strip()
         if not title:
             continue
         paper_url = (item.get("url") or "").strip()
         if not paper_url:
             ext = item.get("externalIds") or {}
@@ -450,12 +591,14 @@ def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
                 category="paper",
                 source="other",
                 summary=summary[:240] if summary else "Candidate paper returned from Semantic Scholar search for Pashto.",
-                evidence_text="Matched by Semantic Scholar query: pashto.",
                 evidence_url=paper_url,
                 markers=["pashto"],
                 tags=["pashto", "candidate", "paper"],
             )
         )
     return out

 USER_AGENT = "pashto-resource-sync/1.0"
 MAX_FETCH_RETRIES = 4
 RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
+PASHTO_QUERY_TERMS = ["pashto", "pukhto", "pushto", "pakhto"]
+PASHTO_TEXT_MARKERS = ("pashto", "pukhto", "pushto", "pakhto")
+PASHTO_SCRIPT_MARKERS = ("پښتو", "پشتو")
+PASHTO_WORD_RE = re.compile(
+    r"(?<![A-Za-z0-9])(pashto|pukhto|pushto|pakhto)(?![A-Za-z0-9])",
+    re.IGNORECASE,
+)
+PASHTO_CAMEL_RE = re.compile(
+    r"(?<![A-Za-z0-9])(pashto|pukhto|pakhto)(?=[A-Z])",
+    re.IGNORECASE,
+)
+PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b", re.IGNORECASE)
+LOW_SIGNAL_RE = re.compile(r"(^|[-_/ ])(test|tmp|trial|scratch)([-_/ ]|$)", re.IGNORECASE)
 def _slug(value: str) -> str:
     return value[:80] if value else "resource"
+def _contains_pashto_marker(value: str) -> bool:
+    text = (value or "").strip()
+    if not text:
+        return False
+    if PASHTO_WORD_RE.search(text):
+        return True
+    if PASHTO_CAMEL_RE.search(text):
+        return True
+    if any(marker in text for marker in PASHTO_SCRIPT_MARKERS):
+        return True
+    lowered = text.casefold()
+    return bool(PASHTO_CODE_RE.search(lowered))
+def _is_pashto_centric(*values: str) -> bool:
+    return any(_contains_pashto_marker(value) for value in values)
+def _is_low_signal_name(value: str) -> bool:
+    return bool(LOW_SIGNAL_RE.search(value or ""))
 def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
     if not retry_after:
         return None
     if kind not in {"datasets", "models"}:
         return []
+    combined: dict[str, dict[str, Any]] = {}
+    errors: list[str] = []
+    for term in PASHTO_QUERY_TERMS:
+        query = urllib.parse.urlencode({"search": term, "limit": str(limit)})
+        url = f"https://huggingface.co/api/{kind}?{query}"
+        try:
+            payload = _fetch_json(url, source_name=f"huggingface-{kind}")
+        except Exception as exc:  # noqa: BLE001
+            errors.append(f"{term}: {exc}")
+            continue
+        for item in payload:
+            repo_id = item.get("id") or item.get("modelId")
+            if not repo_id:
+                continue
+            combined[repo_id] = item
+    if not combined and errors:
+        raise RuntimeError("; ".join(errors))
     category = "dataset" if kind == "datasets" else "model"
     out: list[dict[str, Any]] = []
+    for item in combined.values():
         repo_id = item.get("id") or item.get("modelId")
         if not repo_id:
             continue
+        if not _is_pashto_centric(repo_id):
+            continue
+        if _is_low_signal_name(repo_id):
+            continue
         repo_url = f"https://huggingface.co/{'datasets/' if kind == 'datasets' else ''}{repo_id}"
         rid = f"candidate-hf-{kind[:-1]}-{_slug(repo_id)}"
         out.append(
                 tags=["pashto", "candidate", category],
             )
         )
+        if len(out) >= limit:
+            break
     return out
 def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
+    combined: dict[str, dict[str, Any]] = {}
+    errors: list[str] = []
+    for term in PASHTO_QUERY_TERMS:
+        query = urllib.parse.urlencode({"search": term, "limit": str(limit)})
+        url = f"https://huggingface.co/api/spaces?{query}"
+        try:
+            payload = _fetch_json(url, source_name="huggingface-spaces")
+        except Exception as exc:  # noqa: BLE001
+            errors.append(f"{term}: {exc}")
+            continue
+        for item in payload:
+            space_id = item.get("id")
+            if not space_id:
+                continue
+            combined[space_id] = item
+    if not combined and errors:
+        raise RuntimeError("; ".join(errors))
     out: list[dict[str, Any]] = []
+    for item in combined.values():
         space_id = item.get("id")
         if not space_id:
             continue
+        if not _is_pashto_centric(space_id):
+            continue
+        if _is_low_signal_name(space_id):
+            continue
         space_url = f"https://huggingface.co/spaces/{space_id}"
         rid = f"candidate-hf-project-{_slug(space_id)}"
         summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
                 tags=["pashto", "candidate", "project", "space"],
             )
         )
+        if len(out) >= limit:
+            break
     return out
 def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
     # Public Kaggle dataset listing endpoint (no auth needed for list responses).
+    combined: list[dict[str, Any]] = []
+    seen_urls: set[str] = set()
+    errors: list[str] = []
+    for term in PASHTO_QUERY_TERMS:
+        query = urllib.parse.urlencode({"search": term, "page": "1"})
+        url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
+        try:
+            payload = _fetch_json(url, source_name="kaggle-datasets")
+        except Exception as exc:  # noqa: BLE001
+            errors.append(f"{term}: {exc}")
+            continue
+        for item in payload:
+            dataset_url = (item.get("urlNullable") or "").strip()
+            if not dataset_url or dataset_url in seen_urls:
+                continue
+            seen_urls.add(dataset_url)
+            combined.append(item)
+    if not combined and errors:
+        raise RuntimeError("; ".join(errors))
     out: list[dict[str, Any]] = []
+    for item in combined:
         title = (item.get("titleNullable") or "").strip()
         dataset_url = (item.get("urlNullable") or "").strip()
         owner = (item.get("ownerRefNullable") or "").strip()
         if not title or not dataset_url:
             continue
+        if not _is_pashto_centric(title, subtitle):
+            continue
+        if _is_low_signal_name(title):
             continue
         owner_prefix = f"{owner}/" if owner else ""
     # Query by topic first for high precision, then by keyword for recall.
     query_variants = [
         "topic:pashto",
+        "topic:pukhto",
         "pashto in:name,description,readme",
+        "pukhto in:name,description,readme",
+        "pushto in:name,description,readme",
+        "pakhto in:name,description,readme",
     ]
     combined: dict[str, dict[str, Any]] = {}
                 item.get("description") or "",
                 " ".join(item.get("topics") or []),
             ]
+        )
+        if not _is_pashto_centric(name_blob):
+            continue
+        if _is_low_signal_name(full_name):
             continue
         html_url = item["html_url"]
 def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
+    roots: list[ET.Element] = []
+    errors: list[str] = []
+    for term in PASHTO_QUERY_TERMS:
+        query = urllib.parse.urlencode(
+            {"search_query": f"all:{term}", "start": "0", "max_results": str(limit)}
         )
+        url = f"https://export.arxiv.org/api/query?{query}"
+        try:
+            xml_text = _fetch_text(url, timeout=30.0, source_name="arxiv")
+        except Exception as exc:  # noqa: BLE001
+            if not _is_ssl_cert_error(exc):
+                errors.append(f"{term}: {exc}")
+                continue
+            # arXiv occasionally fails cert chain validation in some runner images.
+            insecure_context = ssl._create_unverified_context()
+            print("[warn] arxiv SSL verification failed; retrying with unverified TLS context")
+            xml_text = _fetch_text(
+                url,
+                timeout=30.0,
+                ssl_context=insecure_context,
+                source_name="arxiv",
+            )
+        roots.append(ET.fromstring(xml_text))
+    if not roots and errors:
+        raise RuntimeError("; ".join(errors))
     ns = {"atom": "http://www.w3.org/2005/Atom"}
+    seen_links: set[str] = set()
     out: list[dict[str, Any]] = []
+    for root in roots:
+        for entry in root.findall("atom:entry", ns):
+            title = (entry.findtext("atom:title", default="", namespaces=ns) or "").strip()
+            link = (entry.findtext("atom:id", default="", namespaces=ns) or "").strip()
+            summary = (entry.findtext("atom:summary", default="", namespaces=ns) or "").strip()
+            if not title or not link:
+                continue
+            if link in seen_links:
+                continue
+            # Strict: keep only papers with explicit Pashto markers in title.
+            if not _is_pashto_centric(title):
+                continue
+            if _is_low_signal_name(title):
+                continue
+            seen_links.add(link)
+            rid = f"candidate-arxiv-{_slug(title)}"
+            out.append(
+                _candidate(
+                    rid=rid,
+                    title=title,
+                    url=link,
+                    category="paper",
+                    source="arxiv",
+                    summary=summary[:240] if summary else "Candidate paper returned from arXiv query for Pashto.",
+                    evidence_text="Matched by Pashto marker in paper title from arXiv query results.",
+                    evidence_url=link,
+                    markers=["pashto"],
+                    tags=["pashto", "candidate", "paper"],
+                )
             )
+            if len(out) >= limit:
+                return out
     return out
 def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
     fields = "title,url,abstract,year,externalIds"
+    combined: dict[str, dict[str, Any]] = {}
+    errors: list[str] = []
+    for term in PASHTO_QUERY_TERMS:
+        query = urllib.parse.urlencode(
+            {"query": term, "limit": str(limit), "fields": fields}
+        )
+        url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
+        try:
+            payload = _fetch_json(
+                url,
+                timeout=30.0,
+                source_name="semantic-scholar",
+            )
+        except Exception as exc:  # noqa: BLE001
+            errors.append(f"{term}: {exc}")
+            continue
+        for item in payload.get("data", []):
+            title = (item.get("title") or "").strip()
+            if not title:
+                continue
+            combined[title] = item
+    if not combined and errors:
+        raise RuntimeError("; ".join(errors))
     out: list[dict[str, Any]] = []
+    for item in combined.values():
         title = (item.get("title") or "").strip()
         if not title:
             continue
+        # Strict: keep only papers with explicit Pashto markers in title.
+        if not _is_pashto_centric(title):
+            continue
+        if _is_low_signal_name(title):
+            continue
         paper_url = (item.get("url") or "").strip()
         if not paper_url:
             ext = item.get("externalIds") or {}
                 category="paper",
                 source="other",
                 summary=summary[:240] if summary else "Candidate paper returned from Semantic Scholar search for Pashto.",
+                evidence_text="Matched by explicit Pashto marker in paper title from Semantic Scholar search.",
                 evidence_url=paper_url,
                 markers=["pashto"],
                 tags=["pashto", "candidate", "paper"],
             )
         )
+        if len(out) >= limit:
+            break
     return out

scripts/validate_resource_catalog.py CHANGED Viewed

@@ -20,6 +20,9 @@ ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project
 ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
 ALLOWED_STATUS = {"verified", "candidate"}
 RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
 def _load_json(path: Path) -> dict[str, Any]:
@@ -39,6 +42,19 @@ def _validate_iso_date(value: str) -> bool:
     return True
 def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
     errors: list[str] = []
     prefix = f"resource[{index}]"
@@ -123,6 +139,14 @@ def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
     if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
         errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")
     return errors

 ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
 ALLOWED_STATUS = {"verified", "candidate"}
 RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
+STRICT_PASHTO_CATEGORIES = {"model", "paper", "tool", "code", "project"}
+PASHTO_PUSHTO_WORD_RE = re.compile(r"(?<![a-z0-9])pushto(?![a-z0-9])")
+PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b")
 def _load_json(path: Path) -> dict[str, Any]:
     return True
+def _contains_pashto_marker(value: str) -> bool:
+    if not isinstance(value, str):
+        return False
+    lowered = value.casefold()
+    if any(marker in lowered for marker in ("pashto", "pukhto", "pakhto")):
+        return True
+    if PASHTO_PUSHTO_WORD_RE.search(lowered):
+        return True
+    if PASHTO_CODE_RE.search(lowered):
+        return True
+    return any(marker in value for marker in ("پښتو", "پشتو"))
 def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
     errors: list[str] = []
     prefix = f"resource[{index}]"
     if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
         errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")
+    if category in STRICT_PASHTO_CATEGORIES and not (
+        _contains_pashto_marker(title) or _contains_pashto_marker(url)
+    ):
+        errors.append(
+            f"{prefix} must be Pashto-centric for category '{category}' "
+            "(include a Pashto marker in title or URL)"
+        )
     return errors

tests/test_validate_resource_catalog.py CHANGED Viewed

@@ -43,3 +43,21 @@ def test_validate_catalog_fails_for_invalid_evidence_url() -> None:
     catalog["resources"][0]["pashto_evidence"]["evidence_url"] = "not-a-url"
     errors = validate_catalog(catalog)
     assert any("evidence_url" in error for error in errors)

     catalog["resources"][0]["pashto_evidence"]["evidence_url"] = "not-a-url"
     errors = validate_catalog(catalog)
     assert any("evidence_url" in error for error in errors)
+def test_validate_catalog_fails_for_non_pashto_centric_model() -> None:
+    catalog = _minimal_catalog()
+    catalog["resources"][0]["category"] = "model"
+    catalog["resources"][0]["title"] = "Generic Multilingual Model"
+    catalog["resources"][0]["url"] = "https://example.org/model"
+    errors = validate_catalog(catalog)
+    assert any("must be Pashto-centric" in error for error in errors)
+def test_validate_catalog_allows_pashto_centric_model() -> None:
+    catalog = _minimal_catalog()
+    catalog["resources"][0]["category"] = "model"
+    catalog["resources"][0]["title"] = "Pashto ASR Model"
+    catalog["resources"][0]["url"] = "https://example.org/pashto-model"
+    errors = validate_catalog(catalog)
+    assert errors == []