Expand resource cycle for projects/code and promote new Pashto sources

Files changed (17) hide show

README.md +2 -0
docs/resource_automation.md +8 -0
docs/resource_catalog.md +2 -0
docs/resource_cycle_runbook.md +2 -0
docs/search/resources.json +179 -1
resources/README.md +4 -2
resources/catalog/pending_candidates.json +0 -0
resources/catalog/resource.template.json +1 -1
resources/catalog/resources.json +195 -0
resources/codes/README.md +12 -0
resources/datasets/README.md +3 -0
resources/projects/README.md +14 -0
resources/schema/resource.schema.json +3 -1
scripts/README.md +1 -1
scripts/generate_resource_views.py +4 -0
scripts/sync_resources.py +135 -0
scripts/validate_resource_catalog.py +1 -1

README.md CHANGED Viewed

@@ -89,6 +89,8 @@ python -m pytest -q
 - Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
 - Tools: [resources/tools/README.md](resources/tools/README.md)
 - Papers: [resources/papers/README.md](resources/papers/README.md)
 ## Workspaces
 - [data/](data/README.md): datasets, curation, metadata, quality

 - Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
 - Tools: [resources/tools/README.md](resources/tools/README.md)
 - Papers: [resources/papers/README.md](resources/papers/README.md)
+- Projects: [resources/projects/README.md](resources/projects/README.md)
+- Code: [resources/codes/README.md](resources/codes/README.md)
 ## Workspaces
 - [data/](data/README.md): datasets, curation, metadata, quality

docs/resource_automation.md CHANGED Viewed

@@ -7,6 +7,14 @@ This repository uses a semi-automated process to keep Pashto resources current w
 - Keep a machine-readable canonical catalog.
 - Prevent unreviewed low-confidence resources from directly entering verified lists.
 ## Files involved
 - Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
 - Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)

 - Keep a machine-readable canonical catalog.
 - Prevent unreviewed low-confidence resources from directly entering verified lists.
+## Covered source types
+- Kaggle datasets
+- Hugging Face datasets
+- Hugging Face models
+- Hugging Face Spaces (projects)
+- GitHub repositories (projects and code)
+- Research-paper endpoints
 ## Files involved
 - Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
 - Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)

docs/resource_catalog.md CHANGED Viewed

@@ -20,6 +20,8 @@ This index points to validated Pashto-related resources tracked in structured fi
 - Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
 - Tools: [../resources/tools/README.md](../resources/tools/README.md)
 - Papers: [../resources/papers/README.md](../resources/papers/README.md)
 ## Search page
 - GitHub Pages search UI: [search/index.html](search/index.html)

 - Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
 - Tools: [../resources/tools/README.md](../resources/tools/README.md)
 - Papers: [../resources/papers/README.md](../resources/papers/README.md)
+- Projects: [../resources/projects/README.md](../resources/projects/README.md)
+- Code: [../resources/codes/README.md](../resources/codes/README.md)
 ## Search page
 - GitHub Pages search UI: [search/index.html](search/index.html)

docs/resource_cycle_runbook.md CHANGED Viewed

@@ -21,6 +21,8 @@ What it executes:
 4. `python scripts/check_links.py`
 5. `python -m pytest -q`
 ## Discovery-only mode
 If you only want fresh candidates:

 4. `python scripts/check_links.py`
 5. `python -m pytest -q`
+Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, and paper endpoints.
 ## Discovery-only mode
 If you only want fresh candidates:

docs/search/resources.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "generated_on": "2026-02-15T00:00:00Z",
-  "count": 30,
   "resources": [
     {
       "id": "dataset-common-voice-ps-v24",
@@ -717,6 +717,184 @@
         "ps",
         "ps_af"
       ]
     }
   ]
 }

 {
   "generated_on": "2026-02-15T00:00:00Z",
+  "count": 37,
   "resources": [
     {
       "id": "dataset-common-voice-ps-v24",
         "ps",
         "ps_af"
       ]
+    },
+    {
+      "id": "dataset-kaggle-pold-pashto-offensive",
+      "title": "POLD - Pashto Offensive Language Dataset",
+      "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
+      "category": "dataset",
+      "source": "kaggle",
+      "status": "verified",
+      "summary": "Benchmark dataset for offensive content detection in Pashto social text.",
+      "primary_use": "Pashto toxicity and moderation NLP benchmarks",
+      "tasks": [
+        "nlp",
+        "classification"
+      ],
+      "tags": [
+        "pashto",
+        "kaggle",
+        "nlp",
+        "toxicity"
+      ],
+      "evidence_text": "Kaggle title and description explicitly state Pashto offensive language benchmark dataset.",
+      "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset",
+      "markers": [
+        "Pashto"
+      ]
+    },
+    {
+      "id": "dataset-kaggle-pashto-english-sentiment-corpus",
+      "title": "Pashto English Bilingual Sentiment Corpus",
+      "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
+      "category": "dataset",
+      "source": "kaggle",
+      "status": "verified",
+      "summary": "Pashto to English bilingual sentiment corpus useful for low-resource sentiment tasks.",
+      "primary_use": "Sentiment analysis and bilingual NLP experiments",
+      "tasks": [
+        "nlp",
+        "sentiment"
+      ],
+      "tags": [
+        "pashto",
+        "kaggle",
+        "sentiment",
+        "bilingual"
+      ],
+      "evidence_text": "Kaggle dataset title and description identify the corpus as Pashto-English sentiment data.",
+      "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus",
+      "markers": [
+        "Pashto"
+      ]
+    },
+    {
+      "id": "dataset-kaggle-urdu-pashto-lexicon",
+      "title": "Urdu-Pashto Lexicon Dataset",
+      "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
+      "category": "dataset",
+      "source": "kaggle",
+      "status": "verified",
+      "summary": "Lexicon of Urdu words with Pashto translations for dictionary and MT support.",
+      "primary_use": "Lexicon and translation lexeme mapping",
+      "tasks": [
+        "nlp",
+        "mt"
+      ],
+      "tags": [
+        "pashto",
+        "kaggle",
+        "lexicon",
+        "translation"
+      ],
+      "evidence_text": "Kaggle metadata describes 7,601 Urdu entries with Pashto translations.",
+      "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset",
+      "markers": [
+        "Pashto"
+      ]
+    },
+    {
+      "id": "project-hf-space-ihanif-pashto-asr-v3",
+      "title": "Pashto ASR V3 Space",
+      "url": "https://huggingface.co/spaces/ihanif/pashto-asr-v3",
+      "category": "project",
+      "source": "huggingface",
+      "status": "verified",
+      "summary": "Interactive Hugging Face Space for Pashto automatic speech recognition demos.",
+      "primary_use": "Project demo for Pashto ASR user testing",
+      "tasks": [
+        "asr",
+        "demo"
+      ],
+      "tags": [
+        "pashto",
+        "project",
+        "huggingface-space",
+        "asr"
+      ],
+      "evidence_text": "Space card title is Pashto ASR V3 and short description states Pashto ASR.",
+      "evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr-v3",
+      "markers": [
+        "Pashto",
+        "ASR"
+      ]
+    },
+    {
+      "id": "project-hf-space-pashto2english-dictionary",
+      "title": "Pashto to English Dictionary Space",
+      "url": "https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary",
+      "category": "project",
+      "source": "huggingface",
+      "status": "verified",
+      "summary": "Streamlit project for Pashto to English dictionary lookups.",
+      "primary_use": "Interactive bilingual lookup project",
+      "tasks": [
+        "dictionary",
+        "translation",
+        "demo"
+      ],
+      "tags": [
+        "pashto",
+        "project",
+        "huggingface-space",
+        "dictionary"
+      ],
+      "evidence_text": "Space metadata title states Pashto to English Dictionary.",
+      "evidence_url": "https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary",
+      "markers": [
+        "Pashto"
+      ]
+    },
+    {
+      "id": "project-hf-space-umar4321-pashto-translator",
+      "title": "Pashto Translator Space",
+      "url": "https://huggingface.co/spaces/Umar4321/Pashto-Translator",
+      "category": "project",
+      "source": "huggingface",
+      "status": "verified",
+      "summary": "Streamlit translator project for Pashto to English and Urdu conversion.",
+      "primary_use": "Interactive translation project demo",
+      "tasks": [
+        "translation",
+        "demo"
+      ],
+      "tags": [
+        "pashto",
+        "project",
+        "huggingface-space",
+        "translation"
+      ],
+      "evidence_text": "Space title is Pashto Translator and description states Pashto to English and Urdu translation.",
+      "evidence_url": "https://huggingface.co/api/spaces/Umar4321/Pashto-Translator",
+      "markers": [
+        "Pashto"
+      ]
+    },
+    {
+      "id": "code-github-ijazul-haq-nlpashto",
+      "title": "nlpashto Toolkit",
+      "url": "https://github.com/ijazul-haq/nlpashto",
+      "category": "code",
+      "source": "github",
+      "status": "verified",
+      "summary": "Pashto NLP toolkit codebase for tokenization, embeddings, and downstream NLP workflows.",
+      "primary_use": "Pashto NLP code integration and experimentation",
+      "tasks": [
+        "nlp",
+        "tooling"
+      ],
+      "tags": [
+        "pashto",
+        "code",
+        "github",
+        "nlp"
+      ],
+      "evidence_text": "Repository name and description explicitly identify a Pashto NLP toolkit.",
+      "evidence_url": "https://api.github.com/repos/ijazul-haq/nlpashto",
+      "markers": [
+        "Pashto",
+        "NLP"
+      ]
     }
   ]
 }

resources/README.md CHANGED Viewed

@@ -3,11 +3,13 @@
 Structured, Pashto-focused resource tracking lives in this folder.
 ## Sections
-- Datasets (11): [datasets/README.md](datasets/README.md)
 - Models (9): [models/README.md](models/README.md)
 - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
 - Tools (2): [tools/README.md](tools/README.md)
 - Papers (4): [papers/README.md](papers/README.md)
 ## Machine-Readable Catalog
 - Canonical catalog: [catalog/resources.json](catalog/resources.json)
@@ -20,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
 - Run `python scripts/validate_resource_catalog.py` before opening a PR.
 - Run `python scripts/generate_resource_views.py` after catalog changes.
-Verified resource count: `30`

 Structured, Pashto-focused resource tracking lives in this folder.
 ## Sections
+- Datasets (14): [datasets/README.md](datasets/README.md)
 - Models (9): [models/README.md](models/README.md)
 - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
 - Tools (2): [tools/README.md](tools/README.md)
 - Papers (4): [papers/README.md](papers/README.md)
+- Projects (3): [projects/README.md](projects/README.md)
+- Code (1): [codes/README.md](codes/README.md)
 ## Machine-Readable Catalog
 - Canonical catalog: [catalog/resources.json](catalog/resources.json)
 - Run `python scripts/validate_resource_catalog.py` before opening a PR.
 - Run `python scripts/generate_resource_views.py` after catalog changes.
+Verified resource count: `37`

resources/catalog/pending_candidates.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

resources/catalog/resource.template.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "id": "example-resource-id",
   "title": "Example Resource Title",
   "url": "https://example.org/resource",
-  "category": "dataset",
   "source": "other",
   "status": "verified",
   "summary": "One-line summary explaining why this resource matters for Pashto in technology.",

   "id": "example-resource-id",
   "title": "Example Resource Title",
   "url": "https://example.org/resource",
+  "category": "project",
   "source": "other",
   "status": "verified",
   "summary": "One-line summary explaining why this resource matters for Pashto in technology.",

resources/catalog/resources.json CHANGED Viewed

@@ -782,6 +782,201 @@
         "whisper",
         "fleurs"
       ]
     }
   ]
 }

         "whisper",
         "fleurs"
       ]
+    },
+    {
+      "id": "dataset-kaggle-pold-pashto-offensive",
+      "title": "POLD - Pashto Offensive Language Dataset",
+      "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
+      "category": "dataset",
+      "source": "kaggle",
+      "status": "verified",
+      "summary": "Benchmark dataset for offensive content detection in Pashto social text.",
+      "primary_use": "Pashto toxicity and moderation NLP benchmarks",
+      "license": "CC BY 4.0",
+      "tasks": [
+        "nlp",
+        "classification"
+      ],
+      "pashto_evidence": {
+        "evidence_text": "Kaggle title and description explicitly state Pashto offensive language benchmark dataset.",
+        "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset",
+        "markers": [
+          "Pashto"
+        ]
+      },
+      "tags": [
+        "pashto",
+        "kaggle",
+        "nlp",
+        "toxicity"
+      ]
+    },
+    {
+      "id": "dataset-kaggle-pashto-english-sentiment-corpus",
+      "title": "Pashto English Bilingual Sentiment Corpus",
+      "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
+      "category": "dataset",
+      "source": "kaggle",
+      "status": "verified",
+      "summary": "Pashto to English bilingual sentiment corpus useful for low-resource sentiment tasks.",
+      "primary_use": "Sentiment analysis and bilingual NLP experiments",
+      "license": "CC BY-NC-SA 4.0",
+      "tasks": [
+        "nlp",
+        "sentiment"
+      ],
+      "pashto_evidence": {
+        "evidence_text": "Kaggle dataset title and description identify the corpus as Pashto-English sentiment data.",
+        "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus",
+        "markers": [
+          "Pashto"
+        ]
+      },
+      "tags": [
+        "pashto",
+        "kaggle",
+        "sentiment",
+        "bilingual"
+      ]
+    },
+    {
+      "id": "dataset-kaggle-urdu-pashto-lexicon",
+      "title": "Urdu-Pashto Lexicon Dataset",
+      "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
+      "category": "dataset",
+      "source": "kaggle",
+      "status": "verified",
+      "summary": "Lexicon of Urdu words with Pashto translations for dictionary and MT support.",
+      "primary_use": "Lexicon and translation lexeme mapping",
+      "license": "CC0",
+      "tasks": [
+        "nlp",
+        "mt"
+      ],
+      "pashto_evidence": {
+        "evidence_text": "Kaggle metadata describes 7,601 Urdu entries with Pashto translations.",
+        "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset",
+        "markers": [
+          "Pashto"
+        ]
+      },
+      "tags": [
+        "pashto",
+        "kaggle",
+        "lexicon",
+        "translation"
+      ]
+    },
+    {
+      "id": "project-hf-space-ihanif-pashto-asr-v3",
+      "title": "Pashto ASR V3 Space",
+      "url": "https://huggingface.co/spaces/ihanif/pashto-asr-v3",
+      "category": "project",
+      "source": "huggingface",
+      "status": "verified",
+      "summary": "Interactive Hugging Face Space for Pashto automatic speech recognition demos.",
+      "primary_use": "Project demo for Pashto ASR user testing",
+      "tasks": [
+        "asr",
+        "demo"
+      ],
+      "pashto_evidence": {
+        "evidence_text": "Space card title is Pashto ASR V3 and short description states Pashto ASR.",
+        "evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr-v3",
+        "markers": [
+          "Pashto",
+          "ASR"
+        ]
+      },
+      "tags": [
+        "pashto",
+        "project",
+        "huggingface-space",
+        "asr"
+      ]
+    },
+    {
+      "id": "project-hf-space-pashto2english-dictionary",
+      "title": "Pashto to English Dictionary Space",
+      "url": "https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary",
+      "category": "project",
+      "source": "huggingface",
+      "status": "verified",
+      "summary": "Streamlit project for Pashto to English dictionary lookups.",
+      "primary_use": "Interactive bilingual lookup project",
+      "tasks": [
+        "dictionary",
+        "translation",
+        "demo"
+      ],
+      "pashto_evidence": {
+        "evidence_text": "Space metadata title states Pashto to English Dictionary.",
+        "evidence_url": "https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary",
+        "markers": [
+          "Pashto"
+        ]
+      },
+      "tags": [
+        "pashto",
+        "project",
+        "huggingface-space",
+        "dictionary"
+      ]
+    },
+    {
+      "id": "project-hf-space-umar4321-pashto-translator",
+      "title": "Pashto Translator Space",
+      "url": "https://huggingface.co/spaces/Umar4321/Pashto-Translator",
+      "category": "project",
+      "source": "huggingface",
+      "status": "verified",
+      "summary": "Streamlit translator project for Pashto to English and Urdu conversion.",
+      "primary_use": "Interactive translation project demo",
+      "tasks": [
+        "translation",
+        "demo"
+      ],
+      "pashto_evidence": {
+        "evidence_text": "Space title is Pashto Translator and description states Pashto to English and Urdu translation.",
+        "evidence_url": "https://huggingface.co/api/spaces/Umar4321/Pashto-Translator",
+        "markers": [
+          "Pashto"
+        ]
+      },
+      "tags": [
+        "pashto",
+        "project",
+        "huggingface-space",
+        "translation"
+      ]
+    },
+    {
+      "id": "code-github-ijazul-haq-nlpashto",
+      "title": "nlpashto Toolkit",
+      "url": "https://github.com/ijazul-haq/nlpashto",
+      "category": "code",
+      "source": "github",
+      "status": "verified",
+      "summary": "Pashto NLP toolkit codebase for tokenization, embeddings, and downstream NLP workflows.",
+      "primary_use": "Pashto NLP code integration and experimentation",
+      "tasks": [
+        "nlp",
+        "tooling"
+      ],
+      "pashto_evidence": {
+        "evidence_text": "Repository name and description explicitly identify a Pashto NLP toolkit.",
+        "evidence_url": "https://api.github.com/repos/ijazul-haq/nlpashto",
+        "markers": [
+          "Pashto",
+          "NLP"
+        ]
+      },
+      "tags": [
+        "pashto",
+        "code",
+        "github",
+        "nlp"
+      ]
     }
   ]
 }

resources/codes/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# Code
+## Verified Pashto Resources
+| Resource | Link | Pashto Evidence | Primary Use |
+|---|---|---|---|
+| nlpashto Toolkit | [github](https://github.com/ijazul-haq/nlpashto) | [Repository name and description explicitly identify a Pashto NLP toolkit. (`Pashto`, `NLP`)](https://api.github.com/repos/ijazul-haq/nlpashto) | Pashto NLP code integration and experimentation |
+## Maintenance
+- Source of truth: [../catalog/resources.json](../catalog/resources.json)
+- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
+- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)

resources/datasets/README.md CHANGED Viewed

@@ -10,9 +10,12 @@
 | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
 | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
 | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
 | Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
 | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
 | Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
 | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
 | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |

 | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
 | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
 | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
+| Pashto English Bilingual Sentiment Corpus | [kaggle](https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus) | [Kaggle dataset title and description identify the corpus as Pashto-English sentiment data. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus) | Sentiment analysis and bilingual NLP experiments |
 | Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
 | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
 | Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
+| POLD - Pashto Offensive Language Dataset | [kaggle](https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset) | [Kaggle title and description explicitly state Pashto offensive language benchmark dataset. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset) | Pashto toxicity and moderation NLP benchmarks |
+| Urdu-Pashto Lexicon Dataset | [kaggle](https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset) | [Kaggle metadata describes 7,601 Urdu entries with Pashto translations. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset) | Lexicon and translation lexeme mapping |
 | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
 | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |

resources/projects/README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Projects
+## Verified Pashto Resources
+| Resource | Link | Pashto Evidence | Primary Use |
+|---|---|---|---|
+| Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
+| Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
+| Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
+## Maintenance
+- Source of truth: [../catalog/resources.json](../catalog/resources.json)
+- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
+- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)

resources/schema/resource.schema.json CHANGED Viewed

@@ -62,7 +62,9 @@
             "model",
             "benchmark",
             "tool",
-            "paper"
           ]
         },
         "source": {

             "model",
             "benchmark",
             "tool",
+            "paper",
+            "project",
+            "code"
           ]
         },
         "source": {

scripts/README.md CHANGED Viewed

@@ -7,7 +7,7 @@ Automation scripts for quality checks, resource catalog validation, and search i
 - `check_links.py`: ensure markdown links are clickable (optional online reachability check).
 - `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
 - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
-- `sync_resources.py`: collect new candidate Pashto resources from public endpoints into `resources/catalog/pending_candidates.json`.
 - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
 ## Usage

 - `check_links.py`: ensure markdown links are clickable (optional online reachability check).
 - `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
 - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
+- `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub repositories, and paper endpoints into `resources/catalog/pending_candidates.json`.
 - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
 ## Usage

scripts/generate_resource_views.py CHANGED Viewed

@@ -17,6 +17,8 @@ CATEGORY_CONFIG = {
     "benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
     "tool": ("resources/tools/README.md", "Tools"),
     "paper": ("resources/papers/README.md", "Papers"),
 }
@@ -86,6 +88,8 @@ def _write_resources_home(path: Path, counts: dict[str, int], total_verified: in
         f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
         f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
         f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
         "",
         "## Machine-Readable Catalog",
         "- Canonical catalog: [catalog/resources.json](catalog/resources.json)",

     "benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
     "tool": ("resources/tools/README.md", "Tools"),
     "paper": ("resources/papers/README.md", "Papers"),
+    "project": ("resources/projects/README.md", "Projects"),
+    "code": ("resources/codes/README.md", "Code"),
 }
         f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
         f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
         f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
+        f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)",
+        f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)",
         "",
         "## Machine-Readable Catalog",
         "- Canonical catalog: [catalog/resources.json](catalog/resources.json)",

scripts/sync_resources.py CHANGED Viewed

@@ -108,6 +108,138 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
     return out
 def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
     query = urllib.parse.urlencode(
         {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
@@ -228,8 +360,11 @@ def main() -> int:
     sources_used: list[str] = []
     fetch_steps = [
         ("huggingface-datasets", lambda: fetch_huggingface("datasets", args.limit)),
         ("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
         ("arxiv", lambda: fetch_arxiv(args.limit)),
         ("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
     ]

     return out
+def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
+    query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
+    url = f"https://huggingface.co/api/spaces?{query}"
+    payload = _fetch_json(url)
+    out: list[dict[str, Any]] = []
+    for item in payload:
+        space_id = item.get("id")
+        if not space_id:
+            continue
+        space_url = f"https://huggingface.co/spaces/{space_id}"
+        rid = f"candidate-hf-project-{_slug(space_id)}"
+        summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
+        out.append(
+            _candidate(
+                rid=rid,
+                title=space_id,
+                url=space_url,
+                category="project",
+                source="huggingface",
+                summary=summary,
+                evidence_text="Matched by Pashto keyword in Hugging Face Spaces search.",
+                evidence_url=space_url,
+                markers=["pashto"],
+                tags=["pashto", "candidate", "project", "space"],
+            )
+        )
+    return out
+def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
+    # Public Kaggle dataset listing endpoint (no auth needed for list responses).
+    query = urllib.parse.urlencode({"search": "pashto", "page": "1"})
+    url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
+    payload = _fetch_json(url)
+    out: list[dict[str, Any]] = []
+    for item in payload:
+        title = (item.get("titleNullable") or "").strip()
+        dataset_url = (item.get("urlNullable") or "").strip()
+        owner = (item.get("ownerRefNullable") or "").strip()
+        subtitle = (item.get("subtitleNullable") or "").strip()
+        if not title or not dataset_url:
+            continue
+        blob = f"{title} {subtitle}".lower()
+        if "pashto" not in blob and "pukhto" not in blob:
+            continue
+        owner_prefix = f"{owner}/" if owner else ""
+        rid = f"candidate-kaggle-dataset-{_slug(owner_prefix + title)}"
+        out.append(
+            _candidate(
+                rid=rid,
+                title=title,
+                url=dataset_url,
+                category="dataset",
+                source="kaggle",
+                summary=(subtitle or "Candidate Kaggle dataset returned from Pashto search.")[:240],
+                evidence_text="Kaggle dataset title/subtitle includes Pashto keyword.",
+                evidence_url=dataset_url,
+                markers=["Pashto"],
+                tags=["pashto", "candidate", "dataset", "kaggle"],
+            )
+        )
+        if len(out) >= limit:
+            break
+    return out
+def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
+    # Query by topic first for high precision, then by keyword for recall.
+    query_variants = [
+        "topic:pashto",
+        "pashto in:name,description,readme",
+    ]
+    combined: dict[str, dict[str, Any]] = {}
+    for query_text in query_variants:
+        query = urllib.parse.urlencode(
+            {"q": query_text, "sort": "stars", "order": "desc", "per_page": str(limit)}
+        )
+        url = f"https://api.github.com/search/repositories?{query}"
+        payload = _fetch_json(url)
+        for item in payload.get("items", []):
+            full_name = item.get("full_name")
+            html_url = item.get("html_url")
+            if not full_name or not html_url:
+                continue
+            combined[full_name] = item
+    out: list[dict[str, Any]] = []
+    for full_name, item in sorted(combined.items(), key=lambda kv: kv[1].get("stargazers_count", 0), reverse=True):
+        name_blob = " ".join(
+            [
+                full_name or "",
+                item.get("name") or "",
+                item.get("description") or "",
+                " ".join(item.get("topics") or []),
+            ]
+        ).lower()
+        if "pashto" not in name_blob and "pukhto" not in name_blob:
+            continue
+        html_url = item["html_url"]
+        category = "project"
+        topics = item.get("topics") or []
+        if any(token in name_blob for token in ("toolkit", "library", "nlp", "asr", "tts", "ocr", "api", "code")):
+            category = "code"
+        rid = f"candidate-gh-{category}-{_slug(full_name)}"
+        description = (item.get("description") or "").strip()
+        summary = description or "Candidate Pashto-related GitHub repository."
+        out.append(
+            _candidate(
+                rid=rid,
+                title=full_name,
+                url=html_url,
+                category=category,
+                source="github",
+                summary=summary[:240] if summary else "Candidate Pashto-related GitHub repository.",
+                evidence_text="Repository metadata (name/description/topics) includes Pashto markers.",
+                evidence_url=html_url,
+                markers=["pashto"],
+                tags=["pashto", "candidate", category, "github", *(topics[:3])],
+            )
+        )
+        if len(out) >= limit:
+            break
+    return out
 def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
     query = urllib.parse.urlencode(
         {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
     sources_used: list[str] = []
     fetch_steps = [
+        ("kaggle-datasets", lambda: fetch_kaggle_datasets(args.limit)),
         ("huggingface-datasets", lambda: fetch_huggingface("datasets", args.limit)),
         ("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
+        ("huggingface-spaces", lambda: fetch_huggingface_spaces(args.limit)),
+        ("github-repositories", lambda: fetch_github_pashto_repos(args.limit)),
         ("arxiv", lambda: fetch_arxiv(args.limit)),
         ("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
     ]

scripts/validate_resource_catalog.py CHANGED Viewed

@@ -16,7 +16,7 @@ from typing import Any
 from urllib.parse import urlparse
-ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper"}
 ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
 ALLOWED_STATUS = {"verified", "candidate"}
 RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")

 from urllib.parse import urlparse
+ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project", "code"}
 ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
 ALLOWED_STATUS = {"verified", "candidate"}
 RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")