musaw commited on
Commit
194828a
·
1 Parent(s): 6f1c8bd

sync(hf): snapshot origin main after resource audit cycle

Browse files
.github/workflows/resource_sync.yml CHANGED
@@ -4,11 +4,29 @@ on:
4
  schedule:
5
  - cron: "0 4 * * *"
6
  workflow_dispatch:
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  permissions:
9
  contents: write
10
  pull-requests: write
11
 
 
 
 
 
 
12
  jobs:
13
  sync:
14
  runs-on: ubuntu-latest
@@ -26,11 +44,24 @@ jobs:
26
  python -m pip install --upgrade pip
27
  python -m pip install -e ".[dev]"
28
 
 
 
 
 
 
 
 
 
29
  - name: Sync candidate resources
30
- run: python scripts/sync_resources.py --limit 20
31
 
32
  - name: Auto-promote valid candidates
33
- run: python scripts/promote_candidates.py
 
 
 
 
 
34
 
35
  - name: Validate catalog
36
  run: python scripts/validate_resource_catalog.py
@@ -79,6 +110,7 @@ jobs:
79
  Automated daily resource sync.
80
 
81
  Scope:
 
82
  - Updates `resources/catalog/pending_candidates.json`
83
  - Auto-promotes valid non-duplicate candidates into `resources/catalog/resources.json`
84
  - Regenerates resource indexes and search payload
@@ -88,6 +120,7 @@ jobs:
88
  add-paths: |
89
  resources/catalog/pending_candidates.json
90
  resources/catalog/resources.json
 
91
  resources/README.md
92
  resources/datasets/README.md
93
  resources/models/README.md
 
4
  schedule:
5
  - cron: "0 4 * * *"
6
  workflow_dispatch:
7
+ inputs:
8
+ limit:
9
+ description: "Candidate fetch limit per source"
10
+ required: false
11
+ default: "20"
12
+ max_promotions:
13
+ description: "Optional max number of candidate promotions"
14
+ required: false
15
+ default: ""
16
+ enforce_pashto_relevance:
17
+ description: "Also remove existing entries without Pashto evidence"
18
+ required: false
19
+ default: "true"
20
 
21
  permissions:
22
  contents: write
23
  pull-requests: write
24
 
25
+ env:
26
+ RESOURCE_LIMIT: ${{ github.event.inputs.limit || '20' }}
27
+ MAX_PROMOTIONS: ${{ github.event.inputs.max_promotions || '' }}
28
+ ENFORCE_PASHTO_RELEVANCE: ${{ github.event.inputs.enforce_pashto_relevance || 'true' }}
29
+
30
  jobs:
31
  sync:
32
  runs-on: ubuntu-latest
 
44
  python -m pip install --upgrade pip
45
  python -m pip install -e ".[dev]"
46
 
47
+ - name: Review existing resources for stale or low-value entries
48
+ run: |
49
+ if [ "${ENFORCE_PASHTO_RELEVANCE}" = "true" ]; then
50
+ python scripts/review_existing_resources.py --enforce-pashto-relevance
51
+ else
52
+ python scripts/review_existing_resources.py
53
+ fi
54
+
55
  - name: Sync candidate resources
56
+ run: python scripts/sync_resources.py --limit "${RESOURCE_LIMIT}"
57
 
58
  - name: Auto-promote valid candidates
59
+ run: |
60
+ if [ -n "${MAX_PROMOTIONS}" ]; then
61
+ python scripts/promote_candidates.py --max-promotions "${MAX_PROMOTIONS}"
62
+ else
63
+ python scripts/promote_candidates.py
64
+ fi
65
 
66
  - name: Validate catalog
67
  run: python scripts/validate_resource_catalog.py
 
110
  Automated daily resource sync.
111
 
112
  Scope:
113
+ - Reviews existing catalog entries and removes stale ones only with strong logged reasons
114
  - Updates `resources/catalog/pending_candidates.json`
115
  - Auto-promotes valid non-duplicate candidates into `resources/catalog/resources.json`
116
  - Regenerates resource indexes and search payload
 
120
  add-paths: |
121
  resources/catalog/pending_candidates.json
122
  resources/catalog/resources.json
123
+ resources/catalog/removal_log.json
124
  resources/README.md
125
  resources/datasets/README.md
126
  resources/models/README.md
docs/resource_automation.md CHANGED
@@ -28,6 +28,7 @@ This repository uses automated discovery and promotion to keep Pashto resources
28
  ## Scripts
29
  - Validate catalog: `python scripts/validate_resource_catalog.py`
30
  - Generate markdown and search index: `python scripts/generate_resource_views.py`
 
31
  - Sync new candidates: `python scripts/sync_resources.py --limit 20`
32
  - Auto-promote valid candidates: `python scripts/promote_candidates.py`
33
  - Full run wrapper: `python scripts/run_resource_cycle.py --limit 25`
@@ -39,14 +40,16 @@ This repository uses automated discovery and promotion to keep Pashto resources
39
  - markdown link checks
40
  - tests
41
  - Resource Sync (`.github/workflows/resource_sync.yml`) runs daily, syncs candidates, auto-promotes valid non-duplicate entries, regenerates views, and opens a PR.
 
42
 
43
  ## Promotion flow
44
- 1. Sync candidates into `resources/catalog/pending_candidates.json`.
45
- 2. Auto-promote valid, non-duplicate entries into `resources/catalog/resources.json`.
46
- 3. Run:
 
47
  - `python scripts/validate_resource_catalog.py`
48
  - `python scripts/generate_resource_views.py`
49
- 4. Review PR and merge.
50
 
51
  ## Runbook
52
  - Reusable process guide: [resource_cycle_runbook.md](resource_cycle_runbook.md)
 
28
  ## Scripts
29
  - Validate catalog: `python scripts/validate_resource_catalog.py`
30
  - Generate markdown and search index: `python scripts/generate_resource_views.py`
31
+ - Review existing resources for stale/deleted entries: `python scripts/review_existing_resources.py`
32
  - Sync new candidates: `python scripts/sync_resources.py --limit 20`
33
  - Auto-promote valid candidates: `python scripts/promote_candidates.py`
34
  - Full run wrapper: `python scripts/run_resource_cycle.py --limit 25`
 
40
  - markdown link checks
41
  - tests
42
  - Resource Sync (`.github/workflows/resource_sync.yml`) runs daily, syncs candidates, auto-promotes valid non-duplicate entries, regenerates views, and opens a PR.
43
+ - Before candidate sync, it reviews existing entries and removes stale/deleted or non-Pashto/low-value entries only with strong logged reasons.
44
 
45
  ## Promotion flow
46
+ 1. Review existing resources and remove stale entries with strong reasons.
47
+ 2. Sync candidates into `resources/catalog/pending_candidates.json`.
48
+ 3. Auto-promote valid, non-duplicate, URL-available entries into `resources/catalog/resources.json`.
49
+ 4. Run:
50
  - `python scripts/validate_resource_catalog.py`
51
  - `python scripts/generate_resource_views.py`
52
+ 5. Review PR and merge.
53
 
54
  ## Runbook
55
  - Reusable process guide: [resource_cycle_runbook.md](resource_cycle_runbook.md)
docs/resource_cycle_runbook.md CHANGED
@@ -5,7 +5,7 @@ Use this runbook whenever you want to repeat the resource update process without
5
  ## Daily automation (already enabled)
6
  - Workflow: [../.github/workflows/resource_sync.yml](../.github/workflows/resource_sync.yml)
7
  - Schedule: every day at 04:00 UTC via GitHub Actions cron.
8
- - Output: updates [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json), auto-promotes valid non-duplicate entries into [../resources/catalog/resources.json](../resources/catalog/resources.json), regenerates views, and opens a review PR.
9
 
10
  ## Manual run (single command)
11
  Run from repository root:
@@ -15,12 +15,13 @@ python scripts/run_resource_cycle.py --limit 25
15
  ```
16
 
17
  What it executes:
18
- 1. `python scripts/sync_resources.py --limit 25`
19
- 2. `python scripts/promote_candidates.py`
20
- 3. `python scripts/validate_resource_catalog.py`
21
- 4. `python scripts/generate_resource_views.py`
22
- 5. `python scripts/check_links.py`
23
- 6. `python -m pytest -q`
 
24
 
25
  Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, GitLab repositories, Zenodo records, Dataverse datasets, DataCite DOI records, and paper endpoints (arXiv, Semantic Scholar, OpenAlex, Crossref).
26
 
@@ -33,7 +34,8 @@ If you want fresh candidates without auto-promotion:
33
  5. Commit and push.
34
 
35
  ## Guardrails
36
- - Auto-promotion accepts only entries that pass dedupe and catalog validation checks.
 
37
  - Keep `status: verified` for entries that pass automation checks and repository review.
38
  - Do not promote "reference-only" resources where Pashto is incidental; only Pashto-centric resources are eligible.
39
  - Treat spelling variants as valid Pashto markers during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `pashto-script`).
 
5
  ## Daily automation (already enabled)
6
  - Workflow: [../.github/workflows/resource_sync.yml](../.github/workflows/resource_sync.yml)
7
  - Schedule: every day at 04:00 UTC via GitHub Actions cron.
8
+ - Output: reviews existing resources for stale/deleted links and non-Pashto/low-value entries (removing only with strong logged reasons), updates [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json), auto-promotes valid non-duplicate entries into [../resources/catalog/resources.json](../resources/catalog/resources.json), regenerates views, and opens a review PR.
9
 
10
  ## Manual run (single command)
11
  Run from repository root:
 
15
  ```
16
 
17
  What it executes:
18
+ 1. `python scripts/review_existing_resources.py`
19
+ 2. `python scripts/sync_resources.py --limit 25`
20
+ 3. `python scripts/promote_candidates.py`
21
+ 4. `python scripts/validate_resource_catalog.py`
22
+ 5. `python scripts/generate_resource_views.py`
23
+ 6. `python scripts/check_links.py`
24
+ 7. `python -m pytest -q`
25
 
26
  Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, GitLab repositories, Zenodo records, Dataverse datasets, DataCite DOI records, and paper endpoints (arXiv, Semantic Scholar, OpenAlex, Crossref).
27
 
 
34
  5. Commit and push.
35
 
36
  ## Guardrails
37
+ - Auto-promotion accepts only entries that pass dedupe, URL-availability checks, and catalog validation checks.
38
+ - Existing resources are auto-removed only for strong reasons (for example confirmed hard-missing links, duplicates, or missing Pashto relevance), with reasons stored in `resources/catalog/removal_log.json`.
39
  - Keep `status: verified` for entries that pass automation checks and repository review.
40
  - Do not promote "reference-only" resources where Pashto is incidental; only Pashto-centric resources are eligible.
41
  - Treat spelling variants as valid Pashto markers during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `pashto-script`).
docs/search/resources.json CHANGED
The diff for this file is too large to render. See raw diff
 
resources/README.md CHANGED
@@ -3,13 +3,13 @@
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
- - Datasets (48): [datasets/README.md](datasets/README.md)
7
- - Models (18): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (0): [tools/README.md](tools/README.md)
10
- - Papers (24): [papers/README.md](papers/README.md)
11
- - Projects (17): [projects/README.md](projects/README.md)
12
- - Code (1): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
15
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
@@ -22,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
- Verified resource count: `112`
 
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
+ - Datasets (46): [datasets/README.md](datasets/README.md)
7
+ - Models (19): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (0): [tools/README.md](tools/README.md)
10
+ - Papers (104): [papers/README.md](papers/README.md)
11
+ - Projects (48): [projects/README.md](projects/README.md)
12
+ - Code (4): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
15
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
 
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
+ Verified resource count: `225`
resources/catalog/pending_candidates.json CHANGED
The diff for this file is too large to render. See raw diff
 
resources/catalog/removal_log.json ADDED
@@ -0,0 +1,1205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "updated_on": "2026-02-22",
3
+ "entries": [
4
+ {
5
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
6
+ "id": "dataset-kaggle-pashto-isolated-words",
7
+ "title": "Pashto Isolated Words Speech Dataset",
8
+ "url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
9
+ "reasons": [
10
+ "URL returned hard-missing HTTP status 404."
11
+ ],
12
+ "evidence": {
13
+ "status_code": 404,
14
+ "final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
15
+ "metadata_pashto": true,
16
+ "direct_pashto": true,
17
+ "page_pashto": false
18
+ }
19
+ },
20
+ {
21
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
22
+ "id": "dataset-kaggle-pashto-word-embeddings",
23
+ "title": "Pashto Word Embeddings",
24
+ "url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
25
+ "reasons": [
26
+ "URL returned hard-missing HTTP status 404."
27
+ ],
28
+ "evidence": {
29
+ "status_code": 404,
30
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
31
+ "metadata_pashto": true,
32
+ "direct_pashto": true,
33
+ "page_pashto": false
34
+ }
35
+ },
36
+ {
37
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
38
+ "id": "dataset-kaggle-pold-pashto-offensive",
39
+ "title": "POLD - Pashto Offensive Language Dataset",
40
+ "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
41
+ "reasons": [
42
+ "URL returned hard-missing HTTP status 404."
43
+ ],
44
+ "evidence": {
45
+ "status_code": 404,
46
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
47
+ "metadata_pashto": true,
48
+ "direct_pashto": true,
49
+ "page_pashto": false
50
+ }
51
+ },
52
+ {
53
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
54
+ "id": "dataset-kaggle-pashto-english-sentiment-corpus",
55
+ "title": "Pashto English Bilingual Sentiment Corpus",
56
+ "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
57
+ "reasons": [
58
+ "URL returned hard-missing HTTP status 404."
59
+ ],
60
+ "evidence": {
61
+ "status_code": 404,
62
+ "final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
63
+ "metadata_pashto": true,
64
+ "direct_pashto": true,
65
+ "page_pashto": false
66
+ }
67
+ },
68
+ {
69
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
70
+ "id": "dataset-kaggle-urdu-pashto-lexicon",
71
+ "title": "Urdu-Pashto Lexicon Dataset",
72
+ "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
73
+ "reasons": [
74
+ "URL returned hard-missing HTTP status 404."
75
+ ],
76
+ "evidence": {
77
+ "status_code": 404,
78
+ "final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
79
+ "metadata_pashto": true,
80
+ "direct_pashto": true,
81
+ "page_pashto": false
82
+ }
83
+ },
84
+ {
85
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
86
+ "id": "dataset-kaggle-drijaz-pashtoocr",
87
+ "title": "PashtoOCR (Kaggle)",
88
+ "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
89
+ "reasons": [
90
+ "URL returned hard-missing HTTP status 404."
91
+ ],
92
+ "evidence": {
93
+ "status_code": 404,
94
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
95
+ "metadata_pashto": true,
96
+ "direct_pashto": true,
97
+ "page_pashto": false
98
+ }
99
+ },
100
+ {
101
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
102
+ "id": "dataset-kaggle-english-pashto-language-dataset-epld",
103
+ "title": "English-Pashto Language Dataset (EPLD)",
104
+ "url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
105
+ "reasons": [
106
+ "URL returned hard-missing HTTP status 404."
107
+ ],
108
+ "evidence": {
109
+ "status_code": 404,
110
+ "final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
111
+ "metadata_pashto": true,
112
+ "direct_pashto": true,
113
+ "page_pashto": false
114
+ }
115
+ },
116
+ {
117
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
118
+ "id": "dataset-kaggle-katib-s-pashto-text-imagebase-kpti",
119
+ "title": "Katib's Pashto Text Imagebase (KPTI)",
120
+ "url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
121
+ "reasons": [
122
+ "URL returned hard-missing HTTP status 404."
123
+ ],
124
+ "evidence": {
125
+ "status_code": 404,
126
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
127
+ "metadata_pashto": true,
128
+ "direct_pashto": true,
129
+ "page_pashto": false
130
+ }
131
+ },
132
+ {
133
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
134
+ "id": "dataset-kaggle-pashto-ocr",
135
+ "title": "Pashto OCR",
136
+ "url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
137
+ "reasons": [
138
+ "URL returned hard-missing HTTP status 404."
139
+ ],
140
+ "evidence": {
141
+ "status_code": 404,
142
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
143
+ "metadata_pashto": true,
144
+ "direct_pashto": true,
145
+ "page_pashto": false
146
+ }
147
+ },
148
+ {
149
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
150
+ "id": "dataset-kaggle-common-voice-24-0-pashto-speech-dataset",
151
+ "title": "Common Voice 24.0: Pashto Speech Dataset",
152
+ "url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
153
+ "reasons": [
154
+ "URL returned hard-missing HTTP status 404."
155
+ ],
156
+ "evidence": {
157
+ "status_code": 404,
158
+ "final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
159
+ "metadata_pashto": true,
160
+ "direct_pashto": true,
161
+ "page_pashto": false
162
+ }
163
+ },
164
+ {
165
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
166
+ "id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
167
+ "title": "Pashto Isolated Alphabets and Numerals",
168
+ "url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
169
+ "reasons": [
170
+ "URL returned hard-missing HTTP status 404."
171
+ ],
172
+ "evidence": {
173
+ "status_code": 404,
174
+ "final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
175
+ "metadata_pashto": true,
176
+ "direct_pashto": true,
177
+ "page_pashto": false
178
+ }
179
+ },
180
+ {
181
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
182
+ "id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
183
+ "title": "Pashto Poetry",
184
+ "url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
185
+ "reasons": [
186
+ "URL returned hard-missing HTTP status 404."
187
+ ],
188
+ "evidence": {
189
+ "status_code": 404,
190
+ "final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
191
+ "metadata_pashto": true,
192
+ "direct_pashto": true,
193
+ "page_pashto": false
194
+ }
195
+ },
196
+ {
197
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
198
+ "id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
199
+ "title": "Pashto text characters sample",
200
+ "url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
201
+ "reasons": [
202
+ "URL returned hard-missing HTTP status 404."
203
+ ],
204
+ "evidence": {
205
+ "status_code": 404,
206
+ "final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
207
+ "metadata_pashto": true,
208
+ "direct_pashto": true,
209
+ "page_pashto": false
210
+ }
211
+ },
212
+ {
213
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
214
+ "id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
215
+ "title": "pashto_language_alphabets",
216
+ "url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
217
+ "reasons": [
218
+ "URL returned hard-missing HTTP status 404."
219
+ ],
220
+ "evidence": {
221
+ "status_code": 404,
222
+ "final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
223
+ "metadata_pashto": true,
224
+ "direct_pashto": true,
225
+ "page_pashto": false
226
+ }
227
+ },
228
+ {
229
+ "removed_on": "2026-02-21T19:47:22.435531+00:00",
230
+ "id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
231
+ "title": "Pashto_language_characters",
232
+ "url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
233
+ "reasons": [
234
+ "URL returned hard-missing HTTP status 404."
235
+ ],
236
+ "evidence": {
237
+ "status_code": 404,
238
+ "final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
239
+ "metadata_pashto": true,
240
+ "direct_pashto": true,
241
+ "page_pashto": false
242
+ }
243
+ },
244
+ {
245
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
246
+ "id": "candidate-kaggle-dataset-ataullahaali-common-voice-24-0-pashto-speech-dataset",
247
+ "title": "Common Voice 24.0: Pashto Speech Dataset",
248
+ "url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
249
+ "reasons": [
250
+ "URL returned hard-missing HTTP status 404."
251
+ ],
252
+ "evidence": {
253
+ "status_code": 404,
254
+ "final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
255
+ "metadata_pashto": true,
256
+ "direct_pashto": true,
257
+ "page_pashto": false
258
+ }
259
+ },
260
+ {
261
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
262
+ "id": "candidate-kaggle-dataset-rabiakhan827-english-pashto-language-dataset-epld",
263
+ "title": "English-Pashto Language Dataset (EPLD)",
264
+ "url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
265
+ "reasons": [
266
+ "URL returned hard-missing HTTP status 404."
267
+ ],
268
+ "evidence": {
269
+ "status_code": 404,
270
+ "final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
271
+ "metadata_pashto": true,
272
+ "direct_pashto": true,
273
+ "page_pashto": false
274
+ }
275
+ },
276
+ {
277
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
278
+ "id": "candidate-kaggle-dataset-hassanamin-katib-s-pashto-text-imagebase-kpti",
279
+ "title": "Katib's Pashto Text Imagebase (KPTI)",
280
+ "url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
281
+ "reasons": [
282
+ "URL returned hard-missing HTTP status 404."
283
+ ],
284
+ "evidence": {
285
+ "status_code": 404,
286
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
287
+ "metadata_pashto": true,
288
+ "direct_pashto": true,
289
+ "page_pashto": false
290
+ }
291
+ },
292
+ {
293
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
294
+ "id": "candidate-kaggle-dataset-farhadkhan66-pashto-english-bilingual-sentiment-corpus",
295
+ "title": "Pashto English Bilingual Sentiment Corpus",
296
+ "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
297
+ "reasons": [
298
+ "URL returned hard-missing HTTP status 404."
299
+ ],
300
+ "evidence": {
301
+ "status_code": 404,
302
+ "final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
303
+ "metadata_pashto": true,
304
+ "direct_pashto": true,
305
+ "page_pashto": false
306
+ }
307
+ },
308
+ {
309
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
310
+ "id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
311
+ "title": "Pashto Isolated Alphabets and Numerals",
312
+ "url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
313
+ "reasons": [
314
+ "URL returned hard-missing HTTP status 404."
315
+ ],
316
+ "evidence": {
317
+ "status_code": 404,
318
+ "final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
319
+ "metadata_pashto": true,
320
+ "direct_pashto": true,
321
+ "page_pashto": false
322
+ }
323
+ },
324
+ {
325
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
326
+ "id": "candidate-kaggle-dataset-engrirf-pashto-isolated-words-speech-dataset",
327
+ "title": "Pashto Isolated Words Speech Dataset",
328
+ "url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
329
+ "reasons": [
330
+ "URL returned hard-missing HTTP status 404."
331
+ ],
332
+ "evidence": {
333
+ "status_code": 404,
334
+ "final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
335
+ "metadata_pashto": true,
336
+ "direct_pashto": true,
337
+ "page_pashto": false
338
+ }
339
+ },
340
+ {
341
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
342
+ "id": "candidate-kaggle-dataset-hassanamin-pashto-ocr",
343
+ "title": "Pashto OCR",
344
+ "url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
345
+ "reasons": [
346
+ "URL returned hard-missing HTTP status 404."
347
+ ],
348
+ "evidence": {
349
+ "status_code": 404,
350
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
351
+ "metadata_pashto": true,
352
+ "direct_pashto": true,
353
+ "page_pashto": false
354
+ }
355
+ },
356
+ {
357
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
358
+ "id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
359
+ "title": "Pashto Poetry",
360
+ "url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
361
+ "reasons": [
362
+ "URL returned hard-missing HTTP status 404."
363
+ ],
364
+ "evidence": {
365
+ "status_code": 404,
366
+ "final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
367
+ "metadata_pashto": true,
368
+ "direct_pashto": true,
369
+ "page_pashto": false
370
+ }
371
+ },
372
+ {
373
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
374
+ "id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
375
+ "title": "Pashto text characters sample",
376
+ "url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
377
+ "reasons": [
378
+ "URL returned hard-missing HTTP status 404."
379
+ ],
380
+ "evidence": {
381
+ "status_code": 404,
382
+ "final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
383
+ "metadata_pashto": true,
384
+ "direct_pashto": true,
385
+ "page_pashto": false
386
+ }
387
+ },
388
+ {
389
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
390
+ "id": "candidate-kaggle-dataset-drijaz-pashto-word-embeddings",
391
+ "title": "Pashto Word Embeddings",
392
+ "url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
393
+ "reasons": [
394
+ "URL returned hard-missing HTTP status 404."
395
+ ],
396
+ "evidence": {
397
+ "status_code": 404,
398
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
399
+ "metadata_pashto": true,
400
+ "direct_pashto": true,
401
+ "page_pashto": false
402
+ }
403
+ },
404
+ {
405
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
406
+ "id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
407
+ "title": "pashto_language_alphabets",
408
+ "url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
409
+ "reasons": [
410
+ "URL returned hard-missing HTTP status 404."
411
+ ],
412
+ "evidence": {
413
+ "status_code": 404,
414
+ "final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
415
+ "metadata_pashto": true,
416
+ "direct_pashto": true,
417
+ "page_pashto": false
418
+ }
419
+ },
420
+ {
421
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
422
+ "id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
423
+ "title": "Pashto_language_characters",
424
+ "url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
425
+ "reasons": [
426
+ "URL returned hard-missing HTTP status 404."
427
+ ],
428
+ "evidence": {
429
+ "status_code": 404,
430
+ "final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
431
+ "metadata_pashto": true,
432
+ "direct_pashto": true,
433
+ "page_pashto": false
434
+ }
435
+ },
436
+ {
437
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
438
+ "id": "candidate-kaggle-dataset-drijaz-pashtoocr",
439
+ "title": "PashtoOCR",
440
+ "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
441
+ "reasons": [
442
+ "URL returned hard-missing HTTP status 404."
443
+ ],
444
+ "evidence": {
445
+ "status_code": 404,
446
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
447
+ "metadata_pashto": true,
448
+ "direct_pashto": true,
449
+ "page_pashto": false
450
+ }
451
+ },
452
+ {
453
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
454
+ "id": "candidate-kaggle-dataset-drijaz-pold-pashto-offensive-language-dataset",
455
+ "title": "POLD - Pashto Offensive Language Dataset",
456
+ "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
457
+ "reasons": [
458
+ "URL returned hard-missing HTTP status 404."
459
+ ],
460
+ "evidence": {
461
+ "status_code": 404,
462
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
463
+ "metadata_pashto": true,
464
+ "direct_pashto": true,
465
+ "page_pashto": false
466
+ }
467
+ },
468
+ {
469
+ "removed_on": "2026-02-21T19:59:50.593781+00:00",
470
+ "id": "candidate-kaggle-dataset-shafeeqgigyani-urdu-pashto-lexicon-dataset",
471
+ "title": "Urdu-Pashto Lexicon Dataset",
472
+ "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
473
+ "reasons": [
474
+ "URL returned hard-missing HTTP status 404."
475
+ ],
476
+ "evidence": {
477
+ "status_code": 404,
478
+ "final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
479
+ "metadata_pashto": true,
480
+ "direct_pashto": true,
481
+ "page_pashto": false
482
+ }
483
+ },
484
+ {
485
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
486
+ "id": "candidate-kaggle-dataset-ataullahaali-common-voice-24-0-pashto-speech-dataset",
487
+ "title": "Common Voice 24.0: Pashto Speech Dataset",
488
+ "url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
489
+ "reasons": [
490
+ "URL returned hard-missing HTTP status 404."
491
+ ],
492
+ "evidence": {
493
+ "status_code": 404,
494
+ "final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
495
+ "metadata_pashto": true,
496
+ "direct_pashto": true,
497
+ "page_pashto": false
498
+ }
499
+ },
500
+ {
501
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
502
+ "id": "candidate-kaggle-dataset-rabiakhan827-english-pashto-language-dataset-epld",
503
+ "title": "English-Pashto Language Dataset (EPLD)",
504
+ "url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
505
+ "reasons": [
506
+ "URL returned hard-missing HTTP status 404."
507
+ ],
508
+ "evidence": {
509
+ "status_code": 404,
510
+ "final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
511
+ "metadata_pashto": true,
512
+ "direct_pashto": true,
513
+ "page_pashto": false
514
+ }
515
+ },
516
+ {
517
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
518
+ "id": "candidate-kaggle-dataset-hassanamin-katib-s-pashto-text-imagebase-kpti",
519
+ "title": "Katib's Pashto Text Imagebase (KPTI)",
520
+ "url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
521
+ "reasons": [
522
+ "URL returned hard-missing HTTP status 404."
523
+ ],
524
+ "evidence": {
525
+ "status_code": 404,
526
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
527
+ "metadata_pashto": true,
528
+ "direct_pashto": true,
529
+ "page_pashto": false
530
+ }
531
+ },
532
+ {
533
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
534
+ "id": "candidate-kaggle-dataset-farhadkhan66-pashto-english-bilingual-sentiment-corpus",
535
+ "title": "Pashto English Bilingual Sentiment Corpus",
536
+ "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
537
+ "reasons": [
538
+ "URL returned hard-missing HTTP status 404."
539
+ ],
540
+ "evidence": {
541
+ "status_code": 404,
542
+ "final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
543
+ "metadata_pashto": true,
544
+ "direct_pashto": true,
545
+ "page_pashto": false
546
+ }
547
+ },
548
+ {
549
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
550
+ "id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
551
+ "title": "Pashto Isolated Alphabets and Numerals",
552
+ "url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
553
+ "reasons": [
554
+ "URL returned hard-missing HTTP status 404."
555
+ ],
556
+ "evidence": {
557
+ "status_code": 404,
558
+ "final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
559
+ "metadata_pashto": true,
560
+ "direct_pashto": true,
561
+ "page_pashto": false
562
+ }
563
+ },
564
+ {
565
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
566
+ "id": "candidate-kaggle-dataset-engrirf-pashto-isolated-words-speech-dataset",
567
+ "title": "Pashto Isolated Words Speech Dataset",
568
+ "url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
569
+ "reasons": [
570
+ "URL returned hard-missing HTTP status 404."
571
+ ],
572
+ "evidence": {
573
+ "status_code": 404,
574
+ "final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
575
+ "metadata_pashto": true,
576
+ "direct_pashto": true,
577
+ "page_pashto": false
578
+ }
579
+ },
580
+ {
581
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
582
+ "id": "candidate-kaggle-dataset-hassanamin-pashto-ocr",
583
+ "title": "Pashto OCR",
584
+ "url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
585
+ "reasons": [
586
+ "URL returned hard-missing HTTP status 404."
587
+ ],
588
+ "evidence": {
589
+ "status_code": 404,
590
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
591
+ "metadata_pashto": true,
592
+ "direct_pashto": true,
593
+ "page_pashto": false
594
+ }
595
+ },
596
+ {
597
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
598
+ "id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
599
+ "title": "Pashto Poetry",
600
+ "url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
601
+ "reasons": [
602
+ "URL returned hard-missing HTTP status 404."
603
+ ],
604
+ "evidence": {
605
+ "status_code": 404,
606
+ "final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
607
+ "metadata_pashto": true,
608
+ "direct_pashto": true,
609
+ "page_pashto": false
610
+ }
611
+ },
612
+ {
613
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
614
+ "id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
615
+ "title": "Pashto text characters sample",
616
+ "url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
617
+ "reasons": [
618
+ "URL returned hard-missing HTTP status 404."
619
+ ],
620
+ "evidence": {
621
+ "status_code": 404,
622
+ "final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
623
+ "metadata_pashto": true,
624
+ "direct_pashto": true,
625
+ "page_pashto": false
626
+ }
627
+ },
628
+ {
629
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
630
+ "id": "candidate-kaggle-dataset-drijaz-pashto-word-embeddings",
631
+ "title": "Pashto Word Embeddings",
632
+ "url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
633
+ "reasons": [
634
+ "URL returned hard-missing HTTP status 404."
635
+ ],
636
+ "evidence": {
637
+ "status_code": 404,
638
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
639
+ "metadata_pashto": true,
640
+ "direct_pashto": true,
641
+ "page_pashto": false
642
+ }
643
+ },
644
+ {
645
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
646
+ "id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
647
+ "title": "pashto_language_alphabets",
648
+ "url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
649
+ "reasons": [
650
+ "URL returned hard-missing HTTP status 404."
651
+ ],
652
+ "evidence": {
653
+ "status_code": 404,
654
+ "final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
655
+ "metadata_pashto": true,
656
+ "direct_pashto": true,
657
+ "page_pashto": false
658
+ }
659
+ },
660
+ {
661
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
662
+ "id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
663
+ "title": "Pashto_language_characters",
664
+ "url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
665
+ "reasons": [
666
+ "URL returned hard-missing HTTP status 404."
667
+ ],
668
+ "evidence": {
669
+ "status_code": 404,
670
+ "final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
671
+ "metadata_pashto": true,
672
+ "direct_pashto": true,
673
+ "page_pashto": false
674
+ }
675
+ },
676
+ {
677
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
678
+ "id": "candidate-kaggle-dataset-drijaz-pashtoocr",
679
+ "title": "PashtoOCR",
680
+ "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
681
+ "reasons": [
682
+ "URL returned hard-missing HTTP status 404."
683
+ ],
684
+ "evidence": {
685
+ "status_code": 404,
686
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
687
+ "metadata_pashto": true,
688
+ "direct_pashto": true,
689
+ "page_pashto": false
690
+ }
691
+ },
692
+ {
693
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
694
+ "id": "candidate-kaggle-dataset-drijaz-pold-pashto-offensive-language-dataset",
695
+ "title": "POLD - Pashto Offensive Language Dataset",
696
+ "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
697
+ "reasons": [
698
+ "URL returned hard-missing HTTP status 404."
699
+ ],
700
+ "evidence": {
701
+ "status_code": 404,
702
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
703
+ "metadata_pashto": true,
704
+ "direct_pashto": true,
705
+ "page_pashto": false
706
+ }
707
+ },
708
+ {
709
+ "removed_on": "2026-02-21T20:13:47.457104+00:00",
710
+ "id": "candidate-kaggle-dataset-shafeeqgigyani-urdu-pashto-lexicon-dataset",
711
+ "title": "Urdu-Pashto Lexicon Dataset",
712
+ "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
713
+ "reasons": [
714
+ "URL returned hard-missing HTTP status 404."
715
+ ],
716
+ "evidence": {
717
+ "status_code": 404,
718
+ "final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
719
+ "metadata_pashto": true,
720
+ "direct_pashto": true,
721
+ "page_pashto": false
722
+ }
723
+ },
724
+ {
725
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
726
+ "id": "candidate-kaggle-dataset-ataullahaali-common-voice-24-0-pashto-speech-dataset",
727
+ "title": "Common Voice 24.0: Pashto Speech Dataset",
728
+ "url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
729
+ "reasons": [
730
+ "URL returned hard-missing HTTP status 404."
731
+ ],
732
+ "evidence": {
733
+ "status_code": 404,
734
+ "final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
735
+ "metadata_pashto": true,
736
+ "direct_pashto": true,
737
+ "page_pashto": false
738
+ }
739
+ },
740
+ {
741
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
742
+ "id": "candidate-kaggle-dataset-rabiakhan827-english-pashto-language-dataset-epld",
743
+ "title": "English-Pashto Language Dataset (EPLD)",
744
+ "url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
745
+ "reasons": [
746
+ "URL returned hard-missing HTTP status 404."
747
+ ],
748
+ "evidence": {
749
+ "status_code": 404,
750
+ "final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
751
+ "metadata_pashto": true,
752
+ "direct_pashto": true,
753
+ "page_pashto": false
754
+ }
755
+ },
756
+ {
757
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
758
+ "id": "candidate-kaggle-dataset-hassanamin-katib-s-pashto-text-imagebase-kpti",
759
+ "title": "Katib's Pashto Text Imagebase (KPTI)",
760
+ "url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
761
+ "reasons": [
762
+ "URL returned hard-missing HTTP status 404."
763
+ ],
764
+ "evidence": {
765
+ "status_code": 404,
766
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
767
+ "metadata_pashto": true,
768
+ "direct_pashto": true,
769
+ "page_pashto": false
770
+ }
771
+ },
772
+ {
773
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
774
+ "id": "candidate-kaggle-dataset-farhadkhan66-pashto-english-bilingual-sentiment-corpus",
775
+ "title": "Pashto English Bilingual Sentiment Corpus",
776
+ "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
777
+ "reasons": [
778
+ "URL returned hard-missing HTTP status 404."
779
+ ],
780
+ "evidence": {
781
+ "status_code": 404,
782
+ "final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
783
+ "metadata_pashto": true,
784
+ "direct_pashto": true,
785
+ "page_pashto": false
786
+ }
787
+ },
788
+ {
789
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
790
+ "id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
791
+ "title": "Pashto Isolated Alphabets and Numerals",
792
+ "url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
793
+ "reasons": [
794
+ "URL returned hard-missing HTTP status 404."
795
+ ],
796
+ "evidence": {
797
+ "status_code": 404,
798
+ "final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
799
+ "metadata_pashto": true,
800
+ "direct_pashto": true,
801
+ "page_pashto": false
802
+ }
803
+ },
804
+ {
805
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
806
+ "id": "candidate-kaggle-dataset-engrirf-pashto-isolated-words-speech-dataset",
807
+ "title": "Pashto Isolated Words Speech Dataset",
808
+ "url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
809
+ "reasons": [
810
+ "URL returned hard-missing HTTP status 404."
811
+ ],
812
+ "evidence": {
813
+ "status_code": 404,
814
+ "final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
815
+ "metadata_pashto": true,
816
+ "direct_pashto": true,
817
+ "page_pashto": false
818
+ }
819
+ },
820
+ {
821
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
822
+ "id": "candidate-kaggle-dataset-hassanamin-pashto-ocr",
823
+ "title": "Pashto OCR",
824
+ "url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
825
+ "reasons": [
826
+ "URL returned hard-missing HTTP status 404."
827
+ ],
828
+ "evidence": {
829
+ "status_code": 404,
830
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
831
+ "metadata_pashto": true,
832
+ "direct_pashto": true,
833
+ "page_pashto": false
834
+ }
835
+ },
836
+ {
837
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
838
+ "id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
839
+ "title": "Pashto Poetry",
840
+ "url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
841
+ "reasons": [
842
+ "URL returned hard-missing HTTP status 404."
843
+ ],
844
+ "evidence": {
845
+ "status_code": 404,
846
+ "final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
847
+ "metadata_pashto": true,
848
+ "direct_pashto": true,
849
+ "page_pashto": false
850
+ }
851
+ },
852
+ {
853
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
854
+ "id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
855
+ "title": "Pashto text characters sample",
856
+ "url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
857
+ "reasons": [
858
+ "URL returned hard-missing HTTP status 404."
859
+ ],
860
+ "evidence": {
861
+ "status_code": 404,
862
+ "final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
863
+ "metadata_pashto": true,
864
+ "direct_pashto": true,
865
+ "page_pashto": false
866
+ }
867
+ },
868
+ {
869
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
870
+ "id": "candidate-kaggle-dataset-drijaz-pashto-word-embeddings",
871
+ "title": "Pashto Word Embeddings",
872
+ "url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
873
+ "reasons": [
874
+ "URL returned hard-missing HTTP status 404."
875
+ ],
876
+ "evidence": {
877
+ "status_code": 404,
878
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
879
+ "metadata_pashto": true,
880
+ "direct_pashto": true,
881
+ "page_pashto": false
882
+ }
883
+ },
884
+ {
885
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
886
+ "id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
887
+ "title": "pashto_language_alphabets",
888
+ "url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
889
+ "reasons": [
890
+ "URL returned hard-missing HTTP status 404."
891
+ ],
892
+ "evidence": {
893
+ "status_code": 404,
894
+ "final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
895
+ "metadata_pashto": true,
896
+ "direct_pashto": true,
897
+ "page_pashto": false
898
+ }
899
+ },
900
+ {
901
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
902
+ "id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
903
+ "title": "Pashto_language_characters",
904
+ "url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
905
+ "reasons": [
906
+ "URL returned hard-missing HTTP status 404."
907
+ ],
908
+ "evidence": {
909
+ "status_code": 404,
910
+ "final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
911
+ "metadata_pashto": true,
912
+ "direct_pashto": true,
913
+ "page_pashto": false
914
+ }
915
+ },
916
+ {
917
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
918
+ "id": "candidate-kaggle-dataset-drijaz-pashtoocr",
919
+ "title": "PashtoOCR",
920
+ "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
921
+ "reasons": [
922
+ "URL returned hard-missing HTTP status 404."
923
+ ],
924
+ "evidence": {
925
+ "status_code": 404,
926
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
927
+ "metadata_pashto": true,
928
+ "direct_pashto": true,
929
+ "page_pashto": false
930
+ }
931
+ },
932
+ {
933
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
934
+ "id": "candidate-kaggle-dataset-drijaz-pold-pashto-offensive-language-dataset",
935
+ "title": "POLD - Pashto Offensive Language Dataset",
936
+ "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
937
+ "reasons": [
938
+ "URL returned hard-missing HTTP status 404."
939
+ ],
940
+ "evidence": {
941
+ "status_code": 404,
942
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
943
+ "metadata_pashto": true,
944
+ "direct_pashto": true,
945
+ "page_pashto": false
946
+ }
947
+ },
948
+ {
949
+ "removed_on": "2026-02-21T20:27:10.672699+00:00",
950
+ "id": "candidate-kaggle-dataset-shafeeqgigyani-urdu-pashto-lexicon-dataset",
951
+ "title": "Urdu-Pashto Lexicon Dataset",
952
+ "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
953
+ "reasons": [
954
+ "URL returned hard-missing HTTP status 404."
955
+ ],
956
+ "evidence": {
957
+ "status_code": 404,
958
+ "final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
959
+ "metadata_pashto": true,
960
+ "direct_pashto": true,
961
+ "page_pashto": false
962
+ }
963
+ },
964
+ {
965
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
966
+ "id": "dataset-kaggle-pashto-isolated-words",
967
+ "title": "Pashto Isolated Words Speech Dataset",
968
+ "url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
969
+ "reasons": [
970
+ "URL returned hard-missing HTTP status 404."
971
+ ],
972
+ "evidence": {
973
+ "status_code": 404,
974
+ "final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
975
+ "metadata_pashto": true,
976
+ "direct_pashto": true,
977
+ "page_pashto": false
978
+ }
979
+ },
980
+ {
981
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
982
+ "id": "dataset-kaggle-pashto-word-embeddings",
983
+ "title": "Pashto Word Embeddings",
984
+ "url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
985
+ "reasons": [
986
+ "URL returned hard-missing HTTP status 404."
987
+ ],
988
+ "evidence": {
989
+ "status_code": 404,
990
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
991
+ "metadata_pashto": true,
992
+ "direct_pashto": true,
993
+ "page_pashto": false
994
+ }
995
+ },
996
+ {
997
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
998
+ "id": "dataset-kaggle-pold-pashto-offensive",
999
+ "title": "POLD - Pashto Offensive Language Dataset",
1000
+ "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
1001
+ "reasons": [
1002
+ "URL returned hard-missing HTTP status 404."
1003
+ ],
1004
+ "evidence": {
1005
+ "status_code": 404,
1006
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
1007
+ "metadata_pashto": true,
1008
+ "direct_pashto": true,
1009
+ "page_pashto": false
1010
+ }
1011
+ },
1012
+ {
1013
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1014
+ "id": "dataset-kaggle-pashto-english-sentiment-corpus",
1015
+ "title": "Pashto English Bilingual Sentiment Corpus",
1016
+ "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
1017
+ "reasons": [
1018
+ "URL returned hard-missing HTTP status 404."
1019
+ ],
1020
+ "evidence": {
1021
+ "status_code": 404,
1022
+ "final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
1023
+ "metadata_pashto": true,
1024
+ "direct_pashto": true,
1025
+ "page_pashto": false
1026
+ }
1027
+ },
1028
+ {
1029
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1030
+ "id": "dataset-kaggle-urdu-pashto-lexicon",
1031
+ "title": "Urdu-Pashto Lexicon Dataset",
1032
+ "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
1033
+ "reasons": [
1034
+ "URL returned hard-missing HTTP status 404."
1035
+ ],
1036
+ "evidence": {
1037
+ "status_code": 404,
1038
+ "final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
1039
+ "metadata_pashto": true,
1040
+ "direct_pashto": true,
1041
+ "page_pashto": false
1042
+ }
1043
+ },
1044
+ {
1045
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1046
+ "id": "dataset-kaggle-drijaz-pashtoocr",
1047
+ "title": "PashtoOCR (Kaggle)",
1048
+ "url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
1049
+ "reasons": [
1050
+ "URL returned hard-missing HTTP status 404."
1051
+ ],
1052
+ "evidence": {
1053
+ "status_code": 404,
1054
+ "final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
1055
+ "metadata_pashto": true,
1056
+ "direct_pashto": true,
1057
+ "page_pashto": false
1058
+ }
1059
+ },
1060
+ {
1061
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1062
+ "id": "dataset-kaggle-english-pashto-language-dataset-epld",
1063
+ "title": "English-Pashto Language Dataset (EPLD)",
1064
+ "url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
1065
+ "reasons": [
1066
+ "URL returned hard-missing HTTP status 404."
1067
+ ],
1068
+ "evidence": {
1069
+ "status_code": 404,
1070
+ "final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
1071
+ "metadata_pashto": true,
1072
+ "direct_pashto": true,
1073
+ "page_pashto": false
1074
+ }
1075
+ },
1076
+ {
1077
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1078
+ "id": "dataset-kaggle-katib-s-pashto-text-imagebase-kpti",
1079
+ "title": "Katib's Pashto Text Imagebase (KPTI)",
1080
+ "url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
1081
+ "reasons": [
1082
+ "URL returned hard-missing HTTP status 404."
1083
+ ],
1084
+ "evidence": {
1085
+ "status_code": 404,
1086
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
1087
+ "metadata_pashto": true,
1088
+ "direct_pashto": true,
1089
+ "page_pashto": false
1090
+ }
1091
+ },
1092
+ {
1093
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1094
+ "id": "dataset-kaggle-pashto-ocr",
1095
+ "title": "Pashto OCR",
1096
+ "url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
1097
+ "reasons": [
1098
+ "URL returned hard-missing HTTP status 404."
1099
+ ],
1100
+ "evidence": {
1101
+ "status_code": 404,
1102
+ "final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
1103
+ "metadata_pashto": true,
1104
+ "direct_pashto": true,
1105
+ "page_pashto": false
1106
+ }
1107
+ },
1108
+ {
1109
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1110
+ "id": "dataset-kaggle-common-voice-24-0-pashto-speech-dataset",
1111
+ "title": "Common Voice 24.0: Pashto Speech Dataset",
1112
+ "url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
1113
+ "reasons": [
1114
+ "URL returned hard-missing HTTP status 404."
1115
+ ],
1116
+ "evidence": {
1117
+ "status_code": 404,
1118
+ "final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
1119
+ "metadata_pashto": true,
1120
+ "direct_pashto": true,
1121
+ "page_pashto": false
1122
+ }
1123
+ },
1124
+ {
1125
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1126
+ "id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
1127
+ "title": "Pashto Isolated Alphabets and Numerals",
1128
+ "url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
1129
+ "reasons": [
1130
+ "URL returned hard-missing HTTP status 404."
1131
+ ],
1132
+ "evidence": {
1133
+ "status_code": 404,
1134
+ "final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
1135
+ "metadata_pashto": true,
1136
+ "direct_pashto": true,
1137
+ "page_pashto": false
1138
+ }
1139
+ },
1140
+ {
1141
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1142
+ "id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
1143
+ "title": "Pashto Poetry",
1144
+ "url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
1145
+ "reasons": [
1146
+ "URL returned hard-missing HTTP status 404."
1147
+ ],
1148
+ "evidence": {
1149
+ "status_code": 404,
1150
+ "final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
1151
+ "metadata_pashto": true,
1152
+ "direct_pashto": true,
1153
+ "page_pashto": false
1154
+ }
1155
+ },
1156
+ {
1157
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1158
+ "id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
1159
+ "title": "Pashto text characters sample",
1160
+ "url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
1161
+ "reasons": [
1162
+ "URL returned hard-missing HTTP status 404."
1163
+ ],
1164
+ "evidence": {
1165
+ "status_code": 404,
1166
+ "final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
1167
+ "metadata_pashto": true,
1168
+ "direct_pashto": true,
1169
+ "page_pashto": false
1170
+ }
1171
+ },
1172
+ {
1173
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1174
+ "id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
1175
+ "title": "pashto_language_alphabets",
1176
+ "url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
1177
+ "reasons": [
1178
+ "URL returned hard-missing HTTP status 404."
1179
+ ],
1180
+ "evidence": {
1181
+ "status_code": 404,
1182
+ "final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
1183
+ "metadata_pashto": true,
1184
+ "direct_pashto": true,
1185
+ "page_pashto": false
1186
+ }
1187
+ },
1188
+ {
1189
+ "removed_on": "2026-02-21T20:47:45.952635+00:00",
1190
+ "id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
1191
+ "title": "Pashto_language_characters",
1192
+ "url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
1193
+ "reasons": [
1194
+ "URL returned hard-missing HTTP status 404."
1195
+ ],
1196
+ "evidence": {
1197
+ "status_code": 404,
1198
+ "final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
1199
+ "metadata_pashto": true,
1200
+ "direct_pashto": true,
1201
+ "page_pashto": false
1202
+ }
1203
+ }
1204
+ ]
1205
+ }
resources/catalog/resources.json CHANGED
The diff for this file is too large to render. See raw diff
 
resources/codes/README.md CHANGED
@@ -4,7 +4,10 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
 
7
  | nlpashto Toolkit | [github](https://github.com/ijazul-haq/nlpashto) | [Repository name and description explicitly identify a Pashto NLP toolkit. (`Pashto`, `NLP`)](https://api.github.com/repos/ijazul-haq/nlpashto) | Pashto NLP code integration and experimentation |
 
8
 
9
  ## Maintenance
10
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | LGUG2Z/tashkil | [github](https://github.com/LGUG2Z/tashkil) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/LGUG2Z/tashkil) | Automated discovery entry for Pashto resource tracking. |
8
+ | mrychlik/worldly-ocr | [github](https://github.com/mrychlik/worldly-ocr) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/mrychlik/worldly-ocr) | Automated discovery entry for Pashto resource tracking. |
9
  | nlpashto Toolkit | [github](https://github.com/ijazul-haq/nlpashto) | [Repository name and description explicitly identify a Pashto NLP toolkit. (`Pashto`, `NLP`)](https://api.github.com/repos/ijazul-haq/nlpashto) | Pashto NLP code integration and experimentation |
10
+ | sinaahmadi/PersoArabicLID | [github](https://github.com/sinaahmadi/PersoArabicLID) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/sinaahmadi/PersoArabicLID) | Automated discovery entry for Pashto resource tracking. |
11
 
12
  ## Maintenance
13
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
resources/datasets/README.md CHANGED
@@ -14,10 +14,9 @@
14
  | arsalagrey/pashto-books | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books) | Automated discovery entry for Pashto resource tracking. |
15
  | arsalagrey/pashto-books-json | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | Automated discovery entry for Pashto resource tracking. |
16
  | Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
17
- | Common Voice 24.0: Pashto Speech Dataset | [kaggle](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto) | ASR training and evaluation data source |
18
  | Common Voice Scripted Speech 24.0 - Pashto | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official dataset page is for Pashto. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR training and evaluation |
19
  | English to Pashto Sentences Dataset | [huggingface](https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset) | [Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column. (`Pashto`)](https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset) | MT and bilingual sentence alignment baseline |
20
- | English-Pashto Language Dataset (EPLD) | [kaggle](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | Machine translation and bilingual corpus development |
21
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
22
  | IARPA Babel Pashto Language Pack IARPA-babel104b-v0.4bY | [dataverse](https://hdl.handle.net/11272.1/AB2/GLFN3X) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/GLFN3X) | Pashto speech dataset for ASR and language identification experiments |
23
  | ihanif/pashto_asr_wer | [huggingface](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | ASR training and evaluation data source |
@@ -27,29 +26,28 @@
27
  | ihanif/pashto_speech_5k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | ASR training and evaluation data source |
28
  | ihanif/pashto_speech_ds | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | ASR training and evaluation data source |
29
  | ihanif/pashto_speech_parquet_10k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | ASR training and evaluation data source |
30
- | Katib's Pashto Text Imagebase (KPTI) | [kaggle](https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti) | OCR training and evaluation data source |
31
  | koochikoo25/Pashto-Concatenated | [huggingface](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | [Dataset title explicitly states Pashto and card metadata exposes audio-text features and splits. (`Pashto`, `audio`, `transcription`)](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | ASR dataset preparation and split-based benchmark experiments |
 
 
32
  | oowais/pushto-text-to-speech-dataset | [huggingface](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | ASR training and evaluation data source |
33
  | OPED (Open Pashto-English Dictionary): Preliminary version, 30 October 2025 | [zenodo](https://zenodo.org/records/17487678) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/17487678) | Automated discovery entry for Pashto resource tracking. |
34
  | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
35
  | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
36
- | Pashto English Bilingual Sentiment Corpus | [kaggle](https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus) | [Kaggle dataset title and description identify the corpus as Pashto-English sentiment data. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus) | Sentiment analysis and bilingual NLP experiments |
37
- | Pashto Isolated Alphabets and Numerals | [kaggle](https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals) | Automated discovery entry for Pashto resource tracking. |
38
- | Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
39
- | Pashto OCR | [kaggle](https://www.kaggle.com/datasets/hassanamin/pashto-ocr) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/hassanamin/pashto-ocr) | OCR training and evaluation data source |
40
- | Pashto Poetry | [kaggle](https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry) | Automated discovery entry for Pashto resource tracking. |
41
- | Pashto text characters sample | [kaggle](https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample) | Automated discovery entry for Pashto resource tracking. |
42
  | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
43
- | Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
44
- | pashto_language_alphabets | [kaggle](https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets) | Automated discovery entry for Pashto resource tracking. |
45
- | Pashto_language_characters | [kaggle](https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters) | Automated discovery entry for Pashto resource tracking. |
46
- | PashtoOCR (Kaggle) | [kaggle](https://www.kaggle.com/datasets/drijaz/pashtoocr) | [Kaggle dataset title and subtitle explicitly identify a Pashto OCR dataset. (`Pashto`, `OCR`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pashtoocr) | Pashto OCR dataset benchmarking and training |
47
- | POLD - Pashto Offensive Language Dataset | [kaggle](https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset) | [Kaggle title and description explicitly state Pashto offensive language benchmark dataset. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset) | Pashto toxicity and moderation NLP benchmarks |
48
  | saillab/alpaca_pashto_taco | [huggingface](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | Instruction tuning and LLM adaptation data source |
49
  | SherwinDesouza/pashto-common-voice-20 | [huggingface](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | Pashto data source for NLP experimentation |
 
50
  | tasal9/Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | Pashto data source for NLP experimentation |
51
  | tasal9/ZamAI_Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | Pashto data source for NLP experimentation |
52
- | Urdu-Pashto Lexicon Dataset | [kaggle](https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset) | [Kaggle metadata describes 7,601 Urdu entries with Pashto translations. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset) | Lexicon and translation lexeme mapping |
 
 
 
 
53
  | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
54
  | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
55
 
 
14
  | arsalagrey/pashto-books | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books) | Automated discovery entry for Pashto resource tracking. |
15
  | arsalagrey/pashto-books-json | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | Automated discovery entry for Pashto resource tracking. |
16
  | Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
17
+ | Clitic Particles and the Typology of 2P Languages | [zenodo](https://zenodo.org/records/15010591) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15010591) | Automated discovery entry for Pashto resource tracking. |
18
  | Common Voice Scripted Speech 24.0 - Pashto | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official dataset page is for Pashto. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR training and evaluation |
19
  | English to Pashto Sentences Dataset | [huggingface](https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset) | [Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column. (`Pashto`)](https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset) | MT and bilingual sentence alignment baseline |
 
20
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
21
  | IARPA Babel Pashto Language Pack IARPA-babel104b-v0.4bY | [dataverse](https://hdl.handle.net/11272.1/AB2/GLFN3X) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/GLFN3X) | Pashto speech dataset for ASR and language identification experiments |
22
  | ihanif/pashto_asr_wer | [huggingface](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | ASR training and evaluation data source |
 
26
  | ihanif/pashto_speech_5k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | ASR training and evaluation data source |
27
  | ihanif/pashto_speech_ds | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | ASR training and evaluation data source |
28
  | ihanif/pashto_speech_parquet_10k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | ASR training and evaluation data source |
 
29
  | koochikoo25/Pashto-Concatenated | [huggingface](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | [Dataset title explicitly states Pashto and card metadata exposes audio-text features and splits. (`Pashto`, `audio`, `transcription`)](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | ASR dataset preparation and split-based benchmark experiments |
30
+ | Multi-Language Conversational Telephone Speech 2011 -- Central Asian | [dataverse](https://hdl.handle.net/11272.1/AB2/YW9PX3) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/YW9PX3) | Automated discovery entry for Pashto resource tracking. |
31
+ | NAVOIY-TERRA Corpus v1.0: First Computational Corpus of Alisher Navoi Works with Nine-Language Semantic Annotations | [datacite](https://zenodo.org/doi/10.5281/zenodo.18602634) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18602634) | Automated discovery entry for Pashto resource tracking. |
32
  | oowais/pushto-text-to-speech-dataset | [huggingface](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | ASR training and evaluation data source |
33
  | OPED (Open Pashto-English Dictionary): Preliminary version, 30 October 2025 | [zenodo](https://zenodo.org/records/17487678) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/17487678) | Automated discovery entry for Pashto resource tracking. |
34
  | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
35
  | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
 
 
 
 
 
 
36
  | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
37
+ | RATS Language Identification | [dataverse](https://hdl.handle.net/11272.1/AB2/UP3WJC) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/UP3WJC) | Automated discovery entry for Pashto resource tracking. |
38
+ | RATS Low Speech Density | [dataverse](https://doi.org/10.35111/4ENA-FG30) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/4ENA-FG30) | Automated discovery entry for Pashto resource tracking. |
39
+ | RATS Speaker Identification | [dataverse](https://doi.org/10.35111/ZQET-2102) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/ZQET-2102) | Automated discovery entry for Pashto resource tracking. |
40
+ | RATS Speech Activity Detection | [dataverse](https://hdl.handle.net/11272.1/AB2/1UISJ7) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/1UISJ7) | Automated discovery entry for Pashto resource tracking. |
 
41
  | saillab/alpaca_pashto_taco | [huggingface](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | Instruction tuning and LLM adaptation data source |
42
  | SherwinDesouza/pashto-common-voice-20 | [huggingface](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | Pashto data source for NLP experimentation |
43
+ | SMAjram: A Large-Scale Synthetic OCR Dataset for Punjabi Shahmukhi (Perso-Arabic) Script | [zenodo](https://zenodo.org/records/15868719) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15868719) | Automated discovery entry for Pashto resource tracking. |
44
  | tasal9/Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | Pashto data source for NLP experimentation |
45
  | tasal9/ZamAI_Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | Pashto data source for NLP experimentation |
46
+ | Towards a Typology of Endoclitics | [zenodo](https://zenodo.org/records/15041544) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15041544) | Automated discovery entry for Pashto resource tracking. |
47
+ | TRAD Arabic-French Parallel Text -- Newsgroup | [dataverse](https://hdl.handle.net/11272.1/AB2/0DET8M) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/0DET8M) | Automated discovery entry for Pashto resource tracking. |
48
+ | TRAD Arabic-French Parallel Text -- Newswire | [dataverse](https://doi.org/10.35111/Z1WG-9X78) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/Z1WG-9X78) | Automated discovery entry for Pashto resource tracking. |
49
+ | TRAD Chinese-French Parallel Text -- Blog | [dataverse](https://hdl.handle.net/11272.1/AB2/ATYE6I) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/ATYE6I) | Automated discovery entry for Pashto resource tracking. |
50
+ | TRAD Chinese-French Parallel Text -- Broadcast News | [dataverse](https://doi.org/10.35111/7FW4-EV85) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/7FW4-EV85) | Automated discovery entry for Pashto resource tracking. |
51
  | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
52
  | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
53
 
resources/models/README.md CHANGED
@@ -13,6 +13,7 @@
13
  | ihanif/whisper-small-pashto-dropout | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | Pashto ASR baseline and model comparison |
14
  | ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
15
  | ijazulhaq/bert-base-pashto | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto) | Pashto model baseline for downstream NLP tasks |
 
16
  | ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
17
  | Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1 | [huggingface](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | Automated discovery entry for Pashto resource tracking. |
18
  | koochikoo25/pashto-whisper-large | [huggingface](https://huggingface.co/koochikoo25/pashto-whisper-large) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/koochikoo25/pashto-whisper-large) | Pashto ASR baseline and model comparison |
 
13
  | ihanif/whisper-small-pashto-dropout | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | Pashto ASR baseline and model comparison |
14
  | ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
15
  | ijazulhaq/bert-base-pashto | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto) | Pashto model baseline for downstream NLP tasks |
16
+ | ijazulhaq/bert-base-pashto-c | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-c) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-c) | Automated discovery entry for Pashto resource tracking. |
17
  | ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
18
  | Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1 | [huggingface](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | Automated discovery entry for Pashto resource tracking. |
19
  | koochikoo25/pashto-whisper-large | [huggingface](https://huggingface.co/koochikoo25/pashto-whisper-large) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/koochikoo25/pashto-whisper-large) | Pashto ASR baseline and model comparison |
resources/papers/README.md CHANGED
@@ -4,30 +4,110 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
 
 
 
 
 
 
 
 
 
 
7
  | Benchmark Pashto Handwritten Character Dataset and Pashto Object Character Recognition (OCR) Using Deep Neural Network with Rule Activation Function | [openalex](https://doi.org/10.1155/2021/6669672) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1155/2021/6669672) | Pashto handwritten OCR benchmark and methodology reference |
8
  | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
 
 
 
 
 
 
9
  | Database development and automatic speech recognition of isolated Pashto spoken digits using MFCC and K-NN | [openalex](https://doi.org/10.1007/s10772-014-9267-z) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1007/s10772-014-9267-z) | Pashto ASR baseline method reference for digit recognition |
10
  | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
 
 
11
  | Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese | [arxiv](http://arxiv.org/abs/2005.08650v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2005.08650v1) | Pashto OCR method reference |
 
 
 
 
 
 
 
12
  | Enhancing Pashto NER Using Machine-Labeled Data and Transformer-Based Models | [other](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | Pashto research reference for methods and benchmarking |
13
  | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
 
 
 
 
 
 
 
 
14
  | From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset | [arxiv](http://arxiv.org/abs/2602.14062v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2602.14062v1) | ASR data quality and release trend reference |
 
 
 
 
 
 
15
  | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
 
16
  | KPTI: Katib's Pashto Text Imagebase and Deep Learning Benchmark | [openalex](https://doi.org/10.1109/icfhr.2016.0090) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icfhr.2016.0090) | Pashto OCR dataset and benchmarking reference |
 
 
 
 
 
 
17
  | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
 
 
 
 
 
18
  | Pashto isolated digits recognition using deep convolutional neural network | [openalex](https://doi.org/10.1016/j.heliyon.2020.e03372) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1016/j.heliyon.2020.e03372) | Pashto speech recognition research reference |
 
19
  | Pashto offensive language detection: a benchmark dataset and monolingual Pashto BERT | [openalex](https://doi.org/10.7717/peerj-cs.1617) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.7717/peerj-cs.1617) | Pashto NLP toxicity detection benchmark and model reference |
 
 
 
20
  | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
 
 
 
 
21
  | PHTI: Pashto Handwritten Text Imagebase for Deep Learning Applications | [openalex](https://doi.org/10.1109/access.2022.3216881) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/access.2022.3216881) | Pashto OCR dataset and benchmark reference |
22
  | Pioneer dataset and recognition of Handwritten Pashto characters using Convolution Neural Networks | [openalex](https://doi.org/10.1177/0020294020964826) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1177/0020294020964826) | Pashto handwritten character recognition reference |
 
23
  | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
24
  | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
 
 
25
  | Recognition of Pashto Handwritten Characters Based on Deep Learning | [openalex](https://doi.org/10.3390/s20205884) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.3390/s20205884) | Pashto OCR model reference for handwritten character recognition |
26
  | Recognizable units in Pashto language for OCR | [openalex](https://doi.org/10.1109/icdar.2015.7333963) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333963) | Pashto OCR preprocessing and unit-design reference |
27
  | Scale and rotation invariant OCR for Pashto cursive script using MDLSTM network | [openalex](https://doi.org/10.1109/icdar.2015.7333931) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333931) | Pashto OCR model architecture reference |
 
 
28
  | Shape analysis of Pashto script and creation of image database for OCR | [openalex](https://doi.org/10.1109/icet.2009.5353160) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icet.2009.5353160) | Pashto OCR dataset design and feature reference |
 
29
  | Speech translation for low-resource languages: the case of Pashto | [openalex](https://doi.org/10.21437/interspeech.2005-723) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.21437/interspeech.2005-723) | Pashto speech translation and low-resource MT reference |
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  | Tuning Traditional Language Processing Approaches for Pashto Text Classification | [arxiv](http://arxiv.org/abs/2305.03737v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2305.03737v1) | Pashto text classification method reference |
 
31
 
32
  ## Maintenance
33
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | (Pushto) Pakhto Nasar Kay Da Matbooa Tarjumo Yova Tanqeedi Mutala/Jaiza. | [other](https://www.semanticscholar.org/paper/0da0e8535262d1f26f04dd6bc2f091474cab4150) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/0da0e8535262d1f26f04dd6bc2f091474cab4150) | Automated discovery entry for Pashto resource tracking. |
8
+ | A Comparative Analysis of Pashto Ghazals and English Sonnets in 17th Century | [other](https://www.semanticscholar.org/paper/55b044485b2f134c69c9b9b6dfeaa7e71e704b3d) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/55b044485b2f134c69c9b9b6dfeaa7e71e704b3d) | Automated discovery entry for Pashto resource tracking. |
9
+ | A Dictionary of the Pukhto, Pushto, or Language of the Afghans | [other](https://www.semanticscholar.org/paper/777c0aa56991f55826339915363de2ceb8dd7141) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/777c0aa56991f55826339915363de2ceb8dd7141) | Automated discovery entry for Pashto resource tracking. |
10
+ | A dictionary of the Pukhto, Pushto, or language of the Afghans; with remarks on the originality of the language, and its affinity to the Semitic and other Oriental tongues, etc. | [other](https://www.semanticscholar.org/paper/d12502a6c245ff6f537bf68d9db4b449dca827bb) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/d12502a6c245ff6f537bf68d9db4b449dca827bb) | Automated discovery entry for Pashto resource tracking. |
11
+ | A grammar of the Puk̲h̲to or Pus̲'h̲to language | [other](https://www.semanticscholar.org/paper/99c46409a55ac0bf68e2c530a377becfcb46dd47) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/99c46409a55ac0bf68e2c530a377becfcb46dd47) | Automated discovery entry for Pashto resource tracking. |
12
+ | A New Etymological Vocabulary of Pashto | [openalex](https://openalex.org/W2071464713) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://openalex.org/W2071464713) | Automated discovery entry for Pashto resource tracking. |
13
+ | A reference grammar of Pashto | [openalex](http://wals.info/refdb/record/7189) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](http://wals.info/refdb/record/7189) | Automated discovery entry for Pashto resource tracking. |
14
+ | An Acoustic Analysis of consonants of Khattak Dialect of Pashto | [other](https://www.semanticscholar.org/paper/ed06d206e60a62c2bebdd487b4f8dea253a9a0a8) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/ed06d206e60a62c2bebdd487b4f8dea253a9a0a8) | Automated discovery entry for Pashto resource tracking. |
15
+ | AN ANALYSIS OF FREUDIAN CONCEPT OF MOURNING IN PASHTO TAPPAS ON THE THEME OF MIGRATION | [zenodo](https://zenodo.org/records/11124039) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/11124039) | Automated discovery entry for Pashto resource tracking. |
16
+ | An Analysis of the Syntactic and Pragmatic Effects on Word Order Flexibility in Pashto and English | [other](https://www.semanticscholar.org/paper/136c23f176399f7dfc45e6ae990a975aafd7da1d) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/136c23f176399f7dfc45e6ae990a975aafd7da1d) | Automated discovery entry for Pashto resource tracking. |
17
+ | Analysing Deep Meaning of Proverbs in Pashto Language | [other](https://www.semanticscholar.org/paper/1a804a9701c5103ed38df3350da61abdf5df2b57) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/1a804a9701c5103ed38df3350da61abdf5df2b57) | Automated discovery entry for Pashto resource tracking. |
18
  | Benchmark Pashto Handwritten Character Dataset and Pashto Object Character Recognition (OCR) Using Deep Neural Network with Rule Activation Function | [openalex](https://doi.org/10.1155/2021/6669672) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1155/2021/6669672) | Pashto handwritten OCR benchmark and methodology reference |
19
  | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
20
+ | CER-HV: A CER-Based Human-in-the-Loop Framework for Cleaning Datasets Applied to Arabic-Script HTR | [datacite](https://arxiv.org/abs/2601.16713) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2601.16713) | Automated discovery entry for Pashto resource tracking. |
21
+ | CHALLENGING GENDER ROLES: A FEMINIST ANALYSIS OF GHANI KHAN'S THE PATHANS | [zenodo](https://zenodo.org/records/11216862) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/11216862) | Automated discovery entry for Pashto resource tracking. |
22
+ | Cinematic Misnomers: Examining the Effects of Pashto Movie Titles on the Perception of Pashtun Identity | [other](https://www.semanticscholar.org/paper/1b4c38ce4ceb6ac7846062bb589351cc88a36617) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/1b4c38ce4ceb6ac7846062bb589351cc88a36617) | Automated discovery entry for Pashto resource tracking. |
23
+ | Comparative Study of Adjectives in Pashto and Dari as Cognate Languages | [other](https://www.semanticscholar.org/paper/558e9dd7d4027be391a39f5e5ef988cf05039dc7) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/558e9dd7d4027be391a39f5e5ef988cf05039dc7) | Automated discovery entry for Pashto resource tracking. |
24
+ | Comprehensive Socio-phonetic Study of the Plosive /p/ and Fricative /f/ Merger among Pashto Speakers in Khyber Pakhtunkhwa | [other](https://www.semanticscholar.org/paper/4f01f2250c897dc53099f76a2455471b480f22cf) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/4f01f2250c897dc53099f76a2455471b480f22cf) | Automated discovery entry for Pashto resource tracking. |
25
+ | Critical study of the travelogues of Dr Altaf Yousafzai (In The Context of "Thailand kay Rang", "Nile kay Sang" and "Bakhal-e-Hinduwush Bakhsham") | [zenodo](https://zenodo.org/records/13937101) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/13937101) | Automated discovery entry for Pashto resource tracking. |
26
  | Database development and automatic speech recognition of isolated Pashto spoken digits using MFCC and K-NN | [openalex](https://doi.org/10.1007/s10772-014-9267-z) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1007/s10772-014-9267-z) | Pashto ASR baseline method reference for digit recognition |
27
  | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
28
+ | Depiction of Women's Cries in Pashto Landai Poetry | [zenodo](https://zenodo.org/records/15524281) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15524281) | Automated discovery entry for Pashto resource tracking. |
29
+ | Descriptive Grammar of Pashto and its Dialects | [openalex](https://doi.org/10.1515/9781614512318) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1515/9781614512318) | Automated discovery entry for Pashto resource tracking. |
30
  | Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese | [arxiv](http://arxiv.org/abs/2005.08650v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2005.08650v1) | Pashto OCR method reference |
31
+ | Divorce And Women’s Rights: Should Women have Equal Rights? (Pashto) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123991) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123991) | Automated discovery entry for Pashto resource tracking. |
32
+ | Doing Pashto | [crossref](https://doi.org/10.1080/02690055.2015.1068987) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1080/02690055.2015.1068987) | Automated discovery entry for Pashto resource tracking. |
33
+ | Editorial Note (Dari) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123994) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123994) | Automated discovery entry for Pashto resource tracking. |
34
+ | Editorial Note (English) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123993) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123993) | Automated discovery entry for Pashto resource tracking. |
35
+ | Editorial Note (Pashto) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123995) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123995) | Automated discovery entry for Pashto resource tracking. |
36
+ | EDUCATIONAL AND LINGUISTIC ASPECTS OF TEXT PREPROCESSING IN PASHTO | [zenodo](https://zenodo.org/records/15917449) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15917449) | Automated discovery entry for Pashto resource tracking. |
37
+ | Embedding Elements from Foreign Language into The Native Language Through Pashto-English Code-Mixed Speech | [zenodo](https://zenodo.org/records/14756994) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/14756994) | Automated discovery entry for Pashto resource tracking. |
38
  | Enhancing Pashto NER Using Machine-Labeled Data and Transformer-Based Models | [other](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | Pashto research reference for methods and benchmarking |
39
  | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
40
+ | Essential Skills for a Lexicographer: Based on Pashto Lexicography | [other](https://www.semanticscholar.org/paper/8fc45aa567cb78713e2fef41d5e748e8ee1d8470) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/8fc45aa567cb78713e2fef41d5e748e8ee1d8470) | Automated discovery entry for Pashto resource tracking. |
41
+ | EVALUATION OF ANTIBACTERIAL ACTIVITY OF ZIZYPHUS JUJUBA | [zenodo](https://zenodo.org/records/3595881) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/3595881) | Automated discovery entry for Pashto resource tracking. |
42
+ | EVALUATION OF ANTIPYRETIC ACTIVITY OF ZIZYPHUS JUJUBA LAM. LEAVES ON ALBINO RATS | [zenodo](https://zenodo.org/records/4269214) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/4269214) | Automated discovery entry for Pashto resource tracking. |
43
+ | Exploring Hospitality as a Cultural Tradition: A Qualitative Study of Pashto and Hindko Customs | [zenodo](https://zenodo.org/records/14872725) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/14872725) | Automated discovery entry for Pashto resource tracking. |
44
+ | Exploring the Impacts of Emotion through Language Learning on Pashto Speakers Young Adulthood in District Peshawar | [other](https://www.semanticscholar.org/paper/4549649112553aabccfac8b918c7e98cdbdd0f09) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/4549649112553aabccfac8b918c7e98cdbdd0f09) | Automated discovery entry for Pashto resource tracking. |
45
+ | Fairness Evaluation and Inference Level Mitigation in LLMs | [datacite](https://figshare.mq.edu.au/articles/thesis/Fairness_Evaluation_and_Inference_Level_Mitigation_in_LLMs/31093552/1) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://figshare.mq.edu.au/articles/thesis/Fairness_Evaluation_and_Inference_Level_Mitigation_in_LLMs/31093552/1) | Automated discovery entry for Pashto resource tracking. |
46
+ | Fragments of life in ‘death world’: an analysis of Pashto poetry as a non-violent resistance to necropolitics | [other](https://www.semanticscholar.org/paper/9726f372b07f677fad23e2ee27a7f50f985e8ed8) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/9726f372b07f677fad23e2ee27a7f50f985e8ed8) | Automated discovery entry for Pashto resource tracking. |
47
+ | Framing Political Bias in Multilingual LLMs Across Pakistani Languages | [datacite](https://arxiv.org/abs/2506.00068) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2506.00068) | Automated discovery entry for Pashto resource tracking. |
48
  | From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset | [arxiv](http://arxiv.org/abs/2602.14062v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2602.14062v1) | ASR data quality and release trend reference |
49
+ | From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset | [datacite](https://arxiv.org/abs/2602.14062) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2602.14062) | Automated discovery entry for Pashto resource tracking. |
50
+ | Gemination in Pashto | [crossref](https://doi.org/10.24312/ucp-jll.02.02.405) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.24312/ucp-jll.02.02.405) | Automated discovery entry for Pashto resource tracking. |
51
+ | Gender Classification From Pashto Handwritten Text Images | [other](https://www.semanticscholar.org/paper/2d70fffa9224d71f67ad3c1943b8a71b18164eeb) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/2d70fffa9224d71f67ad3c1943b8a71b18164eeb) | Automated discovery entry for Pashto resource tracking. |
52
+ | Introduction to Pashto Word’s Characteristics | [other](https://www.semanticscholar.org/paper/6eb3febbb368a7eaccc6290bcd77683ed3d624aa) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/6eb3febbb368a7eaccc6290bcd77683ed3d624aa) | Automated discovery entry for Pashto resource tracking. |
53
+ | Is the Pushto a Semitic Language | [zenodo](https://zenodo.org/records/16001185) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/16001185) | Automated discovery entry for Pashto resource tracking. |
54
+ | Isolated Handwritten Pashto Character Recognition Using a <i>K</i>‐NN Classification Tool based on Zoning and HOG Feature Extraction Techniques | [openalex](https://doi.org/10.1155/2021/5558373) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1155/2021/5558373) | Automated discovery entry for Pashto resource tracking. |
55
  | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
56
+ | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [openalex](https://doi.org/10.14569/ijacsa.2018.091069) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.14569/ijacsa.2018.091069) | Automated discovery entry for Pashto resource tracking. |
57
  | KPTI: Katib's Pashto Text Imagebase and Deep Learning Benchmark | [openalex](https://doi.org/10.1109/icfhr.2016.0090) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icfhr.2016.0090) | Pashto OCR dataset and benchmarking reference |
58
+ | Language Barrier and its Effect on Learning at the Public Primary School Level in Lahore | [zenodo](https://zenodo.org/records/17728944) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/17728944) | Automated discovery entry for Pashto resource tracking. |
59
+ | Le verbe pashto | [crossref](https://doi.org/10.29091/9783954907083) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.29091/9783954907083) | Automated discovery entry for Pashto resource tracking. |
60
+ | Morphology of Pashto Adverbs: Word and Paradigm Approach | [zenodo](https://zenodo.org/records/16211508) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/16211508) | Automated discovery entry for Pashto resource tracking. |
61
+ | Negation in Pashto | [zenodo](https://zenodo.org/records/18233956) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/18233956) | Automated discovery entry for Pashto resource tracking. |
62
+ | Negotiating Pakhto: Proverbs, Islam and the Construction of Identity among Pashtuns | [other](https://www.semanticscholar.org/paper/8a503f164e0c1f5be13866dad00539c7e5b1cabc) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/8a503f164e0c1f5be13866dad00539c7e5b1cabc) | Automated discovery entry for Pashto resource tracking. |
63
+ | Only 2 of 141 Global Languages Employ a Labial for "Tongue" in 1st position Challenging Saussure's Arbitrariness With Near Universal Embodied Iconicity for Tongue Vs Mouth in "inverse" Control | [datacite](https://zenodo.org/doi/10.5281/zenodo.17807676) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.17807676) | Automated discovery entry for Pashto resource tracking. |
64
  | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
65
+ | Pashto | [crossref](https://doi.org/10.4324/9780203301524-34) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.4324/9780203301524-34) | Automated discovery entry for Pashto resource tracking. |
66
+ | Pashto (Endo-)clitics in a Parallel Architecture | [openalex](http://nbn-resolving.de/urn:nbn:de:bsz:352-0-278290) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](http://nbn-resolving.de/urn:nbn:de:bsz:352-0-278290) | Automated discovery entry for Pashto resource tracking. |
67
+ | Pashto : Pashto-English, English-Pashto dictionary & phrasebook | [other](https://www.semanticscholar.org/paper/8ff77d35396d17225d97772e577e472a2ab1c47a) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/8ff77d35396d17225d97772e577e472a2ab1c47a) | Automated discovery entry for Pashto resource tracking. |
68
+ | Pashto free relatives and triply-filled Comp: Evidence for a headed analysis | [openalex](https://doi.org/10.1016/s0024-3841(96)00032-0) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1016/s0024-3841(96)00032-0) | Automated discovery entry for Pashto resource tracking. |
69
+ | Pashto Handwritten Books | [crossref](https://doi.org/10.1163/9789004737358_003) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1163/9789004737358_003) | Automated discovery entry for Pashto resource tracking. |
70
  | Pashto isolated digits recognition using deep convolutional neural network | [openalex](https://doi.org/10.1016/j.heliyon.2020.e03372) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1016/j.heliyon.2020.e03372) | Pashto speech recognition research reference |
71
+ | Pashto Language | [crossref](https://doi.org/10.32388/pxbtfv) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.32388/pxbtfv) | Automated discovery entry for Pashto resource tracking. |
72
  | Pashto offensive language detection: a benchmark dataset and monolingual Pashto BERT | [openalex](https://doi.org/10.7717/peerj-cs.1617) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.7717/peerj-cs.1617) | Pashto NLP toxicity detection benchmark and model reference |
73
+ | PASHTO POETRY AND MILITANCY IN KHYBER PAKHTUNKHWA AFTER 9/11: THEMATIC ANALYSIS OF PASHTO POETRY IN RESISTING MILITANCY | [other](https://www.semanticscholar.org/paper/e81d4e7ac6cd7519643bf5d5c0bdfd9be554a8f2) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/e81d4e7ac6cd7519643bf5d5c0bdfd9be554a8f2) | Automated discovery entry for Pashto resource tracking. |
74
+ | Pashto preverbs V | [other](https://www.semanticscholar.org/paper/1f59f22ae99379106b417186f3053c00b5fe391f) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/1f59f22ae99379106b417186f3053c00b5fe391f) | Automated discovery entry for Pashto resource tracking. |
75
+ | Pashto preverbs, III. Compound verbs with preverb | [other](https://www.semanticscholar.org/paper/53eeae3a973d6bb72839e9304be13a0362c92242) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/53eeae3a973d6bb72839e9304be13a0362c92242) | Automated discovery entry for Pashto resource tracking. |
76
  | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
77
+ | Pashto Tappa | [crossref](https://doi.org/10.4324/9781003604877-9) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.4324/9781003604877-9) | Automated discovery entry for Pashto resource tracking. |
78
+ | Pashto Verse | [crossref](https://doi.org/10.1017/s0041977x00072700) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1017/s0041977x00072700) | Automated discovery entry for Pashto resource tracking. |
79
+ | Persian loanwords and calques in Pashto | [other](https://www.semanticscholar.org/paper/ed232f1c2abd6e6f8a49f04de8ac76bf922521ea) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/ed232f1c2abd6e6f8a49f04de8ac76bf922521ea) | Automated discovery entry for Pashto resource tracking. |
80
+ | Persian, Urdu, and Pashto: A comparative orthographic analysis | [openalex](https://doi.org/10.1093/wsr/wsq005) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1093/wsr/wsq005) | Automated discovery entry for Pashto resource tracking. |
81
  | PHTI: Pashto Handwritten Text Imagebase for Deep Learning Applications | [openalex](https://doi.org/10.1109/access.2022.3216881) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/access.2022.3216881) | Pashto OCR dataset and benchmark reference |
82
  | Pioneer dataset and recognition of Handwritten Pashto characters using Convolution Neural Networks | [openalex](https://doi.org/10.1177/0020294020964826) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1177/0020294020964826) | Pashto handwritten character recognition reference |
83
+ | Portrayal of Death in the Selected Poems of Abdul Ghani Khan and Emily Dickinson: A Comparative Thematic Analysis | [zenodo](https://zenodo.org/records/15046502) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15046502) | Automated discovery entry for Pashto resource tracking. |
84
  | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
85
  | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
86
+ | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [datacite](https://arxiv.org/abs/2505.10055) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2505.10055) | Automated discovery entry for Pashto resource tracking. |
87
+ | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [arxiv](http://arxiv.org/abs/2505.10055v2) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2505.10055v2) | Automated discovery entry for Pashto resource tracking. |
88
  | Recognition of Pashto Handwritten Characters Based on Deep Learning | [openalex](https://doi.org/10.3390/s20205884) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.3390/s20205884) | Pashto OCR model reference for handwritten character recognition |
89
  | Recognizable units in Pashto language for OCR | [openalex](https://doi.org/10.1109/icdar.2015.7333963) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333963) | Pashto OCR preprocessing and unit-design reference |
90
  | Scale and rotation invariant OCR for Pashto cursive script using MDLSTM network | [openalex](https://doi.org/10.1109/icdar.2015.7333931) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333931) | Pashto OCR model architecture reference |
91
+ | Scale and rotation invariant recognition of cursive Pashto script using SIFT features | [openalex](https://doi.org/10.1109/icet.2010.5638470) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icet.2010.5638470) | Automated discovery entry for Pashto resource tracking. |
92
+ | Separating phonology from syntax: a reanalysis of Pashto cliticization | [openalex](https://doi.org/10.1017/s0022226700006952) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1017/s0022226700006952) | Automated discovery entry for Pashto resource tracking. |
93
  | Shape analysis of Pashto script and creation of image database for OCR | [openalex](https://doi.org/10.1109/icet.2009.5353160) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icet.2009.5353160) | Pashto OCR dataset design and feature reference |
94
+ | Some Remarks on the Origin of the Afghán People and Dialect and on the Connexion of the Pushto Language with the Zend and Pehlavi and the Hebrew | [zenodo](https://zenodo.org/records/16191315) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/16191315) | Automated discovery entry for Pashto resource tracking. |
95
  | Speech translation for low-resource languages: the case of Pashto | [openalex](https://doi.org/10.21437/interspeech.2005-723) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.21437/interspeech.2005-723) | Pashto speech translation and low-resource MT reference |
96
+ | Summaries in Pashto | [crossref](https://doi.org/10.1097/01.wtf.0000437933.40809.39) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1097/01.wtf.0000437933.40809.39) | Automated discovery entry for Pashto resource tracking. |
97
+ | SWITCHING SELVES ONLINE:PASHTO-ENGLISH BILINGUALISM,IDENTITY, AND EXPRESSION IN PAKISTAN’S DIGITAL DISCOURSE | [other](https://www.semanticscholar.org/paper/7a330c5fb416a1105866a895748b4336f8ef8100) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/7a330c5fb416a1105866a895748b4336f8ef8100) | Automated discovery entry for Pashto resource tracking. |
98
+ | Syntax and morphology of Baniswola Pashto: investigating universal and dialectal variations | [other](https://www.semanticscholar.org/paper/9f725b3b282cf05f9089002d474010c6021001f9) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/9f725b3b282cf05f9089002d474010c6021001f9) | Automated discovery entry for Pashto resource tracking. |
99
+ | The BBN Byblos Pashto OCR system | [openalex](https://doi.org/10.1145/1031442.1031447) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1145/1031442.1031447) | Automated discovery entry for Pashto resource tracking. |
100
+ | The development and evaluation of an automatic clitic generator for Pashto language | [other](https://www.semanticscholar.org/paper/3d95449d67799fcac83f855984cb0c29cc500d7b) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/3d95449d67799fcac83f855984cb0c29cc500d7b) | Automated discovery entry for Pashto resource tracking. |
101
+ | The grammar of clitics : evidence from Pashto and other languages | [openalex](http://hdl.handle.net/11858/00-001M-0000-0012-7447-0) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](http://hdl.handle.net/11858/00-001M-0000-0012-7447-0) | Automated discovery entry for Pashto resource tracking. |
102
+ | The Influence of the Arabic Language on the Pashto Language: The Abdur-Rahman Baba as a Model (A Case Study Analysis) | [zenodo](https://zenodo.org/records/18174368) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/18174368) | Automated discovery entry for Pashto resource tracking. |
103
+ | The Pashto language and identity‐formation in Pakistan | [openalex](https://doi.org/10.1080/09584939508719759) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1080/09584939508719759) | Automated discovery entry for Pashto resource tracking. |
104
+ | The Regional Criminal Law Framework for the Combat of Organized Crime (Pashto) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123975) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123975) | Automated discovery entry for Pashto resource tracking. |
105
+ | The Role of Early Literary Biographies (Tazkiri) in the Ancient History of Pashto Literature | [other](https://www.semanticscholar.org/paper/4938170077d3430c2e3f9fadc161ed7b79242917) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/4938170077d3430c2e3f9fadc161ed7b79242917) | Automated discovery entry for Pashto resource tracking. |
106
+ | The Roshani Movement literary services and the contribution of this Movement in the development of Pashto Literature | [other](https://www.semanticscholar.org/paper/88a3cd1ec497844c5997ae1795f8e72bbb314112) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/88a3cd1ec497844c5997ae1795f8e72bbb314112) | Automated discovery entry for Pashto resource tracking. |
107
+ | The Social Structure and Organization of A Pakhto Speaking Community in Afghanistan. | [other](https://www.semanticscholar.org/paper/306e9a04b8835de6e906303b5e27d43a6994cb1d) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/306e9a04b8835de6e906303b5e27d43a6994cb1d) | Automated discovery entry for Pashto resource tracking. |
108
+ | Topicalization in Pashto | [crossref](https://doi.org/10.31703/gssr.2020(v-i).17) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.31703/gssr.2020(v-i).17) | Automated discovery entry for Pashto resource tracking. |
109
  | Tuning Traditional Language Processing Approaches for Pashto Text Classification | [arxiv](http://arxiv.org/abs/2305.03737v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2305.03737v1) | Pashto text classification method reference |
110
+ | بلوچستان میں " فقہ اسلامی " کے فروغ و ارتقا٫ کا تحقیقی جائزہ | [zenodo](https://zenodo.org/records/18049233) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/18049233) | Automated discovery entry for Pashto resource tracking. |
111
 
112
  ## Maintenance
113
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
resources/projects/README.md CHANGED
@@ -4,20 +4,51 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
 
 
7
  | afaaaak/urdu_pashto_translator | [huggingface](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | [Space metadata title is Urdu Pashto Translator and the slug includes pashto. (`Pashto`, `translator`)](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | Translation demo and bilingual usability testing |
8
  | afaqalinagra/PASHTO-ASR-MODEL | [huggingface](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | Interactive Pashto demo and quick qualitative validation |
9
  | Aizazayyubi/pashto_asr | [huggingface](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | Interactive Pashto ASR demo for qualitative evaluation |
 
10
  | DrSaqlainHassan/PashtoTokenixer | [huggingface](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | [Space card title states Pashto Parts of Speech Identifier and the slug contains Pashto. (`Pashto`, `parts-of-speech`)](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | Pashto NLP demo for token and part-of-speech analysis |
 
 
11
  | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
 
 
 
 
 
 
12
  | ihanif/wav2vec-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
13
  | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
 
 
14
  | ilyas02828/Pashto_Sign_Language | [huggingface](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | Interactive Pashto demo and quick qualitative validation |
 
 
 
15
  | mahmudaq/PashtoASRNMT1 | [huggingface](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | Interactive Pashto demo and quick qualitative validation |
 
 
 
 
16
  | nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
 
 
 
 
17
  | Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
18
  | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
19
  | Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
20
  | Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
 
 
 
 
 
 
 
21
  | tasal9/ZamAI-Phi3-Mini-Pashto-Demo | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | Interactive Pashto demo and quick qualitative validation |
22
  | Umar4321/Pashto-To-English-Urdu | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | Interactive Pashto demo and quick qualitative validation |
23
  | ZamAI-Mistral-7B-Pashto Space | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | [Space title and ID explicitly include Pashto and model card metadata exposes project details. (`Pashto`)](https://huggingface.co/api/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | Interactive Pashto LLM project demo |
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | adnankarim/ihanif-whisper-base-pashto | [huggingface](https://huggingface.co/spaces/adnankarim/ihanif-whisper-base-pashto) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/adnankarim/ihanif-whisper-base-pashto) | Automated discovery entry for Pashto resource tracking. |
8
+ | adnankarim/ihanif-whisper-medium-pashto-3e-7 | [huggingface](https://huggingface.co/spaces/adnankarim/ihanif-whisper-medium-pashto-3e-7) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/adnankarim/ihanif-whisper-medium-pashto-3e-7) | Automated discovery entry for Pashto resource tracking. |
9
  | afaaaak/urdu_pashto_translator | [huggingface](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | [Space metadata title is Urdu Pashto Translator and the slug includes pashto. (`Pashto`, `translator`)](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | Translation demo and bilingual usability testing |
10
  | afaqalinagra/PASHTO-ASR-MODEL | [huggingface](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | Interactive Pashto demo and quick qualitative validation |
11
  | Aizazayyubi/pashto_asr | [huggingface](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | Interactive Pashto ASR demo for qualitative evaluation |
12
+ | amirajorloo/jira-auto-direction-chrome-extension | [github](https://github.com/amirajorloo/jira-auto-direction-chrome-extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/amirajorloo/jira-auto-direction-chrome-extension) | Automated discovery entry for Pashto resource tracking. |
13
  | DrSaqlainHassan/PashtoTokenixer | [huggingface](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | [Space card title states Pashto Parts of Speech Identifier and the slug contains Pashto. (`Pashto`, `parts-of-speech`)](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | Pashto NLP demo for token and part-of-speech analysis |
14
+ | Early Pregnancy Loss [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18325729) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18325729) | Automated discovery entry for Pashto resource tracking. |
15
+ | Fazlullahmamond/hadith-collection-pashto | [github](https://github.com/Fazlullahmamond/hadith-collection-pashto) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/hadith-collection-pashto) | Automated discovery entry for Pashto resource tracking. |
16
  | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
17
+ | Female Birth Control Part I [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18325040) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18325040) | Automated discovery entry for Pashto resource tracking. |
18
+ | Female Birth Control Part II [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18325401) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18325401) | Automated discovery entry for Pashto resource tracking. |
19
+ | Haroon-blip/khan-pukhtoon | [github](https://github.com/Haroon-blip/khan-pukhtoon) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Haroon-blip/khan-pukhtoon) | Automated discovery entry for Pashto resource tracking. |
20
+ | Haseeb-007/Pashto-sekho | [huggingface](https://huggingface.co/spaces/Haseeb-007/Pashto-sekho) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Haseeb-007/Pashto-sekho) | Automated discovery entry for Pashto resource tracking. |
21
+ | haseebjanhamraz/PashtoFonts | [github](https://github.com/haseebjanhamraz/PashtoFonts) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/haseebjanhamraz/PashtoFonts) | Automated discovery entry for Pashto resource tracking. |
22
+ | Hassaankabir/Pashto_Malgaray | [huggingface](https://huggingface.co/spaces/Hassaankabir/Pashto_Malgaray) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Hassaankabir/Pashto_Malgaray) | Automated discovery entry for Pashto resource tracking. |
23
  | ihanif/wav2vec-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
24
  | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
25
+ | ihanif/whisper-medium-pashto | [huggingface](https://huggingface.co/spaces/ihanif/whisper-medium-pashto) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/whisper-medium-pashto) | Automated discovery entry for Pashto resource tracking. |
26
+ | IhyaCommunity/Khushkhat-Extension | [github](https://github.com/IhyaCommunity/Khushkhat-Extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/IhyaCommunity/Khushkhat-Extension) | Automated discovery entry for Pashto resource tracking. |
27
  | ilyas02828/Pashto_Sign_Language | [huggingface](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | Interactive Pashto demo and quick qualitative validation |
28
+ | Introduction to Postpartum Care for Refugee women [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18324878) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18324878) | Automated discovery entry for Pashto resource tracking. |
29
+ | lecramyajiv/fonts-arabic-extra | [github](https://github.com/lecramyajiv/fonts-arabic-extra) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/lecramyajiv/fonts-arabic-extra) | Automated discovery entry for Pashto resource tracking. |
30
+ | lecramyajiv/ttf-x2 | [github](https://github.com/lecramyajiv/ttf-x2) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/lecramyajiv/ttf-x2) | Automated discovery entry for Pashto resource tracking. |
31
  | mahmudaq/PashtoASRNMT1 | [huggingface](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | Interactive Pashto demo and quick qualitative validation |
32
+ | mastermoo/pashto-quran | [github](https://github.com/mastermoo/pashto-quran) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/mastermoo/pashto-quran) | Automated discovery entry for Pashto resource tracking. |
33
+ | MuhammadUllah7/PAKHTOONN | [github](https://github.com/MuhammadUllah7/PAKHTOONN) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/MuhammadUllah7/PAKHTOONN) | Automated discovery entry for Pashto resource tracking. |
34
+ | nabeelest/pakhtoodle | [github](https://github.com/nabeelest/pakhtoodle) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/nabeelest/pakhtoodle) | Automated discovery entry for Pashto resource tracking. |
35
+ | NanoNulla/lorem | [github](https://github.com/NanoNulla/lorem) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/NanoNulla/lorem) | Automated discovery entry for Pashto resource tracking. |
36
  | nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
37
+ | Negation in Pashto | [datacite](https://zenodo.org/doi/10.5281/zenodo.18233956) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18233956) | Automated discovery entry for Pashto resource tracking. |
38
+ | nisarmasid/NisAr-PakhtoOn | [github](https://github.com/nisarmasid/NisAr-PakhtoOn) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/nisarmasid/NisAr-PakhtoOn) | Automated discovery entry for Pashto resource tracking. |
39
+ | omid/Persian-Log2Vis | [github](https://github.com/omid/Persian-Log2Vis) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/omid/Persian-Log2Vis) | Automated discovery entry for Pashto resource tracking. |
40
+ | Pakhtoon9900/Pakhtoon- | [github](https://github.com/Pakhtoon9900/Pakhtoon-) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Pakhtoon9900/Pakhtoon-) | Automated discovery entry for Pashto resource tracking. |
41
  | Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
42
  | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
43
  | Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
44
  | Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
45
+ | Pukhtoon203/PUKHTOON | [github](https://github.com/Pukhtoon203/PUKHTOON) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Pukhtoon203/PUKHTOON) | Automated discovery entry for Pashto resource tracking. |
46
+ | Pukhtoonmafia009/Pukhtoonmafia009 | [github](https://github.com/Pukhtoonmafia009/Pukhtoonmafia009) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Pukhtoonmafia009/Pukhtoonmafia009) | Automated discovery entry for Pashto resource tracking. |
47
+ | pukhtoonyar406/pukhtoonyar406 | [github](https://github.com/pukhtoonyar406/pukhtoonyar406) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/pukhtoonyar406/pukhtoonyar406) | Automated discovery entry for Pashto resource tracking. |
48
+ | ShahZamanPatan/Pashto-Baran | [github](https://github.com/ShahZamanPatan/Pashto-Baran) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/ShahZamanPatan/Pashto-Baran) | Automated discovery entry for Pashto resource tracking. |
49
+ | ShawAnonymouse/Pakhtoon | [github](https://github.com/ShawAnonymouse/Pakhtoon) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/ShawAnonymouse/Pakhtoon) | Automated discovery entry for Pashto resource tracking. |
50
+ | tasal9/pashto-base-bloom-space | [huggingface](https://huggingface.co/spaces/tasal9/pashto-base-bloom-space) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/pashto-base-bloom-space) | Automated discovery entry for Pashto resource tracking. |
51
+ | tasal9/ZamAI-mt5-Pashto-Demo | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-mt5-Pashto-Demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/ZamAI-mt5-Pashto-Demo) | Automated discovery entry for Pashto resource tracking. |
52
  | tasal9/ZamAI-Phi3-Mini-Pashto-Demo | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | Interactive Pashto demo and quick qualitative validation |
53
  | Umar4321/Pashto-To-English-Urdu | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | Interactive Pashto demo and quick qualitative validation |
54
  | ZamAI-Mistral-7B-Pashto Space | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | [Space title and ID explicitly include Pashto and model card metadata exposes project details. (`Pashto`)](https://huggingface.co/api/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | Interactive Pashto LLM project demo |
scripts/README.md CHANGED
@@ -9,6 +9,7 @@ Automation scripts for quality checks, resource catalog validation, and search i
9
  - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
10
  - `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub, GitLab, OpenAlex, Crossref, Zenodo, Dataverse, DataCite, arXiv, and Semantic Scholar into `resources/catalog/pending_candidates.json`.
11
  - `promote_candidates.py`: auto-promote valid non-duplicate entries from `pending_candidates.json` into `resources/catalog/resources.json`.
 
12
  - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
13
 
14
  ## Usage
@@ -33,11 +34,26 @@ Sync candidate resources for maintainer review:
33
  python scripts/sync_resources.py --limit 20
34
  ```
35
 
 
 
 
 
 
 
 
 
 
 
36
  Auto-promote valid candidates into verified catalog:
37
  ```bash
38
  python scripts/promote_candidates.py
39
  ```
40
 
 
 
 
 
 
41
  Run full repeatable cycle:
42
  ```bash
43
  python scripts/run_resource_cycle.py --limit 25
 
9
  - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
10
  - `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub, GitLab, OpenAlex, Crossref, Zenodo, Dataverse, DataCite, arXiv, and Semantic Scholar into `resources/catalog/pending_candidates.json`.
11
  - `promote_candidates.py`: auto-promote valid non-duplicate entries from `pending_candidates.json` into `resources/catalog/resources.json`.
12
+ - `review_existing_resources.py`: review current catalog resources, remove stale/removed entries only with strong reasons, and log removals in `resources/catalog/removal_log.json`.
13
  - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
14
 
15
  ## Usage
 
34
  python scripts/sync_resources.py --limit 20
35
  ```
36
 
37
+ Review existing resources and remove stale entries before discovery:
38
+ ```bash
39
+ python scripts/review_existing_resources.py
40
+ ```
41
+
42
+ Run stricter relevance cleanup mode:
43
+ ```bash
44
+ python scripts/review_existing_resources.py --enforce-pashto-relevance
45
+ ```
46
+
47
  Auto-promote valid candidates into verified catalog:
48
  ```bash
49
  python scripts/promote_candidates.py
50
  ```
51
 
52
+ Auto-promote while skipping online URL availability checks:
53
+ ```bash
54
+ python scripts/promote_candidates.py --skip-url-check
55
+ ```
56
+
57
  Run full repeatable cycle:
58
  ```bash
59
  python scripts/run_resource_cycle.py --limit 25
scripts/promote_candidates.py CHANGED
@@ -18,8 +18,23 @@ try:
18
  except ModuleNotFoundError:
19
  from validate_resource_catalog import validate_resource
20
 
 
 
 
 
 
21
 
22
  PLACEHOLDER_PRIMARY_USE = "Needs maintainer review before promotion to verified catalog."
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  def _canonical_url(value: str) -> str:
@@ -43,11 +58,24 @@ def _prepare_candidate(candidate: dict[str, Any]) -> dict[str, Any]:
43
  return promoted
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
46
  def promote_candidates(
47
  catalog: dict[str, Any],
48
  pending_payload: dict[str, Any],
49
  *,
50
  max_promotions: int | None = None,
 
 
51
  ) -> tuple[list[dict[str, Any]], dict[str, int]]:
52
  resources = catalog.get("resources")
53
  if not isinstance(resources, list):
@@ -69,7 +97,7 @@ def promote_candidates(
69
  }
70
 
71
  promoted: list[dict[str, Any]] = []
72
- stats = {"total": len(candidates), "promoted": 0, "duplicate": 0, "invalid": 0}
73
 
74
  for candidate in candidates:
75
  if max_promotions is not None and len(promoted) >= max_promotions:
@@ -90,6 +118,10 @@ def promote_candidates(
90
  stats["duplicate"] += 1
91
  continue
92
 
 
 
 
 
93
  errors = validate_resource(resource, len(resources) + len(promoted))
94
  if errors:
95
  stats["invalid"] += 1
@@ -112,6 +144,8 @@ def main() -> int:
112
  parser.add_argument("--catalog", default="resources/catalog/resources.json")
113
  parser.add_argument("--candidates", default="resources/catalog/pending_candidates.json")
114
  parser.add_argument("--max-promotions", type=int, default=None)
 
 
115
  args = parser.parse_args()
116
 
117
  catalog_path = Path(args.catalog)
@@ -135,18 +169,22 @@ def main() -> int:
135
  catalog,
136
  pending_payload,
137
  max_promotions=args.max_promotions,
 
 
138
  )
139
  if not promoted:
140
  print(
141
  "Promotion complete: no new verified resources "
142
- f"(duplicates={stats['duplicate']}, invalid={stats['invalid']})"
143
  )
144
  return 0
145
 
146
  catalog_path.write_text(json.dumps(catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
147
  print(
148
  "Promotion complete: "
149
- f"promoted={stats['promoted']} duplicate={stats['duplicate']} invalid={stats['invalid']}"
 
 
150
  )
151
  return 0
152
 
 
18
  except ModuleNotFoundError:
19
  from validate_resource_catalog import validate_resource
20
 
21
+ try:
22
+ from scripts.review_existing_resources import probe_resource_url
23
+ except ModuleNotFoundError:
24
+ from review_existing_resources import probe_resource_url
25
+
26
 
27
  PLACEHOLDER_PRIMARY_USE = "Needs maintainer review before promotion to verified catalog."
28
+ NOT_FOUND_PATTERNS = (
29
+ "repository not found",
30
+ "model not found",
31
+ "dataset not found",
32
+ "space not found",
33
+ "page not found",
34
+ "not found",
35
+ "this repository does not exist",
36
+ "we couldn't find",
37
+ )
38
 
39
 
40
  def _canonical_url(value: str) -> str:
 
58
  return promoted
59
 
60
 
61
+ def _candidate_url_unavailable(url: str, timeout: float) -> bool:
62
+ probe = probe_resource_url(url, timeout)
63
+ if probe.hard_missing:
64
+ return True
65
+ if probe.content_sample:
66
+ lowered = probe.content_sample.casefold()
67
+ if any(pattern in lowered for pattern in NOT_FOUND_PATTERNS):
68
+ return True
69
+ return False
70
+
71
+
72
  def promote_candidates(
73
  catalog: dict[str, Any],
74
  pending_payload: dict[str, Any],
75
  *,
76
  max_promotions: int | None = None,
77
+ verify_urls: bool = False,
78
+ url_timeout: float = 10.0,
79
  ) -> tuple[list[dict[str, Any]], dict[str, int]]:
80
  resources = catalog.get("resources")
81
  if not isinstance(resources, list):
 
97
  }
98
 
99
  promoted: list[dict[str, Any]] = []
100
+ stats = {"total": len(candidates), "promoted": 0, "duplicate": 0, "invalid": 0, "unavailable": 0}
101
 
102
  for candidate in candidates:
103
  if max_promotions is not None and len(promoted) >= max_promotions:
 
118
  stats["duplicate"] += 1
119
  continue
120
 
121
+ if verify_urls and _candidate_url_unavailable(url, url_timeout):
122
+ stats["unavailable"] += 1
123
+ continue
124
+
125
  errors = validate_resource(resource, len(resources) + len(promoted))
126
  if errors:
127
  stats["invalid"] += 1
 
144
  parser.add_argument("--catalog", default="resources/catalog/resources.json")
145
  parser.add_argument("--candidates", default="resources/catalog/pending_candidates.json")
146
  parser.add_argument("--max-promotions", type=int, default=None)
147
+ parser.add_argument("--skip-url-check", action="store_true")
148
+ parser.add_argument("--url-timeout", type=float, default=10.0)
149
  args = parser.parse_args()
150
 
151
  catalog_path = Path(args.catalog)
 
169
  catalog,
170
  pending_payload,
171
  max_promotions=args.max_promotions,
172
+ verify_urls=not args.skip_url_check,
173
+ url_timeout=args.url_timeout,
174
  )
175
  if not promoted:
176
  print(
177
  "Promotion complete: no new verified resources "
178
+ f"(duplicates={stats['duplicate']}, invalid={stats['invalid']}, unavailable={stats['unavailable']})"
179
  )
180
  return 0
181
 
182
  catalog_path.write_text(json.dumps(catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
183
  print(
184
  "Promotion complete: "
185
+ "promoted="
186
+ f"{stats['promoted']} duplicate={stats['duplicate']} invalid={stats['invalid']} "
187
+ f"unavailable={stats['unavailable']}"
188
  )
189
  return 0
190
 
scripts/review_existing_resources.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Review existing catalog entries and remove only with strong evidence.
2
+
3
+ This script enforces a conservative pre-sync audit:
4
+ - Keep resources that are reachable and Pashto-relevant.
5
+ - Remove only when there is a strong reason (for example hard 404/410, duplicate ID/URL,
6
+ or no Pashto signal in metadata and live page content).
7
+ - Persist removal reasons in a log for maintainer review.
8
+
9
+ Usage:
10
+ python scripts/review_existing_resources.py
11
+ python scripts/review_existing_resources.py --timeout 15
12
+ python scripts/review_existing_resources.py --dry-run
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import concurrent.futures as futures
19
+ import json
20
+ import re
21
+ import socket
22
+ from dataclasses import dataclass
23
+ from datetime import date, datetime, timezone
24
+ from pathlib import Path
25
+ from typing import Any
26
+ from urllib.error import HTTPError, URLError
27
+ from urllib.request import Request, urlopen
28
+
29
+ try:
30
+ from scripts.validate_resource_catalog import validate_resource
31
+ except ModuleNotFoundError:
32
+ from validate_resource_catalog import validate_resource
33
+
34
+
35
+ USER_AGENT = "pashto-resource-review/1.0"
36
+ MAX_BODY_BYTES = 120_000
37
+ HARD_REMOVE_HTTP_CODES = {404, 410, 451}
38
+ NOT_FOUND_PATTERNS = (
39
+ "repository not found",
40
+ "model not found",
41
+ "dataset not found",
42
+ "space not found",
43
+ "page not found",
44
+ "not found",
45
+ "this repository does not exist",
46
+ "we couldn't find",
47
+ )
48
+ AUTOMATED_PRIMARY_USE = "Automated discovery entry for Pashto resource tracking."
49
+ PASHTO_WORD_RE = re.compile(r"(?<![A-Za-z0-9])(pashto|pukhto|pushto|pakhto)(?![A-Za-z0-9])", re.IGNORECASE)
50
+ PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b", re.IGNORECASE)
51
+ PASHTO_SCRIPT_MARKERS = ("پښتو", "پشتو")
52
+
53
+
54
+ @dataclass
55
+ class UrlProbe:
56
+ status_code: int | None = None
57
+ final_url: str | None = None
58
+ content_sample: str = ""
59
+ hard_missing: bool = False
60
+ uncertain_error: str | None = None
61
+
62
+
63
+ def _contains_pashto_marker(value: str) -> bool:
64
+ text = (value or "").strip()
65
+ if not text:
66
+ return False
67
+ if PASHTO_WORD_RE.search(text):
68
+ return True
69
+ if PASHTO_CODE_RE.search(text):
70
+ return True
71
+ return any(marker in text for marker in PASHTO_SCRIPT_MARKERS)
72
+
73
+
74
+ def _resource_metadata_has_pashto_signal(resource: dict[str, Any]) -> bool:
75
+ values: list[str] = []
76
+ for key in ("title", "url", "summary", "primary_use"):
77
+ item = resource.get(key)
78
+ if isinstance(item, str):
79
+ values.append(item)
80
+
81
+ tags = resource.get("tags")
82
+ if isinstance(tags, list):
83
+ values.extend(tag for tag in tags if isinstance(tag, str))
84
+
85
+ evidence = resource.get("pashto_evidence")
86
+ if isinstance(evidence, dict):
87
+ for key in ("evidence_text", "evidence_url"):
88
+ item = evidence.get(key)
89
+ if isinstance(item, str):
90
+ values.append(item)
91
+ markers = evidence.get("markers")
92
+ if isinstance(markers, list):
93
+ values.extend(marker for marker in markers if isinstance(marker, str))
94
+
95
+ return any(_contains_pashto_marker(value) for value in values)
96
+
97
+
98
+ def _resource_has_direct_pashto_signal(resource: dict[str, Any]) -> bool:
99
+ values: list[str] = []
100
+ for key in ("title", "url"):
101
+ item = resource.get(key)
102
+ if isinstance(item, str):
103
+ values.append(item)
104
+
105
+ evidence = resource.get("pashto_evidence")
106
+ if isinstance(evidence, dict):
107
+ evidence_url = evidence.get("evidence_url")
108
+ if isinstance(evidence_url, str):
109
+ values.append(evidence_url)
110
+ markers = evidence.get("markers")
111
+ if isinstance(markers, list):
112
+ values.extend(marker for marker in markers if isinstance(marker, str))
113
+
114
+ tags = resource.get("tags")
115
+ if isinstance(tags, list):
116
+ values.extend(tag for tag in tags if isinstance(tag, str))
117
+
118
+ return any(_contains_pashto_marker(value) for value in values)
119
+
120
+
121
+ def _is_automated_candidate_like(resource: dict[str, Any]) -> bool:
122
+ rid = resource.get("id")
123
+ primary_use = resource.get("primary_use")
124
+ return (isinstance(rid, str) and rid.startswith("candidate-")) or (
125
+ isinstance(primary_use, str) and primary_use.strip() == AUTOMATED_PRIMARY_USE
126
+ )
127
+
128
+
129
+ def _canonical_url(value: str) -> str:
130
+ return value.rstrip("/")
131
+
132
+
133
+ def _request_url(url: str, method: str, timeout: float) -> UrlProbe:
134
+ request = Request(url, method=method, headers={"User-Agent": USER_AGENT})
135
+ try:
136
+ with urlopen(request, timeout=timeout) as response:
137
+ status = getattr(response, "status", 200)
138
+ final_url = response.geturl()
139
+ sample = ""
140
+ if method == "GET":
141
+ payload = response.read(MAX_BODY_BYTES)
142
+ sample = payload.decode("utf-8", errors="replace")
143
+ return UrlProbe(status_code=status, final_url=final_url, content_sample=sample)
144
+ except HTTPError as exc:
145
+ if method == "GET":
146
+ try:
147
+ payload = exc.read(MAX_BODY_BYTES)
148
+ sample = payload.decode("utf-8", errors="replace")
149
+ except Exception: # noqa: BLE001
150
+ sample = ""
151
+ else:
152
+ sample = ""
153
+ return UrlProbe(
154
+ status_code=exc.code,
155
+ final_url=exc.geturl(),
156
+ content_sample=sample,
157
+ hard_missing=exc.code in HARD_REMOVE_HTTP_CODES,
158
+ )
159
+ except (URLError, TimeoutError, socket.timeout) as exc:
160
+ return UrlProbe(uncertain_error=str(exc))
161
+
162
+
163
+ def probe_resource_url(url: str, timeout: float) -> UrlProbe:
164
+ head = _request_url(url, "HEAD", timeout)
165
+ if head.uncertain_error:
166
+ return head
167
+ if head.status_code in HARD_REMOVE_HTTP_CODES:
168
+ head.hard_missing = True
169
+ return head
170
+ if head.status_code in {403, 405, 429} or head.status_code is None:
171
+ get_result = _request_url(url, "GET", timeout)
172
+ if get_result.status_code in HARD_REMOVE_HTTP_CODES:
173
+ get_result.hard_missing = True
174
+ return get_result
175
+ if head.status_code and 200 <= head.status_code < 400:
176
+ get_result = _request_url(url, "GET", timeout)
177
+ if get_result.uncertain_error:
178
+ return head
179
+ return get_result
180
+ return head
181
+
182
+
183
+ def review_resources(
184
+ catalog: dict[str, Any],
185
+ *,
186
+ timeout: float = 12.0,
187
+ enforce_pashto_relevance: bool = False,
188
+ max_workers: int = 12,
189
+ probe_fn: Any = probe_resource_url,
190
+ ) -> tuple[dict[str, Any], dict[str, Any]]:
191
+ resources = catalog.get("resources")
192
+ if not isinstance(resources, list):
193
+ raise ValueError("catalog.resources must be a list")
194
+
195
+ kept: list[dict[str, Any]] = []
196
+ removals: list[dict[str, Any]] = []
197
+ warnings: list[str] = []
198
+ seen_ids: dict[str, str] = {}
199
+ seen_urls: dict[tuple[str, str], str] = {}
200
+
201
+ probe_results: dict[str, UrlProbe] = {}
202
+ candidate_urls = sorted(
203
+ {
204
+ resource.get("url", "").strip()
205
+ for resource in resources
206
+ if isinstance(resource, dict) and isinstance(resource.get("url"), str) and resource.get("url", "").strip()
207
+ }
208
+ )
209
+ if candidate_urls:
210
+ worker_count = max(1, min(max_workers, len(candidate_urls)))
211
+ with futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
212
+ future_map = {executor.submit(probe_fn, url, timeout): url for url in candidate_urls}
213
+ for future in futures.as_completed(future_map):
214
+ url = future_map[future]
215
+ try:
216
+ probe_results[url] = future.result()
217
+ except Exception as exc: # noqa: BLE001
218
+ probe_results[url] = UrlProbe(uncertain_error=str(exc))
219
+
220
+ for index, resource in enumerate(resources):
221
+ if not isinstance(resource, dict):
222
+ removals.append(
223
+ {
224
+ "id": f"resource-{index}",
225
+ "title": "",
226
+ "url": "",
227
+ "reasons": ["Entry is not a JSON object."],
228
+ "evidence": {},
229
+ }
230
+ )
231
+ continue
232
+
233
+ rid = resource.get("id", "")
234
+ title = resource.get("title", "")
235
+ url = resource.get("url", "")
236
+ category = resource.get("category", "")
237
+ reasons: list[str] = []
238
+
239
+ if not isinstance(rid, str) or not rid.strip():
240
+ reasons.append("Missing or invalid resource id.")
241
+ if not isinstance(url, str) or not url.strip():
242
+ reasons.append("Missing or invalid resource URL.")
243
+
244
+ if isinstance(rid, str) and rid:
245
+ previous = seen_ids.get(rid)
246
+ if previous:
247
+ reasons.append(f"Duplicate resource id; already used by '{previous}'.")
248
+
249
+ canonical_url = _canonical_url(url) if isinstance(url, str) else ""
250
+ normalized_category = str(category).strip().casefold() if isinstance(category, str) else ""
251
+ if canonical_url:
252
+ previous = seen_urls.get((normalized_category, canonical_url))
253
+ if previous:
254
+ reasons.append(
255
+ "Duplicate canonical URL in same category; "
256
+ f"already used by '{previous}'."
257
+ )
258
+
259
+ validation_errors = validate_resource(resource, index)
260
+ if any(".url must be a valid http/https URL" in error for error in validation_errors):
261
+ reasons.append("Resource URL failed schema validation.")
262
+
263
+ probe = UrlProbe()
264
+ if isinstance(url, str) and url.strip():
265
+ probe = probe_results.get(url, UrlProbe())
266
+ if probe.hard_missing:
267
+ status_code = probe.status_code if probe.status_code is not None else "unknown"
268
+ reasons.append(f"URL returned hard-missing HTTP status {status_code}.")
269
+ elif probe.uncertain_error:
270
+ warnings.append(f"{rid or f'resource-{index}'} URL probe inconclusive: {probe.uncertain_error}")
271
+
272
+ metadata_pashto = _resource_metadata_has_pashto_signal(resource)
273
+ direct_pashto = _resource_has_direct_pashto_signal(resource)
274
+ page_pashto = _contains_pashto_marker(probe.content_sample)
275
+ page_not_found = any(pattern in probe.content_sample.casefold() for pattern in NOT_FOUND_PATTERNS)
276
+
277
+ if page_not_found and not page_pashto:
278
+ reasons.append("Live page content indicates resource is unavailable.")
279
+
280
+ if enforce_pashto_relevance and not metadata_pashto and not page_pashto:
281
+ reasons.append("No Pashto signal found in metadata or live page content.")
282
+
283
+ if enforce_pashto_relevance and _is_automated_candidate_like(resource) and not direct_pashto and not page_pashto:
284
+ reasons.append("Automated candidate lacks direct Pashto signal and appears low-confidence.")
285
+
286
+ if reasons:
287
+ removals.append(
288
+ {
289
+ "id": rid,
290
+ "title": title,
291
+ "url": url,
292
+ "reasons": reasons,
293
+ "evidence": {
294
+ "status_code": probe.status_code,
295
+ "final_url": probe.final_url,
296
+ "metadata_pashto": metadata_pashto,
297
+ "direct_pashto": direct_pashto,
298
+ "page_pashto": page_pashto,
299
+ },
300
+ }
301
+ )
302
+ continue
303
+
304
+ kept.append(resource)
305
+ if isinstance(rid, str) and rid:
306
+ seen_ids[rid] = title if isinstance(title, str) else rid
307
+ if canonical_url:
308
+ seen_urls[(normalized_category, canonical_url)] = (
309
+ title if isinstance(title, str) else canonical_url
310
+ )
311
+
312
+ updated_catalog = dict(catalog)
313
+ if len(kept) != len(resources):
314
+ updated_catalog["resources"] = kept
315
+ updated_catalog["updated_on"] = date.today().isoformat()
316
+
317
+ report = {
318
+ "checked": len(resources),
319
+ "kept": len(kept),
320
+ "removed": len(removals),
321
+ "removals": removals,
322
+ "warnings": warnings,
323
+ }
324
+ return updated_catalog, report
325
+
326
+
327
+ def update_removal_log(log_path: Path, removals: list[dict[str, Any]]) -> None:
328
+ payload: dict[str, Any]
329
+ if log_path.exists():
330
+ try:
331
+ payload = json.loads(log_path.read_text(encoding="utf-8"))
332
+ except json.JSONDecodeError:
333
+ payload = {"updated_on": date.today().isoformat(), "entries": []}
334
+ else:
335
+ payload = {"updated_on": date.today().isoformat(), "entries": []}
336
+
337
+ entries = payload.get("entries")
338
+ if not isinstance(entries, list):
339
+ entries = []
340
+
341
+ removed_on = datetime.now(timezone.utc).isoformat()
342
+ for item in removals:
343
+ entries.append(
344
+ {
345
+ "removed_on": removed_on,
346
+ "id": item.get("id", ""),
347
+ "title": item.get("title", ""),
348
+ "url": item.get("url", ""),
349
+ "reasons": item.get("reasons", []),
350
+ "evidence": item.get("evidence", {}),
351
+ }
352
+ )
353
+
354
+ payload["updated_on"] = date.today().isoformat()
355
+ payload["entries"] = entries
356
+ log_path.parent.mkdir(parents=True, exist_ok=True)
357
+ log_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
358
+
359
+
360
+ def main() -> int:
361
+ parser = argparse.ArgumentParser()
362
+ parser.add_argument("--catalog", default="resources/catalog/resources.json")
363
+ parser.add_argument("--timeout", type=float, default=12.0)
364
+ parser.add_argument("--max-workers", type=int, default=12)
365
+ parser.add_argument("--removal-log", default="resources/catalog/removal_log.json")
366
+ parser.add_argument("--dry-run", action="store_true")
367
+ parser.add_argument(
368
+ "--enforce-pashto-relevance",
369
+ action="store_true",
370
+ help="Also remove entries that have no Pashto signal in metadata or live page content.",
371
+ )
372
+ args = parser.parse_args()
373
+
374
+ catalog_path = Path(args.catalog)
375
+ removal_log_path = Path(args.removal_log)
376
+ if not catalog_path.exists():
377
+ print(f"Missing catalog file: {catalog_path}")
378
+ return 1
379
+
380
+ try:
381
+ catalog = json.loads(catalog_path.read_text(encoding="utf-8"))
382
+ except json.JSONDecodeError as exc:
383
+ print(f"Invalid catalog JSON: {exc}")
384
+ return 1
385
+
386
+ updated_catalog, report = review_resources(
387
+ catalog,
388
+ timeout=args.timeout,
389
+ enforce_pashto_relevance=args.enforce_pashto_relevance,
390
+ max_workers=args.max_workers,
391
+ probe_fn=probe_resource_url,
392
+ )
393
+
394
+ print(
395
+ "Resource review complete: "
396
+ f"checked={report['checked']} kept={report['kept']} removed={report['removed']} "
397
+ f"warnings={len(report['warnings'])}"
398
+ )
399
+
400
+ if report["warnings"]:
401
+ for warning in report["warnings"]:
402
+ print(f"[warn] {warning}")
403
+
404
+ if report["removed"]:
405
+ for item in report["removals"]:
406
+ rid = item.get("id", "<unknown>")
407
+ reasons = item.get("reasons", [])
408
+ print(f"[remove] {rid}")
409
+ for reason in reasons:
410
+ print(f" - {reason}")
411
+
412
+ if not args.dry_run:
413
+ catalog_path.write_text(json.dumps(updated_catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
414
+ update_removal_log(removal_log_path, report["removals"])
415
+ elif not args.dry_run and updated_catalog != catalog:
416
+ # Defensive branch for any non-removal edits.
417
+ catalog_path.write_text(json.dumps(updated_catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
418
+
419
+ return 0
420
+
421
+
422
+ if __name__ == "__main__":
423
+ raise SystemExit(main())
scripts/run_resource_cycle.py CHANGED
@@ -9,6 +9,8 @@ Usage:
9
  python scripts/run_resource_cycle.py --skip-pytest
10
  python scripts/run_resource_cycle.py --discover-only
11
  python scripts/run_resource_cycle.py --max-promotions 10
 
 
12
  """
13
 
14
  from __future__ import annotations
@@ -37,12 +39,38 @@ def main() -> int:
37
  default=None,
38
  help="Optional cap for auto-promotion count from pending candidates",
39
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  args = parser.parse_args()
41
 
42
  repo_root = Path(__file__).resolve().parents[1]
43
- steps: list[list[str]] = [
44
- ["python", "scripts/sync_resources.py", "--limit", str(args.limit)],
45
- ]
 
 
 
 
 
 
 
 
 
 
46
 
47
  if not args.discover_only:
48
  promote_step = ["python", "scripts/promote_candidates.py"]
 
9
  python scripts/run_resource_cycle.py --skip-pytest
10
  python scripts/run_resource_cycle.py --discover-only
11
  python scripts/run_resource_cycle.py --max-promotions 10
12
+ python scripts/run_resource_cycle.py --skip-existing-review
13
+ python scripts/run_resource_cycle.py --skip-pashto-relevance-check
14
  """
15
 
16
  from __future__ import annotations
 
39
  default=None,
40
  help="Optional cap for auto-promotion count from pending candidates",
41
  )
42
+ parser.add_argument(
43
+ "--skip-existing-review",
44
+ action="store_true",
45
+ help="Skip review/removal of stale existing resources before syncing candidates.",
46
+ )
47
+ parser.add_argument(
48
+ "--resource-timeout",
49
+ type=float,
50
+ default=12.0,
51
+ help="Timeout in seconds for existing-resource URL probes.",
52
+ )
53
+ parser.add_argument(
54
+ "--skip-pashto-relevance-check",
55
+ action="store_true",
56
+ help="Disable Pashto relevance filtering in existing-resource review.",
57
+ )
58
  args = parser.parse_args()
59
 
60
  repo_root = Path(__file__).resolve().parents[1]
61
+ steps: list[list[str]] = []
62
+ if not args.skip_existing_review:
63
+ review_step = [
64
+ "python",
65
+ "scripts/review_existing_resources.py",
66
+ "--timeout",
67
+ str(args.resource_timeout),
68
+ ]
69
+ if not args.skip_pashto_relevance_check:
70
+ review_step.append("--enforce-pashto-relevance")
71
+ steps.append(review_step)
72
+
73
+ steps.append(["python", "scripts/sync_resources.py", "--limit", str(args.limit)])
74
 
75
  if not args.discover_only:
76
  promote_step = ["python", "scripts/promote_candidates.py"]
tests/test_promote_candidates.py CHANGED
@@ -1,5 +1,6 @@
1
  from datetime import date
2
 
 
3
  from scripts.promote_candidates import PLACEHOLDER_PRIMARY_USE, promote_candidates
4
 
5
 
@@ -132,3 +133,29 @@ def test_promote_candidates_respects_max_promotions() -> None:
132
  assert len(promoted) == 1
133
  assert stats["promoted"] == 1
134
  assert len(catalog["resources"]) == 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from datetime import date
2
 
3
+ import scripts.promote_candidates as promote_module
4
  from scripts.promote_candidates import PLACEHOLDER_PRIMARY_USE, promote_candidates
5
 
6
 
 
133
  assert len(promoted) == 1
134
  assert stats["promoted"] == 1
135
  assert len(catalog["resources"]) == 2
136
+
137
+
138
+ def test_promote_candidates_skips_unavailable_when_url_check_enabled(monkeypatch) -> None:
139
+ catalog = _catalog()
140
+ pending = {
141
+ "candidate_count": 1,
142
+ "candidates": [
143
+ _candidate(
144
+ rid="dataset-unavailable",
145
+ title="Pashto Unavailable Dataset",
146
+ url="https://example.org/pashto-unavailable",
147
+ )
148
+ ],
149
+ }
150
+
151
+ monkeypatch.setattr(
152
+ promote_module,
153
+ "_candidate_url_unavailable",
154
+ lambda *_args, **_kwargs: True,
155
+ )
156
+
157
+ promoted, stats = promote_candidates(catalog, pending, verify_urls=True)
158
+
159
+ assert promoted == []
160
+ assert stats["promoted"] == 0
161
+ assert stats["unavailable"] == 1
tests/test_review_existing_resources.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scripts.review_existing_resources import UrlProbe, review_resources
2
+
3
+
4
+ def _resource(*, rid: str, title: str, url: str) -> dict:
5
+ return {
6
+ "id": rid,
7
+ "title": title,
8
+ "url": url,
9
+ "category": "dataset",
10
+ "source": "other",
11
+ "status": "verified",
12
+ "summary": "Resource summary used for catalog review tests.",
13
+ "primary_use": "Testing",
14
+ "tasks": ["nlp"],
15
+ "pashto_evidence": {
16
+ "evidence_text": "Contains Pashto signal in metadata.",
17
+ "evidence_url": url,
18
+ "markers": ["Pashto"],
19
+ },
20
+ "tags": ["pashto", "dataset"],
21
+ }
22
+
23
+
24
+ def test_review_resources_removes_hard_missing_urls() -> None:
25
+ catalog = {
26
+ "version": "1.0.0",
27
+ "updated_on": "2026-02-20",
28
+ "resources": [_resource(rid="dataset-a", title="Pashto A", url="https://example.org/a")],
29
+ }
30
+
31
+ def probe(_: str, __: float) -> UrlProbe:
32
+ return UrlProbe(status_code=404, hard_missing=True)
33
+
34
+ updated, report = review_resources(catalog, probe_fn=probe)
35
+
36
+ assert report["removed"] == 1
37
+ assert updated["resources"] == []
38
+ assert any("hard-missing HTTP status 404" in reason for reason in report["removals"][0]["reasons"])
39
+
40
+
41
+ def test_review_resources_keeps_resource_when_probe_is_inconclusive() -> None:
42
+ catalog = {
43
+ "version": "1.0.0",
44
+ "updated_on": "2026-02-20",
45
+ "resources": [_resource(rid="dataset-a", title="Pashto A", url="https://example.org/a")],
46
+ }
47
+
48
+ def probe(_: str, __: float) -> UrlProbe:
49
+ return UrlProbe(uncertain_error="timed out")
50
+
51
+ updated, report = review_resources(catalog, probe_fn=probe)
52
+
53
+ assert report["removed"] == 0
54
+ assert len(updated["resources"]) == 1
55
+ assert len(report["warnings"]) == 1
56
+
57
+
58
+ def test_review_resources_removes_duplicate_urls() -> None:
59
+ catalog = {
60
+ "version": "1.0.0",
61
+ "updated_on": "2026-02-20",
62
+ "resources": [
63
+ _resource(rid="dataset-a", title="Pashto A", url="https://example.org/shared"),
64
+ _resource(rid="dataset-b", title="Pashto B", url="https://example.org/shared"),
65
+ ],
66
+ }
67
+
68
+ def probe(_: str, __: float) -> UrlProbe:
69
+ return UrlProbe(status_code=200, content_sample="Pashto")
70
+
71
+ updated, report = review_resources(catalog, probe_fn=probe)
72
+
73
+ assert report["removed"] == 1
74
+ assert len(updated["resources"]) == 1
75
+ assert any("Duplicate canonical URL" in reason for reason in report["removals"][0]["reasons"])
76
+
77
+
78
+ def test_review_resources_allows_same_url_across_different_categories() -> None:
79
+ dataset = _resource(rid="dataset-a", title="Pashto A", url="https://example.org/shared")
80
+ benchmark = _resource(rid="benchmark-a", title="Pashto A Benchmark", url="https://example.org/shared")
81
+ benchmark["category"] = "benchmark"
82
+ benchmark["tags"] = ["pashto", "benchmark"]
83
+ catalog = {
84
+ "version": "1.0.0",
85
+ "updated_on": "2026-02-20",
86
+ "resources": [dataset, benchmark],
87
+ }
88
+
89
+ def probe(_: str, __: float) -> UrlProbe:
90
+ return UrlProbe(status_code=200, content_sample="Pashto")
91
+
92
+ updated, report = review_resources(catalog, probe_fn=probe)
93
+
94
+ assert report["removed"] == 0
95
+ assert len(updated["resources"]) == 2
96
+
97
+
98
+ def test_review_resources_enforces_pashto_relevance_only_when_enabled() -> None:
99
+ non_pashto = _resource(rid="dataset-x", title="General Dataset", url="https://example.org/general")
100
+ non_pashto["pashto_evidence"]["evidence_text"] = "Generic metadata note."
101
+ non_pashto["pashto_evidence"]["markers"] = ["generic"]
102
+ non_pashto["tags"] = ["dataset", "general"]
103
+ catalog = {"version": "1.0.0", "updated_on": "2026-02-20", "resources": [non_pashto]}
104
+
105
+ def probe(_: str, __: float) -> UrlProbe:
106
+ return UrlProbe(status_code=200, content_sample="General language resource")
107
+
108
+ updated_relaxed, report_relaxed = review_resources(catalog, probe_fn=probe, enforce_pashto_relevance=False)
109
+ updated_strict, report_strict = review_resources(catalog, probe_fn=probe, enforce_pashto_relevance=True)
110
+
111
+ assert report_relaxed["removed"] == 0
112
+ assert len(updated_relaxed["resources"]) == 1
113
+ assert report_strict["removed"] == 1
114
+ assert updated_strict["resources"] == []