musaw commited on
Commit ·
194828a
1
Parent(s): 6f1c8bd
sync(hf): snapshot origin main after resource audit cycle
Browse files- .github/workflows/resource_sync.yml +35 -2
- docs/resource_automation.md +7 -4
- docs/resource_cycle_runbook.md +10 -8
- docs/search/resources.json +0 -0
- resources/README.md +6 -6
- resources/catalog/pending_candidates.json +0 -0
- resources/catalog/removal_log.json +1205 -0
- resources/catalog/resources.json +0 -0
- resources/codes/README.md +3 -0
- resources/datasets/README.md +13 -15
- resources/models/README.md +1 -0
- resources/papers/README.md +80 -0
- resources/projects/README.md +31 -0
- scripts/README.md +16 -0
- scripts/promote_candidates.py +41 -3
- scripts/review_existing_resources.py +423 -0
- scripts/run_resource_cycle.py +31 -3
- tests/test_promote_candidates.py +27 -0
- tests/test_review_existing_resources.py +114 -0
.github/workflows/resource_sync.yml
CHANGED
|
@@ -4,11 +4,29 @@ on:
|
|
| 4 |
schedule:
|
| 5 |
- cron: "0 4 * * *"
|
| 6 |
workflow_dispatch:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
permissions:
|
| 9 |
contents: write
|
| 10 |
pull-requests: write
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
jobs:
|
| 13 |
sync:
|
| 14 |
runs-on: ubuntu-latest
|
|
@@ -26,11 +44,24 @@ jobs:
|
|
| 26 |
python -m pip install --upgrade pip
|
| 27 |
python -m pip install -e ".[dev]"
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
- name: Sync candidate resources
|
| 30 |
-
run: python scripts/sync_resources.py --limit
|
| 31 |
|
| 32 |
- name: Auto-promote valid candidates
|
| 33 |
-
run:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
- name: Validate catalog
|
| 36 |
run: python scripts/validate_resource_catalog.py
|
|
@@ -79,6 +110,7 @@ jobs:
|
|
| 79 |
Automated daily resource sync.
|
| 80 |
|
| 81 |
Scope:
|
|
|
|
| 82 |
- Updates `resources/catalog/pending_candidates.json`
|
| 83 |
- Auto-promotes valid non-duplicate candidates into `resources/catalog/resources.json`
|
| 84 |
- Regenerates resource indexes and search payload
|
|
@@ -88,6 +120,7 @@ jobs:
|
|
| 88 |
add-paths: |
|
| 89 |
resources/catalog/pending_candidates.json
|
| 90 |
resources/catalog/resources.json
|
|
|
|
| 91 |
resources/README.md
|
| 92 |
resources/datasets/README.md
|
| 93 |
resources/models/README.md
|
|
|
|
| 4 |
schedule:
|
| 5 |
- cron: "0 4 * * *"
|
| 6 |
workflow_dispatch:
|
| 7 |
+
inputs:
|
| 8 |
+
limit:
|
| 9 |
+
description: "Candidate fetch limit per source"
|
| 10 |
+
required: false
|
| 11 |
+
default: "20"
|
| 12 |
+
max_promotions:
|
| 13 |
+
description: "Optional max number of candidate promotions"
|
| 14 |
+
required: false
|
| 15 |
+
default: ""
|
| 16 |
+
enforce_pashto_relevance:
|
| 17 |
+
description: "Also remove existing entries without Pashto evidence"
|
| 18 |
+
required: false
|
| 19 |
+
default: "true"
|
| 20 |
|
| 21 |
permissions:
|
| 22 |
contents: write
|
| 23 |
pull-requests: write
|
| 24 |
|
| 25 |
+
env:
|
| 26 |
+
RESOURCE_LIMIT: ${{ github.event.inputs.limit || '20' }}
|
| 27 |
+
MAX_PROMOTIONS: ${{ github.event.inputs.max_promotions || '' }}
|
| 28 |
+
ENFORCE_PASHTO_RELEVANCE: ${{ github.event.inputs.enforce_pashto_relevance || 'true' }}
|
| 29 |
+
|
| 30 |
jobs:
|
| 31 |
sync:
|
| 32 |
runs-on: ubuntu-latest
|
|
|
|
| 44 |
python -m pip install --upgrade pip
|
| 45 |
python -m pip install -e ".[dev]"
|
| 46 |
|
| 47 |
+
- name: Review existing resources for stale or low-value entries
|
| 48 |
+
run: |
|
| 49 |
+
if [ "${ENFORCE_PASHTO_RELEVANCE}" = "true" ]; then
|
| 50 |
+
python scripts/review_existing_resources.py --enforce-pashto-relevance
|
| 51 |
+
else
|
| 52 |
+
python scripts/review_existing_resources.py
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
- name: Sync candidate resources
|
| 56 |
+
run: python scripts/sync_resources.py --limit "${RESOURCE_LIMIT}"
|
| 57 |
|
| 58 |
- name: Auto-promote valid candidates
|
| 59 |
+
run: |
|
| 60 |
+
if [ -n "${MAX_PROMOTIONS}" ]; then
|
| 61 |
+
python scripts/promote_candidates.py --max-promotions "${MAX_PROMOTIONS}"
|
| 62 |
+
else
|
| 63 |
+
python scripts/promote_candidates.py
|
| 64 |
+
fi
|
| 65 |
|
| 66 |
- name: Validate catalog
|
| 67 |
run: python scripts/validate_resource_catalog.py
|
|
|
|
| 110 |
Automated daily resource sync.
|
| 111 |
|
| 112 |
Scope:
|
| 113 |
+
- Reviews existing catalog entries and removes stale ones only with strong logged reasons
|
| 114 |
- Updates `resources/catalog/pending_candidates.json`
|
| 115 |
- Auto-promotes valid non-duplicate candidates into `resources/catalog/resources.json`
|
| 116 |
- Regenerates resource indexes and search payload
|
|
|
|
| 120 |
add-paths: |
|
| 121 |
resources/catalog/pending_candidates.json
|
| 122 |
resources/catalog/resources.json
|
| 123 |
+
resources/catalog/removal_log.json
|
| 124 |
resources/README.md
|
| 125 |
resources/datasets/README.md
|
| 126 |
resources/models/README.md
|
docs/resource_automation.md
CHANGED
|
@@ -28,6 +28,7 @@ This repository uses automated discovery and promotion to keep Pashto resources
|
|
| 28 |
## Scripts
|
| 29 |
- Validate catalog: `python scripts/validate_resource_catalog.py`
|
| 30 |
- Generate markdown and search index: `python scripts/generate_resource_views.py`
|
|
|
|
| 31 |
- Sync new candidates: `python scripts/sync_resources.py --limit 20`
|
| 32 |
- Auto-promote valid candidates: `python scripts/promote_candidates.py`
|
| 33 |
- Full run wrapper: `python scripts/run_resource_cycle.py --limit 25`
|
|
@@ -39,14 +40,16 @@ This repository uses automated discovery and promotion to keep Pashto resources
|
|
| 39 |
- markdown link checks
|
| 40 |
- tests
|
| 41 |
- Resource Sync (`.github/workflows/resource_sync.yml`) runs daily, syncs candidates, auto-promotes valid non-duplicate entries, regenerates views, and opens a PR.
|
|
|
|
| 42 |
|
| 43 |
## Promotion flow
|
| 44 |
-
1.
|
| 45 |
-
2.
|
| 46 |
-
3.
|
|
|
|
| 47 |
- `python scripts/validate_resource_catalog.py`
|
| 48 |
- `python scripts/generate_resource_views.py`
|
| 49 |
-
|
| 50 |
|
| 51 |
## Runbook
|
| 52 |
- Reusable process guide: [resource_cycle_runbook.md](resource_cycle_runbook.md)
|
|
|
|
| 28 |
## Scripts
|
| 29 |
- Validate catalog: `python scripts/validate_resource_catalog.py`
|
| 30 |
- Generate markdown and search index: `python scripts/generate_resource_views.py`
|
| 31 |
+
- Review existing resources for stale/deleted entries: `python scripts/review_existing_resources.py`
|
| 32 |
- Sync new candidates: `python scripts/sync_resources.py --limit 20`
|
| 33 |
- Auto-promote valid candidates: `python scripts/promote_candidates.py`
|
| 34 |
- Full run wrapper: `python scripts/run_resource_cycle.py --limit 25`
|
|
|
|
| 40 |
- markdown link checks
|
| 41 |
- tests
|
| 42 |
- Resource Sync (`.github/workflows/resource_sync.yml`) runs daily, syncs candidates, auto-promotes valid non-duplicate entries, regenerates views, and opens a PR.
|
| 43 |
+
- Before candidate sync, it reviews existing entries and removes stale/deleted or non-Pashto/low-value entries only with strong logged reasons.
|
| 44 |
|
| 45 |
## Promotion flow
|
| 46 |
+
1. Review existing resources and remove stale entries with strong reasons.
|
| 47 |
+
2. Sync candidates into `resources/catalog/pending_candidates.json`.
|
| 48 |
+
3. Auto-promote valid, non-duplicate, URL-available entries into `resources/catalog/resources.json`.
|
| 49 |
+
4. Run:
|
| 50 |
- `python scripts/validate_resource_catalog.py`
|
| 51 |
- `python scripts/generate_resource_views.py`
|
| 52 |
+
5. Review PR and merge.
|
| 53 |
|
| 54 |
## Runbook
|
| 55 |
- Reusable process guide: [resource_cycle_runbook.md](resource_cycle_runbook.md)
|
docs/resource_cycle_runbook.md
CHANGED
|
@@ -5,7 +5,7 @@ Use this runbook whenever you want to repeat the resource update process without
|
|
| 5 |
## Daily automation (already enabled)
|
| 6 |
- Workflow: [../.github/workflows/resource_sync.yml](../.github/workflows/resource_sync.yml)
|
| 7 |
- Schedule: every day at 04:00 UTC via GitHub Actions cron.
|
| 8 |
-
- Output: updates [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json), auto-promotes valid non-duplicate entries into [../resources/catalog/resources.json](../resources/catalog/resources.json), regenerates views, and opens a review PR.
|
| 9 |
|
| 10 |
## Manual run (single command)
|
| 11 |
Run from repository root:
|
|
@@ -15,12 +15,13 @@ python scripts/run_resource_cycle.py --limit 25
|
|
| 15 |
```
|
| 16 |
|
| 17 |
What it executes:
|
| 18 |
-
1. `python scripts/
|
| 19 |
-
2. `python scripts/
|
| 20 |
-
3. `python scripts/
|
| 21 |
-
4. `python scripts/
|
| 22 |
-
5. `python scripts/
|
| 23 |
-
6. `python
|
|
|
|
| 24 |
|
| 25 |
Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, GitLab repositories, Zenodo records, Dataverse datasets, DataCite DOI records, and paper endpoints (arXiv, Semantic Scholar, OpenAlex, Crossref).
|
| 26 |
|
|
@@ -33,7 +34,8 @@ If you want fresh candidates without auto-promotion:
|
|
| 33 |
5. Commit and push.
|
| 34 |
|
| 35 |
## Guardrails
|
| 36 |
-
- Auto-promotion accepts only entries that pass dedupe and catalog validation checks.
|
|
|
|
| 37 |
- Keep `status: verified` for entries that pass automation checks and repository review.
|
| 38 |
- Do not promote "reference-only" resources where Pashto is incidental; only Pashto-centric resources are eligible.
|
| 39 |
- Treat spelling variants as valid Pashto markers during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `pashto-script`).
|
|
|
|
| 5 |
## Daily automation (already enabled)
|
| 6 |
- Workflow: [../.github/workflows/resource_sync.yml](../.github/workflows/resource_sync.yml)
|
| 7 |
- Schedule: every day at 04:00 UTC via GitHub Actions cron.
|
| 8 |
+
- Output: reviews existing resources for stale/deleted links and non-Pashto/low-value entries (removing only with strong logged reasons), updates [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json), auto-promotes valid non-duplicate entries into [../resources/catalog/resources.json](../resources/catalog/resources.json), regenerates views, and opens a review PR.
|
| 9 |
|
| 10 |
## Manual run (single command)
|
| 11 |
Run from repository root:
|
|
|
|
| 15 |
```
|
| 16 |
|
| 17 |
What it executes:
|
| 18 |
+
1. `python scripts/review_existing_resources.py`
|
| 19 |
+
2. `python scripts/sync_resources.py --limit 25`
|
| 20 |
+
3. `python scripts/promote_candidates.py`
|
| 21 |
+
4. `python scripts/validate_resource_catalog.py`
|
| 22 |
+
5. `python scripts/generate_resource_views.py`
|
| 23 |
+
6. `python scripts/check_links.py`
|
| 24 |
+
7. `python -m pytest -q`
|
| 25 |
|
| 26 |
Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, GitLab repositories, Zenodo records, Dataverse datasets, DataCite DOI records, and paper endpoints (arXiv, Semantic Scholar, OpenAlex, Crossref).
|
| 27 |
|
|
|
|
| 34 |
5. Commit and push.
|
| 35 |
|
| 36 |
## Guardrails
|
| 37 |
+
- Auto-promotion accepts only entries that pass dedupe, URL-availability checks, and catalog validation checks.
|
| 38 |
+
- Existing resources are auto-removed only for strong reasons (for example confirmed hard-missing links, duplicates, or missing Pashto relevance), with reasons stored in `resources/catalog/removal_log.json`.
|
| 39 |
- Keep `status: verified` for entries that pass automation checks and repository review.
|
| 40 |
- Do not promote "reference-only" resources where Pashto is incidental; only Pashto-centric resources are eligible.
|
| 41 |
- Treat spelling variants as valid Pashto markers during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `pashto-script`).
|
docs/search/resources.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
resources/README.md
CHANGED
|
@@ -3,13 +3,13 @@
|
|
| 3 |
Structured, Pashto-focused resource tracking lives in this folder.
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
-
- Datasets (
|
| 7 |
-
- Models (
|
| 8 |
- Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
- Tools (0): [tools/README.md](tools/README.md)
|
| 10 |
-
- Papers (
|
| 11 |
-
- Projects (
|
| 12 |
-
- Code (
|
| 13 |
|
| 14 |
## Machine-Readable Catalog
|
| 15 |
- Canonical catalog: [catalog/resources.json](catalog/resources.json)
|
|
@@ -22,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
|
|
| 22 |
- Run `python scripts/validate_resource_catalog.py` before opening a PR.
|
| 23 |
- Run `python scripts/generate_resource_views.py` after catalog changes.
|
| 24 |
|
| 25 |
-
Verified resource count: `
|
|
|
|
| 3 |
Structured, Pashto-focused resource tracking lives in this folder.
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
+
- Datasets (46): [datasets/README.md](datasets/README.md)
|
| 7 |
+
- Models (19): [models/README.md](models/README.md)
|
| 8 |
- Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
- Tools (0): [tools/README.md](tools/README.md)
|
| 10 |
+
- Papers (104): [papers/README.md](papers/README.md)
|
| 11 |
+
- Projects (48): [projects/README.md](projects/README.md)
|
| 12 |
+
- Code (4): [codes/README.md](codes/README.md)
|
| 13 |
|
| 14 |
## Machine-Readable Catalog
|
| 15 |
- Canonical catalog: [catalog/resources.json](catalog/resources.json)
|
|
|
|
| 22 |
- Run `python scripts/validate_resource_catalog.py` before opening a PR.
|
| 23 |
- Run `python scripts/generate_resource_views.py` after catalog changes.
|
| 24 |
|
| 25 |
+
Verified resource count: `225`
|
resources/catalog/pending_candidates.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
resources/catalog/removal_log.json
ADDED
|
@@ -0,0 +1,1205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"updated_on": "2026-02-22",
|
| 3 |
+
"entries": [
|
| 4 |
+
{
|
| 5 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 6 |
+
"id": "dataset-kaggle-pashto-isolated-words",
|
| 7 |
+
"title": "Pashto Isolated Words Speech Dataset",
|
| 8 |
+
"url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 9 |
+
"reasons": [
|
| 10 |
+
"URL returned hard-missing HTTP status 404."
|
| 11 |
+
],
|
| 12 |
+
"evidence": {
|
| 13 |
+
"status_code": 404,
|
| 14 |
+
"final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 15 |
+
"metadata_pashto": true,
|
| 16 |
+
"direct_pashto": true,
|
| 17 |
+
"page_pashto": false
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 22 |
+
"id": "dataset-kaggle-pashto-word-embeddings",
|
| 23 |
+
"title": "Pashto Word Embeddings",
|
| 24 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 25 |
+
"reasons": [
|
| 26 |
+
"URL returned hard-missing HTTP status 404."
|
| 27 |
+
],
|
| 28 |
+
"evidence": {
|
| 29 |
+
"status_code": 404,
|
| 30 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 31 |
+
"metadata_pashto": true,
|
| 32 |
+
"direct_pashto": true,
|
| 33 |
+
"page_pashto": false
|
| 34 |
+
}
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 38 |
+
"id": "dataset-kaggle-pold-pashto-offensive",
|
| 39 |
+
"title": "POLD - Pashto Offensive Language Dataset",
|
| 40 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 41 |
+
"reasons": [
|
| 42 |
+
"URL returned hard-missing HTTP status 404."
|
| 43 |
+
],
|
| 44 |
+
"evidence": {
|
| 45 |
+
"status_code": 404,
|
| 46 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 47 |
+
"metadata_pashto": true,
|
| 48 |
+
"direct_pashto": true,
|
| 49 |
+
"page_pashto": false
|
| 50 |
+
}
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 54 |
+
"id": "dataset-kaggle-pashto-english-sentiment-corpus",
|
| 55 |
+
"title": "Pashto English Bilingual Sentiment Corpus",
|
| 56 |
+
"url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 57 |
+
"reasons": [
|
| 58 |
+
"URL returned hard-missing HTTP status 404."
|
| 59 |
+
],
|
| 60 |
+
"evidence": {
|
| 61 |
+
"status_code": 404,
|
| 62 |
+
"final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 63 |
+
"metadata_pashto": true,
|
| 64 |
+
"direct_pashto": true,
|
| 65 |
+
"page_pashto": false
|
| 66 |
+
}
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 70 |
+
"id": "dataset-kaggle-urdu-pashto-lexicon",
|
| 71 |
+
"title": "Urdu-Pashto Lexicon Dataset",
|
| 72 |
+
"url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 73 |
+
"reasons": [
|
| 74 |
+
"URL returned hard-missing HTTP status 404."
|
| 75 |
+
],
|
| 76 |
+
"evidence": {
|
| 77 |
+
"status_code": 404,
|
| 78 |
+
"final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 79 |
+
"metadata_pashto": true,
|
| 80 |
+
"direct_pashto": true,
|
| 81 |
+
"page_pashto": false
|
| 82 |
+
}
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 86 |
+
"id": "dataset-kaggle-drijaz-pashtoocr",
|
| 87 |
+
"title": "PashtoOCR (Kaggle)",
|
| 88 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 89 |
+
"reasons": [
|
| 90 |
+
"URL returned hard-missing HTTP status 404."
|
| 91 |
+
],
|
| 92 |
+
"evidence": {
|
| 93 |
+
"status_code": 404,
|
| 94 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 95 |
+
"metadata_pashto": true,
|
| 96 |
+
"direct_pashto": true,
|
| 97 |
+
"page_pashto": false
|
| 98 |
+
}
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 102 |
+
"id": "dataset-kaggle-english-pashto-language-dataset-epld",
|
| 103 |
+
"title": "English-Pashto Language Dataset (EPLD)",
|
| 104 |
+
"url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 105 |
+
"reasons": [
|
| 106 |
+
"URL returned hard-missing HTTP status 404."
|
| 107 |
+
],
|
| 108 |
+
"evidence": {
|
| 109 |
+
"status_code": 404,
|
| 110 |
+
"final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 111 |
+
"metadata_pashto": true,
|
| 112 |
+
"direct_pashto": true,
|
| 113 |
+
"page_pashto": false
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 118 |
+
"id": "dataset-kaggle-katib-s-pashto-text-imagebase-kpti",
|
| 119 |
+
"title": "Katib's Pashto Text Imagebase (KPTI)",
|
| 120 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 121 |
+
"reasons": [
|
| 122 |
+
"URL returned hard-missing HTTP status 404."
|
| 123 |
+
],
|
| 124 |
+
"evidence": {
|
| 125 |
+
"status_code": 404,
|
| 126 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 127 |
+
"metadata_pashto": true,
|
| 128 |
+
"direct_pashto": true,
|
| 129 |
+
"page_pashto": false
|
| 130 |
+
}
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 134 |
+
"id": "dataset-kaggle-pashto-ocr",
|
| 135 |
+
"title": "Pashto OCR",
|
| 136 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 137 |
+
"reasons": [
|
| 138 |
+
"URL returned hard-missing HTTP status 404."
|
| 139 |
+
],
|
| 140 |
+
"evidence": {
|
| 141 |
+
"status_code": 404,
|
| 142 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 143 |
+
"metadata_pashto": true,
|
| 144 |
+
"direct_pashto": true,
|
| 145 |
+
"page_pashto": false
|
| 146 |
+
}
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 150 |
+
"id": "dataset-kaggle-common-voice-24-0-pashto-speech-dataset",
|
| 151 |
+
"title": "Common Voice 24.0: Pashto Speech Dataset",
|
| 152 |
+
"url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 153 |
+
"reasons": [
|
| 154 |
+
"URL returned hard-missing HTTP status 404."
|
| 155 |
+
],
|
| 156 |
+
"evidence": {
|
| 157 |
+
"status_code": 404,
|
| 158 |
+
"final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 159 |
+
"metadata_pashto": true,
|
| 160 |
+
"direct_pashto": true,
|
| 161 |
+
"page_pashto": false
|
| 162 |
+
}
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 166 |
+
"id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
|
| 167 |
+
"title": "Pashto Isolated Alphabets and Numerals",
|
| 168 |
+
"url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 169 |
+
"reasons": [
|
| 170 |
+
"URL returned hard-missing HTTP status 404."
|
| 171 |
+
],
|
| 172 |
+
"evidence": {
|
| 173 |
+
"status_code": 404,
|
| 174 |
+
"final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 175 |
+
"metadata_pashto": true,
|
| 176 |
+
"direct_pashto": true,
|
| 177 |
+
"page_pashto": false
|
| 178 |
+
}
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 182 |
+
"id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
|
| 183 |
+
"title": "Pashto Poetry",
|
| 184 |
+
"url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 185 |
+
"reasons": [
|
| 186 |
+
"URL returned hard-missing HTTP status 404."
|
| 187 |
+
],
|
| 188 |
+
"evidence": {
|
| 189 |
+
"status_code": 404,
|
| 190 |
+
"final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 191 |
+
"metadata_pashto": true,
|
| 192 |
+
"direct_pashto": true,
|
| 193 |
+
"page_pashto": false
|
| 194 |
+
}
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 198 |
+
"id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
|
| 199 |
+
"title": "Pashto text characters sample",
|
| 200 |
+
"url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 201 |
+
"reasons": [
|
| 202 |
+
"URL returned hard-missing HTTP status 404."
|
| 203 |
+
],
|
| 204 |
+
"evidence": {
|
| 205 |
+
"status_code": 404,
|
| 206 |
+
"final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 207 |
+
"metadata_pashto": true,
|
| 208 |
+
"direct_pashto": true,
|
| 209 |
+
"page_pashto": false
|
| 210 |
+
}
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 214 |
+
"id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
|
| 215 |
+
"title": "pashto_language_alphabets",
|
| 216 |
+
"url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 217 |
+
"reasons": [
|
| 218 |
+
"URL returned hard-missing HTTP status 404."
|
| 219 |
+
],
|
| 220 |
+
"evidence": {
|
| 221 |
+
"status_code": 404,
|
| 222 |
+
"final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 223 |
+
"metadata_pashto": true,
|
| 224 |
+
"direct_pashto": true,
|
| 225 |
+
"page_pashto": false
|
| 226 |
+
}
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"removed_on": "2026-02-21T19:47:22.435531+00:00",
|
| 230 |
+
"id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
|
| 231 |
+
"title": "Pashto_language_characters",
|
| 232 |
+
"url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 233 |
+
"reasons": [
|
| 234 |
+
"URL returned hard-missing HTTP status 404."
|
| 235 |
+
],
|
| 236 |
+
"evidence": {
|
| 237 |
+
"status_code": 404,
|
| 238 |
+
"final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 239 |
+
"metadata_pashto": true,
|
| 240 |
+
"direct_pashto": true,
|
| 241 |
+
"page_pashto": false
|
| 242 |
+
}
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 246 |
+
"id": "candidate-kaggle-dataset-ataullahaali-common-voice-24-0-pashto-speech-dataset",
|
| 247 |
+
"title": "Common Voice 24.0: Pashto Speech Dataset",
|
| 248 |
+
"url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 249 |
+
"reasons": [
|
| 250 |
+
"URL returned hard-missing HTTP status 404."
|
| 251 |
+
],
|
| 252 |
+
"evidence": {
|
| 253 |
+
"status_code": 404,
|
| 254 |
+
"final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 255 |
+
"metadata_pashto": true,
|
| 256 |
+
"direct_pashto": true,
|
| 257 |
+
"page_pashto": false
|
| 258 |
+
}
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 262 |
+
"id": "candidate-kaggle-dataset-rabiakhan827-english-pashto-language-dataset-epld",
|
| 263 |
+
"title": "English-Pashto Language Dataset (EPLD)",
|
| 264 |
+
"url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 265 |
+
"reasons": [
|
| 266 |
+
"URL returned hard-missing HTTP status 404."
|
| 267 |
+
],
|
| 268 |
+
"evidence": {
|
| 269 |
+
"status_code": 404,
|
| 270 |
+
"final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 271 |
+
"metadata_pashto": true,
|
| 272 |
+
"direct_pashto": true,
|
| 273 |
+
"page_pashto": false
|
| 274 |
+
}
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 278 |
+
"id": "candidate-kaggle-dataset-hassanamin-katib-s-pashto-text-imagebase-kpti",
|
| 279 |
+
"title": "Katib's Pashto Text Imagebase (KPTI)",
|
| 280 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 281 |
+
"reasons": [
|
| 282 |
+
"URL returned hard-missing HTTP status 404."
|
| 283 |
+
],
|
| 284 |
+
"evidence": {
|
| 285 |
+
"status_code": 404,
|
| 286 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 287 |
+
"metadata_pashto": true,
|
| 288 |
+
"direct_pashto": true,
|
| 289 |
+
"page_pashto": false
|
| 290 |
+
}
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 294 |
+
"id": "candidate-kaggle-dataset-farhadkhan66-pashto-english-bilingual-sentiment-corpus",
|
| 295 |
+
"title": "Pashto English Bilingual Sentiment Corpus",
|
| 296 |
+
"url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 297 |
+
"reasons": [
|
| 298 |
+
"URL returned hard-missing HTTP status 404."
|
| 299 |
+
],
|
| 300 |
+
"evidence": {
|
| 301 |
+
"status_code": 404,
|
| 302 |
+
"final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 303 |
+
"metadata_pashto": true,
|
| 304 |
+
"direct_pashto": true,
|
| 305 |
+
"page_pashto": false
|
| 306 |
+
}
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 310 |
+
"id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
|
| 311 |
+
"title": "Pashto Isolated Alphabets and Numerals",
|
| 312 |
+
"url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 313 |
+
"reasons": [
|
| 314 |
+
"URL returned hard-missing HTTP status 404."
|
| 315 |
+
],
|
| 316 |
+
"evidence": {
|
| 317 |
+
"status_code": 404,
|
| 318 |
+
"final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 319 |
+
"metadata_pashto": true,
|
| 320 |
+
"direct_pashto": true,
|
| 321 |
+
"page_pashto": false
|
| 322 |
+
}
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 326 |
+
"id": "candidate-kaggle-dataset-engrirf-pashto-isolated-words-speech-dataset",
|
| 327 |
+
"title": "Pashto Isolated Words Speech Dataset",
|
| 328 |
+
"url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 329 |
+
"reasons": [
|
| 330 |
+
"URL returned hard-missing HTTP status 404."
|
| 331 |
+
],
|
| 332 |
+
"evidence": {
|
| 333 |
+
"status_code": 404,
|
| 334 |
+
"final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 335 |
+
"metadata_pashto": true,
|
| 336 |
+
"direct_pashto": true,
|
| 337 |
+
"page_pashto": false
|
| 338 |
+
}
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 342 |
+
"id": "candidate-kaggle-dataset-hassanamin-pashto-ocr",
|
| 343 |
+
"title": "Pashto OCR",
|
| 344 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 345 |
+
"reasons": [
|
| 346 |
+
"URL returned hard-missing HTTP status 404."
|
| 347 |
+
],
|
| 348 |
+
"evidence": {
|
| 349 |
+
"status_code": 404,
|
| 350 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 351 |
+
"metadata_pashto": true,
|
| 352 |
+
"direct_pashto": true,
|
| 353 |
+
"page_pashto": false
|
| 354 |
+
}
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 358 |
+
"id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
|
| 359 |
+
"title": "Pashto Poetry",
|
| 360 |
+
"url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 361 |
+
"reasons": [
|
| 362 |
+
"URL returned hard-missing HTTP status 404."
|
| 363 |
+
],
|
| 364 |
+
"evidence": {
|
| 365 |
+
"status_code": 404,
|
| 366 |
+
"final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 367 |
+
"metadata_pashto": true,
|
| 368 |
+
"direct_pashto": true,
|
| 369 |
+
"page_pashto": false
|
| 370 |
+
}
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 374 |
+
"id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
|
| 375 |
+
"title": "Pashto text characters sample",
|
| 376 |
+
"url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 377 |
+
"reasons": [
|
| 378 |
+
"URL returned hard-missing HTTP status 404."
|
| 379 |
+
],
|
| 380 |
+
"evidence": {
|
| 381 |
+
"status_code": 404,
|
| 382 |
+
"final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 383 |
+
"metadata_pashto": true,
|
| 384 |
+
"direct_pashto": true,
|
| 385 |
+
"page_pashto": false
|
| 386 |
+
}
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 390 |
+
"id": "candidate-kaggle-dataset-drijaz-pashto-word-embeddings",
|
| 391 |
+
"title": "Pashto Word Embeddings",
|
| 392 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 393 |
+
"reasons": [
|
| 394 |
+
"URL returned hard-missing HTTP status 404."
|
| 395 |
+
],
|
| 396 |
+
"evidence": {
|
| 397 |
+
"status_code": 404,
|
| 398 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 399 |
+
"metadata_pashto": true,
|
| 400 |
+
"direct_pashto": true,
|
| 401 |
+
"page_pashto": false
|
| 402 |
+
}
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 406 |
+
"id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
|
| 407 |
+
"title": "pashto_language_alphabets",
|
| 408 |
+
"url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 409 |
+
"reasons": [
|
| 410 |
+
"URL returned hard-missing HTTP status 404."
|
| 411 |
+
],
|
| 412 |
+
"evidence": {
|
| 413 |
+
"status_code": 404,
|
| 414 |
+
"final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 415 |
+
"metadata_pashto": true,
|
| 416 |
+
"direct_pashto": true,
|
| 417 |
+
"page_pashto": false
|
| 418 |
+
}
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 422 |
+
"id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
|
| 423 |
+
"title": "Pashto_language_characters",
|
| 424 |
+
"url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 425 |
+
"reasons": [
|
| 426 |
+
"URL returned hard-missing HTTP status 404."
|
| 427 |
+
],
|
| 428 |
+
"evidence": {
|
| 429 |
+
"status_code": 404,
|
| 430 |
+
"final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 431 |
+
"metadata_pashto": true,
|
| 432 |
+
"direct_pashto": true,
|
| 433 |
+
"page_pashto": false
|
| 434 |
+
}
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 438 |
+
"id": "candidate-kaggle-dataset-drijaz-pashtoocr",
|
| 439 |
+
"title": "PashtoOCR",
|
| 440 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 441 |
+
"reasons": [
|
| 442 |
+
"URL returned hard-missing HTTP status 404."
|
| 443 |
+
],
|
| 444 |
+
"evidence": {
|
| 445 |
+
"status_code": 404,
|
| 446 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 447 |
+
"metadata_pashto": true,
|
| 448 |
+
"direct_pashto": true,
|
| 449 |
+
"page_pashto": false
|
| 450 |
+
}
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 454 |
+
"id": "candidate-kaggle-dataset-drijaz-pold-pashto-offensive-language-dataset",
|
| 455 |
+
"title": "POLD - Pashto Offensive Language Dataset",
|
| 456 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 457 |
+
"reasons": [
|
| 458 |
+
"URL returned hard-missing HTTP status 404."
|
| 459 |
+
],
|
| 460 |
+
"evidence": {
|
| 461 |
+
"status_code": 404,
|
| 462 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 463 |
+
"metadata_pashto": true,
|
| 464 |
+
"direct_pashto": true,
|
| 465 |
+
"page_pashto": false
|
| 466 |
+
}
|
| 467 |
+
},
|
| 468 |
+
{
|
| 469 |
+
"removed_on": "2026-02-21T19:59:50.593781+00:00",
|
| 470 |
+
"id": "candidate-kaggle-dataset-shafeeqgigyani-urdu-pashto-lexicon-dataset",
|
| 471 |
+
"title": "Urdu-Pashto Lexicon Dataset",
|
| 472 |
+
"url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 473 |
+
"reasons": [
|
| 474 |
+
"URL returned hard-missing HTTP status 404."
|
| 475 |
+
],
|
| 476 |
+
"evidence": {
|
| 477 |
+
"status_code": 404,
|
| 478 |
+
"final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 479 |
+
"metadata_pashto": true,
|
| 480 |
+
"direct_pashto": true,
|
| 481 |
+
"page_pashto": false
|
| 482 |
+
}
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 486 |
+
"id": "candidate-kaggle-dataset-ataullahaali-common-voice-24-0-pashto-speech-dataset",
|
| 487 |
+
"title": "Common Voice 24.0: Pashto Speech Dataset",
|
| 488 |
+
"url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 489 |
+
"reasons": [
|
| 490 |
+
"URL returned hard-missing HTTP status 404."
|
| 491 |
+
],
|
| 492 |
+
"evidence": {
|
| 493 |
+
"status_code": 404,
|
| 494 |
+
"final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 495 |
+
"metadata_pashto": true,
|
| 496 |
+
"direct_pashto": true,
|
| 497 |
+
"page_pashto": false
|
| 498 |
+
}
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 502 |
+
"id": "candidate-kaggle-dataset-rabiakhan827-english-pashto-language-dataset-epld",
|
| 503 |
+
"title": "English-Pashto Language Dataset (EPLD)",
|
| 504 |
+
"url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 505 |
+
"reasons": [
|
| 506 |
+
"URL returned hard-missing HTTP status 404."
|
| 507 |
+
],
|
| 508 |
+
"evidence": {
|
| 509 |
+
"status_code": 404,
|
| 510 |
+
"final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 511 |
+
"metadata_pashto": true,
|
| 512 |
+
"direct_pashto": true,
|
| 513 |
+
"page_pashto": false
|
| 514 |
+
}
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 518 |
+
"id": "candidate-kaggle-dataset-hassanamin-katib-s-pashto-text-imagebase-kpti",
|
| 519 |
+
"title": "Katib's Pashto Text Imagebase (KPTI)",
|
| 520 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 521 |
+
"reasons": [
|
| 522 |
+
"URL returned hard-missing HTTP status 404."
|
| 523 |
+
],
|
| 524 |
+
"evidence": {
|
| 525 |
+
"status_code": 404,
|
| 526 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 527 |
+
"metadata_pashto": true,
|
| 528 |
+
"direct_pashto": true,
|
| 529 |
+
"page_pashto": false
|
| 530 |
+
}
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 534 |
+
"id": "candidate-kaggle-dataset-farhadkhan66-pashto-english-bilingual-sentiment-corpus",
|
| 535 |
+
"title": "Pashto English Bilingual Sentiment Corpus",
|
| 536 |
+
"url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 537 |
+
"reasons": [
|
| 538 |
+
"URL returned hard-missing HTTP status 404."
|
| 539 |
+
],
|
| 540 |
+
"evidence": {
|
| 541 |
+
"status_code": 404,
|
| 542 |
+
"final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 543 |
+
"metadata_pashto": true,
|
| 544 |
+
"direct_pashto": true,
|
| 545 |
+
"page_pashto": false
|
| 546 |
+
}
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 550 |
+
"id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
|
| 551 |
+
"title": "Pashto Isolated Alphabets and Numerals",
|
| 552 |
+
"url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 553 |
+
"reasons": [
|
| 554 |
+
"URL returned hard-missing HTTP status 404."
|
| 555 |
+
],
|
| 556 |
+
"evidence": {
|
| 557 |
+
"status_code": 404,
|
| 558 |
+
"final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 559 |
+
"metadata_pashto": true,
|
| 560 |
+
"direct_pashto": true,
|
| 561 |
+
"page_pashto": false
|
| 562 |
+
}
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 566 |
+
"id": "candidate-kaggle-dataset-engrirf-pashto-isolated-words-speech-dataset",
|
| 567 |
+
"title": "Pashto Isolated Words Speech Dataset",
|
| 568 |
+
"url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 569 |
+
"reasons": [
|
| 570 |
+
"URL returned hard-missing HTTP status 404."
|
| 571 |
+
],
|
| 572 |
+
"evidence": {
|
| 573 |
+
"status_code": 404,
|
| 574 |
+
"final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 575 |
+
"metadata_pashto": true,
|
| 576 |
+
"direct_pashto": true,
|
| 577 |
+
"page_pashto": false
|
| 578 |
+
}
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 582 |
+
"id": "candidate-kaggle-dataset-hassanamin-pashto-ocr",
|
| 583 |
+
"title": "Pashto OCR",
|
| 584 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 585 |
+
"reasons": [
|
| 586 |
+
"URL returned hard-missing HTTP status 404."
|
| 587 |
+
],
|
| 588 |
+
"evidence": {
|
| 589 |
+
"status_code": 404,
|
| 590 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 591 |
+
"metadata_pashto": true,
|
| 592 |
+
"direct_pashto": true,
|
| 593 |
+
"page_pashto": false
|
| 594 |
+
}
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 598 |
+
"id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
|
| 599 |
+
"title": "Pashto Poetry",
|
| 600 |
+
"url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 601 |
+
"reasons": [
|
| 602 |
+
"URL returned hard-missing HTTP status 404."
|
| 603 |
+
],
|
| 604 |
+
"evidence": {
|
| 605 |
+
"status_code": 404,
|
| 606 |
+
"final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 607 |
+
"metadata_pashto": true,
|
| 608 |
+
"direct_pashto": true,
|
| 609 |
+
"page_pashto": false
|
| 610 |
+
}
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 614 |
+
"id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
|
| 615 |
+
"title": "Pashto text characters sample",
|
| 616 |
+
"url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 617 |
+
"reasons": [
|
| 618 |
+
"URL returned hard-missing HTTP status 404."
|
| 619 |
+
],
|
| 620 |
+
"evidence": {
|
| 621 |
+
"status_code": 404,
|
| 622 |
+
"final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 623 |
+
"metadata_pashto": true,
|
| 624 |
+
"direct_pashto": true,
|
| 625 |
+
"page_pashto": false
|
| 626 |
+
}
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 630 |
+
"id": "candidate-kaggle-dataset-drijaz-pashto-word-embeddings",
|
| 631 |
+
"title": "Pashto Word Embeddings",
|
| 632 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 633 |
+
"reasons": [
|
| 634 |
+
"URL returned hard-missing HTTP status 404."
|
| 635 |
+
],
|
| 636 |
+
"evidence": {
|
| 637 |
+
"status_code": 404,
|
| 638 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 639 |
+
"metadata_pashto": true,
|
| 640 |
+
"direct_pashto": true,
|
| 641 |
+
"page_pashto": false
|
| 642 |
+
}
|
| 643 |
+
},
|
| 644 |
+
{
|
| 645 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 646 |
+
"id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
|
| 647 |
+
"title": "pashto_language_alphabets",
|
| 648 |
+
"url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 649 |
+
"reasons": [
|
| 650 |
+
"URL returned hard-missing HTTP status 404."
|
| 651 |
+
],
|
| 652 |
+
"evidence": {
|
| 653 |
+
"status_code": 404,
|
| 654 |
+
"final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 655 |
+
"metadata_pashto": true,
|
| 656 |
+
"direct_pashto": true,
|
| 657 |
+
"page_pashto": false
|
| 658 |
+
}
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 662 |
+
"id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
|
| 663 |
+
"title": "Pashto_language_characters",
|
| 664 |
+
"url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 665 |
+
"reasons": [
|
| 666 |
+
"URL returned hard-missing HTTP status 404."
|
| 667 |
+
],
|
| 668 |
+
"evidence": {
|
| 669 |
+
"status_code": 404,
|
| 670 |
+
"final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 671 |
+
"metadata_pashto": true,
|
| 672 |
+
"direct_pashto": true,
|
| 673 |
+
"page_pashto": false
|
| 674 |
+
}
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 678 |
+
"id": "candidate-kaggle-dataset-drijaz-pashtoocr",
|
| 679 |
+
"title": "PashtoOCR",
|
| 680 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 681 |
+
"reasons": [
|
| 682 |
+
"URL returned hard-missing HTTP status 404."
|
| 683 |
+
],
|
| 684 |
+
"evidence": {
|
| 685 |
+
"status_code": 404,
|
| 686 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 687 |
+
"metadata_pashto": true,
|
| 688 |
+
"direct_pashto": true,
|
| 689 |
+
"page_pashto": false
|
| 690 |
+
}
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 694 |
+
"id": "candidate-kaggle-dataset-drijaz-pold-pashto-offensive-language-dataset",
|
| 695 |
+
"title": "POLD - Pashto Offensive Language Dataset",
|
| 696 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 697 |
+
"reasons": [
|
| 698 |
+
"URL returned hard-missing HTTP status 404."
|
| 699 |
+
],
|
| 700 |
+
"evidence": {
|
| 701 |
+
"status_code": 404,
|
| 702 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 703 |
+
"metadata_pashto": true,
|
| 704 |
+
"direct_pashto": true,
|
| 705 |
+
"page_pashto": false
|
| 706 |
+
}
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"removed_on": "2026-02-21T20:13:47.457104+00:00",
|
| 710 |
+
"id": "candidate-kaggle-dataset-shafeeqgigyani-urdu-pashto-lexicon-dataset",
|
| 711 |
+
"title": "Urdu-Pashto Lexicon Dataset",
|
| 712 |
+
"url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 713 |
+
"reasons": [
|
| 714 |
+
"URL returned hard-missing HTTP status 404."
|
| 715 |
+
],
|
| 716 |
+
"evidence": {
|
| 717 |
+
"status_code": 404,
|
| 718 |
+
"final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 719 |
+
"metadata_pashto": true,
|
| 720 |
+
"direct_pashto": true,
|
| 721 |
+
"page_pashto": false
|
| 722 |
+
}
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 726 |
+
"id": "candidate-kaggle-dataset-ataullahaali-common-voice-24-0-pashto-speech-dataset",
|
| 727 |
+
"title": "Common Voice 24.0: Pashto Speech Dataset",
|
| 728 |
+
"url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 729 |
+
"reasons": [
|
| 730 |
+
"URL returned hard-missing HTTP status 404."
|
| 731 |
+
],
|
| 732 |
+
"evidence": {
|
| 733 |
+
"status_code": 404,
|
| 734 |
+
"final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 735 |
+
"metadata_pashto": true,
|
| 736 |
+
"direct_pashto": true,
|
| 737 |
+
"page_pashto": false
|
| 738 |
+
}
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 742 |
+
"id": "candidate-kaggle-dataset-rabiakhan827-english-pashto-language-dataset-epld",
|
| 743 |
+
"title": "English-Pashto Language Dataset (EPLD)",
|
| 744 |
+
"url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 745 |
+
"reasons": [
|
| 746 |
+
"URL returned hard-missing HTTP status 404."
|
| 747 |
+
],
|
| 748 |
+
"evidence": {
|
| 749 |
+
"status_code": 404,
|
| 750 |
+
"final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 751 |
+
"metadata_pashto": true,
|
| 752 |
+
"direct_pashto": true,
|
| 753 |
+
"page_pashto": false
|
| 754 |
+
}
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 758 |
+
"id": "candidate-kaggle-dataset-hassanamin-katib-s-pashto-text-imagebase-kpti",
|
| 759 |
+
"title": "Katib's Pashto Text Imagebase (KPTI)",
|
| 760 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 761 |
+
"reasons": [
|
| 762 |
+
"URL returned hard-missing HTTP status 404."
|
| 763 |
+
],
|
| 764 |
+
"evidence": {
|
| 765 |
+
"status_code": 404,
|
| 766 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 767 |
+
"metadata_pashto": true,
|
| 768 |
+
"direct_pashto": true,
|
| 769 |
+
"page_pashto": false
|
| 770 |
+
}
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 774 |
+
"id": "candidate-kaggle-dataset-farhadkhan66-pashto-english-bilingual-sentiment-corpus",
|
| 775 |
+
"title": "Pashto English Bilingual Sentiment Corpus",
|
| 776 |
+
"url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 777 |
+
"reasons": [
|
| 778 |
+
"URL returned hard-missing HTTP status 404."
|
| 779 |
+
],
|
| 780 |
+
"evidence": {
|
| 781 |
+
"status_code": 404,
|
| 782 |
+
"final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 783 |
+
"metadata_pashto": true,
|
| 784 |
+
"direct_pashto": true,
|
| 785 |
+
"page_pashto": false
|
| 786 |
+
}
|
| 787 |
+
},
|
| 788 |
+
{
|
| 789 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 790 |
+
"id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
|
| 791 |
+
"title": "Pashto Isolated Alphabets and Numerals",
|
| 792 |
+
"url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 793 |
+
"reasons": [
|
| 794 |
+
"URL returned hard-missing HTTP status 404."
|
| 795 |
+
],
|
| 796 |
+
"evidence": {
|
| 797 |
+
"status_code": 404,
|
| 798 |
+
"final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 799 |
+
"metadata_pashto": true,
|
| 800 |
+
"direct_pashto": true,
|
| 801 |
+
"page_pashto": false
|
| 802 |
+
}
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 806 |
+
"id": "candidate-kaggle-dataset-engrirf-pashto-isolated-words-speech-dataset",
|
| 807 |
+
"title": "Pashto Isolated Words Speech Dataset",
|
| 808 |
+
"url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 809 |
+
"reasons": [
|
| 810 |
+
"URL returned hard-missing HTTP status 404."
|
| 811 |
+
],
|
| 812 |
+
"evidence": {
|
| 813 |
+
"status_code": 404,
|
| 814 |
+
"final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 815 |
+
"metadata_pashto": true,
|
| 816 |
+
"direct_pashto": true,
|
| 817 |
+
"page_pashto": false
|
| 818 |
+
}
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 822 |
+
"id": "candidate-kaggle-dataset-hassanamin-pashto-ocr",
|
| 823 |
+
"title": "Pashto OCR",
|
| 824 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 825 |
+
"reasons": [
|
| 826 |
+
"URL returned hard-missing HTTP status 404."
|
| 827 |
+
],
|
| 828 |
+
"evidence": {
|
| 829 |
+
"status_code": 404,
|
| 830 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 831 |
+
"metadata_pashto": true,
|
| 832 |
+
"direct_pashto": true,
|
| 833 |
+
"page_pashto": false
|
| 834 |
+
}
|
| 835 |
+
},
|
| 836 |
+
{
|
| 837 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 838 |
+
"id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
|
| 839 |
+
"title": "Pashto Poetry",
|
| 840 |
+
"url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 841 |
+
"reasons": [
|
| 842 |
+
"URL returned hard-missing HTTP status 404."
|
| 843 |
+
],
|
| 844 |
+
"evidence": {
|
| 845 |
+
"status_code": 404,
|
| 846 |
+
"final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 847 |
+
"metadata_pashto": true,
|
| 848 |
+
"direct_pashto": true,
|
| 849 |
+
"page_pashto": false
|
| 850 |
+
}
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 854 |
+
"id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
|
| 855 |
+
"title": "Pashto text characters sample",
|
| 856 |
+
"url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 857 |
+
"reasons": [
|
| 858 |
+
"URL returned hard-missing HTTP status 404."
|
| 859 |
+
],
|
| 860 |
+
"evidence": {
|
| 861 |
+
"status_code": 404,
|
| 862 |
+
"final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 863 |
+
"metadata_pashto": true,
|
| 864 |
+
"direct_pashto": true,
|
| 865 |
+
"page_pashto": false
|
| 866 |
+
}
|
| 867 |
+
},
|
| 868 |
+
{
|
| 869 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 870 |
+
"id": "candidate-kaggle-dataset-drijaz-pashto-word-embeddings",
|
| 871 |
+
"title": "Pashto Word Embeddings",
|
| 872 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 873 |
+
"reasons": [
|
| 874 |
+
"URL returned hard-missing HTTP status 404."
|
| 875 |
+
],
|
| 876 |
+
"evidence": {
|
| 877 |
+
"status_code": 404,
|
| 878 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 879 |
+
"metadata_pashto": true,
|
| 880 |
+
"direct_pashto": true,
|
| 881 |
+
"page_pashto": false
|
| 882 |
+
}
|
| 883 |
+
},
|
| 884 |
+
{
|
| 885 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 886 |
+
"id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
|
| 887 |
+
"title": "pashto_language_alphabets",
|
| 888 |
+
"url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 889 |
+
"reasons": [
|
| 890 |
+
"URL returned hard-missing HTTP status 404."
|
| 891 |
+
],
|
| 892 |
+
"evidence": {
|
| 893 |
+
"status_code": 404,
|
| 894 |
+
"final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 895 |
+
"metadata_pashto": true,
|
| 896 |
+
"direct_pashto": true,
|
| 897 |
+
"page_pashto": false
|
| 898 |
+
}
|
| 899 |
+
},
|
| 900 |
+
{
|
| 901 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 902 |
+
"id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
|
| 903 |
+
"title": "Pashto_language_characters",
|
| 904 |
+
"url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 905 |
+
"reasons": [
|
| 906 |
+
"URL returned hard-missing HTTP status 404."
|
| 907 |
+
],
|
| 908 |
+
"evidence": {
|
| 909 |
+
"status_code": 404,
|
| 910 |
+
"final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 911 |
+
"metadata_pashto": true,
|
| 912 |
+
"direct_pashto": true,
|
| 913 |
+
"page_pashto": false
|
| 914 |
+
}
|
| 915 |
+
},
|
| 916 |
+
{
|
| 917 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 918 |
+
"id": "candidate-kaggle-dataset-drijaz-pashtoocr",
|
| 919 |
+
"title": "PashtoOCR",
|
| 920 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 921 |
+
"reasons": [
|
| 922 |
+
"URL returned hard-missing HTTP status 404."
|
| 923 |
+
],
|
| 924 |
+
"evidence": {
|
| 925 |
+
"status_code": 404,
|
| 926 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 927 |
+
"metadata_pashto": true,
|
| 928 |
+
"direct_pashto": true,
|
| 929 |
+
"page_pashto": false
|
| 930 |
+
}
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 934 |
+
"id": "candidate-kaggle-dataset-drijaz-pold-pashto-offensive-language-dataset",
|
| 935 |
+
"title": "POLD - Pashto Offensive Language Dataset",
|
| 936 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 937 |
+
"reasons": [
|
| 938 |
+
"URL returned hard-missing HTTP status 404."
|
| 939 |
+
],
|
| 940 |
+
"evidence": {
|
| 941 |
+
"status_code": 404,
|
| 942 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 943 |
+
"metadata_pashto": true,
|
| 944 |
+
"direct_pashto": true,
|
| 945 |
+
"page_pashto": false
|
| 946 |
+
}
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"removed_on": "2026-02-21T20:27:10.672699+00:00",
|
| 950 |
+
"id": "candidate-kaggle-dataset-shafeeqgigyani-urdu-pashto-lexicon-dataset",
|
| 951 |
+
"title": "Urdu-Pashto Lexicon Dataset",
|
| 952 |
+
"url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 953 |
+
"reasons": [
|
| 954 |
+
"URL returned hard-missing HTTP status 404."
|
| 955 |
+
],
|
| 956 |
+
"evidence": {
|
| 957 |
+
"status_code": 404,
|
| 958 |
+
"final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 959 |
+
"metadata_pashto": true,
|
| 960 |
+
"direct_pashto": true,
|
| 961 |
+
"page_pashto": false
|
| 962 |
+
}
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 966 |
+
"id": "dataset-kaggle-pashto-isolated-words",
|
| 967 |
+
"title": "Pashto Isolated Words Speech Dataset",
|
| 968 |
+
"url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 969 |
+
"reasons": [
|
| 970 |
+
"URL returned hard-missing HTTP status 404."
|
| 971 |
+
],
|
| 972 |
+
"evidence": {
|
| 973 |
+
"status_code": 404,
|
| 974 |
+
"final_url": "https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset",
|
| 975 |
+
"metadata_pashto": true,
|
| 976 |
+
"direct_pashto": true,
|
| 977 |
+
"page_pashto": false
|
| 978 |
+
}
|
| 979 |
+
},
|
| 980 |
+
{
|
| 981 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 982 |
+
"id": "dataset-kaggle-pashto-word-embeddings",
|
| 983 |
+
"title": "Pashto Word Embeddings",
|
| 984 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 985 |
+
"reasons": [
|
| 986 |
+
"URL returned hard-missing HTTP status 404."
|
| 987 |
+
],
|
| 988 |
+
"evidence": {
|
| 989 |
+
"status_code": 404,
|
| 990 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings",
|
| 991 |
+
"metadata_pashto": true,
|
| 992 |
+
"direct_pashto": true,
|
| 993 |
+
"page_pashto": false
|
| 994 |
+
}
|
| 995 |
+
},
|
| 996 |
+
{
|
| 997 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 998 |
+
"id": "dataset-kaggle-pold-pashto-offensive",
|
| 999 |
+
"title": "POLD - Pashto Offensive Language Dataset",
|
| 1000 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 1001 |
+
"reasons": [
|
| 1002 |
+
"URL returned hard-missing HTTP status 404."
|
| 1003 |
+
],
|
| 1004 |
+
"evidence": {
|
| 1005 |
+
"status_code": 404,
|
| 1006 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 1007 |
+
"metadata_pashto": true,
|
| 1008 |
+
"direct_pashto": true,
|
| 1009 |
+
"page_pashto": false
|
| 1010 |
+
}
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1014 |
+
"id": "dataset-kaggle-pashto-english-sentiment-corpus",
|
| 1015 |
+
"title": "Pashto English Bilingual Sentiment Corpus",
|
| 1016 |
+
"url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 1017 |
+
"reasons": [
|
| 1018 |
+
"URL returned hard-missing HTTP status 404."
|
| 1019 |
+
],
|
| 1020 |
+
"evidence": {
|
| 1021 |
+
"status_code": 404,
|
| 1022 |
+
"final_url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 1023 |
+
"metadata_pashto": true,
|
| 1024 |
+
"direct_pashto": true,
|
| 1025 |
+
"page_pashto": false
|
| 1026 |
+
}
|
| 1027 |
+
},
|
| 1028 |
+
{
|
| 1029 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1030 |
+
"id": "dataset-kaggle-urdu-pashto-lexicon",
|
| 1031 |
+
"title": "Urdu-Pashto Lexicon Dataset",
|
| 1032 |
+
"url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 1033 |
+
"reasons": [
|
| 1034 |
+
"URL returned hard-missing HTTP status 404."
|
| 1035 |
+
],
|
| 1036 |
+
"evidence": {
|
| 1037 |
+
"status_code": 404,
|
| 1038 |
+
"final_url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 1039 |
+
"metadata_pashto": true,
|
| 1040 |
+
"direct_pashto": true,
|
| 1041 |
+
"page_pashto": false
|
| 1042 |
+
}
|
| 1043 |
+
},
|
| 1044 |
+
{
|
| 1045 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1046 |
+
"id": "dataset-kaggle-drijaz-pashtoocr",
|
| 1047 |
+
"title": "PashtoOCR (Kaggle)",
|
| 1048 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 1049 |
+
"reasons": [
|
| 1050 |
+
"URL returned hard-missing HTTP status 404."
|
| 1051 |
+
],
|
| 1052 |
+
"evidence": {
|
| 1053 |
+
"status_code": 404,
|
| 1054 |
+
"final_url": "https://www.kaggle.com/datasets/drijaz/pashtoocr",
|
| 1055 |
+
"metadata_pashto": true,
|
| 1056 |
+
"direct_pashto": true,
|
| 1057 |
+
"page_pashto": false
|
| 1058 |
+
}
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1062 |
+
"id": "dataset-kaggle-english-pashto-language-dataset-epld",
|
| 1063 |
+
"title": "English-Pashto Language Dataset (EPLD)",
|
| 1064 |
+
"url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 1065 |
+
"reasons": [
|
| 1066 |
+
"URL returned hard-missing HTTP status 404."
|
| 1067 |
+
],
|
| 1068 |
+
"evidence": {
|
| 1069 |
+
"status_code": 404,
|
| 1070 |
+
"final_url": "https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld",
|
| 1071 |
+
"metadata_pashto": true,
|
| 1072 |
+
"direct_pashto": true,
|
| 1073 |
+
"page_pashto": false
|
| 1074 |
+
}
|
| 1075 |
+
},
|
| 1076 |
+
{
|
| 1077 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1078 |
+
"id": "dataset-kaggle-katib-s-pashto-text-imagebase-kpti",
|
| 1079 |
+
"title": "Katib's Pashto Text Imagebase (KPTI)",
|
| 1080 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 1081 |
+
"reasons": [
|
| 1082 |
+
"URL returned hard-missing HTTP status 404."
|
| 1083 |
+
],
|
| 1084 |
+
"evidence": {
|
| 1085 |
+
"status_code": 404,
|
| 1086 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti",
|
| 1087 |
+
"metadata_pashto": true,
|
| 1088 |
+
"direct_pashto": true,
|
| 1089 |
+
"page_pashto": false
|
| 1090 |
+
}
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1094 |
+
"id": "dataset-kaggle-pashto-ocr",
|
| 1095 |
+
"title": "Pashto OCR",
|
| 1096 |
+
"url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 1097 |
+
"reasons": [
|
| 1098 |
+
"URL returned hard-missing HTTP status 404."
|
| 1099 |
+
],
|
| 1100 |
+
"evidence": {
|
| 1101 |
+
"status_code": 404,
|
| 1102 |
+
"final_url": "https://www.kaggle.com/datasets/hassanamin/pashto-ocr",
|
| 1103 |
+
"metadata_pashto": true,
|
| 1104 |
+
"direct_pashto": true,
|
| 1105 |
+
"page_pashto": false
|
| 1106 |
+
}
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1110 |
+
"id": "dataset-kaggle-common-voice-24-0-pashto-speech-dataset",
|
| 1111 |
+
"title": "Common Voice 24.0: Pashto Speech Dataset",
|
| 1112 |
+
"url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 1113 |
+
"reasons": [
|
| 1114 |
+
"URL returned hard-missing HTTP status 404."
|
| 1115 |
+
],
|
| 1116 |
+
"evidence": {
|
| 1117 |
+
"status_code": 404,
|
| 1118 |
+
"final_url": "https://www.kaggle.com/datasets/ataullahaali/common-voice-scripted-speech-24-0-pashto",
|
| 1119 |
+
"metadata_pashto": true,
|
| 1120 |
+
"direct_pashto": true,
|
| 1121 |
+
"page_pashto": false
|
| 1122 |
+
}
|
| 1123 |
+
},
|
| 1124 |
+
{
|
| 1125 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1126 |
+
"id": "candidate-kaggle-dataset-abdulbasitkh-pashto-isolated-alphabets-and-numerals",
|
| 1127 |
+
"title": "Pashto Isolated Alphabets and Numerals",
|
| 1128 |
+
"url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 1129 |
+
"reasons": [
|
| 1130 |
+
"URL returned hard-missing HTTP status 404."
|
| 1131 |
+
],
|
| 1132 |
+
"evidence": {
|
| 1133 |
+
"status_code": 404,
|
| 1134 |
+
"final_url": "https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals",
|
| 1135 |
+
"metadata_pashto": true,
|
| 1136 |
+
"direct_pashto": true,
|
| 1137 |
+
"page_pashto": false
|
| 1138 |
+
}
|
| 1139 |
+
},
|
| 1140 |
+
{
|
| 1141 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1142 |
+
"id": "candidate-kaggle-dataset-alimuhammadasad-pashto-poetry",
|
| 1143 |
+
"title": "Pashto Poetry",
|
| 1144 |
+
"url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 1145 |
+
"reasons": [
|
| 1146 |
+
"URL returned hard-missing HTTP status 404."
|
| 1147 |
+
],
|
| 1148 |
+
"evidence": {
|
| 1149 |
+
"status_code": 404,
|
| 1150 |
+
"final_url": "https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry",
|
| 1151 |
+
"metadata_pashto": true,
|
| 1152 |
+
"direct_pashto": true,
|
| 1153 |
+
"page_pashto": false
|
| 1154 |
+
}
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1158 |
+
"id": "candidate-kaggle-dataset-mahibullahmudaser-pashto-text-characters-sample",
|
| 1159 |
+
"title": "Pashto text characters sample",
|
| 1160 |
+
"url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 1161 |
+
"reasons": [
|
| 1162 |
+
"URL returned hard-missing HTTP status 404."
|
| 1163 |
+
],
|
| 1164 |
+
"evidence": {
|
| 1165 |
+
"status_code": 404,
|
| 1166 |
+
"final_url": "https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample",
|
| 1167 |
+
"metadata_pashto": true,
|
| 1168 |
+
"direct_pashto": true,
|
| 1169 |
+
"page_pashto": false
|
| 1170 |
+
}
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1174 |
+
"id": "candidate-kaggle-dataset-ahmadferozafshar-pashto-language-alphabets",
|
| 1175 |
+
"title": "pashto_language_alphabets",
|
| 1176 |
+
"url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 1177 |
+
"reasons": [
|
| 1178 |
+
"URL returned hard-missing HTTP status 404."
|
| 1179 |
+
],
|
| 1180 |
+
"evidence": {
|
| 1181 |
+
"status_code": 404,
|
| 1182 |
+
"final_url": "https://www.kaggle.com/datasets/ahmadferozafshar/pashto-language-alphabets",
|
| 1183 |
+
"metadata_pashto": true,
|
| 1184 |
+
"direct_pashto": true,
|
| 1185 |
+
"page_pashto": false
|
| 1186 |
+
}
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
"removed_on": "2026-02-21T20:47:45.952635+00:00",
|
| 1190 |
+
"id": "candidate-kaggle-dataset-aimalrezvan-pashto-language-characters",
|
| 1191 |
+
"title": "Pashto_language_characters",
|
| 1192 |
+
"url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 1193 |
+
"reasons": [
|
| 1194 |
+
"URL returned hard-missing HTTP status 404."
|
| 1195 |
+
],
|
| 1196 |
+
"evidence": {
|
| 1197 |
+
"status_code": 404,
|
| 1198 |
+
"final_url": "https://www.kaggle.com/datasets/aimalrezvan/pashto-language-characters",
|
| 1199 |
+
"metadata_pashto": true,
|
| 1200 |
+
"direct_pashto": true,
|
| 1201 |
+
"page_pashto": false
|
| 1202 |
+
}
|
| 1203 |
+
}
|
| 1204 |
+
]
|
| 1205 |
+
}
|
resources/catalog/resources.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
resources/codes/README.md
CHANGED
|
@@ -4,7 +4,10 @@
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
|
|
|
|
|
|
| 7 |
| nlpashto Toolkit | [github](https://github.com/ijazul-haq/nlpashto) | [Repository name and description explicitly identify a Pashto NLP toolkit. (`Pashto`, `NLP`)](https://api.github.com/repos/ijazul-haq/nlpashto) | Pashto NLP code integration and experimentation |
|
|
|
|
| 8 |
|
| 9 |
## Maintenance
|
| 10 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
|
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
+
| LGUG2Z/tashkil | [github](https://github.com/LGUG2Z/tashkil) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/LGUG2Z/tashkil) | Automated discovery entry for Pashto resource tracking. |
|
| 8 |
+
| mrychlik/worldly-ocr | [github](https://github.com/mrychlik/worldly-ocr) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/mrychlik/worldly-ocr) | Automated discovery entry for Pashto resource tracking. |
|
| 9 |
| nlpashto Toolkit | [github](https://github.com/ijazul-haq/nlpashto) | [Repository name and description explicitly identify a Pashto NLP toolkit. (`Pashto`, `NLP`)](https://api.github.com/repos/ijazul-haq/nlpashto) | Pashto NLP code integration and experimentation |
|
| 10 |
+
| sinaahmadi/PersoArabicLID | [github](https://github.com/sinaahmadi/PersoArabicLID) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/sinaahmadi/PersoArabicLID) | Automated discovery entry for Pashto resource tracking. |
|
| 11 |
|
| 12 |
## Maintenance
|
| 13 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
resources/datasets/README.md
CHANGED
|
@@ -14,10 +14,9 @@
|
|
| 14 |
| arsalagrey/pashto-books | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books) | Automated discovery entry for Pashto resource tracking. |
|
| 15 |
| arsalagrey/pashto-books-json | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | Automated discovery entry for Pashto resource tracking. |
|
| 16 |
| Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
|
| 17 |
-
|
|
| 18 |
| Common Voice Scripted Speech 24.0 - Pashto | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official dataset page is for Pashto. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR training and evaluation |
|
| 19 |
| English to Pashto Sentences Dataset | [huggingface](https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset) | [Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column. (`Pashto`)](https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset) | MT and bilingual sentence alignment baseline |
|
| 20 |
-
| English-Pashto Language Dataset (EPLD) | [kaggle](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/rabiakhan827/english-pashto-language-dataset-epld) | Machine translation and bilingual corpus development |
|
| 21 |
| Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
|
| 22 |
| IARPA Babel Pashto Language Pack IARPA-babel104b-v0.4bY | [dataverse](https://hdl.handle.net/11272.1/AB2/GLFN3X) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/GLFN3X) | Pashto speech dataset for ASR and language identification experiments |
|
| 23 |
| ihanif/pashto_asr_wer | [huggingface](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | ASR training and evaluation data source |
|
|
@@ -27,29 +26,28 @@
|
|
| 27 |
| ihanif/pashto_speech_5k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | ASR training and evaluation data source |
|
| 28 |
| ihanif/pashto_speech_ds | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | ASR training and evaluation data source |
|
| 29 |
| ihanif/pashto_speech_parquet_10k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | ASR training and evaluation data source |
|
| 30 |
-
| Katib's Pashto Text Imagebase (KPTI) | [kaggle](https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/hassanamin/katibs-pashto-text-imagebase-kpti) | OCR training and evaluation data source |
|
| 31 |
| koochikoo25/Pashto-Concatenated | [huggingface](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | [Dataset title explicitly states Pashto and card metadata exposes audio-text features and splits. (`Pashto`, `audio`, `transcription`)](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | ASR dataset preparation and split-based benchmark experiments |
|
|
|
|
|
|
|
| 32 |
| oowais/pushto-text-to-speech-dataset | [huggingface](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | ASR training and evaluation data source |
|
| 33 |
| OPED (Open Pashto-English Dictionary): Preliminary version, 30 October 2025 | [zenodo](https://zenodo.org/records/17487678) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/17487678) | Automated discovery entry for Pashto resource tracking. |
|
| 34 |
| OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
|
| 35 |
| OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
|
| 36 |
-
| Pashto English Bilingual Sentiment Corpus | [kaggle](https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus) | [Kaggle dataset title and description identify the corpus as Pashto-English sentiment data. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus) | Sentiment analysis and bilingual NLP experiments |
|
| 37 |
-
| Pashto Isolated Alphabets and Numerals | [kaggle](https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/abdulbasitkh/pashto-isolated-alphabetss-and-numerals) | Automated discovery entry for Pashto resource tracking. |
|
| 38 |
-
| Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
|
| 39 |
-
| Pashto OCR | [kaggle](https://www.kaggle.com/datasets/hassanamin/pashto-ocr) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/hassanamin/pashto-ocr) | OCR training and evaluation data source |
|
| 40 |
-
| Pashto Poetry | [kaggle](https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/alimuhammadasad/pashto-poetry) | Automated discovery entry for Pashto resource tracking. |
|
| 41 |
-
| Pashto text characters sample | [kaggle](https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample) | [Kaggle dataset title/subtitle includes Pashto keyword. (`Pashto`)](https://www.kaggle.com/datasets/mahibullahmudaser/pashtochracterssample) | Automated discovery entry for Pashto resource tracking. |
|
| 42 |
| Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
|
| 43 |
-
|
|
| 44 |
-
|
|
| 45 |
-
|
|
| 46 |
-
|
|
| 47 |
-
| POLD - Pashto Offensive Language Dataset | [kaggle](https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset) | [Kaggle title and description explicitly state Pashto offensive language benchmark dataset. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset) | Pashto toxicity and moderation NLP benchmarks |
|
| 48 |
| saillab/alpaca_pashto_taco | [huggingface](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | Instruction tuning and LLM adaptation data source |
|
| 49 |
| SherwinDesouza/pashto-common-voice-20 | [huggingface](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | Pashto data source for NLP experimentation |
|
|
|
|
| 50 |
| tasal9/Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | Pashto data source for NLP experimentation |
|
| 51 |
| tasal9/ZamAI_Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | Pashto data source for NLP experimentation |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
| Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
|
| 54 |
| Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
|
| 55 |
|
|
|
|
| 14 |
| arsalagrey/pashto-books | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books) | Automated discovery entry for Pashto resource tracking. |
|
| 15 |
| arsalagrey/pashto-books-json | [huggingface](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/arsalagrey/pashto-books-json) | Automated discovery entry for Pashto resource tracking. |
|
| 16 |
| Belebele | [huggingface](https://huggingface.co/datasets/facebook/belebele) | [Dataset includes pbt_Arab subset. (`pbt_Arab`)](https://huggingface.co/datasets/facebook/belebele) | Comprehension and multilingual NLP benchmark |
|
| 17 |
+
| Clitic Particles and the Typology of 2P Languages | [zenodo](https://zenodo.org/records/15010591) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15010591) | Automated discovery entry for Pashto resource tracking. |
|
| 18 |
| Common Voice Scripted Speech 24.0 - Pashto | [mozilla](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | [Official dataset page is for Pashto. (`Pashto`)](https://datacollective.mozillafoundation.org/datasets/cmj8u3pnb00llnxxbfvxo3b14) | ASR training and evaluation |
|
| 19 |
| English to Pashto Sentences Dataset | [huggingface](https://huggingface.co/datasets/adnankhan769/english_to_pashto_sentences_dataset) | [Dataset ID explicitly states English-to-Pashto and includes Pashto-script sentence column. (`Pashto`)](https://huggingface.co/api/datasets/adnankhan769/english_to_pashto_sentences_dataset) | MT and bilingual sentence alignment baseline |
|
|
|
|
| 20 |
| Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
|
| 21 |
| IARPA Babel Pashto Language Pack IARPA-babel104b-v0.4bY | [dataverse](https://hdl.handle.net/11272.1/AB2/GLFN3X) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/GLFN3X) | Pashto speech dataset for ASR and language identification experiments |
|
| 22 |
| ihanif/pashto_asr_wer | [huggingface](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_asr_wer) | ASR training and evaluation data source |
|
|
|
|
| 26 |
| ihanif/pashto_speech_5k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_5k) | ASR training and evaluation data source |
|
| 27 |
| ihanif/pashto_speech_ds | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_ds) | ASR training and evaluation data source |
|
| 28 |
| ihanif/pashto_speech_parquet_10k | [huggingface](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/ihanif/pashto_speech_parquet_10k) | ASR training and evaluation data source |
|
|
|
|
| 29 |
| koochikoo25/Pashto-Concatenated | [huggingface](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | [Dataset title explicitly states Pashto and card metadata exposes audio-text features and splits. (`Pashto`, `audio`, `transcription`)](https://huggingface.co/datasets/koochikoo25/Pashto-Concatenated) | ASR dataset preparation and split-based benchmark experiments |
|
| 30 |
+
| Multi-Language Conversational Telephone Speech 2011 -- Central Asian | [dataverse](https://hdl.handle.net/11272.1/AB2/YW9PX3) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/YW9PX3) | Automated discovery entry for Pashto resource tracking. |
|
| 31 |
+
| NAVOIY-TERRA Corpus v1.0: First Computational Corpus of Alisher Navoi Works with Nine-Language Semantic Annotations | [datacite](https://zenodo.org/doi/10.5281/zenodo.18602634) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18602634) | Automated discovery entry for Pashto resource tracking. |
|
| 32 |
| oowais/pushto-text-to-speech-dataset | [huggingface](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/oowais/pushto-text-to-speech-dataset) | ASR training and evaluation data source |
|
| 33 |
| OPED (Open Pashto-English Dictionary): Preliminary version, 30 October 2025 | [zenodo](https://zenodo.org/records/17487678) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/17487678) | Automated discovery entry for Pashto resource tracking. |
|
| 34 |
| OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
|
| 35 |
| OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
| Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
|
| 37 |
+
| RATS Language Identification | [dataverse](https://hdl.handle.net/11272.1/AB2/UP3WJC) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/UP3WJC) | Automated discovery entry for Pashto resource tracking. |
|
| 38 |
+
| RATS Low Speech Density | [dataverse](https://doi.org/10.35111/4ENA-FG30) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/4ENA-FG30) | Automated discovery entry for Pashto resource tracking. |
|
| 39 |
+
| RATS Speaker Identification | [dataverse](https://doi.org/10.35111/ZQET-2102) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/ZQET-2102) | Automated discovery entry for Pashto resource tracking. |
|
| 40 |
+
| RATS Speech Activity Detection | [dataverse](https://hdl.handle.net/11272.1/AB2/1UISJ7) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/1UISJ7) | Automated discovery entry for Pashto resource tracking. |
|
|
|
|
| 41 |
| saillab/alpaca_pashto_taco | [huggingface](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/saillab/alpaca_pashto_taco) | Instruction tuning and LLM adaptation data source |
|
| 42 |
| SherwinDesouza/pashto-common-voice-20 | [huggingface](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/SherwinDesouza/pashto-common-voice-20) | Pashto data source for NLP experimentation |
|
| 43 |
+
| SMAjram: A Large-Scale Synthetic OCR Dataset for Punjabi Shahmukhi (Perso-Arabic) Script | [zenodo](https://zenodo.org/records/15868719) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15868719) | Automated discovery entry for Pashto resource tracking. |
|
| 44 |
| tasal9/Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/Pashto_Dataset) | Pashto data source for NLP experimentation |
|
| 45 |
| tasal9/ZamAI_Pashto_Dataset | [huggingface](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/datasets/tasal9/ZamAI_Pashto_Dataset) | Pashto data source for NLP experimentation |
|
| 46 |
+
| Towards a Typology of Endoclitics | [zenodo](https://zenodo.org/records/15041544) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15041544) | Automated discovery entry for Pashto resource tracking. |
|
| 47 |
+
| TRAD Arabic-French Parallel Text -- Newsgroup | [dataverse](https://hdl.handle.net/11272.1/AB2/0DET8M) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/0DET8M) | Automated discovery entry for Pashto resource tracking. |
|
| 48 |
+
| TRAD Arabic-French Parallel Text -- Newswire | [dataverse](https://doi.org/10.35111/Z1WG-9X78) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/Z1WG-9X78) | Automated discovery entry for Pashto resource tracking. |
|
| 49 |
+
| TRAD Chinese-French Parallel Text -- Blog | [dataverse](https://hdl.handle.net/11272.1/AB2/ATYE6I) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://hdl.handle.net/11272.1/AB2/ATYE6I) | Automated discovery entry for Pashto resource tracking. |
|
| 50 |
+
| TRAD Chinese-French Parallel Text -- Broadcast News | [dataverse](https://doi.org/10.35111/7FW4-EV85) | [Dataverse metadata includes Pashto markers in dataset title or description. (`pashto`)](https://doi.org/10.35111/7FW4-EV85) | Automated discovery entry for Pashto resource tracking. |
|
| 51 |
| Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
|
| 52 |
| Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
|
| 53 |
|
resources/models/README.md
CHANGED
|
@@ -13,6 +13,7 @@
|
|
| 13 |
| ihanif/whisper-small-pashto-dropout | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | Pashto ASR baseline and model comparison |
|
| 14 |
| ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
|
| 15 |
| ijazulhaq/bert-base-pashto | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto) | Pashto model baseline for downstream NLP tasks |
|
|
|
|
| 16 |
| ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
|
| 17 |
| Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1 | [huggingface](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | Automated discovery entry for Pashto resource tracking. |
|
| 18 |
| koochikoo25/pashto-whisper-large | [huggingface](https://huggingface.co/koochikoo25/pashto-whisper-large) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/koochikoo25/pashto-whisper-large) | Pashto ASR baseline and model comparison |
|
|
|
|
| 13 |
| ihanif/whisper-small-pashto-dropout | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto-dropout) | Pashto ASR baseline and model comparison |
|
| 14 |
| ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
|
| 15 |
| ijazulhaq/bert-base-pashto | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto) | Pashto model baseline for downstream NLP tasks |
|
| 16 |
+
| ijazulhaq/bert-base-pashto-c | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-c) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-c) | Automated discovery entry for Pashto resource tracking. |
|
| 17 |
| ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
|
| 18 |
| Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1 | [huggingface](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/Jawaria/wav2vec2-large-xls-r-300m-pashto-colab-final-1) | Automated discovery entry for Pashto resource tracking. |
|
| 19 |
| koochikoo25/pashto-whisper-large | [huggingface](https://huggingface.co/koochikoo25/pashto-whisper-large) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/koochikoo25/pashto-whisper-large) | Pashto ASR baseline and model comparison |
|
resources/papers/README.md
CHANGED
|
@@ -4,30 +4,110 @@
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
| Benchmark Pashto Handwritten Character Dataset and Pashto Object Character Recognition (OCR) Using Deep Neural Network with Rule Activation Function | [openalex](https://doi.org/10.1155/2021/6669672) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1155/2021/6669672) | Pashto handwritten OCR benchmark and methodology reference |
|
| 8 |
| Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
| Database development and automatic speech recognition of isolated Pashto spoken digits using MFCC and K-NN | [openalex](https://doi.org/10.1007/s10772-014-9267-z) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1007/s10772-014-9267-z) | Pashto ASR baseline method reference for digit recognition |
|
| 10 |
| Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
|
|
|
|
|
|
|
| 11 |
| Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese | [arxiv](http://arxiv.org/abs/2005.08650v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2005.08650v1) | Pashto OCR method reference |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
| Enhancing Pashto NER Using Machine-Labeled Data and Transformer-Based Models | [other](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | Pashto research reference for methods and benchmarking |
|
| 13 |
| Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
| From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset | [arxiv](http://arxiv.org/abs/2602.14062v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2602.14062v1) | ASR data quality and release trend reference |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
| KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
|
|
|
|
| 16 |
| KPTI: Katib's Pashto Text Imagebase and Deep Learning Benchmark | [openalex](https://doi.org/10.1109/icfhr.2016.0090) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icfhr.2016.0090) | Pashto OCR dataset and benchmarking reference |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
| Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
| Pashto isolated digits recognition using deep convolutional neural network | [openalex](https://doi.org/10.1016/j.heliyon.2020.e03372) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1016/j.heliyon.2020.e03372) | Pashto speech recognition research reference |
|
|
|
|
| 19 |
| Pashto offensive language detection: a benchmark dataset and monolingual Pashto BERT | [openalex](https://doi.org/10.7717/peerj-cs.1617) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.7717/peerj-cs.1617) | Pashto NLP toxicity detection benchmark and model reference |
|
|
|
|
|
|
|
|
|
|
| 20 |
| Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
| PHTI: Pashto Handwritten Text Imagebase for Deep Learning Applications | [openalex](https://doi.org/10.1109/access.2022.3216881) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/access.2022.3216881) | Pashto OCR dataset and benchmark reference |
|
| 22 |
| Pioneer dataset and recognition of Handwritten Pashto characters using Convolution Neural Networks | [openalex](https://doi.org/10.1177/0020294020964826) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1177/0020294020964826) | Pashto handwritten character recognition reference |
|
|
|
|
| 23 |
| POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
|
| 24 |
| PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
|
|
|
|
|
|
|
| 25 |
| Recognition of Pashto Handwritten Characters Based on Deep Learning | [openalex](https://doi.org/10.3390/s20205884) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.3390/s20205884) | Pashto OCR model reference for handwritten character recognition |
|
| 26 |
| Recognizable units in Pashto language for OCR | [openalex](https://doi.org/10.1109/icdar.2015.7333963) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333963) | Pashto OCR preprocessing and unit-design reference |
|
| 27 |
| Scale and rotation invariant OCR for Pashto cursive script using MDLSTM network | [openalex](https://doi.org/10.1109/icdar.2015.7333931) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333931) | Pashto OCR model architecture reference |
|
|
|
|
|
|
|
| 28 |
| Shape analysis of Pashto script and creation of image database for OCR | [openalex](https://doi.org/10.1109/icet.2009.5353160) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icet.2009.5353160) | Pashto OCR dataset design and feature reference |
|
|
|
|
| 29 |
| Speech translation for low-resource languages: the case of Pashto | [openalex](https://doi.org/10.21437/interspeech.2005-723) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.21437/interspeech.2005-723) | Pashto speech translation and low-resource MT reference |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
| Tuning Traditional Language Processing Approaches for Pashto Text Classification | [arxiv](http://arxiv.org/abs/2305.03737v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2305.03737v1) | Pashto text classification method reference |
|
|
|
|
| 31 |
|
| 32 |
## Maintenance
|
| 33 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
|
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
+
| (Pushto) Pakhto Nasar Kay Da Matbooa Tarjumo Yova Tanqeedi Mutala/Jaiza. | [other](https://www.semanticscholar.org/paper/0da0e8535262d1f26f04dd6bc2f091474cab4150) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/0da0e8535262d1f26f04dd6bc2f091474cab4150) | Automated discovery entry for Pashto resource tracking. |
|
| 8 |
+
| A Comparative Analysis of Pashto Ghazals and English Sonnets in 17th Century | [other](https://www.semanticscholar.org/paper/55b044485b2f134c69c9b9b6dfeaa7e71e704b3d) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/55b044485b2f134c69c9b9b6dfeaa7e71e704b3d) | Automated discovery entry for Pashto resource tracking. |
|
| 9 |
+
| A Dictionary of the Pukhto, Pushto, or Language of the Afghans | [other](https://www.semanticscholar.org/paper/777c0aa56991f55826339915363de2ceb8dd7141) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/777c0aa56991f55826339915363de2ceb8dd7141) | Automated discovery entry for Pashto resource tracking. |
|
| 10 |
+
| A dictionary of the Pukhto, Pushto, or language of the Afghans; with remarks on the originality of the language, and its affinity to the Semitic and other Oriental tongues, etc. | [other](https://www.semanticscholar.org/paper/d12502a6c245ff6f537bf68d9db4b449dca827bb) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/d12502a6c245ff6f537bf68d9db4b449dca827bb) | Automated discovery entry for Pashto resource tracking. |
|
| 11 |
+
| A grammar of the Puk̲h̲to or Pus̲'h̲to language | [other](https://www.semanticscholar.org/paper/99c46409a55ac0bf68e2c530a377becfcb46dd47) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/99c46409a55ac0bf68e2c530a377becfcb46dd47) | Automated discovery entry for Pashto resource tracking. |
|
| 12 |
+
| A New Etymological Vocabulary of Pashto | [openalex](https://openalex.org/W2071464713) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://openalex.org/W2071464713) | Automated discovery entry for Pashto resource tracking. |
|
| 13 |
+
| A reference grammar of Pashto | [openalex](http://wals.info/refdb/record/7189) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](http://wals.info/refdb/record/7189) | Automated discovery entry for Pashto resource tracking. |
|
| 14 |
+
| An Acoustic Analysis of consonants of Khattak Dialect of Pashto | [other](https://www.semanticscholar.org/paper/ed06d206e60a62c2bebdd487b4f8dea253a9a0a8) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/ed06d206e60a62c2bebdd487b4f8dea253a9a0a8) | Automated discovery entry for Pashto resource tracking. |
|
| 15 |
+
| AN ANALYSIS OF FREUDIAN CONCEPT OF MOURNING IN PASHTO TAPPAS ON THE THEME OF MIGRATION | [zenodo](https://zenodo.org/records/11124039) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/11124039) | Automated discovery entry for Pashto resource tracking. |
|
| 16 |
+
| An Analysis of the Syntactic and Pragmatic Effects on Word Order Flexibility in Pashto and English | [other](https://www.semanticscholar.org/paper/136c23f176399f7dfc45e6ae990a975aafd7da1d) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/136c23f176399f7dfc45e6ae990a975aafd7da1d) | Automated discovery entry for Pashto resource tracking. |
|
| 17 |
+
| Analysing Deep Meaning of Proverbs in Pashto Language | [other](https://www.semanticscholar.org/paper/1a804a9701c5103ed38df3350da61abdf5df2b57) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/1a804a9701c5103ed38df3350da61abdf5df2b57) | Automated discovery entry for Pashto resource tracking. |
|
| 18 |
| Benchmark Pashto Handwritten Character Dataset and Pashto Object Character Recognition (OCR) Using Deep Neural Network with Rule Activation Function | [openalex](https://doi.org/10.1155/2021/6669672) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1155/2021/6669672) | Pashto handwritten OCR benchmark and methodology reference |
|
| 19 |
| Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
|
| 20 |
+
| CER-HV: A CER-Based Human-in-the-Loop Framework for Cleaning Datasets Applied to Arabic-Script HTR | [datacite](https://arxiv.org/abs/2601.16713) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2601.16713) | Automated discovery entry for Pashto resource tracking. |
|
| 21 |
+
| CHALLENGING GENDER ROLES: A FEMINIST ANALYSIS OF GHANI KHAN'S THE PATHANS | [zenodo](https://zenodo.org/records/11216862) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/11216862) | Automated discovery entry for Pashto resource tracking. |
|
| 22 |
+
| Cinematic Misnomers: Examining the Effects of Pashto Movie Titles on the Perception of Pashtun Identity | [other](https://www.semanticscholar.org/paper/1b4c38ce4ceb6ac7846062bb589351cc88a36617) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/1b4c38ce4ceb6ac7846062bb589351cc88a36617) | Automated discovery entry for Pashto resource tracking. |
|
| 23 |
+
| Comparative Study of Adjectives in Pashto and Dari as Cognate Languages | [other](https://www.semanticscholar.org/paper/558e9dd7d4027be391a39f5e5ef988cf05039dc7) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/558e9dd7d4027be391a39f5e5ef988cf05039dc7) | Automated discovery entry for Pashto resource tracking. |
|
| 24 |
+
| Comprehensive Socio-phonetic Study of the Plosive /p/ and Fricative /f/ Merger among Pashto Speakers in Khyber Pakhtunkhwa | [other](https://www.semanticscholar.org/paper/4f01f2250c897dc53099f76a2455471b480f22cf) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/4f01f2250c897dc53099f76a2455471b480f22cf) | Automated discovery entry for Pashto resource tracking. |
|
| 25 |
+
| Critical study of the travelogues of Dr Altaf Yousafzai (In The Context of "Thailand kay Rang", "Nile kay Sang" and "Bakhal-e-Hinduwush Bakhsham") | [zenodo](https://zenodo.org/records/13937101) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/13937101) | Automated discovery entry for Pashto resource tracking. |
|
| 26 |
| Database development and automatic speech recognition of isolated Pashto spoken digits using MFCC and K-NN | [openalex](https://doi.org/10.1007/s10772-014-9267-z) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1007/s10772-014-9267-z) | Pashto ASR baseline method reference for digit recognition |
|
| 27 |
| Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
|
| 28 |
+
| Depiction of Women's Cries in Pashto Landai Poetry | [zenodo](https://zenodo.org/records/15524281) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15524281) | Automated discovery entry for Pashto resource tracking. |
|
| 29 |
+
| Descriptive Grammar of Pashto and its Dialects | [openalex](https://doi.org/10.1515/9781614512318) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1515/9781614512318) | Automated discovery entry for Pashto resource tracking. |
|
| 30 |
| Development of a New Image-to-text Conversion System for Pashto, Farsi and Traditional Chinese | [arxiv](http://arxiv.org/abs/2005.08650v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2005.08650v1) | Pashto OCR method reference |
|
| 31 |
+
| Divorce And Women’s Rights: Should Women have Equal Rights? (Pashto) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123991) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123991) | Automated discovery entry for Pashto resource tracking. |
|
| 32 |
+
| Doing Pashto | [crossref](https://doi.org/10.1080/02690055.2015.1068987) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1080/02690055.2015.1068987) | Automated discovery entry for Pashto resource tracking. |
|
| 33 |
+
| Editorial Note (Dari) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123994) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123994) | Automated discovery entry for Pashto resource tracking. |
|
| 34 |
+
| Editorial Note (English) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123993) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123993) | Automated discovery entry for Pashto resource tracking. |
|
| 35 |
+
| Editorial Note (Pashto) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123995) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123995) | Automated discovery entry for Pashto resource tracking. |
|
| 36 |
+
| EDUCATIONAL AND LINGUISTIC ASPECTS OF TEXT PREPROCESSING IN PASHTO | [zenodo](https://zenodo.org/records/15917449) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15917449) | Automated discovery entry for Pashto resource tracking. |
|
| 37 |
+
| Embedding Elements from Foreign Language into The Native Language Through Pashto-English Code-Mixed Speech | [zenodo](https://zenodo.org/records/14756994) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/14756994) | Automated discovery entry for Pashto resource tracking. |
|
| 38 |
| Enhancing Pashto NER Using Machine-Labeled Data and Transformer-Based Models | [other](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/be851ecf9197ef9bb8bf764abf4db0dda95cd9da) | Pashto research reference for methods and benchmarking |
|
| 39 |
| Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
|
| 40 |
+
| Essential Skills for a Lexicographer: Based on Pashto Lexicography | [other](https://www.semanticscholar.org/paper/8fc45aa567cb78713e2fef41d5e748e8ee1d8470) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/8fc45aa567cb78713e2fef41d5e748e8ee1d8470) | Automated discovery entry for Pashto resource tracking. |
|
| 41 |
+
| EVALUATION OF ANTIBACTERIAL ACTIVITY OF ZIZYPHUS JUJUBA | [zenodo](https://zenodo.org/records/3595881) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/3595881) | Automated discovery entry for Pashto resource tracking. |
|
| 42 |
+
| EVALUATION OF ANTIPYRETIC ACTIVITY OF ZIZYPHUS JUJUBA LAM. LEAVES ON ALBINO RATS | [zenodo](https://zenodo.org/records/4269214) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/4269214) | Automated discovery entry for Pashto resource tracking. |
|
| 43 |
+
| Exploring Hospitality as a Cultural Tradition: A Qualitative Study of Pashto and Hindko Customs | [zenodo](https://zenodo.org/records/14872725) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/14872725) | Automated discovery entry for Pashto resource tracking. |
|
| 44 |
+
| Exploring the Impacts of Emotion through Language Learning on Pashto Speakers Young Adulthood in District Peshawar | [other](https://www.semanticscholar.org/paper/4549649112553aabccfac8b918c7e98cdbdd0f09) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/4549649112553aabccfac8b918c7e98cdbdd0f09) | Automated discovery entry for Pashto resource tracking. |
|
| 45 |
+
| Fairness Evaluation and Inference Level Mitigation in LLMs | [datacite](https://figshare.mq.edu.au/articles/thesis/Fairness_Evaluation_and_Inference_Level_Mitigation_in_LLMs/31093552/1) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://figshare.mq.edu.au/articles/thesis/Fairness_Evaluation_and_Inference_Level_Mitigation_in_LLMs/31093552/1) | Automated discovery entry for Pashto resource tracking. |
|
| 46 |
+
| Fragments of life in ‘death world’: an analysis of Pashto poetry as a non-violent resistance to necropolitics | [other](https://www.semanticscholar.org/paper/9726f372b07f677fad23e2ee27a7f50f985e8ed8) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/9726f372b07f677fad23e2ee27a7f50f985e8ed8) | Automated discovery entry for Pashto resource tracking. |
|
| 47 |
+
| Framing Political Bias in Multilingual LLMs Across Pakistani Languages | [datacite](https://arxiv.org/abs/2506.00068) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2506.00068) | Automated discovery entry for Pashto resource tracking. |
|
| 48 |
| From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset | [arxiv](http://arxiv.org/abs/2602.14062v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2602.14062v1) | ASR data quality and release trend reference |
|
| 49 |
+
| From Scarcity to Scale: A Release-Level Analysis of the Pashto Common Voice Dataset | [datacite](https://arxiv.org/abs/2602.14062) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2602.14062) | Automated discovery entry for Pashto resource tracking. |
|
| 50 |
+
| Gemination in Pashto | [crossref](https://doi.org/10.24312/ucp-jll.02.02.405) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.24312/ucp-jll.02.02.405) | Automated discovery entry for Pashto resource tracking. |
|
| 51 |
+
| Gender Classification From Pashto Handwritten Text Images | [other](https://www.semanticscholar.org/paper/2d70fffa9224d71f67ad3c1943b8a71b18164eeb) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/2d70fffa9224d71f67ad3c1943b8a71b18164eeb) | Automated discovery entry for Pashto resource tracking. |
|
| 52 |
+
| Introduction to Pashto Word’s Characteristics | [other](https://www.semanticscholar.org/paper/6eb3febbb368a7eaccc6290bcd77683ed3d624aa) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/6eb3febbb368a7eaccc6290bcd77683ed3d624aa) | Automated discovery entry for Pashto resource tracking. |
|
| 53 |
+
| Is the Pushto a Semitic Language | [zenodo](https://zenodo.org/records/16001185) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/16001185) | Automated discovery entry for Pashto resource tracking. |
|
| 54 |
+
| Isolated Handwritten Pashto Character Recognition Using a <i>K</i>‐NN Classification Tool based on Zoning and HOG Feature Extraction Techniques | [openalex](https://doi.org/10.1155/2021/5558373) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1155/2021/5558373) | Automated discovery entry for Pashto resource tracking. |
|
| 55 |
| KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
|
| 56 |
+
| KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [openalex](https://doi.org/10.14569/ijacsa.2018.091069) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.14569/ijacsa.2018.091069) | Automated discovery entry for Pashto resource tracking. |
|
| 57 |
| KPTI: Katib's Pashto Text Imagebase and Deep Learning Benchmark | [openalex](https://doi.org/10.1109/icfhr.2016.0090) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icfhr.2016.0090) | Pashto OCR dataset and benchmarking reference |
|
| 58 |
+
| Language Barrier and its Effect on Learning at the Public Primary School Level in Lahore | [zenodo](https://zenodo.org/records/17728944) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/17728944) | Automated discovery entry for Pashto resource tracking. |
|
| 59 |
+
| Le verbe pashto | [crossref](https://doi.org/10.29091/9783954907083) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.29091/9783954907083) | Automated discovery entry for Pashto resource tracking. |
|
| 60 |
+
| Morphology of Pashto Adverbs: Word and Paradigm Approach | [zenodo](https://zenodo.org/records/16211508) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/16211508) | Automated discovery entry for Pashto resource tracking. |
|
| 61 |
+
| Negation in Pashto | [zenodo](https://zenodo.org/records/18233956) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/18233956) | Automated discovery entry for Pashto resource tracking. |
|
| 62 |
+
| Negotiating Pakhto: Proverbs, Islam and the Construction of Identity among Pashtuns | [other](https://www.semanticscholar.org/paper/8a503f164e0c1f5be13866dad00539c7e5b1cabc) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/8a503f164e0c1f5be13866dad00539c7e5b1cabc) | Automated discovery entry for Pashto resource tracking. |
|
| 63 |
+
| Only 2 of 141 Global Languages Employ a Labial for "Tongue" in 1st position Challenging Saussure's Arbitrariness With Near Universal Embodied Iconicity for Tongue Vs Mouth in "inverse" Control | [datacite](https://zenodo.org/doi/10.5281/zenodo.17807676) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.17807676) | Automated discovery entry for Pashto resource tracking. |
|
| 64 |
| Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
|
| 65 |
+
| Pashto | [crossref](https://doi.org/10.4324/9780203301524-34) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.4324/9780203301524-34) | Automated discovery entry for Pashto resource tracking. |
|
| 66 |
+
| Pashto (Endo-)clitics in a Parallel Architecture | [openalex](http://nbn-resolving.de/urn:nbn:de:bsz:352-0-278290) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](http://nbn-resolving.de/urn:nbn:de:bsz:352-0-278290) | Automated discovery entry for Pashto resource tracking. |
|
| 67 |
+
| Pashto : Pashto-English, English-Pashto dictionary & phrasebook | [other](https://www.semanticscholar.org/paper/8ff77d35396d17225d97772e577e472a2ab1c47a) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/8ff77d35396d17225d97772e577e472a2ab1c47a) | Automated discovery entry for Pashto resource tracking. |
|
| 68 |
+
| Pashto free relatives and triply-filled Comp: Evidence for a headed analysis | [openalex](https://doi.org/10.1016/s0024-3841(96)00032-0) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1016/s0024-3841(96)00032-0) | Automated discovery entry for Pashto resource tracking. |
|
| 69 |
+
| Pashto Handwritten Books | [crossref](https://doi.org/10.1163/9789004737358_003) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1163/9789004737358_003) | Automated discovery entry for Pashto resource tracking. |
|
| 70 |
| Pashto isolated digits recognition using deep convolutional neural network | [openalex](https://doi.org/10.1016/j.heliyon.2020.e03372) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1016/j.heliyon.2020.e03372) | Pashto speech recognition research reference |
|
| 71 |
+
| Pashto Language | [crossref](https://doi.org/10.32388/pxbtfv) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.32388/pxbtfv) | Automated discovery entry for Pashto resource tracking. |
|
| 72 |
| Pashto offensive language detection: a benchmark dataset and monolingual Pashto BERT | [openalex](https://doi.org/10.7717/peerj-cs.1617) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.7717/peerj-cs.1617) | Pashto NLP toxicity detection benchmark and model reference |
|
| 73 |
+
| PASHTO POETRY AND MILITANCY IN KHYBER PAKHTUNKHWA AFTER 9/11: THEMATIC ANALYSIS OF PASHTO POETRY IN RESISTING MILITANCY | [other](https://www.semanticscholar.org/paper/e81d4e7ac6cd7519643bf5d5c0bdfd9be554a8f2) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/e81d4e7ac6cd7519643bf5d5c0bdfd9be554a8f2) | Automated discovery entry for Pashto resource tracking. |
|
| 74 |
+
| Pashto preverbs V | [other](https://www.semanticscholar.org/paper/1f59f22ae99379106b417186f3053c00b5fe391f) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/1f59f22ae99379106b417186f3053c00b5fe391f) | Automated discovery entry for Pashto resource tracking. |
|
| 75 |
+
| Pashto preverbs, III. Compound verbs with preverb | [other](https://www.semanticscholar.org/paper/53eeae3a973d6bb72839e9304be13a0362c92242) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/53eeae3a973d6bb72839e9304be13a0362c92242) | Automated discovery entry for Pashto resource tracking. |
|
| 76 |
| Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
|
| 77 |
+
| Pashto Tappa | [crossref](https://doi.org/10.4324/9781003604877-9) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.4324/9781003604877-9) | Automated discovery entry for Pashto resource tracking. |
|
| 78 |
+
| Pashto Verse | [crossref](https://doi.org/10.1017/s0041977x00072700) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1017/s0041977x00072700) | Automated discovery entry for Pashto resource tracking. |
|
| 79 |
+
| Persian loanwords and calques in Pashto | [other](https://www.semanticscholar.org/paper/ed232f1c2abd6e6f8a49f04de8ac76bf922521ea) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/ed232f1c2abd6e6f8a49f04de8ac76bf922521ea) | Automated discovery entry for Pashto resource tracking. |
|
| 80 |
+
| Persian, Urdu, and Pashto: A comparative orthographic analysis | [openalex](https://doi.org/10.1093/wsr/wsq005) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1093/wsr/wsq005) | Automated discovery entry for Pashto resource tracking. |
|
| 81 |
| PHTI: Pashto Handwritten Text Imagebase for Deep Learning Applications | [openalex](https://doi.org/10.1109/access.2022.3216881) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/access.2022.3216881) | Pashto OCR dataset and benchmark reference |
|
| 82 |
| Pioneer dataset and recognition of Handwritten Pashto characters using Convolution Neural Networks | [openalex](https://doi.org/10.1177/0020294020964826) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1177/0020294020964826) | Pashto handwritten character recognition reference |
|
| 83 |
+
| Portrayal of Death in the Selected Poems of Abdul Ghani Khan and Emily Dickinson: A Comparative Thematic Analysis | [zenodo](https://zenodo.org/records/15046502) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/15046502) | Automated discovery entry for Pashto resource tracking. |
|
| 84 |
| POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
|
| 85 |
| PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
|
| 86 |
+
| PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [datacite](https://arxiv.org/abs/2505.10055) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://arxiv.org/abs/2505.10055) | Automated discovery entry for Pashto resource tracking. |
|
| 87 |
+
| PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [arxiv](http://arxiv.org/abs/2505.10055v2) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2505.10055v2) | Automated discovery entry for Pashto resource tracking. |
|
| 88 |
| Recognition of Pashto Handwritten Characters Based on Deep Learning | [openalex](https://doi.org/10.3390/s20205884) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.3390/s20205884) | Pashto OCR model reference for handwritten character recognition |
|
| 89 |
| Recognizable units in Pashto language for OCR | [openalex](https://doi.org/10.1109/icdar.2015.7333963) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333963) | Pashto OCR preprocessing and unit-design reference |
|
| 90 |
| Scale and rotation invariant OCR for Pashto cursive script using MDLSTM network | [openalex](https://doi.org/10.1109/icdar.2015.7333931) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icdar.2015.7333931) | Pashto OCR model architecture reference |
|
| 91 |
+
| Scale and rotation invariant recognition of cursive Pashto script using SIFT features | [openalex](https://doi.org/10.1109/icet.2010.5638470) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icet.2010.5638470) | Automated discovery entry for Pashto resource tracking. |
|
| 92 |
+
| Separating phonology from syntax: a reanalysis of Pashto cliticization | [openalex](https://doi.org/10.1017/s0022226700006952) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1017/s0022226700006952) | Automated discovery entry for Pashto resource tracking. |
|
| 93 |
| Shape analysis of Pashto script and creation of image database for OCR | [openalex](https://doi.org/10.1109/icet.2009.5353160) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1109/icet.2009.5353160) | Pashto OCR dataset design and feature reference |
|
| 94 |
+
| Some Remarks on the Origin of the Afghán People and Dialect and on the Connexion of the Pushto Language with the Zend and Pehlavi and the Hebrew | [zenodo](https://zenodo.org/records/16191315) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/16191315) | Automated discovery entry for Pashto resource tracking. |
|
| 95 |
| Speech translation for low-resource languages: the case of Pashto | [openalex](https://doi.org/10.21437/interspeech.2005-723) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.21437/interspeech.2005-723) | Pashto speech translation and low-resource MT reference |
|
| 96 |
+
| Summaries in Pashto | [crossref](https://doi.org/10.1097/01.wtf.0000437933.40809.39) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.1097/01.wtf.0000437933.40809.39) | Automated discovery entry for Pashto resource tracking. |
|
| 97 |
+
| SWITCHING SELVES ONLINE:PASHTO-ENGLISH BILINGUALISM,IDENTITY, AND EXPRESSION IN PAKISTAN’S DIGITAL DISCOURSE | [other](https://www.semanticscholar.org/paper/7a330c5fb416a1105866a895748b4336f8ef8100) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/7a330c5fb416a1105866a895748b4336f8ef8100) | Automated discovery entry for Pashto resource tracking. |
|
| 98 |
+
| Syntax and morphology of Baniswola Pashto: investigating universal and dialectal variations | [other](https://www.semanticscholar.org/paper/9f725b3b282cf05f9089002d474010c6021001f9) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/9f725b3b282cf05f9089002d474010c6021001f9) | Automated discovery entry for Pashto resource tracking. |
|
| 99 |
+
| The BBN Byblos Pashto OCR system | [openalex](https://doi.org/10.1145/1031442.1031447) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1145/1031442.1031447) | Automated discovery entry for Pashto resource tracking. |
|
| 100 |
+
| The development and evaluation of an automatic clitic generator for Pashto language | [other](https://www.semanticscholar.org/paper/3d95449d67799fcac83f855984cb0c29cc500d7b) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/3d95449d67799fcac83f855984cb0c29cc500d7b) | Automated discovery entry for Pashto resource tracking. |
|
| 101 |
+
| The grammar of clitics : evidence from Pashto and other languages | [openalex](http://hdl.handle.net/11858/00-001M-0000-0012-7447-0) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](http://hdl.handle.net/11858/00-001M-0000-0012-7447-0) | Automated discovery entry for Pashto resource tracking. |
|
| 102 |
+
| The Influence of the Arabic Language on the Pashto Language: The Abdur-Rahman Baba as a Model (A Case Study Analysis) | [zenodo](https://zenodo.org/records/18174368) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/18174368) | Automated discovery entry for Pashto resource tracking. |
|
| 103 |
+
| The Pashto language and identity‐formation in Pakistan | [openalex](https://doi.org/10.1080/09584939508719759) | [Matched by explicit Pashto marker in title from OpenAlex works search. (`pashto`)](https://doi.org/10.1080/09584939508719759) | Automated discovery entry for Pashto resource tracking. |
|
| 104 |
+
| The Regional Criminal Law Framework for the Combat of Organized Crime (Pashto) | [datacite](https://opendata.uni-halle.de//handle/1981185920/123975) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://opendata.uni-halle.de//handle/1981185920/123975) | Automated discovery entry for Pashto resource tracking. |
|
| 105 |
+
| The Role of Early Literary Biographies (Tazkiri) in the Ancient History of Pashto Literature | [other](https://www.semanticscholar.org/paper/4938170077d3430c2e3f9fadc161ed7b79242917) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/4938170077d3430c2e3f9fadc161ed7b79242917) | Automated discovery entry for Pashto resource tracking. |
|
| 106 |
+
| The Roshani Movement literary services and the contribution of this Movement in the development of Pashto Literature | [other](https://www.semanticscholar.org/paper/88a3cd1ec497844c5997ae1795f8e72bbb314112) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/88a3cd1ec497844c5997ae1795f8e72bbb314112) | Automated discovery entry for Pashto resource tracking. |
|
| 107 |
+
| The Social Structure and Organization of A Pakhto Speaking Community in Afghanistan. | [other](https://www.semanticscholar.org/paper/306e9a04b8835de6e906303b5e27d43a6994cb1d) | [Matched by explicit Pashto marker in paper title from Semantic Scholar search. (`pashto`)](https://www.semanticscholar.org/paper/306e9a04b8835de6e906303b5e27d43a6994cb1d) | Automated discovery entry for Pashto resource tracking. |
|
| 108 |
+
| Topicalization in Pashto | [crossref](https://doi.org/10.31703/gssr.2020(v-i).17) | [Matched by explicit Pashto marker in title from Crossref search. (`pashto`)](https://doi.org/10.31703/gssr.2020(v-i).17) | Automated discovery entry for Pashto resource tracking. |
|
| 109 |
| Tuning Traditional Language Processing Approaches for Pashto Text Classification | [arxiv](http://arxiv.org/abs/2305.03737v1) | [Matched by Pashto marker in paper title from arXiv query results. (`pashto`)](http://arxiv.org/abs/2305.03737v1) | Pashto text classification method reference |
|
| 110 |
+
| بلوچستان میں " فقہ اسلامی " کے فروغ و ارتقا٫ کا تحقیقی جائزہ | [zenodo](https://zenodo.org/records/18049233) | [Zenodo metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/records/18049233) | Automated discovery entry for Pashto resource tracking. |
|
| 111 |
|
| 112 |
## Maintenance
|
| 113 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
resources/projects/README.md
CHANGED
|
@@ -4,20 +4,51 @@
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
|
|
|
|
|
|
| 7 |
| afaaaak/urdu_pashto_translator | [huggingface](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | [Space metadata title is Urdu Pashto Translator and the slug includes pashto. (`Pashto`, `translator`)](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | Translation demo and bilingual usability testing |
|
| 8 |
| afaqalinagra/PASHTO-ASR-MODEL | [huggingface](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | Interactive Pashto demo and quick qualitative validation |
|
| 9 |
| Aizazayyubi/pashto_asr | [huggingface](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | Interactive Pashto ASR demo for qualitative evaluation |
|
|
|
|
| 10 |
| DrSaqlainHassan/PashtoTokenixer | [huggingface](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | [Space card title states Pashto Parts of Speech Identifier and the slug contains Pashto. (`Pashto`, `parts-of-speech`)](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | Pashto NLP demo for token and part-of-speech analysis |
|
|
|
|
|
|
|
| 11 |
| Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
| ihanif/wav2vec-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
|
| 13 |
| ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
|
|
|
|
|
|
|
| 14 |
| ilyas02828/Pashto_Sign_Language | [huggingface](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | Interactive Pashto demo and quick qualitative validation |
|
|
|
|
|
|
|
|
|
|
| 15 |
| mahmudaq/PashtoASRNMT1 | [huggingface](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | Interactive Pashto demo and quick qualitative validation |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
| nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
| Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
|
| 18 |
| Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
|
| 19 |
| Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
|
| 20 |
| Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
| tasal9/ZamAI-Phi3-Mini-Pashto-Demo | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | Interactive Pashto demo and quick qualitative validation |
|
| 22 |
| Umar4321/Pashto-To-English-Urdu | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | Interactive Pashto demo and quick qualitative validation |
|
| 23 |
| ZamAI-Mistral-7B-Pashto Space | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | [Space title and ID explicitly include Pashto and model card metadata exposes project details. (`Pashto`)](https://huggingface.co/api/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | Interactive Pashto LLM project demo |
|
|
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
+
| adnankarim/ihanif-whisper-base-pashto | [huggingface](https://huggingface.co/spaces/adnankarim/ihanif-whisper-base-pashto) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/adnankarim/ihanif-whisper-base-pashto) | Automated discovery entry for Pashto resource tracking. |
|
| 8 |
+
| adnankarim/ihanif-whisper-medium-pashto-3e-7 | [huggingface](https://huggingface.co/spaces/adnankarim/ihanif-whisper-medium-pashto-3e-7) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/adnankarim/ihanif-whisper-medium-pashto-3e-7) | Automated discovery entry for Pashto resource tracking. |
|
| 9 |
| afaaaak/urdu_pashto_translator | [huggingface](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | [Space metadata title is Urdu Pashto Translator and the slug includes pashto. (`Pashto`, `translator`)](https://huggingface.co/spaces/afaaaak/urdu_pashto_translator) | Translation demo and bilingual usability testing |
|
| 10 |
| afaqalinagra/PASHTO-ASR-MODEL | [huggingface](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/afaqalinagra/PASHTO-ASR-MODEL) | Interactive Pashto demo and quick qualitative validation |
|
| 11 |
| Aizazayyubi/pashto_asr | [huggingface](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Aizazayyubi/pashto_asr) | Interactive Pashto ASR demo for qualitative evaluation |
|
| 12 |
+
| amirajorloo/jira-auto-direction-chrome-extension | [github](https://github.com/amirajorloo/jira-auto-direction-chrome-extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/amirajorloo/jira-auto-direction-chrome-extension) | Automated discovery entry for Pashto resource tracking. |
|
| 13 |
| DrSaqlainHassan/PashtoTokenixer | [huggingface](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | [Space card title states Pashto Parts of Speech Identifier and the slug contains Pashto. (`Pashto`, `parts-of-speech`)](https://huggingface.co/spaces/DrSaqlainHassan/PashtoTokenixer) | Pashto NLP demo for token and part-of-speech analysis |
|
| 14 |
+
| Early Pregnancy Loss [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18325729) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18325729) | Automated discovery entry for Pashto resource tracking. |
|
| 15 |
+
| Fazlullahmamond/hadith-collection-pashto | [github](https://github.com/Fazlullahmamond/hadith-collection-pashto) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/hadith-collection-pashto) | Automated discovery entry for Pashto resource tracking. |
|
| 16 |
| Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
|
| 17 |
+
| Female Birth Control Part I [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18325040) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18325040) | Automated discovery entry for Pashto resource tracking. |
|
| 18 |
+
| Female Birth Control Part II [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18325401) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18325401) | Automated discovery entry for Pashto resource tracking. |
|
| 19 |
+
| Haroon-blip/khan-pukhtoon | [github](https://github.com/Haroon-blip/khan-pukhtoon) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Haroon-blip/khan-pukhtoon) | Automated discovery entry for Pashto resource tracking. |
|
| 20 |
+
| Haseeb-007/Pashto-sekho | [huggingface](https://huggingface.co/spaces/Haseeb-007/Pashto-sekho) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Haseeb-007/Pashto-sekho) | Automated discovery entry for Pashto resource tracking. |
|
| 21 |
+
| haseebjanhamraz/PashtoFonts | [github](https://github.com/haseebjanhamraz/PashtoFonts) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/haseebjanhamraz/PashtoFonts) | Automated discovery entry for Pashto resource tracking. |
|
| 22 |
+
| Hassaankabir/Pashto_Malgaray | [huggingface](https://huggingface.co/spaces/Hassaankabir/Pashto_Malgaray) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Hassaankabir/Pashto_Malgaray) | Automated discovery entry for Pashto resource tracking. |
|
| 23 |
| ihanif/wav2vec-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
|
| 24 |
| ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
|
| 25 |
+
| ihanif/whisper-medium-pashto | [huggingface](https://huggingface.co/spaces/ihanif/whisper-medium-pashto) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/whisper-medium-pashto) | Automated discovery entry for Pashto resource tracking. |
|
| 26 |
+
| IhyaCommunity/Khushkhat-Extension | [github](https://github.com/IhyaCommunity/Khushkhat-Extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/IhyaCommunity/Khushkhat-Extension) | Automated discovery entry for Pashto resource tracking. |
|
| 27 |
| ilyas02828/Pashto_Sign_Language | [huggingface](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ilyas02828/Pashto_Sign_Language) | Interactive Pashto demo and quick qualitative validation |
|
| 28 |
+
| Introduction to Postpartum Care for Refugee women [Pashto] | [datacite](https://zenodo.org/doi/10.5281/zenodo.18324878) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18324878) | Automated discovery entry for Pashto resource tracking. |
|
| 29 |
+
| lecramyajiv/fonts-arabic-extra | [github](https://github.com/lecramyajiv/fonts-arabic-extra) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/lecramyajiv/fonts-arabic-extra) | Automated discovery entry for Pashto resource tracking. |
|
| 30 |
+
| lecramyajiv/ttf-x2 | [github](https://github.com/lecramyajiv/ttf-x2) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/lecramyajiv/ttf-x2) | Automated discovery entry for Pashto resource tracking. |
|
| 31 |
| mahmudaq/PashtoASRNMT1 | [huggingface](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/mahmudaq/PashtoASRNMT1) | Interactive Pashto demo and quick qualitative validation |
|
| 32 |
+
| mastermoo/pashto-quran | [github](https://github.com/mastermoo/pashto-quran) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/mastermoo/pashto-quran) | Automated discovery entry for Pashto resource tracking. |
|
| 33 |
+
| MuhammadUllah7/PAKHTOONN | [github](https://github.com/MuhammadUllah7/PAKHTOONN) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/MuhammadUllah7/PAKHTOONN) | Automated discovery entry for Pashto resource tracking. |
|
| 34 |
+
| nabeelest/pakhtoodle | [github](https://github.com/nabeelest/pakhtoodle) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/nabeelest/pakhtoodle) | Automated discovery entry for Pashto resource tracking. |
|
| 35 |
+
| NanoNulla/lorem | [github](https://github.com/NanoNulla/lorem) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/NanoNulla/lorem) | Automated discovery entry for Pashto resource tracking. |
|
| 36 |
| nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
|
| 37 |
+
| Negation in Pashto | [datacite](https://zenodo.org/doi/10.5281/zenodo.18233956) | [DataCite metadata includes Pashto markers in title or description. (`pashto`)](https://zenodo.org/doi/10.5281/zenodo.18233956) | Automated discovery entry for Pashto resource tracking. |
|
| 38 |
+
| nisarmasid/NisAr-PakhtoOn | [github](https://github.com/nisarmasid/NisAr-PakhtoOn) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/nisarmasid/NisAr-PakhtoOn) | Automated discovery entry for Pashto resource tracking. |
|
| 39 |
+
| omid/Persian-Log2Vis | [github](https://github.com/omid/Persian-Log2Vis) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/omid/Persian-Log2Vis) | Automated discovery entry for Pashto resource tracking. |
|
| 40 |
+
| Pakhtoon9900/Pakhtoon- | [github](https://github.com/Pakhtoon9900/Pakhtoon-) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Pakhtoon9900/Pakhtoon-) | Automated discovery entry for Pashto resource tracking. |
|
| 41 |
| Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
|
| 42 |
| Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
|
| 43 |
| Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
|
| 44 |
| Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
|
| 45 |
+
| Pukhtoon203/PUKHTOON | [github](https://github.com/Pukhtoon203/PUKHTOON) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Pukhtoon203/PUKHTOON) | Automated discovery entry for Pashto resource tracking. |
|
| 46 |
+
| Pukhtoonmafia009/Pukhtoonmafia009 | [github](https://github.com/Pukhtoonmafia009/Pukhtoonmafia009) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Pukhtoonmafia009/Pukhtoonmafia009) | Automated discovery entry for Pashto resource tracking. |
|
| 47 |
+
| pukhtoonyar406/pukhtoonyar406 | [github](https://github.com/pukhtoonyar406/pukhtoonyar406) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/pukhtoonyar406/pukhtoonyar406) | Automated discovery entry for Pashto resource tracking. |
|
| 48 |
+
| ShahZamanPatan/Pashto-Baran | [github](https://github.com/ShahZamanPatan/Pashto-Baran) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/ShahZamanPatan/Pashto-Baran) | Automated discovery entry for Pashto resource tracking. |
|
| 49 |
+
| ShawAnonymouse/Pakhtoon | [github](https://github.com/ShawAnonymouse/Pakhtoon) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/ShawAnonymouse/Pakhtoon) | Automated discovery entry for Pashto resource tracking. |
|
| 50 |
+
| tasal9/pashto-base-bloom-space | [huggingface](https://huggingface.co/spaces/tasal9/pashto-base-bloom-space) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/pashto-base-bloom-space) | Automated discovery entry for Pashto resource tracking. |
|
| 51 |
+
| tasal9/ZamAI-mt5-Pashto-Demo | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-mt5-Pashto-Demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/ZamAI-mt5-Pashto-Demo) | Automated discovery entry for Pashto resource tracking. |
|
| 52 |
| tasal9/ZamAI-Phi3-Mini-Pashto-Demo | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/tasal9/ZamAI-Phi3-Mini-Pashto-Demo) | Interactive Pashto demo and quick qualitative validation |
|
| 53 |
| Umar4321/Pashto-To-English-Urdu | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/Umar4321/Pashto-To-English-Urdu) | Interactive Pashto demo and quick qualitative validation |
|
| 54 |
| ZamAI-Mistral-7B-Pashto Space | [huggingface](https://huggingface.co/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | [Space title and ID explicitly include Pashto and model card metadata exposes project details. (`Pashto`)](https://huggingface.co/api/spaces/tasal9/ZamAI-Mistral-7B-Pashto-space) | Interactive Pashto LLM project demo |
|
scripts/README.md
CHANGED
|
@@ -9,6 +9,7 @@ Automation scripts for quality checks, resource catalog validation, and search i
|
|
| 9 |
- `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
|
| 10 |
- `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub, GitLab, OpenAlex, Crossref, Zenodo, Dataverse, DataCite, arXiv, and Semantic Scholar into `resources/catalog/pending_candidates.json`.
|
| 11 |
- `promote_candidates.py`: auto-promote valid non-duplicate entries from `pending_candidates.json` into `resources/catalog/resources.json`.
|
|
|
|
| 12 |
- `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
|
| 13 |
|
| 14 |
## Usage
|
|
@@ -33,11 +34,26 @@ Sync candidate resources for maintainer review:
|
|
| 33 |
python scripts/sync_resources.py --limit 20
|
| 34 |
```
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
Auto-promote valid candidates into verified catalog:
|
| 37 |
```bash
|
| 38 |
python scripts/promote_candidates.py
|
| 39 |
```
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
Run full repeatable cycle:
|
| 42 |
```bash
|
| 43 |
python scripts/run_resource_cycle.py --limit 25
|
|
|
|
| 9 |
- `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
|
| 10 |
- `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub, GitLab, OpenAlex, Crossref, Zenodo, Dataverse, DataCite, arXiv, and Semantic Scholar into `resources/catalog/pending_candidates.json`.
|
| 11 |
- `promote_candidates.py`: auto-promote valid non-duplicate entries from `pending_candidates.json` into `resources/catalog/resources.json`.
|
| 12 |
+
- `review_existing_resources.py`: review current catalog resources, remove stale/removed entries only with strong reasons, and log removals in `resources/catalog/removal_log.json`.
|
| 13 |
- `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
|
| 14 |
|
| 15 |
## Usage
|
|
|
|
| 34 |
python scripts/sync_resources.py --limit 20
|
| 35 |
```
|
| 36 |
|
| 37 |
+
Review existing resources and remove stale entries before discovery:
|
| 38 |
+
```bash
|
| 39 |
+
python scripts/review_existing_resources.py
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
Run stricter relevance cleanup mode:
|
| 43 |
+
```bash
|
| 44 |
+
python scripts/review_existing_resources.py --enforce-pashto-relevance
|
| 45 |
+
```
|
| 46 |
+
|
| 47 |
Auto-promote valid candidates into verified catalog:
|
| 48 |
```bash
|
| 49 |
python scripts/promote_candidates.py
|
| 50 |
```
|
| 51 |
|
| 52 |
+
Auto-promote while skipping online URL availability checks:
|
| 53 |
+
```bash
|
| 54 |
+
python scripts/promote_candidates.py --skip-url-check
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
Run full repeatable cycle:
|
| 58 |
```bash
|
| 59 |
python scripts/run_resource_cycle.py --limit 25
|
scripts/promote_candidates.py
CHANGED
|
@@ -18,8 +18,23 @@ try:
|
|
| 18 |
except ModuleNotFoundError:
|
| 19 |
from validate_resource_catalog import validate_resource
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
PLACEHOLDER_PRIMARY_USE = "Needs maintainer review before promotion to verified catalog."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
def _canonical_url(value: str) -> str:
|
|
@@ -43,11 +58,24 @@ def _prepare_candidate(candidate: dict[str, Any]) -> dict[str, Any]:
|
|
| 43 |
return promoted
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
def promote_candidates(
|
| 47 |
catalog: dict[str, Any],
|
| 48 |
pending_payload: dict[str, Any],
|
| 49 |
*,
|
| 50 |
max_promotions: int | None = None,
|
|
|
|
|
|
|
| 51 |
) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
| 52 |
resources = catalog.get("resources")
|
| 53 |
if not isinstance(resources, list):
|
|
@@ -69,7 +97,7 @@ def promote_candidates(
|
|
| 69 |
}
|
| 70 |
|
| 71 |
promoted: list[dict[str, Any]] = []
|
| 72 |
-
stats = {"total": len(candidates), "promoted": 0, "duplicate": 0, "invalid": 0}
|
| 73 |
|
| 74 |
for candidate in candidates:
|
| 75 |
if max_promotions is not None and len(promoted) >= max_promotions:
|
|
@@ -90,6 +118,10 @@ def promote_candidates(
|
|
| 90 |
stats["duplicate"] += 1
|
| 91 |
continue
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
errors = validate_resource(resource, len(resources) + len(promoted))
|
| 94 |
if errors:
|
| 95 |
stats["invalid"] += 1
|
|
@@ -112,6 +144,8 @@ def main() -> int:
|
|
| 112 |
parser.add_argument("--catalog", default="resources/catalog/resources.json")
|
| 113 |
parser.add_argument("--candidates", default="resources/catalog/pending_candidates.json")
|
| 114 |
parser.add_argument("--max-promotions", type=int, default=None)
|
|
|
|
|
|
|
| 115 |
args = parser.parse_args()
|
| 116 |
|
| 117 |
catalog_path = Path(args.catalog)
|
|
@@ -135,18 +169,22 @@ def main() -> int:
|
|
| 135 |
catalog,
|
| 136 |
pending_payload,
|
| 137 |
max_promotions=args.max_promotions,
|
|
|
|
|
|
|
| 138 |
)
|
| 139 |
if not promoted:
|
| 140 |
print(
|
| 141 |
"Promotion complete: no new verified resources "
|
| 142 |
-
f"(duplicates={stats['duplicate']}, invalid={stats['invalid']})"
|
| 143 |
)
|
| 144 |
return 0
|
| 145 |
|
| 146 |
catalog_path.write_text(json.dumps(catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
| 147 |
print(
|
| 148 |
"Promotion complete: "
|
| 149 |
-
|
|
|
|
|
|
|
| 150 |
)
|
| 151 |
return 0
|
| 152 |
|
|
|
|
| 18 |
except ModuleNotFoundError:
|
| 19 |
from validate_resource_catalog import validate_resource
|
| 20 |
|
| 21 |
+
try:
|
| 22 |
+
from scripts.review_existing_resources import probe_resource_url
|
| 23 |
+
except ModuleNotFoundError:
|
| 24 |
+
from review_existing_resources import probe_resource_url
|
| 25 |
+
|
| 26 |
|
| 27 |
PLACEHOLDER_PRIMARY_USE = "Needs maintainer review before promotion to verified catalog."
|
| 28 |
+
NOT_FOUND_PATTERNS = (
|
| 29 |
+
"repository not found",
|
| 30 |
+
"model not found",
|
| 31 |
+
"dataset not found",
|
| 32 |
+
"space not found",
|
| 33 |
+
"page not found",
|
| 34 |
+
"not found",
|
| 35 |
+
"this repository does not exist",
|
| 36 |
+
"we couldn't find",
|
| 37 |
+
)
|
| 38 |
|
| 39 |
|
| 40 |
def _canonical_url(value: str) -> str:
|
|
|
|
| 58 |
return promoted
|
| 59 |
|
| 60 |
|
| 61 |
+
def _candidate_url_unavailable(url: str, timeout: float) -> bool:
|
| 62 |
+
probe = probe_resource_url(url, timeout)
|
| 63 |
+
if probe.hard_missing:
|
| 64 |
+
return True
|
| 65 |
+
if probe.content_sample:
|
| 66 |
+
lowered = probe.content_sample.casefold()
|
| 67 |
+
if any(pattern in lowered for pattern in NOT_FOUND_PATTERNS):
|
| 68 |
+
return True
|
| 69 |
+
return False
|
| 70 |
+
|
| 71 |
+
|
| 72 |
def promote_candidates(
|
| 73 |
catalog: dict[str, Any],
|
| 74 |
pending_payload: dict[str, Any],
|
| 75 |
*,
|
| 76 |
max_promotions: int | None = None,
|
| 77 |
+
verify_urls: bool = False,
|
| 78 |
+
url_timeout: float = 10.0,
|
| 79 |
) -> tuple[list[dict[str, Any]], dict[str, int]]:
|
| 80 |
resources = catalog.get("resources")
|
| 81 |
if not isinstance(resources, list):
|
|
|
|
| 97 |
}
|
| 98 |
|
| 99 |
promoted: list[dict[str, Any]] = []
|
| 100 |
+
stats = {"total": len(candidates), "promoted": 0, "duplicate": 0, "invalid": 0, "unavailable": 0}
|
| 101 |
|
| 102 |
for candidate in candidates:
|
| 103 |
if max_promotions is not None and len(promoted) >= max_promotions:
|
|
|
|
| 118 |
stats["duplicate"] += 1
|
| 119 |
continue
|
| 120 |
|
| 121 |
+
if verify_urls and _candidate_url_unavailable(url, url_timeout):
|
| 122 |
+
stats["unavailable"] += 1
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
errors = validate_resource(resource, len(resources) + len(promoted))
|
| 126 |
if errors:
|
| 127 |
stats["invalid"] += 1
|
|
|
|
| 144 |
parser.add_argument("--catalog", default="resources/catalog/resources.json")
|
| 145 |
parser.add_argument("--candidates", default="resources/catalog/pending_candidates.json")
|
| 146 |
parser.add_argument("--max-promotions", type=int, default=None)
|
| 147 |
+
parser.add_argument("--skip-url-check", action="store_true")
|
| 148 |
+
parser.add_argument("--url-timeout", type=float, default=10.0)
|
| 149 |
args = parser.parse_args()
|
| 150 |
|
| 151 |
catalog_path = Path(args.catalog)
|
|
|
|
| 169 |
catalog,
|
| 170 |
pending_payload,
|
| 171 |
max_promotions=args.max_promotions,
|
| 172 |
+
verify_urls=not args.skip_url_check,
|
| 173 |
+
url_timeout=args.url_timeout,
|
| 174 |
)
|
| 175 |
if not promoted:
|
| 176 |
print(
|
| 177 |
"Promotion complete: no new verified resources "
|
| 178 |
+
f"(duplicates={stats['duplicate']}, invalid={stats['invalid']}, unavailable={stats['unavailable']})"
|
| 179 |
)
|
| 180 |
return 0
|
| 181 |
|
| 182 |
catalog_path.write_text(json.dumps(catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
| 183 |
print(
|
| 184 |
"Promotion complete: "
|
| 185 |
+
"promoted="
|
| 186 |
+
f"{stats['promoted']} duplicate={stats['duplicate']} invalid={stats['invalid']} "
|
| 187 |
+
f"unavailable={stats['unavailable']}"
|
| 188 |
)
|
| 189 |
return 0
|
| 190 |
|
scripts/review_existing_resources.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Review existing catalog entries and remove only with strong evidence.
|
| 2 |
+
|
| 3 |
+
This script enforces a conservative pre-sync audit:
|
| 4 |
+
- Keep resources that are reachable and Pashto-relevant.
|
| 5 |
+
- Remove only when there is a strong reason (for example hard 404/410, duplicate ID/URL,
|
| 6 |
+
or no Pashto signal in metadata and live page content).
|
| 7 |
+
- Persist removal reasons in a log for maintainer review.
|
| 8 |
+
|
| 9 |
+
Usage:
|
| 10 |
+
python scripts/review_existing_resources.py
|
| 11 |
+
python scripts/review_existing_resources.py --timeout 15
|
| 12 |
+
python scripts/review_existing_resources.py --dry-run
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import argparse
|
| 18 |
+
import concurrent.futures as futures
|
| 19 |
+
import json
|
| 20 |
+
import re
|
| 21 |
+
import socket
|
| 22 |
+
from dataclasses import dataclass
|
| 23 |
+
from datetime import date, datetime, timezone
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
from typing import Any
|
| 26 |
+
from urllib.error import HTTPError, URLError
|
| 27 |
+
from urllib.request import Request, urlopen
|
| 28 |
+
|
| 29 |
+
try:
|
| 30 |
+
from scripts.validate_resource_catalog import validate_resource
|
| 31 |
+
except ModuleNotFoundError:
|
| 32 |
+
from validate_resource_catalog import validate_resource
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
USER_AGENT = "pashto-resource-review/1.0"
|
| 36 |
+
MAX_BODY_BYTES = 120_000
|
| 37 |
+
HARD_REMOVE_HTTP_CODES = {404, 410, 451}
|
| 38 |
+
NOT_FOUND_PATTERNS = (
|
| 39 |
+
"repository not found",
|
| 40 |
+
"model not found",
|
| 41 |
+
"dataset not found",
|
| 42 |
+
"space not found",
|
| 43 |
+
"page not found",
|
| 44 |
+
"not found",
|
| 45 |
+
"this repository does not exist",
|
| 46 |
+
"we couldn't find",
|
| 47 |
+
)
|
| 48 |
+
AUTOMATED_PRIMARY_USE = "Automated discovery entry for Pashto resource tracking."
|
| 49 |
+
PASHTO_WORD_RE = re.compile(r"(?<![A-Za-z0-9])(pashto|pukhto|pushto|pakhto)(?![A-Za-z0-9])", re.IGNORECASE)
|
| 50 |
+
PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b", re.IGNORECASE)
|
| 51 |
+
PASHTO_SCRIPT_MARKERS = ("پښتو", "پشتو")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@dataclass
|
| 55 |
+
class UrlProbe:
|
| 56 |
+
status_code: int | None = None
|
| 57 |
+
final_url: str | None = None
|
| 58 |
+
content_sample: str = ""
|
| 59 |
+
hard_missing: bool = False
|
| 60 |
+
uncertain_error: str | None = None
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def _contains_pashto_marker(value: str) -> bool:
|
| 64 |
+
text = (value or "").strip()
|
| 65 |
+
if not text:
|
| 66 |
+
return False
|
| 67 |
+
if PASHTO_WORD_RE.search(text):
|
| 68 |
+
return True
|
| 69 |
+
if PASHTO_CODE_RE.search(text):
|
| 70 |
+
return True
|
| 71 |
+
return any(marker in text for marker in PASHTO_SCRIPT_MARKERS)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _resource_metadata_has_pashto_signal(resource: dict[str, Any]) -> bool:
|
| 75 |
+
values: list[str] = []
|
| 76 |
+
for key in ("title", "url", "summary", "primary_use"):
|
| 77 |
+
item = resource.get(key)
|
| 78 |
+
if isinstance(item, str):
|
| 79 |
+
values.append(item)
|
| 80 |
+
|
| 81 |
+
tags = resource.get("tags")
|
| 82 |
+
if isinstance(tags, list):
|
| 83 |
+
values.extend(tag for tag in tags if isinstance(tag, str))
|
| 84 |
+
|
| 85 |
+
evidence = resource.get("pashto_evidence")
|
| 86 |
+
if isinstance(evidence, dict):
|
| 87 |
+
for key in ("evidence_text", "evidence_url"):
|
| 88 |
+
item = evidence.get(key)
|
| 89 |
+
if isinstance(item, str):
|
| 90 |
+
values.append(item)
|
| 91 |
+
markers = evidence.get("markers")
|
| 92 |
+
if isinstance(markers, list):
|
| 93 |
+
values.extend(marker for marker in markers if isinstance(marker, str))
|
| 94 |
+
|
| 95 |
+
return any(_contains_pashto_marker(value) for value in values)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _resource_has_direct_pashto_signal(resource: dict[str, Any]) -> bool:
|
| 99 |
+
values: list[str] = []
|
| 100 |
+
for key in ("title", "url"):
|
| 101 |
+
item = resource.get(key)
|
| 102 |
+
if isinstance(item, str):
|
| 103 |
+
values.append(item)
|
| 104 |
+
|
| 105 |
+
evidence = resource.get("pashto_evidence")
|
| 106 |
+
if isinstance(evidence, dict):
|
| 107 |
+
evidence_url = evidence.get("evidence_url")
|
| 108 |
+
if isinstance(evidence_url, str):
|
| 109 |
+
values.append(evidence_url)
|
| 110 |
+
markers = evidence.get("markers")
|
| 111 |
+
if isinstance(markers, list):
|
| 112 |
+
values.extend(marker for marker in markers if isinstance(marker, str))
|
| 113 |
+
|
| 114 |
+
tags = resource.get("tags")
|
| 115 |
+
if isinstance(tags, list):
|
| 116 |
+
values.extend(tag for tag in tags if isinstance(tag, str))
|
| 117 |
+
|
| 118 |
+
return any(_contains_pashto_marker(value) for value in values)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def _is_automated_candidate_like(resource: dict[str, Any]) -> bool:
|
| 122 |
+
rid = resource.get("id")
|
| 123 |
+
primary_use = resource.get("primary_use")
|
| 124 |
+
return (isinstance(rid, str) and rid.startswith("candidate-")) or (
|
| 125 |
+
isinstance(primary_use, str) and primary_use.strip() == AUTOMATED_PRIMARY_USE
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _canonical_url(value: str) -> str:
|
| 130 |
+
return value.rstrip("/")
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _request_url(url: str, method: str, timeout: float) -> UrlProbe:
|
| 134 |
+
request = Request(url, method=method, headers={"User-Agent": USER_AGENT})
|
| 135 |
+
try:
|
| 136 |
+
with urlopen(request, timeout=timeout) as response:
|
| 137 |
+
status = getattr(response, "status", 200)
|
| 138 |
+
final_url = response.geturl()
|
| 139 |
+
sample = ""
|
| 140 |
+
if method == "GET":
|
| 141 |
+
payload = response.read(MAX_BODY_BYTES)
|
| 142 |
+
sample = payload.decode("utf-8", errors="replace")
|
| 143 |
+
return UrlProbe(status_code=status, final_url=final_url, content_sample=sample)
|
| 144 |
+
except HTTPError as exc:
|
| 145 |
+
if method == "GET":
|
| 146 |
+
try:
|
| 147 |
+
payload = exc.read(MAX_BODY_BYTES)
|
| 148 |
+
sample = payload.decode("utf-8", errors="replace")
|
| 149 |
+
except Exception: # noqa: BLE001
|
| 150 |
+
sample = ""
|
| 151 |
+
else:
|
| 152 |
+
sample = ""
|
| 153 |
+
return UrlProbe(
|
| 154 |
+
status_code=exc.code,
|
| 155 |
+
final_url=exc.geturl(),
|
| 156 |
+
content_sample=sample,
|
| 157 |
+
hard_missing=exc.code in HARD_REMOVE_HTTP_CODES,
|
| 158 |
+
)
|
| 159 |
+
except (URLError, TimeoutError, socket.timeout) as exc:
|
| 160 |
+
return UrlProbe(uncertain_error=str(exc))
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def probe_resource_url(url: str, timeout: float) -> UrlProbe:
|
| 164 |
+
head = _request_url(url, "HEAD", timeout)
|
| 165 |
+
if head.uncertain_error:
|
| 166 |
+
return head
|
| 167 |
+
if head.status_code in HARD_REMOVE_HTTP_CODES:
|
| 168 |
+
head.hard_missing = True
|
| 169 |
+
return head
|
| 170 |
+
if head.status_code in {403, 405, 429} or head.status_code is None:
|
| 171 |
+
get_result = _request_url(url, "GET", timeout)
|
| 172 |
+
if get_result.status_code in HARD_REMOVE_HTTP_CODES:
|
| 173 |
+
get_result.hard_missing = True
|
| 174 |
+
return get_result
|
| 175 |
+
if head.status_code and 200 <= head.status_code < 400:
|
| 176 |
+
get_result = _request_url(url, "GET", timeout)
|
| 177 |
+
if get_result.uncertain_error:
|
| 178 |
+
return head
|
| 179 |
+
return get_result
|
| 180 |
+
return head
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def review_resources(
|
| 184 |
+
catalog: dict[str, Any],
|
| 185 |
+
*,
|
| 186 |
+
timeout: float = 12.0,
|
| 187 |
+
enforce_pashto_relevance: bool = False,
|
| 188 |
+
max_workers: int = 12,
|
| 189 |
+
probe_fn: Any = probe_resource_url,
|
| 190 |
+
) -> tuple[dict[str, Any], dict[str, Any]]:
|
| 191 |
+
resources = catalog.get("resources")
|
| 192 |
+
if not isinstance(resources, list):
|
| 193 |
+
raise ValueError("catalog.resources must be a list")
|
| 194 |
+
|
| 195 |
+
kept: list[dict[str, Any]] = []
|
| 196 |
+
removals: list[dict[str, Any]] = []
|
| 197 |
+
warnings: list[str] = []
|
| 198 |
+
seen_ids: dict[str, str] = {}
|
| 199 |
+
seen_urls: dict[tuple[str, str], str] = {}
|
| 200 |
+
|
| 201 |
+
probe_results: dict[str, UrlProbe] = {}
|
| 202 |
+
candidate_urls = sorted(
|
| 203 |
+
{
|
| 204 |
+
resource.get("url", "").strip()
|
| 205 |
+
for resource in resources
|
| 206 |
+
if isinstance(resource, dict) and isinstance(resource.get("url"), str) and resource.get("url", "").strip()
|
| 207 |
+
}
|
| 208 |
+
)
|
| 209 |
+
if candidate_urls:
|
| 210 |
+
worker_count = max(1, min(max_workers, len(candidate_urls)))
|
| 211 |
+
with futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
|
| 212 |
+
future_map = {executor.submit(probe_fn, url, timeout): url for url in candidate_urls}
|
| 213 |
+
for future in futures.as_completed(future_map):
|
| 214 |
+
url = future_map[future]
|
| 215 |
+
try:
|
| 216 |
+
probe_results[url] = future.result()
|
| 217 |
+
except Exception as exc: # noqa: BLE001
|
| 218 |
+
probe_results[url] = UrlProbe(uncertain_error=str(exc))
|
| 219 |
+
|
| 220 |
+
for index, resource in enumerate(resources):
|
| 221 |
+
if not isinstance(resource, dict):
|
| 222 |
+
removals.append(
|
| 223 |
+
{
|
| 224 |
+
"id": f"resource-{index}",
|
| 225 |
+
"title": "",
|
| 226 |
+
"url": "",
|
| 227 |
+
"reasons": ["Entry is not a JSON object."],
|
| 228 |
+
"evidence": {},
|
| 229 |
+
}
|
| 230 |
+
)
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
rid = resource.get("id", "")
|
| 234 |
+
title = resource.get("title", "")
|
| 235 |
+
url = resource.get("url", "")
|
| 236 |
+
category = resource.get("category", "")
|
| 237 |
+
reasons: list[str] = []
|
| 238 |
+
|
| 239 |
+
if not isinstance(rid, str) or not rid.strip():
|
| 240 |
+
reasons.append("Missing or invalid resource id.")
|
| 241 |
+
if not isinstance(url, str) or not url.strip():
|
| 242 |
+
reasons.append("Missing or invalid resource URL.")
|
| 243 |
+
|
| 244 |
+
if isinstance(rid, str) and rid:
|
| 245 |
+
previous = seen_ids.get(rid)
|
| 246 |
+
if previous:
|
| 247 |
+
reasons.append(f"Duplicate resource id; already used by '{previous}'.")
|
| 248 |
+
|
| 249 |
+
canonical_url = _canonical_url(url) if isinstance(url, str) else ""
|
| 250 |
+
normalized_category = str(category).strip().casefold() if isinstance(category, str) else ""
|
| 251 |
+
if canonical_url:
|
| 252 |
+
previous = seen_urls.get((normalized_category, canonical_url))
|
| 253 |
+
if previous:
|
| 254 |
+
reasons.append(
|
| 255 |
+
"Duplicate canonical URL in same category; "
|
| 256 |
+
f"already used by '{previous}'."
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
validation_errors = validate_resource(resource, index)
|
| 260 |
+
if any(".url must be a valid http/https URL" in error for error in validation_errors):
|
| 261 |
+
reasons.append("Resource URL failed schema validation.")
|
| 262 |
+
|
| 263 |
+
probe = UrlProbe()
|
| 264 |
+
if isinstance(url, str) and url.strip():
|
| 265 |
+
probe = probe_results.get(url, UrlProbe())
|
| 266 |
+
if probe.hard_missing:
|
| 267 |
+
status_code = probe.status_code if probe.status_code is not None else "unknown"
|
| 268 |
+
reasons.append(f"URL returned hard-missing HTTP status {status_code}.")
|
| 269 |
+
elif probe.uncertain_error:
|
| 270 |
+
warnings.append(f"{rid or f'resource-{index}'} URL probe inconclusive: {probe.uncertain_error}")
|
| 271 |
+
|
| 272 |
+
metadata_pashto = _resource_metadata_has_pashto_signal(resource)
|
| 273 |
+
direct_pashto = _resource_has_direct_pashto_signal(resource)
|
| 274 |
+
page_pashto = _contains_pashto_marker(probe.content_sample)
|
| 275 |
+
page_not_found = any(pattern in probe.content_sample.casefold() for pattern in NOT_FOUND_PATTERNS)
|
| 276 |
+
|
| 277 |
+
if page_not_found and not page_pashto:
|
| 278 |
+
reasons.append("Live page content indicates resource is unavailable.")
|
| 279 |
+
|
| 280 |
+
if enforce_pashto_relevance and not metadata_pashto and not page_pashto:
|
| 281 |
+
reasons.append("No Pashto signal found in metadata or live page content.")
|
| 282 |
+
|
| 283 |
+
if enforce_pashto_relevance and _is_automated_candidate_like(resource) and not direct_pashto and not page_pashto:
|
| 284 |
+
reasons.append("Automated candidate lacks direct Pashto signal and appears low-confidence.")
|
| 285 |
+
|
| 286 |
+
if reasons:
|
| 287 |
+
removals.append(
|
| 288 |
+
{
|
| 289 |
+
"id": rid,
|
| 290 |
+
"title": title,
|
| 291 |
+
"url": url,
|
| 292 |
+
"reasons": reasons,
|
| 293 |
+
"evidence": {
|
| 294 |
+
"status_code": probe.status_code,
|
| 295 |
+
"final_url": probe.final_url,
|
| 296 |
+
"metadata_pashto": metadata_pashto,
|
| 297 |
+
"direct_pashto": direct_pashto,
|
| 298 |
+
"page_pashto": page_pashto,
|
| 299 |
+
},
|
| 300 |
+
}
|
| 301 |
+
)
|
| 302 |
+
continue
|
| 303 |
+
|
| 304 |
+
kept.append(resource)
|
| 305 |
+
if isinstance(rid, str) and rid:
|
| 306 |
+
seen_ids[rid] = title if isinstance(title, str) else rid
|
| 307 |
+
if canonical_url:
|
| 308 |
+
seen_urls[(normalized_category, canonical_url)] = (
|
| 309 |
+
title if isinstance(title, str) else canonical_url
|
| 310 |
+
)
|
| 311 |
+
|
| 312 |
+
updated_catalog = dict(catalog)
|
| 313 |
+
if len(kept) != len(resources):
|
| 314 |
+
updated_catalog["resources"] = kept
|
| 315 |
+
updated_catalog["updated_on"] = date.today().isoformat()
|
| 316 |
+
|
| 317 |
+
report = {
|
| 318 |
+
"checked": len(resources),
|
| 319 |
+
"kept": len(kept),
|
| 320 |
+
"removed": len(removals),
|
| 321 |
+
"removals": removals,
|
| 322 |
+
"warnings": warnings,
|
| 323 |
+
}
|
| 324 |
+
return updated_catalog, report
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def update_removal_log(log_path: Path, removals: list[dict[str, Any]]) -> None:
|
| 328 |
+
payload: dict[str, Any]
|
| 329 |
+
if log_path.exists():
|
| 330 |
+
try:
|
| 331 |
+
payload = json.loads(log_path.read_text(encoding="utf-8"))
|
| 332 |
+
except json.JSONDecodeError:
|
| 333 |
+
payload = {"updated_on": date.today().isoformat(), "entries": []}
|
| 334 |
+
else:
|
| 335 |
+
payload = {"updated_on": date.today().isoformat(), "entries": []}
|
| 336 |
+
|
| 337 |
+
entries = payload.get("entries")
|
| 338 |
+
if not isinstance(entries, list):
|
| 339 |
+
entries = []
|
| 340 |
+
|
| 341 |
+
removed_on = datetime.now(timezone.utc).isoformat()
|
| 342 |
+
for item in removals:
|
| 343 |
+
entries.append(
|
| 344 |
+
{
|
| 345 |
+
"removed_on": removed_on,
|
| 346 |
+
"id": item.get("id", ""),
|
| 347 |
+
"title": item.get("title", ""),
|
| 348 |
+
"url": item.get("url", ""),
|
| 349 |
+
"reasons": item.get("reasons", []),
|
| 350 |
+
"evidence": item.get("evidence", {}),
|
| 351 |
+
}
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
payload["updated_on"] = date.today().isoformat()
|
| 355 |
+
payload["entries"] = entries
|
| 356 |
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 357 |
+
log_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def main() -> int:
|
| 361 |
+
parser = argparse.ArgumentParser()
|
| 362 |
+
parser.add_argument("--catalog", default="resources/catalog/resources.json")
|
| 363 |
+
parser.add_argument("--timeout", type=float, default=12.0)
|
| 364 |
+
parser.add_argument("--max-workers", type=int, default=12)
|
| 365 |
+
parser.add_argument("--removal-log", default="resources/catalog/removal_log.json")
|
| 366 |
+
parser.add_argument("--dry-run", action="store_true")
|
| 367 |
+
parser.add_argument(
|
| 368 |
+
"--enforce-pashto-relevance",
|
| 369 |
+
action="store_true",
|
| 370 |
+
help="Also remove entries that have no Pashto signal in metadata or live page content.",
|
| 371 |
+
)
|
| 372 |
+
args = parser.parse_args()
|
| 373 |
+
|
| 374 |
+
catalog_path = Path(args.catalog)
|
| 375 |
+
removal_log_path = Path(args.removal_log)
|
| 376 |
+
if not catalog_path.exists():
|
| 377 |
+
print(f"Missing catalog file: {catalog_path}")
|
| 378 |
+
return 1
|
| 379 |
+
|
| 380 |
+
try:
|
| 381 |
+
catalog = json.loads(catalog_path.read_text(encoding="utf-8"))
|
| 382 |
+
except json.JSONDecodeError as exc:
|
| 383 |
+
print(f"Invalid catalog JSON: {exc}")
|
| 384 |
+
return 1
|
| 385 |
+
|
| 386 |
+
updated_catalog, report = review_resources(
|
| 387 |
+
catalog,
|
| 388 |
+
timeout=args.timeout,
|
| 389 |
+
enforce_pashto_relevance=args.enforce_pashto_relevance,
|
| 390 |
+
max_workers=args.max_workers,
|
| 391 |
+
probe_fn=probe_resource_url,
|
| 392 |
+
)
|
| 393 |
+
|
| 394 |
+
print(
|
| 395 |
+
"Resource review complete: "
|
| 396 |
+
f"checked={report['checked']} kept={report['kept']} removed={report['removed']} "
|
| 397 |
+
f"warnings={len(report['warnings'])}"
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
if report["warnings"]:
|
| 401 |
+
for warning in report["warnings"]:
|
| 402 |
+
print(f"[warn] {warning}")
|
| 403 |
+
|
| 404 |
+
if report["removed"]:
|
| 405 |
+
for item in report["removals"]:
|
| 406 |
+
rid = item.get("id", "<unknown>")
|
| 407 |
+
reasons = item.get("reasons", [])
|
| 408 |
+
print(f"[remove] {rid}")
|
| 409 |
+
for reason in reasons:
|
| 410 |
+
print(f" - {reason}")
|
| 411 |
+
|
| 412 |
+
if not args.dry_run:
|
| 413 |
+
catalog_path.write_text(json.dumps(updated_catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
| 414 |
+
update_removal_log(removal_log_path, report["removals"])
|
| 415 |
+
elif not args.dry_run and updated_catalog != catalog:
|
| 416 |
+
# Defensive branch for any non-removal edits.
|
| 417 |
+
catalog_path.write_text(json.dumps(updated_catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
| 418 |
+
|
| 419 |
+
return 0
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
if __name__ == "__main__":
|
| 423 |
+
raise SystemExit(main())
|
scripts/run_resource_cycle.py
CHANGED
|
@@ -9,6 +9,8 @@ Usage:
|
|
| 9 |
python scripts/run_resource_cycle.py --skip-pytest
|
| 10 |
python scripts/run_resource_cycle.py --discover-only
|
| 11 |
python scripts/run_resource_cycle.py --max-promotions 10
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
from __future__ import annotations
|
|
@@ -37,12 +39,38 @@ def main() -> int:
|
|
| 37 |
default=None,
|
| 38 |
help="Optional cap for auto-promotion count from pending candidates",
|
| 39 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
args = parser.parse_args()
|
| 41 |
|
| 42 |
repo_root = Path(__file__).resolve().parents[1]
|
| 43 |
-
steps: list[list[str]] = [
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
if not args.discover_only:
|
| 48 |
promote_step = ["python", "scripts/promote_candidates.py"]
|
|
|
|
| 9 |
python scripts/run_resource_cycle.py --skip-pytest
|
| 10 |
python scripts/run_resource_cycle.py --discover-only
|
| 11 |
python scripts/run_resource_cycle.py --max-promotions 10
|
| 12 |
+
python scripts/run_resource_cycle.py --skip-existing-review
|
| 13 |
+
python scripts/run_resource_cycle.py --skip-pashto-relevance-check
|
| 14 |
"""
|
| 15 |
|
| 16 |
from __future__ import annotations
|
|
|
|
| 39 |
default=None,
|
| 40 |
help="Optional cap for auto-promotion count from pending candidates",
|
| 41 |
)
|
| 42 |
+
parser.add_argument(
|
| 43 |
+
"--skip-existing-review",
|
| 44 |
+
action="store_true",
|
| 45 |
+
help="Skip review/removal of stale existing resources before syncing candidates.",
|
| 46 |
+
)
|
| 47 |
+
parser.add_argument(
|
| 48 |
+
"--resource-timeout",
|
| 49 |
+
type=float,
|
| 50 |
+
default=12.0,
|
| 51 |
+
help="Timeout in seconds for existing-resource URL probes.",
|
| 52 |
+
)
|
| 53 |
+
parser.add_argument(
|
| 54 |
+
"--skip-pashto-relevance-check",
|
| 55 |
+
action="store_true",
|
| 56 |
+
help="Disable Pashto relevance filtering in existing-resource review.",
|
| 57 |
+
)
|
| 58 |
args = parser.parse_args()
|
| 59 |
|
| 60 |
repo_root = Path(__file__).resolve().parents[1]
|
| 61 |
+
steps: list[list[str]] = []
|
| 62 |
+
if not args.skip_existing_review:
|
| 63 |
+
review_step = [
|
| 64 |
+
"python",
|
| 65 |
+
"scripts/review_existing_resources.py",
|
| 66 |
+
"--timeout",
|
| 67 |
+
str(args.resource_timeout),
|
| 68 |
+
]
|
| 69 |
+
if not args.skip_pashto_relevance_check:
|
| 70 |
+
review_step.append("--enforce-pashto-relevance")
|
| 71 |
+
steps.append(review_step)
|
| 72 |
+
|
| 73 |
+
steps.append(["python", "scripts/sync_resources.py", "--limit", str(args.limit)])
|
| 74 |
|
| 75 |
if not args.discover_only:
|
| 76 |
promote_step = ["python", "scripts/promote_candidates.py"]
|
tests/test_promote_candidates.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from datetime import date
|
| 2 |
|
|
|
|
| 3 |
from scripts.promote_candidates import PLACEHOLDER_PRIMARY_USE, promote_candidates
|
| 4 |
|
| 5 |
|
|
@@ -132,3 +133,29 @@ def test_promote_candidates_respects_max_promotions() -> None:
|
|
| 132 |
assert len(promoted) == 1
|
| 133 |
assert stats["promoted"] == 1
|
| 134 |
assert len(catalog["resources"]) == 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from datetime import date
|
| 2 |
|
| 3 |
+
import scripts.promote_candidates as promote_module
|
| 4 |
from scripts.promote_candidates import PLACEHOLDER_PRIMARY_USE, promote_candidates
|
| 5 |
|
| 6 |
|
|
|
|
| 133 |
assert len(promoted) == 1
|
| 134 |
assert stats["promoted"] == 1
|
| 135 |
assert len(catalog["resources"]) == 2
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def test_promote_candidates_skips_unavailable_when_url_check_enabled(monkeypatch) -> None:
|
| 139 |
+
catalog = _catalog()
|
| 140 |
+
pending = {
|
| 141 |
+
"candidate_count": 1,
|
| 142 |
+
"candidates": [
|
| 143 |
+
_candidate(
|
| 144 |
+
rid="dataset-unavailable",
|
| 145 |
+
title="Pashto Unavailable Dataset",
|
| 146 |
+
url="https://example.org/pashto-unavailable",
|
| 147 |
+
)
|
| 148 |
+
],
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
monkeypatch.setattr(
|
| 152 |
+
promote_module,
|
| 153 |
+
"_candidate_url_unavailable",
|
| 154 |
+
lambda *_args, **_kwargs: True,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
promoted, stats = promote_candidates(catalog, pending, verify_urls=True)
|
| 158 |
+
|
| 159 |
+
assert promoted == []
|
| 160 |
+
assert stats["promoted"] == 0
|
| 161 |
+
assert stats["unavailable"] == 1
|
tests/test_review_existing_resources.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scripts.review_existing_resources import UrlProbe, review_resources
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def _resource(*, rid: str, title: str, url: str) -> dict:
|
| 5 |
+
return {
|
| 6 |
+
"id": rid,
|
| 7 |
+
"title": title,
|
| 8 |
+
"url": url,
|
| 9 |
+
"category": "dataset",
|
| 10 |
+
"source": "other",
|
| 11 |
+
"status": "verified",
|
| 12 |
+
"summary": "Resource summary used for catalog review tests.",
|
| 13 |
+
"primary_use": "Testing",
|
| 14 |
+
"tasks": ["nlp"],
|
| 15 |
+
"pashto_evidence": {
|
| 16 |
+
"evidence_text": "Contains Pashto signal in metadata.",
|
| 17 |
+
"evidence_url": url,
|
| 18 |
+
"markers": ["Pashto"],
|
| 19 |
+
},
|
| 20 |
+
"tags": ["pashto", "dataset"],
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_review_resources_removes_hard_missing_urls() -> None:
|
| 25 |
+
catalog = {
|
| 26 |
+
"version": "1.0.0",
|
| 27 |
+
"updated_on": "2026-02-20",
|
| 28 |
+
"resources": [_resource(rid="dataset-a", title="Pashto A", url="https://example.org/a")],
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
def probe(_: str, __: float) -> UrlProbe:
|
| 32 |
+
return UrlProbe(status_code=404, hard_missing=True)
|
| 33 |
+
|
| 34 |
+
updated, report = review_resources(catalog, probe_fn=probe)
|
| 35 |
+
|
| 36 |
+
assert report["removed"] == 1
|
| 37 |
+
assert updated["resources"] == []
|
| 38 |
+
assert any("hard-missing HTTP status 404" in reason for reason in report["removals"][0]["reasons"])
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_review_resources_keeps_resource_when_probe_is_inconclusive() -> None:
|
| 42 |
+
catalog = {
|
| 43 |
+
"version": "1.0.0",
|
| 44 |
+
"updated_on": "2026-02-20",
|
| 45 |
+
"resources": [_resource(rid="dataset-a", title="Pashto A", url="https://example.org/a")],
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def probe(_: str, __: float) -> UrlProbe:
|
| 49 |
+
return UrlProbe(uncertain_error="timed out")
|
| 50 |
+
|
| 51 |
+
updated, report = review_resources(catalog, probe_fn=probe)
|
| 52 |
+
|
| 53 |
+
assert report["removed"] == 0
|
| 54 |
+
assert len(updated["resources"]) == 1
|
| 55 |
+
assert len(report["warnings"]) == 1
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_review_resources_removes_duplicate_urls() -> None:
|
| 59 |
+
catalog = {
|
| 60 |
+
"version": "1.0.0",
|
| 61 |
+
"updated_on": "2026-02-20",
|
| 62 |
+
"resources": [
|
| 63 |
+
_resource(rid="dataset-a", title="Pashto A", url="https://example.org/shared"),
|
| 64 |
+
_resource(rid="dataset-b", title="Pashto B", url="https://example.org/shared"),
|
| 65 |
+
],
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
def probe(_: str, __: float) -> UrlProbe:
|
| 69 |
+
return UrlProbe(status_code=200, content_sample="Pashto")
|
| 70 |
+
|
| 71 |
+
updated, report = review_resources(catalog, probe_fn=probe)
|
| 72 |
+
|
| 73 |
+
assert report["removed"] == 1
|
| 74 |
+
assert len(updated["resources"]) == 1
|
| 75 |
+
assert any("Duplicate canonical URL" in reason for reason in report["removals"][0]["reasons"])
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_review_resources_allows_same_url_across_different_categories() -> None:
|
| 79 |
+
dataset = _resource(rid="dataset-a", title="Pashto A", url="https://example.org/shared")
|
| 80 |
+
benchmark = _resource(rid="benchmark-a", title="Pashto A Benchmark", url="https://example.org/shared")
|
| 81 |
+
benchmark["category"] = "benchmark"
|
| 82 |
+
benchmark["tags"] = ["pashto", "benchmark"]
|
| 83 |
+
catalog = {
|
| 84 |
+
"version": "1.0.0",
|
| 85 |
+
"updated_on": "2026-02-20",
|
| 86 |
+
"resources": [dataset, benchmark],
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
def probe(_: str, __: float) -> UrlProbe:
|
| 90 |
+
return UrlProbe(status_code=200, content_sample="Pashto")
|
| 91 |
+
|
| 92 |
+
updated, report = review_resources(catalog, probe_fn=probe)
|
| 93 |
+
|
| 94 |
+
assert report["removed"] == 0
|
| 95 |
+
assert len(updated["resources"]) == 2
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def test_review_resources_enforces_pashto_relevance_only_when_enabled() -> None:
|
| 99 |
+
non_pashto = _resource(rid="dataset-x", title="General Dataset", url="https://example.org/general")
|
| 100 |
+
non_pashto["pashto_evidence"]["evidence_text"] = "Generic metadata note."
|
| 101 |
+
non_pashto["pashto_evidence"]["markers"] = ["generic"]
|
| 102 |
+
non_pashto["tags"] = ["dataset", "general"]
|
| 103 |
+
catalog = {"version": "1.0.0", "updated_on": "2026-02-20", "resources": [non_pashto]}
|
| 104 |
+
|
| 105 |
+
def probe(_: str, __: float) -> UrlProbe:
|
| 106 |
+
return UrlProbe(status_code=200, content_sample="General language resource")
|
| 107 |
+
|
| 108 |
+
updated_relaxed, report_relaxed = review_resources(catalog, probe_fn=probe, enforce_pashto_relevance=False)
|
| 109 |
+
updated_strict, report_strict = review_resources(catalog, probe_fn=probe, enforce_pashto_relevance=True)
|
| 110 |
+
|
| 111 |
+
assert report_relaxed["removed"] == 0
|
| 112 |
+
assert len(updated_relaxed["resources"]) == 1
|
| 113 |
+
assert report_strict["removed"] == 1
|
| 114 |
+
assert updated_strict["resources"] == []
|