musaw commited on
Commit ·
ed6f1f9
1
Parent(s): 9899fdf
chore(resources): enforce Pashto-centric policy and prune reference-only entries
Browse files- docs/resource_catalog.md +2 -1
- docs/resource_cycle_runbook.md +2 -0
- docs/search/resources.json +1 -333
- resources/README.md +6 -6
- resources/catalog/README.md +4 -0
- resources/catalog/pending_candidates.json +0 -0
- resources/catalog/resources.json +0 -360
- resources/models/README.md +2 -6
- resources/papers/README.md +0 -4
- resources/projects/README.md +0 -1
- resources/tools/README.md +1 -2
- scripts/sync_resources.py +208 -65
- scripts/validate_resource_catalog.py +24 -0
- tests/test_validate_resource_catalog.py +18 -0
docs/resource_catalog.md
CHANGED
|
@@ -6,7 +6,8 @@ This index points to validated Pashto-related resources tracked in structured fi
|
|
| 6 |
|
| 7 |
## Validation method
|
| 8 |
- Verify source URL resolves to official page or canonical repository.
|
| 9 |
-
- Verify explicit Pashto support markers (`Pashto`, `ps`, `ps_af`, `pus`, `pbt_Arab`) where possible.
|
|
|
|
| 10 |
- Include only resources with practical use for this repository.
|
| 11 |
|
| 12 |
## Structured catalog
|
|
|
|
| 6 |
|
| 7 |
## Validation method
|
| 8 |
- Verify source URL resolves to official page or canonical repository.
|
| 9 |
+
- Verify explicit Pashto support markers (`Pashto`, `Pukhto`, `Pushto`, `Pakhto`, `پښتو`, `ps`, `ps_af`, `pus`, `pbt_Arab`) where possible.
|
| 10 |
+
- Reject resources where Pashto is only mentioned in passing and the primary work is focused on another language.
|
| 11 |
- Include only resources with practical use for this repository.
|
| 12 |
|
| 13 |
## Structured catalog
|
docs/resource_cycle_runbook.md
CHANGED
|
@@ -42,4 +42,6 @@ After discovery, promote only approved resources:
|
|
| 42 |
## Guardrails
|
| 43 |
- Do not auto-promote candidates without evidence and license review.
|
| 44 |
- Keep `status: verified` only for reviewed entries.
|
|
|
|
|
|
|
| 45 |
- Generated files must be committed after catalog updates.
|
|
|
|
| 42 |
## Guardrails
|
| 43 |
- Do not auto-promote candidates without evidence and license review.
|
| 44 |
- Keep `status: verified` only for reviewed entries.
|
| 45 |
+
- Do not promote "reference-only" resources where Pashto is incidental; only Pashto-centric resources are eligible.
|
| 46 |
+
- Treat spelling variants as valid Pashto markers during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `پښتو`).
|
| 47 |
- Generated files must be committed after catalog updates.
|
docs/search/resources.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"generated_on": "2026-02-16T00:00:00Z",
|
| 3 |
-
"count":
|
| 4 |
"resources": [
|
| 5 |
{
|
| 6 |
"id": "dataset-common-voice-ps-v24",
|
|
@@ -190,144 +190,6 @@
|
|
| 190 |
"Pashto"
|
| 191 |
]
|
| 192 |
},
|
| 193 |
-
{
|
| 194 |
-
"id": "model-whisper-large-v3",
|
| 195 |
-
"title": "Whisper Large v3",
|
| 196 |
-
"url": "https://huggingface.co/openai/whisper-large-v3",
|
| 197 |
-
"category": "model",
|
| 198 |
-
"source": "huggingface",
|
| 199 |
-
"status": "verified",
|
| 200 |
-
"summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
|
| 201 |
-
"primary_use": "ASR baseline and pseudo-labeling",
|
| 202 |
-
"tasks": [
|
| 203 |
-
"asr"
|
| 204 |
-
],
|
| 205 |
-
"tags": [
|
| 206 |
-
"pashto",
|
| 207 |
-
"asr",
|
| 208 |
-
"whisper"
|
| 209 |
-
],
|
| 210 |
-
"evidence_text": "Whisper tokenizer map includes ps language key.",
|
| 211 |
-
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 212 |
-
"markers": [
|
| 213 |
-
"ps"
|
| 214 |
-
]
|
| 215 |
-
},
|
| 216 |
-
{
|
| 217 |
-
"id": "model-mms-1b-all",
|
| 218 |
-
"title": "MMS 1B All",
|
| 219 |
-
"url": "https://huggingface.co/facebook/mms-1b-all",
|
| 220 |
-
"category": "model",
|
| 221 |
-
"source": "huggingface",
|
| 222 |
-
"status": "verified",
|
| 223 |
-
"summary": "Multilingual ASR model from MMS for low-resource transfer.",
|
| 224 |
-
"primary_use": "ASR transfer baseline",
|
| 225 |
-
"tasks": [
|
| 226 |
-
"asr"
|
| 227 |
-
],
|
| 228 |
-
"tags": [
|
| 229 |
-
"pashto",
|
| 230 |
-
"asr",
|
| 231 |
-
"mms"
|
| 232 |
-
],
|
| 233 |
-
"evidence_text": "MMS coverage table includes pus with ASR support.",
|
| 234 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 235 |
-
"markers": [
|
| 236 |
-
"pus"
|
| 237 |
-
]
|
| 238 |
-
},
|
| 239 |
-
{
|
| 240 |
-
"id": "model-mms-tts",
|
| 241 |
-
"title": "MMS TTS",
|
| 242 |
-
"url": "https://huggingface.co/facebook/mms-tts",
|
| 243 |
-
"category": "model",
|
| 244 |
-
"source": "huggingface",
|
| 245 |
-
"status": "verified",
|
| 246 |
-
"summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
|
| 247 |
-
"primary_use": "TTS baseline and transfer",
|
| 248 |
-
"tasks": [
|
| 249 |
-
"tts"
|
| 250 |
-
],
|
| 251 |
-
"tags": [
|
| 252 |
-
"pashto",
|
| 253 |
-
"tts",
|
| 254 |
-
"mms"
|
| 255 |
-
],
|
| 256 |
-
"evidence_text": "MMS coverage table includes pus with TTS support.",
|
| 257 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 258 |
-
"markers": [
|
| 259 |
-
"pus"
|
| 260 |
-
]
|
| 261 |
-
},
|
| 262 |
-
{
|
| 263 |
-
"id": "model-nllb-200-distilled-600m",
|
| 264 |
-
"title": "NLLB-200 Distilled 600M",
|
| 265 |
-
"url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
|
| 266 |
-
"category": "model",
|
| 267 |
-
"source": "huggingface",
|
| 268 |
-
"status": "verified",
|
| 269 |
-
"summary": "General multilingual translation model with Pashto script token support.",
|
| 270 |
-
"primary_use": "Pashto translation baseline",
|
| 271 |
-
"tasks": [
|
| 272 |
-
"mt"
|
| 273 |
-
],
|
| 274 |
-
"tags": [
|
| 275 |
-
"pashto",
|
| 276 |
-
"mt",
|
| 277 |
-
"nllb"
|
| 278 |
-
],
|
| 279 |
-
"evidence_text": "Model special token map includes pbt_Arab.",
|
| 280 |
-
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 281 |
-
"markers": [
|
| 282 |
-
"pbt_Arab"
|
| 283 |
-
]
|
| 284 |
-
},
|
| 285 |
-
{
|
| 286 |
-
"id": "model-opus-mt-en-mul",
|
| 287 |
-
"title": "OPUS MT en-mul",
|
| 288 |
-
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 289 |
-
"category": "model",
|
| 290 |
-
"source": "huggingface",
|
| 291 |
-
"status": "verified",
|
| 292 |
-
"summary": "Translation model that can route English into Pashto via multilingual set.",
|
| 293 |
-
"primary_use": "English to Pashto translation path",
|
| 294 |
-
"tasks": [
|
| 295 |
-
"mt"
|
| 296 |
-
],
|
| 297 |
-
"tags": [
|
| 298 |
-
"pashto",
|
| 299 |
-
"mt",
|
| 300 |
-
"opus"
|
| 301 |
-
],
|
| 302 |
-
"evidence_text": "Language list includes pus code.",
|
| 303 |
-
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 304 |
-
"markers": [
|
| 305 |
-
"pus"
|
| 306 |
-
]
|
| 307 |
-
},
|
| 308 |
-
{
|
| 309 |
-
"id": "model-opus-mt-mul-en",
|
| 310 |
-
"title": "OPUS MT mul-en",
|
| 311 |
-
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 312 |
-
"category": "model",
|
| 313 |
-
"source": "huggingface",
|
| 314 |
-
"status": "verified",
|
| 315 |
-
"summary": "Translation model for Pashto to English via multilingual encoder.",
|
| 316 |
-
"primary_use": "Pashto to English translation path",
|
| 317 |
-
"tasks": [
|
| 318 |
-
"mt"
|
| 319 |
-
],
|
| 320 |
-
"tags": [
|
| 321 |
-
"pashto",
|
| 322 |
-
"mt",
|
| 323 |
-
"opus"
|
| 324 |
-
],
|
| 325 |
-
"evidence_text": "Language list includes pus code.",
|
| 326 |
-
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 327 |
-
"markers": [
|
| 328 |
-
"pus"
|
| 329 |
-
]
|
| 330 |
-
},
|
| 331 |
{
|
| 332 |
"id": "model-pashto-bert",
|
| 333 |
"title": "PashtoBERT",
|
|
@@ -447,150 +309,6 @@
|
|
| 447 |
"pbt_Arab"
|
| 448 |
]
|
| 449 |
},
|
| 450 |
-
{
|
| 451 |
-
"id": "tool-faster-whisper",
|
| 452 |
-
"title": "Faster-Whisper",
|
| 453 |
-
"url": "https://github.com/SYSTRAN/faster-whisper",
|
| 454 |
-
"category": "tool",
|
| 455 |
-
"source": "github",
|
| 456 |
-
"status": "verified",
|
| 457 |
-
"summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
|
| 458 |
-
"primary_use": "ASR inference acceleration",
|
| 459 |
-
"tasks": [
|
| 460 |
-
"asr"
|
| 461 |
-
],
|
| 462 |
-
"tags": [
|
| 463 |
-
"pashto",
|
| 464 |
-
"tooling",
|
| 465 |
-
"asr"
|
| 466 |
-
],
|
| 467 |
-
"evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
|
| 468 |
-
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 469 |
-
"markers": [
|
| 470 |
-
"ps"
|
| 471 |
-
]
|
| 472 |
-
},
|
| 473 |
-
{
|
| 474 |
-
"id": "tool-coqui-tts",
|
| 475 |
-
"title": "Coqui TTS",
|
| 476 |
-
"url": "https://github.com/coqui-ai/TTS",
|
| 477 |
-
"category": "tool",
|
| 478 |
-
"source": "github",
|
| 479 |
-
"status": "verified",
|
| 480 |
-
"summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
|
| 481 |
-
"primary_use": "TTS training and inference",
|
| 482 |
-
"tasks": [
|
| 483 |
-
"tts"
|
| 484 |
-
],
|
| 485 |
-
"tags": [
|
| 486 |
-
"pashto",
|
| 487 |
-
"tooling",
|
| 488 |
-
"tts"
|
| 489 |
-
],
|
| 490 |
-
"evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
|
| 491 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 492 |
-
"markers": [
|
| 493 |
-
"pus"
|
| 494 |
-
]
|
| 495 |
-
},
|
| 496 |
-
{
|
| 497 |
-
"id": "paper-whisper-2212-04356",
|
| 498 |
-
"title": "Robust Speech Recognition via Large-Scale Weak Supervision",
|
| 499 |
-
"url": "https://arxiv.org/abs/2212.04356",
|
| 500 |
-
"category": "paper",
|
| 501 |
-
"source": "arxiv",
|
| 502 |
-
"status": "verified",
|
| 503 |
-
"summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
|
| 504 |
-
"primary_use": "ASR methodology reference",
|
| 505 |
-
"tasks": [
|
| 506 |
-
"asr",
|
| 507 |
-
"research"
|
| 508 |
-
],
|
| 509 |
-
"tags": [
|
| 510 |
-
"pashto",
|
| 511 |
-
"paper",
|
| 512 |
-
"asr"
|
| 513 |
-
],
|
| 514 |
-
"evidence_text": "Paired with tokenizer language map containing ps.",
|
| 515 |
-
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 516 |
-
"markers": [
|
| 517 |
-
"ps"
|
| 518 |
-
]
|
| 519 |
-
},
|
| 520 |
-
{
|
| 521 |
-
"id": "paper-mms-2305-13516",
|
| 522 |
-
"title": "Scaling Speech Technology to 1,000+ Languages",
|
| 523 |
-
"url": "https://arxiv.org/abs/2305.13516",
|
| 524 |
-
"category": "paper",
|
| 525 |
-
"source": "arxiv",
|
| 526 |
-
"status": "verified",
|
| 527 |
-
"summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
|
| 528 |
-
"primary_use": "ASR and TTS transfer reference",
|
| 529 |
-
"tasks": [
|
| 530 |
-
"asr",
|
| 531 |
-
"tts",
|
| 532 |
-
"research"
|
| 533 |
-
],
|
| 534 |
-
"tags": [
|
| 535 |
-
"pashto",
|
| 536 |
-
"paper",
|
| 537 |
-
"speech"
|
| 538 |
-
],
|
| 539 |
-
"evidence_text": "Coverage table marks pus support in MMS release.",
|
| 540 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 541 |
-
"markers": [
|
| 542 |
-
"pus"
|
| 543 |
-
]
|
| 544 |
-
},
|
| 545 |
-
{
|
| 546 |
-
"id": "paper-nllb-2207-04672",
|
| 547 |
-
"title": "No Language Left Behind",
|
| 548 |
-
"url": "https://arxiv.org/abs/2207.04672",
|
| 549 |
-
"category": "paper",
|
| 550 |
-
"source": "arxiv",
|
| 551 |
-
"status": "verified",
|
| 552 |
-
"summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
|
| 553 |
-
"primary_use": "MT research reference",
|
| 554 |
-
"tasks": [
|
| 555 |
-
"mt",
|
| 556 |
-
"research"
|
| 557 |
-
],
|
| 558 |
-
"tags": [
|
| 559 |
-
"pashto",
|
| 560 |
-
"paper",
|
| 561 |
-
"mt"
|
| 562 |
-
],
|
| 563 |
-
"evidence_text": "Model usage in repo references pbt_Arab token support.",
|
| 564 |
-
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 565 |
-
"markers": [
|
| 566 |
-
"pbt_Arab"
|
| 567 |
-
]
|
| 568 |
-
},
|
| 569 |
-
{
|
| 570 |
-
"id": "paper-fleurs-2205-12446",
|
| 571 |
-
"title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
|
| 572 |
-
"url": "https://arxiv.org/abs/2205.12446",
|
| 573 |
-
"category": "paper",
|
| 574 |
-
"source": "arxiv",
|
| 575 |
-
"status": "verified",
|
| 576 |
-
"summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
|
| 577 |
-
"primary_use": "Speech benchmark methodology reference",
|
| 578 |
-
"tasks": [
|
| 579 |
-
"asr",
|
| 580 |
-
"benchmarking",
|
| 581 |
-
"research"
|
| 582 |
-
],
|
| 583 |
-
"tags": [
|
| 584 |
-
"pashto",
|
| 585 |
-
"paper",
|
| 586 |
-
"benchmark"
|
| 587 |
-
],
|
| 588 |
-
"evidence_text": "Dataset implementation includes ps_af language code.",
|
| 589 |
-
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 590 |
-
"markers": [
|
| 591 |
-
"ps_af"
|
| 592 |
-
]
|
| 593 |
-
},
|
| 594 |
{
|
| 595 |
"id": "dataset-nexdata-99h-pashto-dialogue",
|
| 596 |
"title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
|
|
@@ -972,32 +690,6 @@
|
|
| 972 |
"asr"
|
| 973 |
]
|
| 974 |
},
|
| 975 |
-
{
|
| 976 |
-
"id": "code-github-mrychlik-worldly-ocr",
|
| 977 |
-
"title": "worldly-ocr",
|
| 978 |
-
"url": "https://github.com/mrychlik/worldly-ocr",
|
| 979 |
-
"category": "code",
|
| 980 |
-
"source": "github",
|
| 981 |
-
"status": "verified",
|
| 982 |
-
"summary": "Open OCR code project that explicitly includes Pashto among target languages.",
|
| 983 |
-
"primary_use": "Pashto OCR code reference and experimentation",
|
| 984 |
-
"tasks": [
|
| 985 |
-
"ocr",
|
| 986 |
-
"tooling"
|
| 987 |
-
],
|
| 988 |
-
"tags": [
|
| 989 |
-
"pashto",
|
| 990 |
-
"code",
|
| 991 |
-
"github",
|
| 992 |
-
"ocr"
|
| 993 |
-
],
|
| 994 |
-
"evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
|
| 995 |
-
"evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
|
| 996 |
-
"markers": [
|
| 997 |
-
"Pashto",
|
| 998 |
-
"OCR"
|
| 999 |
-
]
|
| 1000 |
-
},
|
| 1001 |
{
|
| 1002 |
"id": "paper-s2-psocr-lmm-pashto",
|
| 1003 |
"title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
|
|
@@ -1687,30 +1379,6 @@
|
|
| 1687 |
"pashto"
|
| 1688 |
]
|
| 1689 |
},
|
| 1690 |
-
{
|
| 1691 |
-
"id": "project-github-ihyacommunity-khushkhat-extension",
|
| 1692 |
-
"title": "IhyaCommunity/Khushkhat-Extension",
|
| 1693 |
-
"url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
|
| 1694 |
-
"category": "project",
|
| 1695 |
-
"source": "github",
|
| 1696 |
-
"status": "verified",
|
| 1697 |
-
"summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
|
| 1698 |
-
"primary_use": "Interactive Pashto demo and quick qualitative validation",
|
| 1699 |
-
"tasks": [
|
| 1700 |
-
"demo"
|
| 1701 |
-
],
|
| 1702 |
-
"tags": [
|
| 1703 |
-
"pashto",
|
| 1704 |
-
"project",
|
| 1705 |
-
"github",
|
| 1706 |
-
"demo"
|
| 1707 |
-
],
|
| 1708 |
-
"evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
|
| 1709 |
-
"evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
|
| 1710 |
-
"markers": [
|
| 1711 |
-
"pashto"
|
| 1712 |
-
]
|
| 1713 |
-
},
|
| 1714 |
{
|
| 1715 |
"id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
|
| 1716 |
"title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
|
|
|
|
| 1 |
{
|
| 2 |
"generated_on": "2026-02-16T00:00:00Z",
|
| 3 |
+
"count": 63,
|
| 4 |
"resources": [
|
| 5 |
{
|
| 6 |
"id": "dataset-common-voice-ps-v24",
|
|
|
|
| 190 |
"Pashto"
|
| 191 |
]
|
| 192 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
{
|
| 194 |
"id": "model-pashto-bert",
|
| 195 |
"title": "PashtoBERT",
|
|
|
|
| 309 |
"pbt_Arab"
|
| 310 |
]
|
| 311 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
{
|
| 313 |
"id": "dataset-nexdata-99h-pashto-dialogue",
|
| 314 |
"title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
|
|
|
|
| 690 |
"asr"
|
| 691 |
]
|
| 692 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
{
|
| 694 |
"id": "paper-s2-psocr-lmm-pashto",
|
| 695 |
"title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
|
|
|
|
| 1379 |
"pashto"
|
| 1380 |
]
|
| 1381 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1382 |
{
|
| 1383 |
"id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
|
| 1384 |
"title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
|
resources/README.md
CHANGED
|
@@ -4,12 +4,12 @@ Structured, Pashto-focused resource tracking lives in this folder.
|
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
- Datasets (28): [datasets/README.md](datasets/README.md)
|
| 7 |
-
- Models (
|
| 8 |
- Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
-
- Tools (
|
| 10 |
-
- Papers (
|
| 11 |
-
- Projects (
|
| 12 |
-
- Code (
|
| 13 |
|
| 14 |
## Machine-Readable Catalog
|
| 15 |
- Canonical catalog: [catalog/resources.json](catalog/resources.json)
|
|
@@ -22,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
|
|
| 22 |
- Run `python scripts/validate_resource_catalog.py` before opening a PR.
|
| 23 |
- Run `python scripts/generate_resource_views.py` after catalog changes.
|
| 24 |
|
| 25 |
-
Verified resource count: `
|
|
|
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
- Datasets (28): [datasets/README.md](datasets/README.md)
|
| 7 |
+
- Models (12): [models/README.md](models/README.md)
|
| 8 |
- Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
+
- Tools (0): [tools/README.md](tools/README.md)
|
| 10 |
+
- Papers (8): [papers/README.md](papers/README.md)
|
| 11 |
+
- Projects (10): [projects/README.md](projects/README.md)
|
| 12 |
+
- Code (1): [codes/README.md](codes/README.md)
|
| 13 |
|
| 14 |
## Machine-Readable Catalog
|
| 15 |
- Canonical catalog: [catalog/resources.json](catalog/resources.json)
|
|
|
|
| 22 |
- Run `python scripts/validate_resource_catalog.py` before opening a PR.
|
| 23 |
- Run `python scripts/generate_resource_views.py` after catalog changes.
|
| 24 |
|
| 25 |
+
Verified resource count: `63`
|
resources/catalog/README.md
CHANGED
|
@@ -12,3 +12,7 @@ This folder holds machine-readable resource data used by docs and GitHub Pages s
|
|
| 12 |
2. Run `python scripts/validate_resource_catalog.py`.
|
| 13 |
3. Run `python scripts/generate_resource_views.py`.
|
| 14 |
4. Commit both catalog and generated markdown/search files.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
2. Run `python scripts/validate_resource_catalog.py`.
|
| 13 |
3. Run `python scripts/generate_resource_views.py`.
|
| 14 |
4. Commit both catalog and generated markdown/search files.
|
| 15 |
+
|
| 16 |
+
## Promotion guardrail
|
| 17 |
+
- Promote only Pashto-centric resources. Exclude entries where Pashto appears only as a side reference.
|
| 18 |
+
- Accept Pashto naming variants during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `پښتو`).
|
resources/catalog/pending_candidates.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
resources/catalog/resources.json
CHANGED
|
@@ -206,156 +206,6 @@
|
|
| 206 |
"kaggle"
|
| 207 |
]
|
| 208 |
},
|
| 209 |
-
{
|
| 210 |
-
"id": "model-whisper-large-v3",
|
| 211 |
-
"title": "Whisper Large v3",
|
| 212 |
-
"url": "https://huggingface.co/openai/whisper-large-v3",
|
| 213 |
-
"category": "model",
|
| 214 |
-
"source": "huggingface",
|
| 215 |
-
"status": "verified",
|
| 216 |
-
"summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
|
| 217 |
-
"primary_use": "ASR baseline and pseudo-labeling",
|
| 218 |
-
"tasks": [
|
| 219 |
-
"asr"
|
| 220 |
-
],
|
| 221 |
-
"pashto_evidence": {
|
| 222 |
-
"evidence_text": "Whisper tokenizer map includes ps language key.",
|
| 223 |
-
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 224 |
-
"markers": [
|
| 225 |
-
"ps"
|
| 226 |
-
]
|
| 227 |
-
},
|
| 228 |
-
"tags": [
|
| 229 |
-
"pashto",
|
| 230 |
-
"asr",
|
| 231 |
-
"whisper"
|
| 232 |
-
]
|
| 233 |
-
},
|
| 234 |
-
{
|
| 235 |
-
"id": "model-mms-1b-all",
|
| 236 |
-
"title": "MMS 1B All",
|
| 237 |
-
"url": "https://huggingface.co/facebook/mms-1b-all",
|
| 238 |
-
"category": "model",
|
| 239 |
-
"source": "huggingface",
|
| 240 |
-
"status": "verified",
|
| 241 |
-
"summary": "Multilingual ASR model from MMS for low-resource transfer.",
|
| 242 |
-
"primary_use": "ASR transfer baseline",
|
| 243 |
-
"tasks": [
|
| 244 |
-
"asr"
|
| 245 |
-
],
|
| 246 |
-
"pashto_evidence": {
|
| 247 |
-
"evidence_text": "MMS coverage table includes pus with ASR support.",
|
| 248 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 249 |
-
"markers": [
|
| 250 |
-
"pus"
|
| 251 |
-
]
|
| 252 |
-
},
|
| 253 |
-
"tags": [
|
| 254 |
-
"pashto",
|
| 255 |
-
"asr",
|
| 256 |
-
"mms"
|
| 257 |
-
]
|
| 258 |
-
},
|
| 259 |
-
{
|
| 260 |
-
"id": "model-mms-tts",
|
| 261 |
-
"title": "MMS TTS",
|
| 262 |
-
"url": "https://huggingface.co/facebook/mms-tts",
|
| 263 |
-
"category": "model",
|
| 264 |
-
"source": "huggingface",
|
| 265 |
-
"status": "verified",
|
| 266 |
-
"summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
|
| 267 |
-
"primary_use": "TTS baseline and transfer",
|
| 268 |
-
"tasks": [
|
| 269 |
-
"tts"
|
| 270 |
-
],
|
| 271 |
-
"pashto_evidence": {
|
| 272 |
-
"evidence_text": "MMS coverage table includes pus with TTS support.",
|
| 273 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 274 |
-
"markers": [
|
| 275 |
-
"pus"
|
| 276 |
-
]
|
| 277 |
-
},
|
| 278 |
-
"tags": [
|
| 279 |
-
"pashto",
|
| 280 |
-
"tts",
|
| 281 |
-
"mms"
|
| 282 |
-
]
|
| 283 |
-
},
|
| 284 |
-
{
|
| 285 |
-
"id": "model-nllb-200-distilled-600m",
|
| 286 |
-
"title": "NLLB-200 Distilled 600M",
|
| 287 |
-
"url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
|
| 288 |
-
"category": "model",
|
| 289 |
-
"source": "huggingface",
|
| 290 |
-
"status": "verified",
|
| 291 |
-
"summary": "General multilingual translation model with Pashto script token support.",
|
| 292 |
-
"primary_use": "Pashto translation baseline",
|
| 293 |
-
"tasks": [
|
| 294 |
-
"mt"
|
| 295 |
-
],
|
| 296 |
-
"pashto_evidence": {
|
| 297 |
-
"evidence_text": "Model special token map includes pbt_Arab.",
|
| 298 |
-
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 299 |
-
"markers": [
|
| 300 |
-
"pbt_Arab"
|
| 301 |
-
]
|
| 302 |
-
},
|
| 303 |
-
"tags": [
|
| 304 |
-
"pashto",
|
| 305 |
-
"mt",
|
| 306 |
-
"nllb"
|
| 307 |
-
]
|
| 308 |
-
},
|
| 309 |
-
{
|
| 310 |
-
"id": "model-opus-mt-en-mul",
|
| 311 |
-
"title": "OPUS MT en-mul",
|
| 312 |
-
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 313 |
-
"category": "model",
|
| 314 |
-
"source": "huggingface",
|
| 315 |
-
"status": "verified",
|
| 316 |
-
"summary": "Translation model that can route English into Pashto via multilingual set.",
|
| 317 |
-
"primary_use": "English to Pashto translation path",
|
| 318 |
-
"tasks": [
|
| 319 |
-
"mt"
|
| 320 |
-
],
|
| 321 |
-
"pashto_evidence": {
|
| 322 |
-
"evidence_text": "Language list includes pus code.",
|
| 323 |
-
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
|
| 324 |
-
"markers": [
|
| 325 |
-
"pus"
|
| 326 |
-
]
|
| 327 |
-
},
|
| 328 |
-
"tags": [
|
| 329 |
-
"pashto",
|
| 330 |
-
"mt",
|
| 331 |
-
"opus"
|
| 332 |
-
]
|
| 333 |
-
},
|
| 334 |
-
{
|
| 335 |
-
"id": "model-opus-mt-mul-en",
|
| 336 |
-
"title": "OPUS MT mul-en",
|
| 337 |
-
"url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 338 |
-
"category": "model",
|
| 339 |
-
"source": "huggingface",
|
| 340 |
-
"status": "verified",
|
| 341 |
-
"summary": "Translation model for Pashto to English via multilingual encoder.",
|
| 342 |
-
"primary_use": "Pashto to English translation path",
|
| 343 |
-
"tasks": [
|
| 344 |
-
"mt"
|
| 345 |
-
],
|
| 346 |
-
"pashto_evidence": {
|
| 347 |
-
"evidence_text": "Language list includes pus code.",
|
| 348 |
-
"evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
|
| 349 |
-
"markers": [
|
| 350 |
-
"pus"
|
| 351 |
-
]
|
| 352 |
-
},
|
| 353 |
-
"tags": [
|
| 354 |
-
"pashto",
|
| 355 |
-
"mt",
|
| 356 |
-
"opus"
|
| 357 |
-
]
|
| 358 |
-
},
|
| 359 |
{
|
| 360 |
"id": "model-pashto-bert",
|
| 361 |
"title": "PashtoBERT",
|
|
@@ -485,162 +335,6 @@
|
|
| 485 |
"mt"
|
| 486 |
]
|
| 487 |
},
|
| 488 |
-
{
|
| 489 |
-
"id": "tool-faster-whisper",
|
| 490 |
-
"title": "Faster-Whisper",
|
| 491 |
-
"url": "https://github.com/SYSTRAN/faster-whisper",
|
| 492 |
-
"category": "tool",
|
| 493 |
-
"source": "github",
|
| 494 |
-
"status": "verified",
|
| 495 |
-
"summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
|
| 496 |
-
"primary_use": "ASR inference acceleration",
|
| 497 |
-
"tasks": [
|
| 498 |
-
"asr"
|
| 499 |
-
],
|
| 500 |
-
"pashto_evidence": {
|
| 501 |
-
"evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
|
| 502 |
-
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 503 |
-
"markers": [
|
| 504 |
-
"ps"
|
| 505 |
-
]
|
| 506 |
-
},
|
| 507 |
-
"tags": [
|
| 508 |
-
"pashto",
|
| 509 |
-
"tooling",
|
| 510 |
-
"asr"
|
| 511 |
-
]
|
| 512 |
-
},
|
| 513 |
-
{
|
| 514 |
-
"id": "tool-coqui-tts",
|
| 515 |
-
"title": "Coqui TTS",
|
| 516 |
-
"url": "https://github.com/coqui-ai/TTS",
|
| 517 |
-
"category": "tool",
|
| 518 |
-
"source": "github",
|
| 519 |
-
"status": "verified",
|
| 520 |
-
"summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
|
| 521 |
-
"primary_use": "TTS training and inference",
|
| 522 |
-
"tasks": [
|
| 523 |
-
"tts"
|
| 524 |
-
],
|
| 525 |
-
"pashto_evidence": {
|
| 526 |
-
"evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
|
| 527 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 528 |
-
"markers": [
|
| 529 |
-
"pus"
|
| 530 |
-
]
|
| 531 |
-
},
|
| 532 |
-
"tags": [
|
| 533 |
-
"pashto",
|
| 534 |
-
"tooling",
|
| 535 |
-
"tts"
|
| 536 |
-
]
|
| 537 |
-
},
|
| 538 |
-
{
|
| 539 |
-
"id": "paper-whisper-2212-04356",
|
| 540 |
-
"title": "Robust Speech Recognition via Large-Scale Weak Supervision",
|
| 541 |
-
"url": "https://arxiv.org/abs/2212.04356",
|
| 542 |
-
"category": "paper",
|
| 543 |
-
"source": "arxiv",
|
| 544 |
-
"status": "verified",
|
| 545 |
-
"summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
|
| 546 |
-
"primary_use": "ASR methodology reference",
|
| 547 |
-
"tasks": [
|
| 548 |
-
"asr",
|
| 549 |
-
"research"
|
| 550 |
-
],
|
| 551 |
-
"pashto_evidence": {
|
| 552 |
-
"evidence_text": "Paired with tokenizer language map containing ps.",
|
| 553 |
-
"evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
|
| 554 |
-
"markers": [
|
| 555 |
-
"ps"
|
| 556 |
-
]
|
| 557 |
-
},
|
| 558 |
-
"tags": [
|
| 559 |
-
"pashto",
|
| 560 |
-
"paper",
|
| 561 |
-
"asr"
|
| 562 |
-
]
|
| 563 |
-
},
|
| 564 |
-
{
|
| 565 |
-
"id": "paper-mms-2305-13516",
|
| 566 |
-
"title": "Scaling Speech Technology to 1,000+ Languages",
|
| 567 |
-
"url": "https://arxiv.org/abs/2305.13516",
|
| 568 |
-
"category": "paper",
|
| 569 |
-
"source": "arxiv",
|
| 570 |
-
"status": "verified",
|
| 571 |
-
"summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
|
| 572 |
-
"primary_use": "ASR and TTS transfer reference",
|
| 573 |
-
"tasks": [
|
| 574 |
-
"asr",
|
| 575 |
-
"tts",
|
| 576 |
-
"research"
|
| 577 |
-
],
|
| 578 |
-
"pashto_evidence": {
|
| 579 |
-
"evidence_text": "Coverage table marks pus support in MMS release.",
|
| 580 |
-
"evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
|
| 581 |
-
"markers": [
|
| 582 |
-
"pus"
|
| 583 |
-
]
|
| 584 |
-
},
|
| 585 |
-
"tags": [
|
| 586 |
-
"pashto",
|
| 587 |
-
"paper",
|
| 588 |
-
"speech"
|
| 589 |
-
]
|
| 590 |
-
},
|
| 591 |
-
{
|
| 592 |
-
"id": "paper-nllb-2207-04672",
|
| 593 |
-
"title": "No Language Left Behind",
|
| 594 |
-
"url": "https://arxiv.org/abs/2207.04672",
|
| 595 |
-
"category": "paper",
|
| 596 |
-
"source": "arxiv",
|
| 597 |
-
"status": "verified",
|
| 598 |
-
"summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
|
| 599 |
-
"primary_use": "MT research reference",
|
| 600 |
-
"tasks": [
|
| 601 |
-
"mt",
|
| 602 |
-
"research"
|
| 603 |
-
],
|
| 604 |
-
"pashto_evidence": {
|
| 605 |
-
"evidence_text": "Model usage in repo references pbt_Arab token support.",
|
| 606 |
-
"evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
|
| 607 |
-
"markers": [
|
| 608 |
-
"pbt_Arab"
|
| 609 |
-
]
|
| 610 |
-
},
|
| 611 |
-
"tags": [
|
| 612 |
-
"pashto",
|
| 613 |
-
"paper",
|
| 614 |
-
"mt"
|
| 615 |
-
]
|
| 616 |
-
},
|
| 617 |
-
{
|
| 618 |
-
"id": "paper-fleurs-2205-12446",
|
| 619 |
-
"title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
|
| 620 |
-
"url": "https://arxiv.org/abs/2205.12446",
|
| 621 |
-
"category": "paper",
|
| 622 |
-
"source": "arxiv",
|
| 623 |
-
"status": "verified",
|
| 624 |
-
"summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
|
| 625 |
-
"primary_use": "Speech benchmark methodology reference",
|
| 626 |
-
"tasks": [
|
| 627 |
-
"asr",
|
| 628 |
-
"benchmarking",
|
| 629 |
-
"research"
|
| 630 |
-
],
|
| 631 |
-
"pashto_evidence": {
|
| 632 |
-
"evidence_text": "Dataset implementation includes ps_af language code.",
|
| 633 |
-
"evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
|
| 634 |
-
"markers": [
|
| 635 |
-
"ps_af"
|
| 636 |
-
]
|
| 637 |
-
},
|
| 638 |
-
"tags": [
|
| 639 |
-
"pashto",
|
| 640 |
-
"paper",
|
| 641 |
-
"benchmark"
|
| 642 |
-
]
|
| 643 |
-
},
|
| 644 |
{
|
| 645 |
"id": "dataset-nexdata-99h-pashto-dialogue",
|
| 646 |
"title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
|
|
@@ -1061,34 +755,6 @@
|
|
| 1061 |
"asr"
|
| 1062 |
]
|
| 1063 |
},
|
| 1064 |
-
{
|
| 1065 |
-
"id": "code-github-mrychlik-worldly-ocr",
|
| 1066 |
-
"title": "worldly-ocr",
|
| 1067 |
-
"url": "https://github.com/mrychlik/worldly-ocr",
|
| 1068 |
-
"category": "code",
|
| 1069 |
-
"source": "github",
|
| 1070 |
-
"status": "verified",
|
| 1071 |
-
"summary": "Open OCR code project that explicitly includes Pashto among target languages.",
|
| 1072 |
-
"primary_use": "Pashto OCR code reference and experimentation",
|
| 1073 |
-
"tasks": [
|
| 1074 |
-
"ocr",
|
| 1075 |
-
"tooling"
|
| 1076 |
-
],
|
| 1077 |
-
"pashto_evidence": {
|
| 1078 |
-
"evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
|
| 1079 |
-
"evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
|
| 1080 |
-
"markers": [
|
| 1081 |
-
"Pashto",
|
| 1082 |
-
"OCR"
|
| 1083 |
-
]
|
| 1084 |
-
},
|
| 1085 |
-
"tags": [
|
| 1086 |
-
"pashto",
|
| 1087 |
-
"code",
|
| 1088 |
-
"github",
|
| 1089 |
-
"ocr"
|
| 1090 |
-
]
|
| 1091 |
-
},
|
| 1092 |
{
|
| 1093 |
"id": "paper-s2-psocr-lmm-pashto",
|
| 1094 |
"title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
|
|
@@ -1837,32 +1503,6 @@
|
|
| 1837 |
"demo"
|
| 1838 |
]
|
| 1839 |
},
|
| 1840 |
-
{
|
| 1841 |
-
"id": "project-github-ihyacommunity-khushkhat-extension",
|
| 1842 |
-
"title": "IhyaCommunity/Khushkhat-Extension",
|
| 1843 |
-
"url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
|
| 1844 |
-
"category": "project",
|
| 1845 |
-
"source": "github",
|
| 1846 |
-
"status": "verified",
|
| 1847 |
-
"summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
|
| 1848 |
-
"primary_use": "Interactive Pashto demo and quick qualitative validation",
|
| 1849 |
-
"tasks": [
|
| 1850 |
-
"demo"
|
| 1851 |
-
],
|
| 1852 |
-
"pashto_evidence": {
|
| 1853 |
-
"evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
|
| 1854 |
-
"evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
|
| 1855 |
-
"markers": [
|
| 1856 |
-
"pashto"
|
| 1857 |
-
]
|
| 1858 |
-
},
|
| 1859 |
-
"tags": [
|
| 1860 |
-
"pashto",
|
| 1861 |
-
"project",
|
| 1862 |
-
"github",
|
| 1863 |
-
"demo"
|
| 1864 |
-
]
|
| 1865 |
-
},
|
| 1866 |
{
|
| 1867 |
"id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
|
| 1868 |
"title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
|
|
|
|
| 206 |
"kaggle"
|
| 207 |
]
|
| 208 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
{
|
| 210 |
"id": "model-pashto-bert",
|
| 211 |
"title": "PashtoBERT",
|
|
|
|
| 335 |
"mt"
|
| 336 |
]
|
| 337 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
{
|
| 339 |
"id": "dataset-nexdata-99h-pashto-dialogue",
|
| 340 |
"title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
|
|
|
|
| 755 |
"asr"
|
| 756 |
]
|
| 757 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 758 |
{
|
| 759 |
"id": "paper-s2-psocr-lmm-pashto",
|
| 760 |
"title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
|
|
|
|
| 1503 |
"demo"
|
| 1504 |
]
|
| 1505 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1506 |
{
|
| 1507 |
"id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
|
| 1508 |
"title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
|
resources/models/README.md
CHANGED
|
@@ -11,15 +11,11 @@
|
|
| 11 |
| ihanif/whisper-small-pashto | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto) | Pashto ASR baseline and model comparison |
|
| 12 |
| ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
|
| 13 |
| ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
|
| 14 |
-
| MMS 1B All | [huggingface](https://huggingface.co/facebook/mms-1b-all) | [MMS coverage table includes pus with ASR support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR transfer baseline |
|
| 15 |
-
| MMS TTS | [huggingface](https://huggingface.co/facebook/mms-tts) | [MMS coverage table includes pus with TTS support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS baseline and transfer |
|
| 16 |
-
| NLLB-200 Distilled 600M | [huggingface](https://huggingface.co/facebook/nllb-200-distilled-600M) | [Model special token map includes pbt_Arab. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | Pashto translation baseline |
|
| 17 |
-
| OPUS MT en-mul | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | English to Pashto translation path |
|
| 18 |
-
| OPUS MT mul-en | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | Pashto to English translation path |
|
| 19 |
| PashtoBERT | [huggingface](https://huggingface.co/mdarhri/pashto-bert) | [Model card states training on Pashto corpus data. (`Pashto`)](https://huggingface.co/mdarhri/pashto-bert) | Pashto NLP baseline encoder |
|
| 20 |
| wav2vec2 XLS-R 300M Pashto | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | [Model tags include pashto and ps, and model index references FLEURS config ps_af. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | Pashto ASR baseline and comparative experiments |
|
| 21 |
-
| Whisper
|
| 22 |
| Whisper Medium Pashto | [huggingface](https://huggingface.co/ihanif/whisper-medium-pashto) | [Model tags include pashto and ps, and model index uses FLEURS ps_af split. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/whisper-medium-pashto) | Pashto ASR baseline and transcription quality comparisons |
|
|
|
|
| 23 |
|
| 24 |
## Maintenance
|
| 25 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
|
|
|
| 11 |
| ihanif/whisper-small-pashto | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto) | Pashto ASR baseline and model comparison |
|
| 12 |
| ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
|
| 13 |
| ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
| PashtoBERT | [huggingface](https://huggingface.co/mdarhri/pashto-bert) | [Model card states training on Pashto corpus data. (`Pashto`)](https://huggingface.co/mdarhri/pashto-bert) | Pashto NLP baseline encoder |
|
| 15 |
| wav2vec2 XLS-R 300M Pashto | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | [Model tags include pashto and ps, and model index references FLEURS config ps_af. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | Pashto ASR baseline and comparative experiments |
|
| 16 |
+
| Whisper Base Pashto | [huggingface](https://huggingface.co/ihanif/whisper-base-pashto) | [Model ID includes Pashto and card metadata references FLEURS config ps_af. (`Pashto`, `ps_af`)](https://huggingface.co/api/models/ihanif/whisper-base-pashto) | Pashto ASR baseline and speed-accuracy comparison |
|
| 17 |
| Whisper Medium Pashto | [huggingface](https://huggingface.co/ihanif/whisper-medium-pashto) | [Model tags include pashto and ps, and model index uses FLEURS ps_af split. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/whisper-medium-pashto) | Pashto ASR baseline and transcription quality comparisons |
|
| 18 |
+
| zirak-ai/pashto-bert-v1 | [huggingface](https://huggingface.co/zirak-ai/pashto-bert-v1) | [Hugging Face model ID and search tags explicitly include pashto marker. (`pashto`)](https://huggingface.co/zirak-ai/pashto-bert-v1) | Pashto encoder baseline for NLP tasks |
|
| 19 |
|
| 20 |
## Maintenance
|
| 21 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
resources/papers/README.md
CHANGED
|
@@ -7,15 +7,11 @@
|
|
| 7 |
| Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
|
| 8 |
| Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
|
| 9 |
| Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
|
| 10 |
-
| FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech | [arxiv](https://arxiv.org/abs/2205.12446) | [Dataset implementation includes ps_af language code. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark methodology reference |
|
| 11 |
| KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
|
| 12 |
-
| No Language Left Behind | [arxiv](https://arxiv.org/abs/2207.04672) | [Model usage in repo references pbt_Arab token support. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | MT research reference |
|
| 13 |
| Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
|
| 14 |
| Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
|
| 15 |
| POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
|
| 16 |
| PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
|
| 17 |
-
| Robust Speech Recognition via Large-Scale Weak Supervision | [arxiv](https://arxiv.org/abs/2212.04356) | [Paired with tokenizer language map containing ps. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR methodology reference |
|
| 18 |
-
| Scaling Speech Technology to 1,000+ Languages | [arxiv](https://arxiv.org/abs/2305.13516) | [Coverage table marks pus support in MMS release. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR and TTS transfer reference |
|
| 19 |
|
| 20 |
## Maintenance
|
| 21 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
|
|
|
| 7 |
| Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
|
| 8 |
| Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
|
| 9 |
| Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
|
|
|
|
| 10 |
| KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
|
|
|
|
| 11 |
| Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
|
| 12 |
| Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
|
| 13 |
| POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
|
| 14 |
| PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
## Maintenance
|
| 17 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
resources/projects/README.md
CHANGED
|
@@ -6,7 +6,6 @@
|
|
| 6 |
|---|---|---|---|
|
| 7 |
| Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
|
| 8 |
| ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
|
| 9 |
-
| IhyaCommunity/Khushkhat-Extension | [github](https://github.com/IhyaCommunity/Khushkhat-Extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/IhyaCommunity/Khushkhat-Extension) | Interactive Pashto demo and quick qualitative validation |
|
| 10 |
| nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
|
| 11 |
| Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
|
| 12 |
| Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
|
|
|
|
| 6 |
|---|---|---|---|
|
| 7 |
| Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
|
| 8 |
| ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
|
|
|
|
| 9 |
| nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
|
| 10 |
| Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
|
| 11 |
| Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
|
resources/tools/README.md
CHANGED
|
@@ -4,8 +4,7 @@
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
-
|
|
| 8 |
-
| Faster-Whisper | [github](https://github.com/SYSTRAN/faster-whisper) | [Whisper tokenizer includes ps and tool runs Whisper models. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR inference acceleration |
|
| 9 |
|
| 10 |
## Maintenance
|
| 11 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
|
|
|
| 4 |
|
| 5 |
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
|---|---|---|---|
|
| 7 |
+
| _None yet_ | - | - | - |
|
|
|
|
| 8 |
|
| 9 |
## Maintenance
|
| 10 |
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
scripts/sync_resources.py
CHANGED
|
@@ -30,6 +30,19 @@ from urllib.error import HTTPError, URLError
|
|
| 30 |
USER_AGENT = "pashto-resource-sync/1.0"
|
| 31 |
MAX_FETCH_RETRIES = 4
|
| 32 |
RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
def _slug(value: str) -> str:
|
|
@@ -39,6 +52,28 @@ def _slug(value: str) -> str:
|
|
| 39 |
return value[:80] if value else "resource"
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
|
| 43 |
if not retry_after:
|
| 44 |
return None
|
|
@@ -201,16 +236,35 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
|
|
| 201 |
if kind not in {"datasets", "models"}:
|
| 202 |
return []
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
category = "dataset" if kind == "datasets" else "model"
|
| 209 |
out: list[dict[str, Any]] = []
|
| 210 |
-
for item in
|
| 211 |
repo_id = item.get("id") or item.get("modelId")
|
| 212 |
if not repo_id:
|
| 213 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
repo_url = f"https://huggingface.co/{'datasets/' if kind == 'datasets' else ''}{repo_id}"
|
| 215 |
rid = f"candidate-hf-{kind[:-1]}-{_slug(repo_id)}"
|
| 216 |
out.append(
|
|
@@ -227,19 +281,40 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
|
|
| 227 |
tags=["pashto", "candidate", category],
|
| 228 |
)
|
| 229 |
)
|
|
|
|
|
|
|
| 230 |
return out
|
| 231 |
|
| 232 |
|
| 233 |
def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
out: list[dict[str, Any]] = []
|
| 239 |
-
for item in
|
| 240 |
space_id = item.get("id")
|
| 241 |
if not space_id:
|
| 242 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
space_url = f"https://huggingface.co/spaces/{space_id}"
|
| 244 |
rid = f"candidate-hf-project-{_slug(space_id)}"
|
| 245 |
summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
|
|
@@ -257,17 +332,36 @@ def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
|
|
| 257 |
tags=["pashto", "candidate", "project", "space"],
|
| 258 |
)
|
| 259 |
)
|
|
|
|
|
|
|
| 260 |
return out
|
| 261 |
|
| 262 |
|
| 263 |
def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
|
| 264 |
# Public Kaggle dataset listing endpoint (no auth needed for list responses).
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
out: list[dict[str, Any]] = []
|
| 270 |
-
for item in
|
| 271 |
title = (item.get("titleNullable") or "").strip()
|
| 272 |
dataset_url = (item.get("urlNullable") or "").strip()
|
| 273 |
owner = (item.get("ownerRefNullable") or "").strip()
|
|
@@ -275,8 +369,9 @@ def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
|
|
| 275 |
if not title or not dataset_url:
|
| 276 |
continue
|
| 277 |
|
| 278 |
-
|
| 279 |
-
|
|
|
|
| 280 |
continue
|
| 281 |
|
| 282 |
owner_prefix = f"{owner}/" if owner else ""
|
|
@@ -304,7 +399,11 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
|
|
| 304 |
# Query by topic first for high precision, then by keyword for recall.
|
| 305 |
query_variants = [
|
| 306 |
"topic:pashto",
|
|
|
|
| 307 |
"pashto in:name,description,readme",
|
|
|
|
|
|
|
|
|
|
| 308 |
]
|
| 309 |
|
| 310 |
combined: dict[str, dict[str, Any]] = {}
|
|
@@ -334,8 +433,10 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
|
|
| 334 |
item.get("description") or "",
|
| 335 |
" ".join(item.get("topics") or []),
|
| 336 |
]
|
| 337 |
-
)
|
| 338 |
-
if
|
|
|
|
|
|
|
| 339 |
continue
|
| 340 |
|
| 341 |
html_url = item["html_url"]
|
|
@@ -367,70 +468,110 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
|
|
| 367 |
|
| 368 |
|
| 369 |
def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
xml_text = _fetch_text(url, timeout=30.0, source_name="arxiv")
|
| 376 |
-
except Exception as exc: # noqa: BLE001
|
| 377 |
-
if not _is_ssl_cert_error(exc):
|
| 378 |
-
raise
|
| 379 |
-
# arXiv occasionally fails cert chain validation in some runner images.
|
| 380 |
-
insecure_context = ssl._create_unverified_context()
|
| 381 |
-
print("[warn] arxiv SSL verification failed; retrying with unverified TLS context")
|
| 382 |
-
xml_text = _fetch_text(
|
| 383 |
-
url,
|
| 384 |
-
timeout=30.0,
|
| 385 |
-
ssl_context=insecure_context,
|
| 386 |
-
source_name="arxiv",
|
| 387 |
)
|
| 388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
| 390 |
|
|
|
|
| 391 |
out: list[dict[str, Any]] = []
|
| 392 |
-
for
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
| 412 |
)
|
| 413 |
-
|
|
|
|
| 414 |
return out
|
| 415 |
|
| 416 |
|
| 417 |
def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
|
| 418 |
fields = "title,url,abstract,year,externalIds"
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
out: list[dict[str, Any]] = []
|
| 430 |
-
for item in
|
| 431 |
title = (item.get("title") or "").strip()
|
| 432 |
if not title:
|
| 433 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 434 |
paper_url = (item.get("url") or "").strip()
|
| 435 |
if not paper_url:
|
| 436 |
ext = item.get("externalIds") or {}
|
|
@@ -450,12 +591,14 @@ def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
|
|
| 450 |
category="paper",
|
| 451 |
source="other",
|
| 452 |
summary=summary[:240] if summary else "Candidate paper returned from Semantic Scholar search for Pashto.",
|
| 453 |
-
evidence_text="Matched by Semantic Scholar
|
| 454 |
evidence_url=paper_url,
|
| 455 |
markers=["pashto"],
|
| 456 |
tags=["pashto", "candidate", "paper"],
|
| 457 |
)
|
| 458 |
)
|
|
|
|
|
|
|
| 459 |
return out
|
| 460 |
|
| 461 |
|
|
|
|
| 30 |
USER_AGENT = "pashto-resource-sync/1.0"
|
| 31 |
MAX_FETCH_RETRIES = 4
|
| 32 |
RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
|
| 33 |
+
PASHTO_QUERY_TERMS = ["pashto", "pukhto", "pushto", "pakhto"]
|
| 34 |
+
PASHTO_TEXT_MARKERS = ("pashto", "pukhto", "pushto", "pakhto")
|
| 35 |
+
PASHTO_SCRIPT_MARKERS = ("پښتو", "پشتو")
|
| 36 |
+
PASHTO_WORD_RE = re.compile(
|
| 37 |
+
r"(?<![A-Za-z0-9])(pashto|pukhto|pushto|pakhto)(?![A-Za-z0-9])",
|
| 38 |
+
re.IGNORECASE,
|
| 39 |
+
)
|
| 40 |
+
PASHTO_CAMEL_RE = re.compile(
|
| 41 |
+
r"(?<![A-Za-z0-9])(pashto|pukhto|pakhto)(?=[A-Z])",
|
| 42 |
+
re.IGNORECASE,
|
| 43 |
+
)
|
| 44 |
+
PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b", re.IGNORECASE)
|
| 45 |
+
LOW_SIGNAL_RE = re.compile(r"(^|[-_/ ])(test|tmp|trial|scratch)([-_/ ]|$)", re.IGNORECASE)
|
| 46 |
|
| 47 |
|
| 48 |
def _slug(value: str) -> str:
|
|
|
|
| 52 |
return value[:80] if value else "resource"
|
| 53 |
|
| 54 |
|
| 55 |
+
def _contains_pashto_marker(value: str) -> bool:
|
| 56 |
+
text = (value or "").strip()
|
| 57 |
+
if not text:
|
| 58 |
+
return False
|
| 59 |
+
if PASHTO_WORD_RE.search(text):
|
| 60 |
+
return True
|
| 61 |
+
if PASHTO_CAMEL_RE.search(text):
|
| 62 |
+
return True
|
| 63 |
+
if any(marker in text for marker in PASHTO_SCRIPT_MARKERS):
|
| 64 |
+
return True
|
| 65 |
+
lowered = text.casefold()
|
| 66 |
+
return bool(PASHTO_CODE_RE.search(lowered))
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _is_pashto_centric(*values: str) -> bool:
|
| 70 |
+
return any(_contains_pashto_marker(value) for value in values)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _is_low_signal_name(value: str) -> bool:
|
| 74 |
+
return bool(LOW_SIGNAL_RE.search(value or ""))
|
| 75 |
+
|
| 76 |
+
|
| 77 |
def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
|
| 78 |
if not retry_after:
|
| 79 |
return None
|
|
|
|
| 236 |
if kind not in {"datasets", "models"}:
|
| 237 |
return []
|
| 238 |
|
| 239 |
+
combined: dict[str, dict[str, Any]] = {}
|
| 240 |
+
errors: list[str] = []
|
| 241 |
+
for term in PASHTO_QUERY_TERMS:
|
| 242 |
+
query = urllib.parse.urlencode({"search": term, "limit": str(limit)})
|
| 243 |
+
url = f"https://huggingface.co/api/{kind}?{query}"
|
| 244 |
+
try:
|
| 245 |
+
payload = _fetch_json(url, source_name=f"huggingface-{kind}")
|
| 246 |
+
except Exception as exc: # noqa: BLE001
|
| 247 |
+
errors.append(f"{term}: {exc}")
|
| 248 |
+
continue
|
| 249 |
+
for item in payload:
|
| 250 |
+
repo_id = item.get("id") or item.get("modelId")
|
| 251 |
+
if not repo_id:
|
| 252 |
+
continue
|
| 253 |
+
combined[repo_id] = item
|
| 254 |
+
|
| 255 |
+
if not combined and errors:
|
| 256 |
+
raise RuntimeError("; ".join(errors))
|
| 257 |
|
| 258 |
category = "dataset" if kind == "datasets" else "model"
|
| 259 |
out: list[dict[str, Any]] = []
|
| 260 |
+
for item in combined.values():
|
| 261 |
repo_id = item.get("id") or item.get("modelId")
|
| 262 |
if not repo_id:
|
| 263 |
continue
|
| 264 |
+
if not _is_pashto_centric(repo_id):
|
| 265 |
+
continue
|
| 266 |
+
if _is_low_signal_name(repo_id):
|
| 267 |
+
continue
|
| 268 |
repo_url = f"https://huggingface.co/{'datasets/' if kind == 'datasets' else ''}{repo_id}"
|
| 269 |
rid = f"candidate-hf-{kind[:-1]}-{_slug(repo_id)}"
|
| 270 |
out.append(
|
|
|
|
| 281 |
tags=["pashto", "candidate", category],
|
| 282 |
)
|
| 283 |
)
|
| 284 |
+
if len(out) >= limit:
|
| 285 |
+
break
|
| 286 |
return out
|
| 287 |
|
| 288 |
|
| 289 |
def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
|
| 290 |
+
combined: dict[str, dict[str, Any]] = {}
|
| 291 |
+
errors: list[str] = []
|
| 292 |
+
for term in PASHTO_QUERY_TERMS:
|
| 293 |
+
query = urllib.parse.urlencode({"search": term, "limit": str(limit)})
|
| 294 |
+
url = f"https://huggingface.co/api/spaces?{query}"
|
| 295 |
+
try:
|
| 296 |
+
payload = _fetch_json(url, source_name="huggingface-spaces")
|
| 297 |
+
except Exception as exc: # noqa: BLE001
|
| 298 |
+
errors.append(f"{term}: {exc}")
|
| 299 |
+
continue
|
| 300 |
+
for item in payload:
|
| 301 |
+
space_id = item.get("id")
|
| 302 |
+
if not space_id:
|
| 303 |
+
continue
|
| 304 |
+
combined[space_id] = item
|
| 305 |
+
|
| 306 |
+
if not combined and errors:
|
| 307 |
+
raise RuntimeError("; ".join(errors))
|
| 308 |
|
| 309 |
out: list[dict[str, Any]] = []
|
| 310 |
+
for item in combined.values():
|
| 311 |
space_id = item.get("id")
|
| 312 |
if not space_id:
|
| 313 |
continue
|
| 314 |
+
if not _is_pashto_centric(space_id):
|
| 315 |
+
continue
|
| 316 |
+
if _is_low_signal_name(space_id):
|
| 317 |
+
continue
|
| 318 |
space_url = f"https://huggingface.co/spaces/{space_id}"
|
| 319 |
rid = f"candidate-hf-project-{_slug(space_id)}"
|
| 320 |
summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
|
|
|
|
| 332 |
tags=["pashto", "candidate", "project", "space"],
|
| 333 |
)
|
| 334 |
)
|
| 335 |
+
if len(out) >= limit:
|
| 336 |
+
break
|
| 337 |
return out
|
| 338 |
|
| 339 |
|
| 340 |
def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
|
| 341 |
# Public Kaggle dataset listing endpoint (no auth needed for list responses).
|
| 342 |
+
combined: list[dict[str, Any]] = []
|
| 343 |
+
seen_urls: set[str] = set()
|
| 344 |
+
errors: list[str] = []
|
| 345 |
+
for term in PASHTO_QUERY_TERMS:
|
| 346 |
+
query = urllib.parse.urlencode({"search": term, "page": "1"})
|
| 347 |
+
url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
|
| 348 |
+
try:
|
| 349 |
+
payload = _fetch_json(url, source_name="kaggle-datasets")
|
| 350 |
+
except Exception as exc: # noqa: BLE001
|
| 351 |
+
errors.append(f"{term}: {exc}")
|
| 352 |
+
continue
|
| 353 |
+
for item in payload:
|
| 354 |
+
dataset_url = (item.get("urlNullable") or "").strip()
|
| 355 |
+
if not dataset_url or dataset_url in seen_urls:
|
| 356 |
+
continue
|
| 357 |
+
seen_urls.add(dataset_url)
|
| 358 |
+
combined.append(item)
|
| 359 |
+
|
| 360 |
+
if not combined and errors:
|
| 361 |
+
raise RuntimeError("; ".join(errors))
|
| 362 |
|
| 363 |
out: list[dict[str, Any]] = []
|
| 364 |
+
for item in combined:
|
| 365 |
title = (item.get("titleNullable") or "").strip()
|
| 366 |
dataset_url = (item.get("urlNullable") or "").strip()
|
| 367 |
owner = (item.get("ownerRefNullable") or "").strip()
|
|
|
|
| 369 |
if not title or not dataset_url:
|
| 370 |
continue
|
| 371 |
|
| 372 |
+
if not _is_pashto_centric(title, subtitle):
|
| 373 |
+
continue
|
| 374 |
+
if _is_low_signal_name(title):
|
| 375 |
continue
|
| 376 |
|
| 377 |
owner_prefix = f"{owner}/" if owner else ""
|
|
|
|
| 399 |
# Query by topic first for high precision, then by keyword for recall.
|
| 400 |
query_variants = [
|
| 401 |
"topic:pashto",
|
| 402 |
+
"topic:pukhto",
|
| 403 |
"pashto in:name,description,readme",
|
| 404 |
+
"pukhto in:name,description,readme",
|
| 405 |
+
"pushto in:name,description,readme",
|
| 406 |
+
"pakhto in:name,description,readme",
|
| 407 |
]
|
| 408 |
|
| 409 |
combined: dict[str, dict[str, Any]] = {}
|
|
|
|
| 433 |
item.get("description") or "",
|
| 434 |
" ".join(item.get("topics") or []),
|
| 435 |
]
|
| 436 |
+
)
|
| 437 |
+
if not _is_pashto_centric(name_blob):
|
| 438 |
+
continue
|
| 439 |
+
if _is_low_signal_name(full_name):
|
| 440 |
continue
|
| 441 |
|
| 442 |
html_url = item["html_url"]
|
|
|
|
| 468 |
|
| 469 |
|
| 470 |
def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
|
| 471 |
+
roots: list[ET.Element] = []
|
| 472 |
+
errors: list[str] = []
|
| 473 |
+
for term in PASHTO_QUERY_TERMS:
|
| 474 |
+
query = urllib.parse.urlencode(
|
| 475 |
+
{"search_query": f"all:{term}", "start": "0", "max_results": str(limit)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
)
|
| 477 |
+
url = f"https://export.arxiv.org/api/query?{query}"
|
| 478 |
+
try:
|
| 479 |
+
xml_text = _fetch_text(url, timeout=30.0, source_name="arxiv")
|
| 480 |
+
except Exception as exc: # noqa: BLE001
|
| 481 |
+
if not _is_ssl_cert_error(exc):
|
| 482 |
+
errors.append(f"{term}: {exc}")
|
| 483 |
+
continue
|
| 484 |
+
# arXiv occasionally fails cert chain validation in some runner images.
|
| 485 |
+
insecure_context = ssl._create_unverified_context()
|
| 486 |
+
print("[warn] arxiv SSL verification failed; retrying with unverified TLS context")
|
| 487 |
+
xml_text = _fetch_text(
|
| 488 |
+
url,
|
| 489 |
+
timeout=30.0,
|
| 490 |
+
ssl_context=insecure_context,
|
| 491 |
+
source_name="arxiv",
|
| 492 |
+
)
|
| 493 |
+
roots.append(ET.fromstring(xml_text))
|
| 494 |
+
|
| 495 |
+
if not roots and errors:
|
| 496 |
+
raise RuntimeError("; ".join(errors))
|
| 497 |
+
|
| 498 |
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
| 499 |
|
| 500 |
+
seen_links: set[str] = set()
|
| 501 |
out: list[dict[str, Any]] = []
|
| 502 |
+
for root in roots:
|
| 503 |
+
for entry in root.findall("atom:entry", ns):
|
| 504 |
+
title = (entry.findtext("atom:title", default="", namespaces=ns) or "").strip()
|
| 505 |
+
link = (entry.findtext("atom:id", default="", namespaces=ns) or "").strip()
|
| 506 |
+
summary = (entry.findtext("atom:summary", default="", namespaces=ns) or "").strip()
|
| 507 |
+
if not title or not link:
|
| 508 |
+
continue
|
| 509 |
+
if link in seen_links:
|
| 510 |
+
continue
|
| 511 |
+
# Strict: keep only papers with explicit Pashto markers in title.
|
| 512 |
+
if not _is_pashto_centric(title):
|
| 513 |
+
continue
|
| 514 |
+
if _is_low_signal_name(title):
|
| 515 |
+
continue
|
| 516 |
|
| 517 |
+
seen_links.add(link)
|
| 518 |
+
rid = f"candidate-arxiv-{_slug(title)}"
|
| 519 |
+
out.append(
|
| 520 |
+
_candidate(
|
| 521 |
+
rid=rid,
|
| 522 |
+
title=title,
|
| 523 |
+
url=link,
|
| 524 |
+
category="paper",
|
| 525 |
+
source="arxiv",
|
| 526 |
+
summary=summary[:240] if summary else "Candidate paper returned from arXiv query for Pashto.",
|
| 527 |
+
evidence_text="Matched by Pashto marker in paper title from arXiv query results.",
|
| 528 |
+
evidence_url=link,
|
| 529 |
+
markers=["pashto"],
|
| 530 |
+
tags=["pashto", "candidate", "paper"],
|
| 531 |
+
)
|
| 532 |
)
|
| 533 |
+
if len(out) >= limit:
|
| 534 |
+
return out
|
| 535 |
return out
|
| 536 |
|
| 537 |
|
| 538 |
def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
|
| 539 |
fields = "title,url,abstract,year,externalIds"
|
| 540 |
+
combined: dict[str, dict[str, Any]] = {}
|
| 541 |
+
errors: list[str] = []
|
| 542 |
+
for term in PASHTO_QUERY_TERMS:
|
| 543 |
+
query = urllib.parse.urlencode(
|
| 544 |
+
{"query": term, "limit": str(limit), "fields": fields}
|
| 545 |
+
)
|
| 546 |
+
url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
|
| 547 |
+
try:
|
| 548 |
+
payload = _fetch_json(
|
| 549 |
+
url,
|
| 550 |
+
timeout=30.0,
|
| 551 |
+
source_name="semantic-scholar",
|
| 552 |
+
)
|
| 553 |
+
except Exception as exc: # noqa: BLE001
|
| 554 |
+
errors.append(f"{term}: {exc}")
|
| 555 |
+
continue
|
| 556 |
+
for item in payload.get("data", []):
|
| 557 |
+
title = (item.get("title") or "").strip()
|
| 558 |
+
if not title:
|
| 559 |
+
continue
|
| 560 |
+
combined[title] = item
|
| 561 |
+
|
| 562 |
+
if not combined and errors:
|
| 563 |
+
raise RuntimeError("; ".join(errors))
|
| 564 |
|
| 565 |
out: list[dict[str, Any]] = []
|
| 566 |
+
for item in combined.values():
|
| 567 |
title = (item.get("title") or "").strip()
|
| 568 |
if not title:
|
| 569 |
continue
|
| 570 |
+
# Strict: keep only papers with explicit Pashto markers in title.
|
| 571 |
+
if not _is_pashto_centric(title):
|
| 572 |
+
continue
|
| 573 |
+
if _is_low_signal_name(title):
|
| 574 |
+
continue
|
| 575 |
paper_url = (item.get("url") or "").strip()
|
| 576 |
if not paper_url:
|
| 577 |
ext = item.get("externalIds") or {}
|
|
|
|
| 591 |
category="paper",
|
| 592 |
source="other",
|
| 593 |
summary=summary[:240] if summary else "Candidate paper returned from Semantic Scholar search for Pashto.",
|
| 594 |
+
evidence_text="Matched by explicit Pashto marker in paper title from Semantic Scholar search.",
|
| 595 |
evidence_url=paper_url,
|
| 596 |
markers=["pashto"],
|
| 597 |
tags=["pashto", "candidate", "paper"],
|
| 598 |
)
|
| 599 |
)
|
| 600 |
+
if len(out) >= limit:
|
| 601 |
+
break
|
| 602 |
return out
|
| 603 |
|
| 604 |
|
scripts/validate_resource_catalog.py
CHANGED
|
@@ -20,6 +20,9 @@ ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project
|
|
| 20 |
ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
|
| 21 |
ALLOWED_STATUS = {"verified", "candidate"}
|
| 22 |
RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
def _load_json(path: Path) -> dict[str, Any]:
|
|
@@ -39,6 +42,19 @@ def _validate_iso_date(value: str) -> bool:
|
|
| 39 |
return True
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
|
| 43 |
errors: list[str] = []
|
| 44 |
prefix = f"resource[{index}]"
|
|
@@ -123,6 +139,14 @@ def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
|
|
| 123 |
if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
|
| 124 |
errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
return errors
|
| 127 |
|
| 128 |
|
|
|
|
| 20 |
ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
|
| 21 |
ALLOWED_STATUS = {"verified", "candidate"}
|
| 22 |
RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
|
| 23 |
+
STRICT_PASHTO_CATEGORIES = {"model", "paper", "tool", "code", "project"}
|
| 24 |
+
PASHTO_PUSHTO_WORD_RE = re.compile(r"(?<![a-z0-9])pushto(?![a-z0-9])")
|
| 25 |
+
PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b")
|
| 26 |
|
| 27 |
|
| 28 |
def _load_json(path: Path) -> dict[str, Any]:
|
|
|
|
| 42 |
return True
|
| 43 |
|
| 44 |
|
| 45 |
+
def _contains_pashto_marker(value: str) -> bool:
|
| 46 |
+
if not isinstance(value, str):
|
| 47 |
+
return False
|
| 48 |
+
lowered = value.casefold()
|
| 49 |
+
if any(marker in lowered for marker in ("pashto", "pukhto", "pakhto")):
|
| 50 |
+
return True
|
| 51 |
+
if PASHTO_PUSHTO_WORD_RE.search(lowered):
|
| 52 |
+
return True
|
| 53 |
+
if PASHTO_CODE_RE.search(lowered):
|
| 54 |
+
return True
|
| 55 |
+
return any(marker in value for marker in ("پښتو", "پشتو"))
|
| 56 |
+
|
| 57 |
+
|
| 58 |
def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
|
| 59 |
errors: list[str] = []
|
| 60 |
prefix = f"resource[{index}]"
|
|
|
|
| 139 |
if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
|
| 140 |
errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")
|
| 141 |
|
| 142 |
+
if category in STRICT_PASHTO_CATEGORIES and not (
|
| 143 |
+
_contains_pashto_marker(title) or _contains_pashto_marker(url)
|
| 144 |
+
):
|
| 145 |
+
errors.append(
|
| 146 |
+
f"{prefix} must be Pashto-centric for category '{category}' "
|
| 147 |
+
"(include a Pashto marker in title or URL)"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
return errors
|
| 151 |
|
| 152 |
|
tests/test_validate_resource_catalog.py
CHANGED
|
@@ -43,3 +43,21 @@ def test_validate_catalog_fails_for_invalid_evidence_url() -> None:
|
|
| 43 |
catalog["resources"][0]["pashto_evidence"]["evidence_url"] = "not-a-url"
|
| 44 |
errors = validate_catalog(catalog)
|
| 45 |
assert any("evidence_url" in error for error in errors)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
catalog["resources"][0]["pashto_evidence"]["evidence_url"] = "not-a-url"
|
| 44 |
errors = validate_catalog(catalog)
|
| 45 |
assert any("evidence_url" in error for error in errors)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def test_validate_catalog_fails_for_non_pashto_centric_model() -> None:
|
| 49 |
+
catalog = _minimal_catalog()
|
| 50 |
+
catalog["resources"][0]["category"] = "model"
|
| 51 |
+
catalog["resources"][0]["title"] = "Generic Multilingual Model"
|
| 52 |
+
catalog["resources"][0]["url"] = "https://example.org/model"
|
| 53 |
+
errors = validate_catalog(catalog)
|
| 54 |
+
assert any("must be Pashto-centric" in error for error in errors)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_validate_catalog_allows_pashto_centric_model() -> None:
|
| 58 |
+
catalog = _minimal_catalog()
|
| 59 |
+
catalog["resources"][0]["category"] = "model"
|
| 60 |
+
catalog["resources"][0]["title"] = "Pashto ASR Model"
|
| 61 |
+
catalog["resources"][0]["url"] = "https://example.org/pashto-model"
|
| 62 |
+
errors = validate_catalog(catalog)
|
| 63 |
+
assert errors == []
|