musaw commited on
Commit
ed6f1f9
·
1 Parent(s): 9899fdf

chore(resources): enforce Pashto-centric policy and prune reference-only entries

Browse files
docs/resource_catalog.md CHANGED
@@ -6,7 +6,8 @@ This index points to validated Pashto-related resources tracked in structured fi
6
 
7
  ## Validation method
8
  - Verify source URL resolves to official page or canonical repository.
9
- - Verify explicit Pashto support markers (`Pashto`, `ps`, `ps_af`, `pus`, `pbt_Arab`) where possible.
 
10
  - Include only resources with practical use for this repository.
11
 
12
  ## Structured catalog
 
6
 
7
  ## Validation method
8
  - Verify source URL resolves to official page or canonical repository.
9
+ - Verify explicit Pashto support markers (`Pashto`, `Pukhto`, `Pushto`, `Pakhto`, `پښتو`, `ps`, `ps_af`, `pus`, `pbt_Arab`) where possible.
10
+ - Reject resources where Pashto is only mentioned in passing and the primary work is focused on another language.
11
  - Include only resources with practical use for this repository.
12
 
13
  ## Structured catalog
docs/resource_cycle_runbook.md CHANGED
@@ -42,4 +42,6 @@ After discovery, promote only approved resources:
42
  ## Guardrails
43
  - Do not auto-promote candidates without evidence and license review.
44
  - Keep `status: verified` only for reviewed entries.
 
 
45
  - Generated files must be committed after catalog updates.
 
42
  ## Guardrails
43
  - Do not auto-promote candidates without evidence and license review.
44
  - Keep `status: verified` only for reviewed entries.
45
+ - Do not promote "reference-only" resources where Pashto is incidental; only Pashto-centric resources are eligible.
46
+ - Treat spelling variants as valid Pashto markers during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `پښتو`).
47
  - Generated files must be committed after catalog updates.
docs/search/resources.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "generated_on": "2026-02-16T00:00:00Z",
3
- "count": 77,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
@@ -190,144 +190,6 @@
190
  "Pashto"
191
  ]
192
  },
193
- {
194
- "id": "model-whisper-large-v3",
195
- "title": "Whisper Large v3",
196
- "url": "https://huggingface.co/openai/whisper-large-v3",
197
- "category": "model",
198
- "source": "huggingface",
199
- "status": "verified",
200
- "summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
201
- "primary_use": "ASR baseline and pseudo-labeling",
202
- "tasks": [
203
- "asr"
204
- ],
205
- "tags": [
206
- "pashto",
207
- "asr",
208
- "whisper"
209
- ],
210
- "evidence_text": "Whisper tokenizer map includes ps language key.",
211
- "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
212
- "markers": [
213
- "ps"
214
- ]
215
- },
216
- {
217
- "id": "model-mms-1b-all",
218
- "title": "MMS 1B All",
219
- "url": "https://huggingface.co/facebook/mms-1b-all",
220
- "category": "model",
221
- "source": "huggingface",
222
- "status": "verified",
223
- "summary": "Multilingual ASR model from MMS for low-resource transfer.",
224
- "primary_use": "ASR transfer baseline",
225
- "tasks": [
226
- "asr"
227
- ],
228
- "tags": [
229
- "pashto",
230
- "asr",
231
- "mms"
232
- ],
233
- "evidence_text": "MMS coverage table includes pus with ASR support.",
234
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
235
- "markers": [
236
- "pus"
237
- ]
238
- },
239
- {
240
- "id": "model-mms-tts",
241
- "title": "MMS TTS",
242
- "url": "https://huggingface.co/facebook/mms-tts",
243
- "category": "model",
244
- "source": "huggingface",
245
- "status": "verified",
246
- "summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
247
- "primary_use": "TTS baseline and transfer",
248
- "tasks": [
249
- "tts"
250
- ],
251
- "tags": [
252
- "pashto",
253
- "tts",
254
- "mms"
255
- ],
256
- "evidence_text": "MMS coverage table includes pus with TTS support.",
257
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
258
- "markers": [
259
- "pus"
260
- ]
261
- },
262
- {
263
- "id": "model-nllb-200-distilled-600m",
264
- "title": "NLLB-200 Distilled 600M",
265
- "url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
266
- "category": "model",
267
- "source": "huggingface",
268
- "status": "verified",
269
- "summary": "General multilingual translation model with Pashto script token support.",
270
- "primary_use": "Pashto translation baseline",
271
- "tasks": [
272
- "mt"
273
- ],
274
- "tags": [
275
- "pashto",
276
- "mt",
277
- "nllb"
278
- ],
279
- "evidence_text": "Model special token map includes pbt_Arab.",
280
- "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
281
- "markers": [
282
- "pbt_Arab"
283
- ]
284
- },
285
- {
286
- "id": "model-opus-mt-en-mul",
287
- "title": "OPUS MT en-mul",
288
- "url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
289
- "category": "model",
290
- "source": "huggingface",
291
- "status": "verified",
292
- "summary": "Translation model that can route English into Pashto via multilingual set.",
293
- "primary_use": "English to Pashto translation path",
294
- "tasks": [
295
- "mt"
296
- ],
297
- "tags": [
298
- "pashto",
299
- "mt",
300
- "opus"
301
- ],
302
- "evidence_text": "Language list includes pus code.",
303
- "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
304
- "markers": [
305
- "pus"
306
- ]
307
- },
308
- {
309
- "id": "model-opus-mt-mul-en",
310
- "title": "OPUS MT mul-en",
311
- "url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
312
- "category": "model",
313
- "source": "huggingface",
314
- "status": "verified",
315
- "summary": "Translation model for Pashto to English via multilingual encoder.",
316
- "primary_use": "Pashto to English translation path",
317
- "tasks": [
318
- "mt"
319
- ],
320
- "tags": [
321
- "pashto",
322
- "mt",
323
- "opus"
324
- ],
325
- "evidence_text": "Language list includes pus code.",
326
- "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
327
- "markers": [
328
- "pus"
329
- ]
330
- },
331
  {
332
  "id": "model-pashto-bert",
333
  "title": "PashtoBERT",
@@ -447,150 +309,6 @@
447
  "pbt_Arab"
448
  ]
449
  },
450
- {
451
- "id": "tool-faster-whisper",
452
- "title": "Faster-Whisper",
453
- "url": "https://github.com/SYSTRAN/faster-whisper",
454
- "category": "tool",
455
- "source": "github",
456
- "status": "verified",
457
- "summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
458
- "primary_use": "ASR inference acceleration",
459
- "tasks": [
460
- "asr"
461
- ],
462
- "tags": [
463
- "pashto",
464
- "tooling",
465
- "asr"
466
- ],
467
- "evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
468
- "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
469
- "markers": [
470
- "ps"
471
- ]
472
- },
473
- {
474
- "id": "tool-coqui-tts",
475
- "title": "Coqui TTS",
476
- "url": "https://github.com/coqui-ai/TTS",
477
- "category": "tool",
478
- "source": "github",
479
- "status": "verified",
480
- "summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
481
- "primary_use": "TTS training and inference",
482
- "tasks": [
483
- "tts"
484
- ],
485
- "tags": [
486
- "pashto",
487
- "tooling",
488
- "tts"
489
- ],
490
- "evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
491
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
492
- "markers": [
493
- "pus"
494
- ]
495
- },
496
- {
497
- "id": "paper-whisper-2212-04356",
498
- "title": "Robust Speech Recognition via Large-Scale Weak Supervision",
499
- "url": "https://arxiv.org/abs/2212.04356",
500
- "category": "paper",
501
- "source": "arxiv",
502
- "status": "verified",
503
- "summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
504
- "primary_use": "ASR methodology reference",
505
- "tasks": [
506
- "asr",
507
- "research"
508
- ],
509
- "tags": [
510
- "pashto",
511
- "paper",
512
- "asr"
513
- ],
514
- "evidence_text": "Paired with tokenizer language map containing ps.",
515
- "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
516
- "markers": [
517
- "ps"
518
- ]
519
- },
520
- {
521
- "id": "paper-mms-2305-13516",
522
- "title": "Scaling Speech Technology to 1,000+ Languages",
523
- "url": "https://arxiv.org/abs/2305.13516",
524
- "category": "paper",
525
- "source": "arxiv",
526
- "status": "verified",
527
- "summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
528
- "primary_use": "ASR and TTS transfer reference",
529
- "tasks": [
530
- "asr",
531
- "tts",
532
- "research"
533
- ],
534
- "tags": [
535
- "pashto",
536
- "paper",
537
- "speech"
538
- ],
539
- "evidence_text": "Coverage table marks pus support in MMS release.",
540
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
541
- "markers": [
542
- "pus"
543
- ]
544
- },
545
- {
546
- "id": "paper-nllb-2207-04672",
547
- "title": "No Language Left Behind",
548
- "url": "https://arxiv.org/abs/2207.04672",
549
- "category": "paper",
550
- "source": "arxiv",
551
- "status": "verified",
552
- "summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
553
- "primary_use": "MT research reference",
554
- "tasks": [
555
- "mt",
556
- "research"
557
- ],
558
- "tags": [
559
- "pashto",
560
- "paper",
561
- "mt"
562
- ],
563
- "evidence_text": "Model usage in repo references pbt_Arab token support.",
564
- "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
565
- "markers": [
566
- "pbt_Arab"
567
- ]
568
- },
569
- {
570
- "id": "paper-fleurs-2205-12446",
571
- "title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
572
- "url": "https://arxiv.org/abs/2205.12446",
573
- "category": "paper",
574
- "source": "arxiv",
575
- "status": "verified",
576
- "summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
577
- "primary_use": "Speech benchmark methodology reference",
578
- "tasks": [
579
- "asr",
580
- "benchmarking",
581
- "research"
582
- ],
583
- "tags": [
584
- "pashto",
585
- "paper",
586
- "benchmark"
587
- ],
588
- "evidence_text": "Dataset implementation includes ps_af language code.",
589
- "evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
590
- "markers": [
591
- "ps_af"
592
- ]
593
- },
594
  {
595
  "id": "dataset-nexdata-99h-pashto-dialogue",
596
  "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
@@ -972,32 +690,6 @@
972
  "asr"
973
  ]
974
  },
975
- {
976
- "id": "code-github-mrychlik-worldly-ocr",
977
- "title": "worldly-ocr",
978
- "url": "https://github.com/mrychlik/worldly-ocr",
979
- "category": "code",
980
- "source": "github",
981
- "status": "verified",
982
- "summary": "Open OCR code project that explicitly includes Pashto among target languages.",
983
- "primary_use": "Pashto OCR code reference and experimentation",
984
- "tasks": [
985
- "ocr",
986
- "tooling"
987
- ],
988
- "tags": [
989
- "pashto",
990
- "code",
991
- "github",
992
- "ocr"
993
- ],
994
- "evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
995
- "evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
996
- "markers": [
997
- "Pashto",
998
- "OCR"
999
- ]
1000
- },
1001
  {
1002
  "id": "paper-s2-psocr-lmm-pashto",
1003
  "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
@@ -1687,30 +1379,6 @@
1687
  "pashto"
1688
  ]
1689
  },
1690
- {
1691
- "id": "project-github-ihyacommunity-khushkhat-extension",
1692
- "title": "IhyaCommunity/Khushkhat-Extension",
1693
- "url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1694
- "category": "project",
1695
- "source": "github",
1696
- "status": "verified",
1697
- "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
1698
- "primary_use": "Interactive Pashto demo and quick qualitative validation",
1699
- "tasks": [
1700
- "demo"
1701
- ],
1702
- "tags": [
1703
- "pashto",
1704
- "project",
1705
- "github",
1706
- "demo"
1707
- ],
1708
- "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
1709
- "evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1710
- "markers": [
1711
- "pashto"
1712
- ]
1713
- },
1714
  {
1715
  "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
1716
  "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
 
1
  {
2
  "generated_on": "2026-02-16T00:00:00Z",
3
+ "count": 63,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
 
190
  "Pashto"
191
  ]
192
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  {
194
  "id": "model-pashto-bert",
195
  "title": "PashtoBERT",
 
309
  "pbt_Arab"
310
  ]
311
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  {
313
  "id": "dataset-nexdata-99h-pashto-dialogue",
314
  "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
 
690
  "asr"
691
  ]
692
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
693
  {
694
  "id": "paper-s2-psocr-lmm-pashto",
695
  "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
 
1379
  "pashto"
1380
  ]
1381
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1382
  {
1383
  "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
1384
  "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
resources/README.md CHANGED
@@ -4,12 +4,12 @@ Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
  - Datasets (28): [datasets/README.md](datasets/README.md)
7
- - Models (18): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
- - Tools (2): [tools/README.md](tools/README.md)
10
- - Papers (12): [papers/README.md](papers/README.md)
11
- - Projects (11): [projects/README.md](projects/README.md)
12
- - Code (2): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
15
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
@@ -22,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
- Verified resource count: `77`
 
4
 
5
  ## Sections
6
  - Datasets (28): [datasets/README.md](datasets/README.md)
7
+ - Models (12): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
+ - Tools (0): [tools/README.md](tools/README.md)
10
+ - Papers (8): [papers/README.md](papers/README.md)
11
+ - Projects (10): [projects/README.md](projects/README.md)
12
+ - Code (1): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
15
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
 
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
+ Verified resource count: `63`
resources/catalog/README.md CHANGED
@@ -12,3 +12,7 @@ This folder holds machine-readable resource data used by docs and GitHub Pages s
12
  2. Run `python scripts/validate_resource_catalog.py`.
13
  3. Run `python scripts/generate_resource_views.py`.
14
  4. Commit both catalog and generated markdown/search files.
 
 
 
 
 
12
  2. Run `python scripts/validate_resource_catalog.py`.
13
  3. Run `python scripts/generate_resource_views.py`.
14
  4. Commit both catalog and generated markdown/search files.
15
+
16
+ ## Promotion guardrail
17
+ - Promote only Pashto-centric resources. Exclude entries where Pashto appears only as a side reference.
18
+ - Accept Pashto naming variants during review (`pashto`, `pukhto`, `pushto`, `pakhto`, `پښتو`).
resources/catalog/pending_candidates.json CHANGED
The diff for this file is too large to render. See raw diff
 
resources/catalog/resources.json CHANGED
@@ -206,156 +206,6 @@
206
  "kaggle"
207
  ]
208
  },
209
- {
210
- "id": "model-whisper-large-v3",
211
- "title": "Whisper Large v3",
212
- "url": "https://huggingface.co/openai/whisper-large-v3",
213
- "category": "model",
214
- "source": "huggingface",
215
- "status": "verified",
216
- "summary": "Strong multilingual ASR baseline suitable for Pashto bootstrapping.",
217
- "primary_use": "ASR baseline and pseudo-labeling",
218
- "tasks": [
219
- "asr"
220
- ],
221
- "pashto_evidence": {
222
- "evidence_text": "Whisper tokenizer map includes ps language key.",
223
- "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
224
- "markers": [
225
- "ps"
226
- ]
227
- },
228
- "tags": [
229
- "pashto",
230
- "asr",
231
- "whisper"
232
- ]
233
- },
234
- {
235
- "id": "model-mms-1b-all",
236
- "title": "MMS 1B All",
237
- "url": "https://huggingface.co/facebook/mms-1b-all",
238
- "category": "model",
239
- "source": "huggingface",
240
- "status": "verified",
241
- "summary": "Multilingual ASR model from MMS for low-resource transfer.",
242
- "primary_use": "ASR transfer baseline",
243
- "tasks": [
244
- "asr"
245
- ],
246
- "pashto_evidence": {
247
- "evidence_text": "MMS coverage table includes pus with ASR support.",
248
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
249
- "markers": [
250
- "pus"
251
- ]
252
- },
253
- "tags": [
254
- "pashto",
255
- "asr",
256
- "mms"
257
- ]
258
- },
259
- {
260
- "id": "model-mms-tts",
261
- "title": "MMS TTS",
262
- "url": "https://huggingface.co/facebook/mms-tts",
263
- "category": "model",
264
- "source": "huggingface",
265
- "status": "verified",
266
- "summary": "Multilingual TTS checkpoints useful for Pashto voice synthesis.",
267
- "primary_use": "TTS baseline and transfer",
268
- "tasks": [
269
- "tts"
270
- ],
271
- "pashto_evidence": {
272
- "evidence_text": "MMS coverage table includes pus with TTS support.",
273
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
274
- "markers": [
275
- "pus"
276
- ]
277
- },
278
- "tags": [
279
- "pashto",
280
- "tts",
281
- "mms"
282
- ]
283
- },
284
- {
285
- "id": "model-nllb-200-distilled-600m",
286
- "title": "NLLB-200 Distilled 600M",
287
- "url": "https://huggingface.co/facebook/nllb-200-distilled-600M",
288
- "category": "model",
289
- "source": "huggingface",
290
- "status": "verified",
291
- "summary": "General multilingual translation model with Pashto script token support.",
292
- "primary_use": "Pashto translation baseline",
293
- "tasks": [
294
- "mt"
295
- ],
296
- "pashto_evidence": {
297
- "evidence_text": "Model special token map includes pbt_Arab.",
298
- "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
299
- "markers": [
300
- "pbt_Arab"
301
- ]
302
- },
303
- "tags": [
304
- "pashto",
305
- "mt",
306
- "nllb"
307
- ]
308
- },
309
- {
310
- "id": "model-opus-mt-en-mul",
311
- "title": "OPUS MT en-mul",
312
- "url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
313
- "category": "model",
314
- "source": "huggingface",
315
- "status": "verified",
316
- "summary": "Translation model that can route English into Pashto via multilingual set.",
317
- "primary_use": "English to Pashto translation path",
318
- "tasks": [
319
- "mt"
320
- ],
321
- "pashto_evidence": {
322
- "evidence_text": "Language list includes pus code.",
323
- "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-en-mul",
324
- "markers": [
325
- "pus"
326
- ]
327
- },
328
- "tags": [
329
- "pashto",
330
- "mt",
331
- "opus"
332
- ]
333
- },
334
- {
335
- "id": "model-opus-mt-mul-en",
336
- "title": "OPUS MT mul-en",
337
- "url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
338
- "category": "model",
339
- "source": "huggingface",
340
- "status": "verified",
341
- "summary": "Translation model for Pashto to English via multilingual encoder.",
342
- "primary_use": "Pashto to English translation path",
343
- "tasks": [
344
- "mt"
345
- ],
346
- "pashto_evidence": {
347
- "evidence_text": "Language list includes pus code.",
348
- "evidence_url": "https://huggingface.co/Helsinki-NLP/opus-mt-mul-en",
349
- "markers": [
350
- "pus"
351
- ]
352
- },
353
- "tags": [
354
- "pashto",
355
- "mt",
356
- "opus"
357
- ]
358
- },
359
  {
360
  "id": "model-pashto-bert",
361
  "title": "PashtoBERT",
@@ -485,162 +335,6 @@
485
  "mt"
486
  ]
487
  },
488
- {
489
- "id": "tool-faster-whisper",
490
- "title": "Faster-Whisper",
491
- "url": "https://github.com/SYSTRAN/faster-whisper",
492
- "category": "tool",
493
- "source": "github",
494
- "status": "verified",
495
- "summary": "Optimized Whisper inference runtime for faster Pashto ASR experiments.",
496
- "primary_use": "ASR inference acceleration",
497
- "tasks": [
498
- "asr"
499
- ],
500
- "pashto_evidence": {
501
- "evidence_text": "Whisper tokenizer includes ps and tool runs Whisper models.",
502
- "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
503
- "markers": [
504
- "ps"
505
- ]
506
- },
507
- "tags": [
508
- "pashto",
509
- "tooling",
510
- "asr"
511
- ]
512
- },
513
- {
514
- "id": "tool-coqui-tts",
515
- "title": "Coqui TTS",
516
- "url": "https://github.com/coqui-ai/TTS",
517
- "category": "tool",
518
- "source": "github",
519
- "status": "verified",
520
- "summary": "Open toolkit for TTS training and inference used for Pashto experiments.",
521
- "primary_use": "TTS training and inference",
522
- "tasks": [
523
- "tts"
524
- ],
525
- "pashto_evidence": {
526
- "evidence_text": "Can be paired with Pashto-supporting MMS checkpoints.",
527
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
528
- "markers": [
529
- "pus"
530
- ]
531
- },
532
- "tags": [
533
- "pashto",
534
- "tooling",
535
- "tts"
536
- ]
537
- },
538
- {
539
- "id": "paper-whisper-2212-04356",
540
- "title": "Robust Speech Recognition via Large-Scale Weak Supervision",
541
- "url": "https://arxiv.org/abs/2212.04356",
542
- "category": "paper",
543
- "source": "arxiv",
544
- "status": "verified",
545
- "summary": "Whisper paper used as a foundational ASR reference for Pashto baselines.",
546
- "primary_use": "ASR methodology reference",
547
- "tasks": [
548
- "asr",
549
- "research"
550
- ],
551
- "pashto_evidence": {
552
- "evidence_text": "Paired with tokenizer language map containing ps.",
553
- "evidence_url": "https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py",
554
- "markers": [
555
- "ps"
556
- ]
557
- },
558
- "tags": [
559
- "pashto",
560
- "paper",
561
- "asr"
562
- ]
563
- },
564
- {
565
- "id": "paper-mms-2305-13516",
566
- "title": "Scaling Speech Technology to 1,000+ Languages",
567
- "url": "https://arxiv.org/abs/2305.13516",
568
- "category": "paper",
569
- "source": "arxiv",
570
- "status": "verified",
571
- "summary": "MMS paper covering multilingual speech scaling and low-resource transfer.",
572
- "primary_use": "ASR and TTS transfer reference",
573
- "tasks": [
574
- "asr",
575
- "tts",
576
- "research"
577
- ],
578
- "pashto_evidence": {
579
- "evidence_text": "Coverage table marks pus support in MMS release.",
580
- "evidence_url": "https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html",
581
- "markers": [
582
- "pus"
583
- ]
584
- },
585
- "tags": [
586
- "pashto",
587
- "paper",
588
- "speech"
589
- ]
590
- },
591
- {
592
- "id": "paper-nllb-2207-04672",
593
- "title": "No Language Left Behind",
594
- "url": "https://arxiv.org/abs/2207.04672",
595
- "category": "paper",
596
- "source": "arxiv",
597
- "status": "verified",
598
- "summary": "NLLB paper supporting multilingual MT strategy for Pashto integration.",
599
- "primary_use": "MT research reference",
600
- "tasks": [
601
- "mt",
602
- "research"
603
- ],
604
- "pashto_evidence": {
605
- "evidence_text": "Model usage in repo references pbt_Arab token support.",
606
- "evidence_url": "https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json",
607
- "markers": [
608
- "pbt_Arab"
609
- ]
610
- },
611
- "tags": [
612
- "pashto",
613
- "paper",
614
- "mt"
615
- ]
616
- },
617
- {
618
- "id": "paper-fleurs-2205-12446",
619
- "title": "FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech",
620
- "url": "https://arxiv.org/abs/2205.12446",
621
- "category": "paper",
622
- "source": "arxiv",
623
- "status": "verified",
624
- "summary": "FLEURS benchmark paper supporting multilingual speech evaluation including Pashto.",
625
- "primary_use": "Speech benchmark methodology reference",
626
- "tasks": [
627
- "asr",
628
- "benchmarking",
629
- "research"
630
- ],
631
- "pashto_evidence": {
632
- "evidence_text": "Dataset implementation includes ps_af language code.",
633
- "evidence_url": "https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py",
634
- "markers": [
635
- "ps_af"
636
- ]
637
- },
638
- "tags": [
639
- "pashto",
640
- "paper",
641
- "benchmark"
642
- ]
643
- },
644
  {
645
  "id": "dataset-nexdata-99h-pashto-dialogue",
646
  "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
@@ -1061,34 +755,6 @@
1061
  "asr"
1062
  ]
1063
  },
1064
- {
1065
- "id": "code-github-mrychlik-worldly-ocr",
1066
- "title": "worldly-ocr",
1067
- "url": "https://github.com/mrychlik/worldly-ocr",
1068
- "category": "code",
1069
- "source": "github",
1070
- "status": "verified",
1071
- "summary": "Open OCR code project that explicitly includes Pashto among target languages.",
1072
- "primary_use": "Pashto OCR code reference and experimentation",
1073
- "tasks": [
1074
- "ocr",
1075
- "tooling"
1076
- ],
1077
- "pashto_evidence": {
1078
- "evidence_text": "Repository description explicitly says OCR for Pashto and Chinese.",
1079
- "evidence_url": "https://api.github.com/repos/mrychlik/worldly-ocr",
1080
- "markers": [
1081
- "Pashto",
1082
- "OCR"
1083
- ]
1084
- },
1085
- "tags": [
1086
- "pashto",
1087
- "code",
1088
- "github",
1089
- "ocr"
1090
- ]
1091
- },
1092
  {
1093
  "id": "paper-s2-psocr-lmm-pashto",
1094
  "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
@@ -1837,32 +1503,6 @@
1837
  "demo"
1838
  ]
1839
  },
1840
- {
1841
- "id": "project-github-ihyacommunity-khushkhat-extension",
1842
- "title": "IhyaCommunity/Khushkhat-Extension",
1843
- "url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1844
- "category": "project",
1845
- "source": "github",
1846
- "status": "verified",
1847
- "summary": "Pashto-focused interactive project discovered from github for demonstration and quick evaluation.",
1848
- "primary_use": "Interactive Pashto demo and quick qualitative validation",
1849
- "tasks": [
1850
- "demo"
1851
- ],
1852
- "pashto_evidence": {
1853
- "evidence_text": "Repository metadata (name/description/topics) includes Pashto markers.",
1854
- "evidence_url": "https://github.com/IhyaCommunity/Khushkhat-Extension",
1855
- "markers": [
1856
- "pashto"
1857
- ]
1858
- },
1859
- "tags": [
1860
- "pashto",
1861
- "project",
1862
- "github",
1863
- "demo"
1864
- ]
1865
- },
1866
  {
1867
  "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
1868
  "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
 
206
  "kaggle"
207
  ]
208
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  {
210
  "id": "model-pashto-bert",
211
  "title": "PashtoBERT",
 
335
  "mt"
336
  ]
337
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  {
339
  "id": "dataset-nexdata-99h-pashto-dialogue",
340
  "title": "99 Hours Pashto Spontaneous Dialogue Smartphone Speech Dataset",
 
755
  "asr"
756
  ]
757
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
758
  {
759
  "id": "paper-s2-psocr-lmm-pashto",
760
  "title": "PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language",
 
1503
  "demo"
1504
  ]
1505
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1506
  {
1507
  "id": "paper-s2-benchmarking-whisper-for-low-resource-speech-recognition-an-n-shot-evaluation-on-pashto-pu",
1508
  "title": "Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu",
resources/models/README.md CHANGED
@@ -11,15 +11,11 @@
11
  | ihanif/whisper-small-pashto | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto) | Pashto ASR baseline and model comparison |
12
  | ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
13
  | ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
14
- | MMS 1B All | [huggingface](https://huggingface.co/facebook/mms-1b-all) | [MMS coverage table includes pus with ASR support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR transfer baseline |
15
- | MMS TTS | [huggingface](https://huggingface.co/facebook/mms-tts) | [MMS coverage table includes pus with TTS support. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS baseline and transfer |
16
- | NLLB-200 Distilled 600M | [huggingface](https://huggingface.co/facebook/nllb-200-distilled-600M) | [Model special token map includes pbt_Arab. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | Pashto translation baseline |
17
- | OPUS MT en-mul | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-en-mul) | English to Pashto translation path |
18
- | OPUS MT mul-en | [huggingface](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | [Language list includes pus code. (`pus`)](https://huggingface.co/Helsinki-NLP/opus-mt-mul-en) | Pashto to English translation path |
19
  | PashtoBERT | [huggingface](https://huggingface.co/mdarhri/pashto-bert) | [Model card states training on Pashto corpus data. (`Pashto`)](https://huggingface.co/mdarhri/pashto-bert) | Pashto NLP baseline encoder |
20
  | wav2vec2 XLS-R 300M Pashto | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | [Model tags include pashto and ps, and model index references FLEURS config ps_af. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | Pashto ASR baseline and comparative experiments |
21
- | Whisper Large v3 | [huggingface](https://huggingface.co/openai/whisper-large-v3) | [Whisper tokenizer map includes ps language key. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR baseline and pseudo-labeling |
22
  | Whisper Medium Pashto | [huggingface](https://huggingface.co/ihanif/whisper-medium-pashto) | [Model tags include pashto and ps, and model index uses FLEURS ps_af split. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/whisper-medium-pashto) | Pashto ASR baseline and transcription quality comparisons |
 
23
 
24
  ## Maintenance
25
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
 
11
  | ihanif/whisper-small-pashto | [huggingface](https://huggingface.co/ihanif/whisper-small-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/whisper-small-pashto) | Pashto ASR baseline and model comparison |
12
  | ihanif/xls-r-1b-pashto | [huggingface](https://huggingface.co/ihanif/xls-r-1b-pashto) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ihanif/xls-r-1b-pashto) | Pashto ASR baseline and model comparison |
13
  | ijazulhaq/bert-base-pashto-v1 | [huggingface](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | [Matched by Pashto keyword in Hugging Face search results. (`pashto`)](https://huggingface.co/ijazulhaq/bert-base-pashto-v1) | Pashto model baseline for downstream NLP tasks |
 
 
 
 
 
14
  | PashtoBERT | [huggingface](https://huggingface.co/mdarhri/pashto-bert) | [Model card states training on Pashto corpus data. (`Pashto`)](https://huggingface.co/mdarhri/pashto-bert) | Pashto NLP baseline encoder |
15
  | wav2vec2 XLS-R 300M Pashto | [huggingface](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | [Model tags include pashto and ps, and model index references FLEURS config ps_af. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/wav2vec2-xls-r-300m-pashto) | Pashto ASR baseline and comparative experiments |
16
+ | Whisper Base Pashto | [huggingface](https://huggingface.co/ihanif/whisper-base-pashto) | [Model ID includes Pashto and card metadata references FLEURS config ps_af. (`Pashto`, `ps_af`)](https://huggingface.co/api/models/ihanif/whisper-base-pashto) | Pashto ASR baseline and speed-accuracy comparison |
17
  | Whisper Medium Pashto | [huggingface](https://huggingface.co/ihanif/whisper-medium-pashto) | [Model tags include pashto and ps, and model index uses FLEURS ps_af split. (`pashto`, `ps`, `ps_af`)](https://huggingface.co/ihanif/whisper-medium-pashto) | Pashto ASR baseline and transcription quality comparisons |
18
+ | zirak-ai/pashto-bert-v1 | [huggingface](https://huggingface.co/zirak-ai/pashto-bert-v1) | [Hugging Face model ID and search tags explicitly include pashto marker. (`pashto`)](https://huggingface.co/zirak-ai/pashto-bert-v1) | Pashto encoder baseline for NLP tasks |
19
 
20
  ## Maintenance
21
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
resources/papers/README.md CHANGED
@@ -7,15 +7,11 @@
7
  | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
8
  | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
9
  | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
10
- | FLEURS: Few-shot Learning Evaluation of Universal Representations of Speech | [arxiv](https://arxiv.org/abs/2205.12446) | [Dataset implementation includes ps_af language code. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark methodology reference |
11
  | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
12
- | No Language Left Behind | [arxiv](https://arxiv.org/abs/2207.04672) | [Model usage in repo references pbt_Arab token support. (`pbt_Arab`)](https://huggingface.co/facebook/nllb-200-distilled-600M/blob/main/special_tokens_map.json) | MT research reference |
13
  | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
14
  | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
15
  | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
16
  | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
17
- | Robust Speech Recognition via Large-Scale Weak Supervision | [arxiv](https://arxiv.org/abs/2212.04356) | [Paired with tokenizer language map containing ps. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR methodology reference |
18
- | Scaling Speech Technology to 1,000+ Languages | [arxiv](https://arxiv.org/abs/2305.13516) | [Coverage table marks pus support in MMS release. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | ASR and TTS transfer reference |
19
 
20
  ## Maintenance
21
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
 
7
  | Benchmarking Whisper for Low-Resource Speech Recognition: An N-Shot Evaluation on Pashto, Punjabi, and Urdu | [other](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/13104eddc785756132a19242ac7e74442b145693) | Pashto research reference for methods and benchmarking |
8
  | Deep Learning-Based Detection of One and Two-Column Textual Blocks in Camera-Captured Pashto Documents Images | [other](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/8c9d2628e23d5c27edc656071f11f0e78124d182) | Pashto research reference for methods and benchmarking |
9
  | Enhancing Pashto Text Classification using Language Processing Techniques for Single And Multi-Label Analysis | [arxiv](http://arxiv.org/abs/2305.03201v1) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/2305.03201v1) | Pashto research reference for methods and benchmarking |
 
10
  | KNN and ANN-based Recognition of Handwritten Pashto Letters using Zoning Features | [arxiv](http://arxiv.org/abs/1904.03391v2) | [Matched by arXiv query: all:pashto. (`pashto`)](http://arxiv.org/abs/1904.03391v2) | Pashto research reference for methods and benchmarking |
 
11
  | Out-of-Vocabulary Pashto Spell Checker using Morphological Operations | [other](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/802aae68a6a7fdfb29d51be03fb2b09e29311fa7) | Pashto research reference for methods and benchmarking |
12
  | Pashto Shallow Parsing: A Deep Learning Approach | [other](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/be36455bb4eca60accb3e6866f345132f0dac1e5) | Pashto research reference for methods and benchmarking |
13
  | POS tagging of low-resource Pashto language: annotated corpus and BERT-based model | [other](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | [Matched by Semantic Scholar query: pashto. (`pashto`)](https://www.semanticscholar.org/paper/1b2d5c896fec735483e8c8fb0a75e13125e08769) | Pashto research reference for methods and benchmarking |
14
  | PsOCR: Benchmarking Large Multimodal Models for Optical Character Recognition in Low-resource Pashto Language | [other](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | [Paper title explicitly references low-resource Pashto language OCR benchmarking. (`Pashto`, `OCR`)](https://www.semanticscholar.org/paper/d2743c0dcdbc65f5b46fcec2f0ba7cb379c4134f) | Pashto OCR research baseline and evaluation reference |
 
 
15
 
16
  ## Maintenance
17
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
resources/projects/README.md CHANGED
@@ -6,7 +6,6 @@
6
  |---|---|---|---|
7
  | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
8
  | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
9
- | IhyaCommunity/Khushkhat-Extension | [github](https://github.com/IhyaCommunity/Khushkhat-Extension) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/IhyaCommunity/Khushkhat-Extension) | Interactive Pashto demo and quick qualitative validation |
10
  | nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
11
  | Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
12
  | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
 
6
  |---|---|---|---|
7
  | Fazlullahmamond/Pashto-Typing | [github](https://github.com/Fazlullahmamond/Pashto-Typing) | [Repository metadata (name/description/topics) includes Pashto markers. (`pashto`)](https://github.com/Fazlullahmamond/Pashto-Typing) | Interactive Pashto demo and quick qualitative validation |
8
  | ihanif/wav2vec2-bert-pashto-asr | [huggingface](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/ihanif/wav2vec2-bert-pashto-asr) | Interactive Pashto demo and quick qualitative validation |
 
9
  | nasirkhansayyad/pashto-whisper-demo | [huggingface](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | [Matched by Pashto keyword in Hugging Face Spaces search. (`pashto`)](https://huggingface.co/spaces/nasirkhansayyad/pashto-whisper-demo) | Interactive Pashto demo and quick qualitative validation |
10
  | Pashto ASR Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr) | [Space ID includes pashto-asr and is returned by Hugging Face Pashto space search. (`pashto`, `asr`)](https://huggingface.co/api/spaces/ihanif/pashto-asr) | Live Pashto speech-to-text demo project |
11
  | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
resources/tools/README.md CHANGED
@@ -4,8 +4,7 @@
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
- | Coqui TTS | [github](https://github.com/coqui-ai/TTS) | [Can be paired with Pashto-supporting MMS checkpoints. (`pus`)](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) | TTS training and inference |
8
- | Faster-Whisper | [github](https://github.com/SYSTRAN/faster-whisper) | [Whisper tokenizer includes ps and tool runs Whisper models. (`ps`)](https://raw.githubusercontent.com/openai/whisper/main/whisper/tokenizer.py) | ASR inference acceleration |
9
 
10
  ## Maintenance
11
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
 
4
 
5
  | Resource | Link | Pashto Evidence | Primary Use |
6
  |---|---|---|---|
7
+ | _None yet_ | - | - | - |
 
8
 
9
  ## Maintenance
10
  - Source of truth: [../catalog/resources.json](../catalog/resources.json)
scripts/sync_resources.py CHANGED
@@ -30,6 +30,19 @@ from urllib.error import HTTPError, URLError
30
  USER_AGENT = "pashto-resource-sync/1.0"
31
  MAX_FETCH_RETRIES = 4
32
  RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
  def _slug(value: str) -> str:
@@ -39,6 +52,28 @@ def _slug(value: str) -> str:
39
  return value[:80] if value else "resource"
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
43
  if not retry_after:
44
  return None
@@ -201,16 +236,35 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
201
  if kind not in {"datasets", "models"}:
202
  return []
203
 
204
- query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
205
- url = f"https://huggingface.co/api/{kind}?{query}"
206
- payload = _fetch_json(url, source_name=f"huggingface-{kind}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  category = "dataset" if kind == "datasets" else "model"
209
  out: list[dict[str, Any]] = []
210
- for item in payload:
211
  repo_id = item.get("id") or item.get("modelId")
212
  if not repo_id:
213
  continue
 
 
 
 
214
  repo_url = f"https://huggingface.co/{'datasets/' if kind == 'datasets' else ''}{repo_id}"
215
  rid = f"candidate-hf-{kind[:-1]}-{_slug(repo_id)}"
216
  out.append(
@@ -227,19 +281,40 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
227
  tags=["pashto", "candidate", category],
228
  )
229
  )
 
 
230
  return out
231
 
232
 
233
  def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
234
- query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
235
- url = f"https://huggingface.co/api/spaces?{query}"
236
- payload = _fetch_json(url, source_name="huggingface-spaces")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  out: list[dict[str, Any]] = []
239
- for item in payload:
240
  space_id = item.get("id")
241
  if not space_id:
242
  continue
 
 
 
 
243
  space_url = f"https://huggingface.co/spaces/{space_id}"
244
  rid = f"candidate-hf-project-{_slug(space_id)}"
245
  summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
@@ -257,17 +332,36 @@ def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
257
  tags=["pashto", "candidate", "project", "space"],
258
  )
259
  )
 
 
260
  return out
261
 
262
 
263
  def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
264
  # Public Kaggle dataset listing endpoint (no auth needed for list responses).
265
- query = urllib.parse.urlencode({"search": "pashto", "page": "1"})
266
- url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
267
- payload = _fetch_json(url, source_name="kaggle-datasets")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
 
269
  out: list[dict[str, Any]] = []
270
- for item in payload:
271
  title = (item.get("titleNullable") or "").strip()
272
  dataset_url = (item.get("urlNullable") or "").strip()
273
  owner = (item.get("ownerRefNullable") or "").strip()
@@ -275,8 +369,9 @@ def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
275
  if not title or not dataset_url:
276
  continue
277
 
278
- blob = f"{title} {subtitle}".lower()
279
- if "pashto" not in blob and "pukhto" not in blob:
 
280
  continue
281
 
282
  owner_prefix = f"{owner}/" if owner else ""
@@ -304,7 +399,11 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
304
  # Query by topic first for high precision, then by keyword for recall.
305
  query_variants = [
306
  "topic:pashto",
 
307
  "pashto in:name,description,readme",
 
 
 
308
  ]
309
 
310
  combined: dict[str, dict[str, Any]] = {}
@@ -334,8 +433,10 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
334
  item.get("description") or "",
335
  " ".join(item.get("topics") or []),
336
  ]
337
- ).lower()
338
- if "pashto" not in name_blob and "pukhto" not in name_blob:
 
 
339
  continue
340
 
341
  html_url = item["html_url"]
@@ -367,70 +468,110 @@ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
367
 
368
 
369
  def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
370
- query = urllib.parse.urlencode(
371
- {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
372
- )
373
- url = f"https://export.arxiv.org/api/query?{query}"
374
- try:
375
- xml_text = _fetch_text(url, timeout=30.0, source_name="arxiv")
376
- except Exception as exc: # noqa: BLE001
377
- if not _is_ssl_cert_error(exc):
378
- raise
379
- # arXiv occasionally fails cert chain validation in some runner images.
380
- insecure_context = ssl._create_unverified_context()
381
- print("[warn] arxiv SSL verification failed; retrying with unverified TLS context")
382
- xml_text = _fetch_text(
383
- url,
384
- timeout=30.0,
385
- ssl_context=insecure_context,
386
- source_name="arxiv",
387
  )
388
- root = ET.fromstring(xml_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  ns = {"atom": "http://www.w3.org/2005/Atom"}
390
 
 
391
  out: list[dict[str, Any]] = []
392
- for entry in root.findall("atom:entry", ns):
393
- title = (entry.findtext("atom:title", default="", namespaces=ns) or "").strip()
394
- link = (entry.findtext("atom:id", default="", namespaces=ns) or "").strip()
395
- summary = (entry.findtext("atom:summary", default="", namespaces=ns) or "").strip()
396
- if not title or not link:
397
- continue
 
 
 
 
 
 
 
 
398
 
399
- rid = f"candidate-arxiv-{_slug(title)}"
400
- out.append(
401
- _candidate(
402
- rid=rid,
403
- title=title,
404
- url=link,
405
- category="paper",
406
- source="arxiv",
407
- summary=summary[:240] if summary else "Candidate paper returned from arXiv query for Pashto.",
408
- evidence_text="Matched by arXiv query: all:pashto.",
409
- evidence_url=link,
410
- markers=["pashto"],
411
- tags=["pashto", "candidate", "paper"],
 
 
412
  )
413
- )
 
414
  return out
415
 
416
 
417
  def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
418
  fields = "title,url,abstract,year,externalIds"
419
- query = urllib.parse.urlencode(
420
- {"query": "pashto", "limit": str(limit), "fields": fields}
421
- )
422
- url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
423
- payload = _fetch_json(
424
- url,
425
- timeout=30.0,
426
- source_name="semantic-scholar",
427
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
 
429
  out: list[dict[str, Any]] = []
430
- for item in payload.get("data", []):
431
  title = (item.get("title") or "").strip()
432
  if not title:
433
  continue
 
 
 
 
 
434
  paper_url = (item.get("url") or "").strip()
435
  if not paper_url:
436
  ext = item.get("externalIds") or {}
@@ -450,12 +591,14 @@ def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
450
  category="paper",
451
  source="other",
452
  summary=summary[:240] if summary else "Candidate paper returned from Semantic Scholar search for Pashto.",
453
- evidence_text="Matched by Semantic Scholar query: pashto.",
454
  evidence_url=paper_url,
455
  markers=["pashto"],
456
  tags=["pashto", "candidate", "paper"],
457
  )
458
  )
 
 
459
  return out
460
 
461
 
 
30
  USER_AGENT = "pashto-resource-sync/1.0"
31
  MAX_FETCH_RETRIES = 4
32
  RETRYABLE_HTTP_CODES = {429, 500, 502, 503, 504}
33
+ PASHTO_QUERY_TERMS = ["pashto", "pukhto", "pushto", "pakhto"]
34
+ PASHTO_TEXT_MARKERS = ("pashto", "pukhto", "pushto", "pakhto")
35
+ PASHTO_SCRIPT_MARKERS = ("پښتو", "پشتو")
36
+ PASHTO_WORD_RE = re.compile(
37
+ r"(?<![A-Za-z0-9])(pashto|pukhto|pushto|pakhto)(?![A-Za-z0-9])",
38
+ re.IGNORECASE,
39
+ )
40
+ PASHTO_CAMEL_RE = re.compile(
41
+ r"(?<![A-Za-z0-9])(pashto|pukhto|pakhto)(?=[A-Z])",
42
+ re.IGNORECASE,
43
+ )
44
+ PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b", re.IGNORECASE)
45
+ LOW_SIGNAL_RE = re.compile(r"(^|[-_/ ])(test|tmp|trial|scratch)([-_/ ]|$)", re.IGNORECASE)
46
 
47
 
48
  def _slug(value: str) -> str:
 
52
  return value[:80] if value else "resource"
53
 
54
 
55
+ def _contains_pashto_marker(value: str) -> bool:
56
+ text = (value or "").strip()
57
+ if not text:
58
+ return False
59
+ if PASHTO_WORD_RE.search(text):
60
+ return True
61
+ if PASHTO_CAMEL_RE.search(text):
62
+ return True
63
+ if any(marker in text for marker in PASHTO_SCRIPT_MARKERS):
64
+ return True
65
+ lowered = text.casefold()
66
+ return bool(PASHTO_CODE_RE.search(lowered))
67
+
68
+
69
+ def _is_pashto_centric(*values: str) -> bool:
70
+ return any(_contains_pashto_marker(value) for value in values)
71
+
72
+
73
+ def _is_low_signal_name(value: str) -> bool:
74
+ return bool(LOW_SIGNAL_RE.search(value or ""))
75
+
76
+
77
  def _parse_retry_after_seconds(retry_after: str | None) -> float | None:
78
  if not retry_after:
79
  return None
 
236
  if kind not in {"datasets", "models"}:
237
  return []
238
 
239
+ combined: dict[str, dict[str, Any]] = {}
240
+ errors: list[str] = []
241
+ for term in PASHTO_QUERY_TERMS:
242
+ query = urllib.parse.urlencode({"search": term, "limit": str(limit)})
243
+ url = f"https://huggingface.co/api/{kind}?{query}"
244
+ try:
245
+ payload = _fetch_json(url, source_name=f"huggingface-{kind}")
246
+ except Exception as exc: # noqa: BLE001
247
+ errors.append(f"{term}: {exc}")
248
+ continue
249
+ for item in payload:
250
+ repo_id = item.get("id") or item.get("modelId")
251
+ if not repo_id:
252
+ continue
253
+ combined[repo_id] = item
254
+
255
+ if not combined and errors:
256
+ raise RuntimeError("; ".join(errors))
257
 
258
  category = "dataset" if kind == "datasets" else "model"
259
  out: list[dict[str, Any]] = []
260
+ for item in combined.values():
261
  repo_id = item.get("id") or item.get("modelId")
262
  if not repo_id:
263
  continue
264
+ if not _is_pashto_centric(repo_id):
265
+ continue
266
+ if _is_low_signal_name(repo_id):
267
+ continue
268
  repo_url = f"https://huggingface.co/{'datasets/' if kind == 'datasets' else ''}{repo_id}"
269
  rid = f"candidate-hf-{kind[:-1]}-{_slug(repo_id)}"
270
  out.append(
 
281
  tags=["pashto", "candidate", category],
282
  )
283
  )
284
+ if len(out) >= limit:
285
+ break
286
  return out
287
 
288
 
289
  def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
290
+ combined: dict[str, dict[str, Any]] = {}
291
+ errors: list[str] = []
292
+ for term in PASHTO_QUERY_TERMS:
293
+ query = urllib.parse.urlencode({"search": term, "limit": str(limit)})
294
+ url = f"https://huggingface.co/api/spaces?{query}"
295
+ try:
296
+ payload = _fetch_json(url, source_name="huggingface-spaces")
297
+ except Exception as exc: # noqa: BLE001
298
+ errors.append(f"{term}: {exc}")
299
+ continue
300
+ for item in payload:
301
+ space_id = item.get("id")
302
+ if not space_id:
303
+ continue
304
+ combined[space_id] = item
305
+
306
+ if not combined and errors:
307
+ raise RuntimeError("; ".join(errors))
308
 
309
  out: list[dict[str, Any]] = []
310
+ for item in combined.values():
311
  space_id = item.get("id")
312
  if not space_id:
313
  continue
314
+ if not _is_pashto_centric(space_id):
315
+ continue
316
+ if _is_low_signal_name(space_id):
317
+ continue
318
  space_url = f"https://huggingface.co/spaces/{space_id}"
319
  rid = f"candidate-hf-project-{_slug(space_id)}"
320
  summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
 
332
  tags=["pashto", "candidate", "project", "space"],
333
  )
334
  )
335
+ if len(out) >= limit:
336
+ break
337
  return out
338
 
339
 
340
  def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
341
  # Public Kaggle dataset listing endpoint (no auth needed for list responses).
342
+ combined: list[dict[str, Any]] = []
343
+ seen_urls: set[str] = set()
344
+ errors: list[str] = []
345
+ for term in PASHTO_QUERY_TERMS:
346
+ query = urllib.parse.urlencode({"search": term, "page": "1"})
347
+ url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
348
+ try:
349
+ payload = _fetch_json(url, source_name="kaggle-datasets")
350
+ except Exception as exc: # noqa: BLE001
351
+ errors.append(f"{term}: {exc}")
352
+ continue
353
+ for item in payload:
354
+ dataset_url = (item.get("urlNullable") or "").strip()
355
+ if not dataset_url or dataset_url in seen_urls:
356
+ continue
357
+ seen_urls.add(dataset_url)
358
+ combined.append(item)
359
+
360
+ if not combined and errors:
361
+ raise RuntimeError("; ".join(errors))
362
 
363
  out: list[dict[str, Any]] = []
364
+ for item in combined:
365
  title = (item.get("titleNullable") or "").strip()
366
  dataset_url = (item.get("urlNullable") or "").strip()
367
  owner = (item.get("ownerRefNullable") or "").strip()
 
369
  if not title or not dataset_url:
370
  continue
371
 
372
+ if not _is_pashto_centric(title, subtitle):
373
+ continue
374
+ if _is_low_signal_name(title):
375
  continue
376
 
377
  owner_prefix = f"{owner}/" if owner else ""
 
399
  # Query by topic first for high precision, then by keyword for recall.
400
  query_variants = [
401
  "topic:pashto",
402
+ "topic:pukhto",
403
  "pashto in:name,description,readme",
404
+ "pukhto in:name,description,readme",
405
+ "pushto in:name,description,readme",
406
+ "pakhto in:name,description,readme",
407
  ]
408
 
409
  combined: dict[str, dict[str, Any]] = {}
 
433
  item.get("description") or "",
434
  " ".join(item.get("topics") or []),
435
  ]
436
+ )
437
+ if not _is_pashto_centric(name_blob):
438
+ continue
439
+ if _is_low_signal_name(full_name):
440
  continue
441
 
442
  html_url = item["html_url"]
 
468
 
469
 
470
  def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
471
+ roots: list[ET.Element] = []
472
+ errors: list[str] = []
473
+ for term in PASHTO_QUERY_TERMS:
474
+ query = urllib.parse.urlencode(
475
+ {"search_query": f"all:{term}", "start": "0", "max_results": str(limit)}
 
 
 
 
 
 
 
 
 
 
 
 
476
  )
477
+ url = f"https://export.arxiv.org/api/query?{query}"
478
+ try:
479
+ xml_text = _fetch_text(url, timeout=30.0, source_name="arxiv")
480
+ except Exception as exc: # noqa: BLE001
481
+ if not _is_ssl_cert_error(exc):
482
+ errors.append(f"{term}: {exc}")
483
+ continue
484
+ # arXiv occasionally fails cert chain validation in some runner images.
485
+ insecure_context = ssl._create_unverified_context()
486
+ print("[warn] arxiv SSL verification failed; retrying with unverified TLS context")
487
+ xml_text = _fetch_text(
488
+ url,
489
+ timeout=30.0,
490
+ ssl_context=insecure_context,
491
+ source_name="arxiv",
492
+ )
493
+ roots.append(ET.fromstring(xml_text))
494
+
495
+ if not roots and errors:
496
+ raise RuntimeError("; ".join(errors))
497
+
498
  ns = {"atom": "http://www.w3.org/2005/Atom"}
499
 
500
+ seen_links: set[str] = set()
501
  out: list[dict[str, Any]] = []
502
+ for root in roots:
503
+ for entry in root.findall("atom:entry", ns):
504
+ title = (entry.findtext("atom:title", default="", namespaces=ns) or "").strip()
505
+ link = (entry.findtext("atom:id", default="", namespaces=ns) or "").strip()
506
+ summary = (entry.findtext("atom:summary", default="", namespaces=ns) or "").strip()
507
+ if not title or not link:
508
+ continue
509
+ if link in seen_links:
510
+ continue
511
+ # Strict: keep only papers with explicit Pashto markers in title.
512
+ if not _is_pashto_centric(title):
513
+ continue
514
+ if _is_low_signal_name(title):
515
+ continue
516
 
517
+ seen_links.add(link)
518
+ rid = f"candidate-arxiv-{_slug(title)}"
519
+ out.append(
520
+ _candidate(
521
+ rid=rid,
522
+ title=title,
523
+ url=link,
524
+ category="paper",
525
+ source="arxiv",
526
+ summary=summary[:240] if summary else "Candidate paper returned from arXiv query for Pashto.",
527
+ evidence_text="Matched by Pashto marker in paper title from arXiv query results.",
528
+ evidence_url=link,
529
+ markers=["pashto"],
530
+ tags=["pashto", "candidate", "paper"],
531
+ )
532
  )
533
+ if len(out) >= limit:
534
+ return out
535
  return out
536
 
537
 
538
  def fetch_semantic_scholar(limit: int) -> list[dict[str, Any]]:
539
  fields = "title,url,abstract,year,externalIds"
540
+ combined: dict[str, dict[str, Any]] = {}
541
+ errors: list[str] = []
542
+ for term in PASHTO_QUERY_TERMS:
543
+ query = urllib.parse.urlencode(
544
+ {"query": term, "limit": str(limit), "fields": fields}
545
+ )
546
+ url = f"https://api.semanticscholar.org/graph/v1/paper/search?{query}"
547
+ try:
548
+ payload = _fetch_json(
549
+ url,
550
+ timeout=30.0,
551
+ source_name="semantic-scholar",
552
+ )
553
+ except Exception as exc: # noqa: BLE001
554
+ errors.append(f"{term}: {exc}")
555
+ continue
556
+ for item in payload.get("data", []):
557
+ title = (item.get("title") or "").strip()
558
+ if not title:
559
+ continue
560
+ combined[title] = item
561
+
562
+ if not combined and errors:
563
+ raise RuntimeError("; ".join(errors))
564
 
565
  out: list[dict[str, Any]] = []
566
+ for item in combined.values():
567
  title = (item.get("title") or "").strip()
568
  if not title:
569
  continue
570
+ # Strict: keep only papers with explicit Pashto markers in title.
571
+ if not _is_pashto_centric(title):
572
+ continue
573
+ if _is_low_signal_name(title):
574
+ continue
575
  paper_url = (item.get("url") or "").strip()
576
  if not paper_url:
577
  ext = item.get("externalIds") or {}
 
591
  category="paper",
592
  source="other",
593
  summary=summary[:240] if summary else "Candidate paper returned from Semantic Scholar search for Pashto.",
594
+ evidence_text="Matched by explicit Pashto marker in paper title from Semantic Scholar search.",
595
  evidence_url=paper_url,
596
  markers=["pashto"],
597
  tags=["pashto", "candidate", "paper"],
598
  )
599
  )
600
+ if len(out) >= limit:
601
+ break
602
  return out
603
 
604
 
scripts/validate_resource_catalog.py CHANGED
@@ -20,6 +20,9 @@ ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project
20
  ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
21
  ALLOWED_STATUS = {"verified", "candidate"}
22
  RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
 
 
 
23
 
24
 
25
  def _load_json(path: Path) -> dict[str, Any]:
@@ -39,6 +42,19 @@ def _validate_iso_date(value: str) -> bool:
39
  return True
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
43
  errors: list[str] = []
44
  prefix = f"resource[{index}]"
@@ -123,6 +139,14 @@ def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
123
  if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
124
  errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")
125
 
 
 
 
 
 
 
 
 
126
  return errors
127
 
128
 
 
20
  ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
21
  ALLOWED_STATUS = {"verified", "candidate"}
22
  RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
23
+ STRICT_PASHTO_CATEGORIES = {"model", "paper", "tool", "code", "project"}
24
+ PASHTO_PUSHTO_WORD_RE = re.compile(r"(?<![a-z0-9])pushto(?![a-z0-9])")
25
+ PASHTO_CODE_RE = re.compile(r"\b(ps(_af)?|pus|pbt[_-]?arab)\b")
26
 
27
 
28
  def _load_json(path: Path) -> dict[str, Any]:
 
42
  return True
43
 
44
 
45
+ def _contains_pashto_marker(value: str) -> bool:
46
+ if not isinstance(value, str):
47
+ return False
48
+ lowered = value.casefold()
49
+ if any(marker in lowered for marker in ("pashto", "pukhto", "pakhto")):
50
+ return True
51
+ if PASHTO_PUSHTO_WORD_RE.search(lowered):
52
+ return True
53
+ if PASHTO_CODE_RE.search(lowered):
54
+ return True
55
+ return any(marker in value for marker in ("پښتو", "پشتو"))
56
+
57
+
58
  def validate_resource(resource: dict[str, Any], index: int) -> list[str]:
59
  errors: list[str] = []
60
  prefix = f"resource[{index}]"
 
139
  if not (isinstance(markers, list) and markers and all(isinstance(marker, str) and marker.strip() for marker in markers)):
140
  errors.append(f"{prefix}.pashto_evidence.markers must be a non-empty list of strings")
141
 
142
+ if category in STRICT_PASHTO_CATEGORIES and not (
143
+ _contains_pashto_marker(title) or _contains_pashto_marker(url)
144
+ ):
145
+ errors.append(
146
+ f"{prefix} must be Pashto-centric for category '{category}' "
147
+ "(include a Pashto marker in title or URL)"
148
+ )
149
+
150
  return errors
151
 
152
 
tests/test_validate_resource_catalog.py CHANGED
@@ -43,3 +43,21 @@ def test_validate_catalog_fails_for_invalid_evidence_url() -> None:
43
  catalog["resources"][0]["pashto_evidence"]["evidence_url"] = "not-a-url"
44
  errors = validate_catalog(catalog)
45
  assert any("evidence_url" in error for error in errors)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  catalog["resources"][0]["pashto_evidence"]["evidence_url"] = "not-a-url"
44
  errors = validate_catalog(catalog)
45
  assert any("evidence_url" in error for error in errors)
46
+
47
+
48
+ def test_validate_catalog_fails_for_non_pashto_centric_model() -> None:
49
+ catalog = _minimal_catalog()
50
+ catalog["resources"][0]["category"] = "model"
51
+ catalog["resources"][0]["title"] = "Generic Multilingual Model"
52
+ catalog["resources"][0]["url"] = "https://example.org/model"
53
+ errors = validate_catalog(catalog)
54
+ assert any("must be Pashto-centric" in error for error in errors)
55
+
56
+
57
+ def test_validate_catalog_allows_pashto_centric_model() -> None:
58
+ catalog = _minimal_catalog()
59
+ catalog["resources"][0]["category"] = "model"
60
+ catalog["resources"][0]["title"] = "Pashto ASR Model"
61
+ catalog["resources"][0]["url"] = "https://example.org/pashto-model"
62
+ errors = validate_catalog(catalog)
63
+ assert errors == []