musaw commited on
Commit
081627f
·
1 Parent(s): 574cd8c

Expand resource cycle for projects/code and promote new Pashto sources

Browse files
README.md CHANGED
@@ -89,6 +89,8 @@ python -m pytest -q
89
  - Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
90
  - Tools: [resources/tools/README.md](resources/tools/README.md)
91
  - Papers: [resources/papers/README.md](resources/papers/README.md)
 
 
92
 
93
  ## Workspaces
94
  - [data/](data/README.md): datasets, curation, metadata, quality
 
89
  - Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
90
  - Tools: [resources/tools/README.md](resources/tools/README.md)
91
  - Papers: [resources/papers/README.md](resources/papers/README.md)
92
+ - Projects: [resources/projects/README.md](resources/projects/README.md)
93
+ - Code: [resources/codes/README.md](resources/codes/README.md)
94
 
95
  ## Workspaces
96
  - [data/](data/README.md): datasets, curation, metadata, quality
docs/resource_automation.md CHANGED
@@ -7,6 +7,14 @@ This repository uses a semi-automated process to keep Pashto resources current w
7
  - Keep a machine-readable canonical catalog.
8
  - Prevent unreviewed low-confidence resources from directly entering verified lists.
9
 
 
 
 
 
 
 
 
 
10
  ## Files involved
11
  - Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
12
  - Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)
 
7
  - Keep a machine-readable canonical catalog.
8
  - Prevent unreviewed low-confidence resources from directly entering verified lists.
9
 
10
+ ## Covered source types
11
+ - Kaggle datasets
12
+ - Hugging Face datasets
13
+ - Hugging Face models
14
+ - Hugging Face Spaces (projects)
15
+ - GitHub repositories (projects and code)
16
+ - Research-paper endpoints
17
+
18
  ## Files involved
19
  - Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
20
  - Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)
docs/resource_catalog.md CHANGED
@@ -20,6 +20,8 @@ This index points to validated Pashto-related resources tracked in structured fi
20
  - Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
21
  - Tools: [../resources/tools/README.md](../resources/tools/README.md)
22
  - Papers: [../resources/papers/README.md](../resources/papers/README.md)
 
 
23
 
24
  ## Search page
25
  - GitHub Pages search UI: [search/index.html](search/index.html)
 
20
  - Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
21
  - Tools: [../resources/tools/README.md](../resources/tools/README.md)
22
  - Papers: [../resources/papers/README.md](../resources/papers/README.md)
23
+ - Projects: [../resources/projects/README.md](../resources/projects/README.md)
24
+ - Code: [../resources/codes/README.md](../resources/codes/README.md)
25
 
26
  ## Search page
27
  - GitHub Pages search UI: [search/index.html](search/index.html)
docs/resource_cycle_runbook.md CHANGED
@@ -21,6 +21,8 @@ What it executes:
21
  4. `python scripts/check_links.py`
22
  5. `python -m pytest -q`
23
 
 
 
24
  ## Discovery-only mode
25
  If you only want fresh candidates:
26
 
 
21
  4. `python scripts/check_links.py`
22
  5. `python -m pytest -q`
23
 
24
+ Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, and paper endpoints.
25
+
26
  ## Discovery-only mode
27
  If you only want fresh candidates:
28
 
docs/search/resources.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "generated_on": "2026-02-15T00:00:00Z",
3
- "count": 30,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
@@ -717,6 +717,184 @@
717
  "ps",
718
  "ps_af"
719
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
720
  }
721
  ]
722
  }
 
1
  {
2
  "generated_on": "2026-02-15T00:00:00Z",
3
+ "count": 37,
4
  "resources": [
5
  {
6
  "id": "dataset-common-voice-ps-v24",
 
717
  "ps",
718
  "ps_af"
719
  ]
720
+ },
721
+ {
722
+ "id": "dataset-kaggle-pold-pashto-offensive",
723
+ "title": "POLD - Pashto Offensive Language Dataset",
724
+ "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
725
+ "category": "dataset",
726
+ "source": "kaggle",
727
+ "status": "verified",
728
+ "summary": "Benchmark dataset for offensive content detection in Pashto social text.",
729
+ "primary_use": "Pashto toxicity and moderation NLP benchmarks",
730
+ "tasks": [
731
+ "nlp",
732
+ "classification"
733
+ ],
734
+ "tags": [
735
+ "pashto",
736
+ "kaggle",
737
+ "nlp",
738
+ "toxicity"
739
+ ],
740
+ "evidence_text": "Kaggle title and description explicitly state Pashto offensive language benchmark dataset.",
741
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset",
742
+ "markers": [
743
+ "Pashto"
744
+ ]
745
+ },
746
+ {
747
+ "id": "dataset-kaggle-pashto-english-sentiment-corpus",
748
+ "title": "Pashto English Bilingual Sentiment Corpus",
749
+ "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
750
+ "category": "dataset",
751
+ "source": "kaggle",
752
+ "status": "verified",
753
+ "summary": "Pashto to English bilingual sentiment corpus useful for low-resource sentiment tasks.",
754
+ "primary_use": "Sentiment analysis and bilingual NLP experiments",
755
+ "tasks": [
756
+ "nlp",
757
+ "sentiment"
758
+ ],
759
+ "tags": [
760
+ "pashto",
761
+ "kaggle",
762
+ "sentiment",
763
+ "bilingual"
764
+ ],
765
+ "evidence_text": "Kaggle dataset title and description identify the corpus as Pashto-English sentiment data.",
766
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus",
767
+ "markers": [
768
+ "Pashto"
769
+ ]
770
+ },
771
+ {
772
+ "id": "dataset-kaggle-urdu-pashto-lexicon",
773
+ "title": "Urdu-Pashto Lexicon Dataset",
774
+ "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
775
+ "category": "dataset",
776
+ "source": "kaggle",
777
+ "status": "verified",
778
+ "summary": "Lexicon of Urdu words with Pashto translations for dictionary and MT support.",
779
+ "primary_use": "Lexicon and translation lexeme mapping",
780
+ "tasks": [
781
+ "nlp",
782
+ "mt"
783
+ ],
784
+ "tags": [
785
+ "pashto",
786
+ "kaggle",
787
+ "lexicon",
788
+ "translation"
789
+ ],
790
+ "evidence_text": "Kaggle metadata describes 7,601 Urdu entries with Pashto translations.",
791
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset",
792
+ "markers": [
793
+ "Pashto"
794
+ ]
795
+ },
796
+ {
797
+ "id": "project-hf-space-ihanif-pashto-asr-v3",
798
+ "title": "Pashto ASR V3 Space",
799
+ "url": "https://huggingface.co/spaces/ihanif/pashto-asr-v3",
800
+ "category": "project",
801
+ "source": "huggingface",
802
+ "status": "verified",
803
+ "summary": "Interactive Hugging Face Space for Pashto automatic speech recognition demos.",
804
+ "primary_use": "Project demo for Pashto ASR user testing",
805
+ "tasks": [
806
+ "asr",
807
+ "demo"
808
+ ],
809
+ "tags": [
810
+ "pashto",
811
+ "project",
812
+ "huggingface-space",
813
+ "asr"
814
+ ],
815
+ "evidence_text": "Space card title is Pashto ASR V3 and short description states Pashto ASR.",
816
+ "evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr-v3",
817
+ "markers": [
818
+ "Pashto",
819
+ "ASR"
820
+ ]
821
+ },
822
+ {
823
+ "id": "project-hf-space-pashto2english-dictionary",
824
+ "title": "Pashto to English Dictionary Space",
825
+ "url": "https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary",
826
+ "category": "project",
827
+ "source": "huggingface",
828
+ "status": "verified",
829
+ "summary": "Streamlit project for Pashto to English dictionary lookups.",
830
+ "primary_use": "Interactive bilingual lookup project",
831
+ "tasks": [
832
+ "dictionary",
833
+ "translation",
834
+ "demo"
835
+ ],
836
+ "tags": [
837
+ "pashto",
838
+ "project",
839
+ "huggingface-space",
840
+ "dictionary"
841
+ ],
842
+ "evidence_text": "Space metadata title states Pashto to English Dictionary.",
843
+ "evidence_url": "https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary",
844
+ "markers": [
845
+ "Pashto"
846
+ ]
847
+ },
848
+ {
849
+ "id": "project-hf-space-umar4321-pashto-translator",
850
+ "title": "Pashto Translator Space",
851
+ "url": "https://huggingface.co/spaces/Umar4321/Pashto-Translator",
852
+ "category": "project",
853
+ "source": "huggingface",
854
+ "status": "verified",
855
+ "summary": "Streamlit translator project for Pashto to English and Urdu conversion.",
856
+ "primary_use": "Interactive translation project demo",
857
+ "tasks": [
858
+ "translation",
859
+ "demo"
860
+ ],
861
+ "tags": [
862
+ "pashto",
863
+ "project",
864
+ "huggingface-space",
865
+ "translation"
866
+ ],
867
+ "evidence_text": "Space title is Pashto Translator and description states Pashto to English and Urdu translation.",
868
+ "evidence_url": "https://huggingface.co/api/spaces/Umar4321/Pashto-Translator",
869
+ "markers": [
870
+ "Pashto"
871
+ ]
872
+ },
873
+ {
874
+ "id": "code-github-ijazul-haq-nlpashto",
875
+ "title": "nlpashto Toolkit",
876
+ "url": "https://github.com/ijazul-haq/nlpashto",
877
+ "category": "code",
878
+ "source": "github",
879
+ "status": "verified",
880
+ "summary": "Pashto NLP toolkit codebase for tokenization, embeddings, and downstream NLP workflows.",
881
+ "primary_use": "Pashto NLP code integration and experimentation",
882
+ "tasks": [
883
+ "nlp",
884
+ "tooling"
885
+ ],
886
+ "tags": [
887
+ "pashto",
888
+ "code",
889
+ "github",
890
+ "nlp"
891
+ ],
892
+ "evidence_text": "Repository name and description explicitly identify a Pashto NLP toolkit.",
893
+ "evidence_url": "https://api.github.com/repos/ijazul-haq/nlpashto",
894
+ "markers": [
895
+ "Pashto",
896
+ "NLP"
897
+ ]
898
  }
899
  ]
900
  }
resources/README.md CHANGED
@@ -3,11 +3,13 @@
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
- - Datasets (11): [datasets/README.md](datasets/README.md)
7
  - Models (9): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (2): [tools/README.md](tools/README.md)
10
  - Papers (4): [papers/README.md](papers/README.md)
 
 
11
 
12
  ## Machine-Readable Catalog
13
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
@@ -20,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
20
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
21
  - Run `python scripts/generate_resource_views.py` after catalog changes.
22
 
23
- Verified resource count: `30`
 
3
  Structured, Pashto-focused resource tracking lives in this folder.
4
 
5
  ## Sections
6
+ - Datasets (14): [datasets/README.md](datasets/README.md)
7
  - Models (9): [models/README.md](models/README.md)
8
  - Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
9
  - Tools (2): [tools/README.md](tools/README.md)
10
  - Papers (4): [papers/README.md](papers/README.md)
11
+ - Projects (3): [projects/README.md](projects/README.md)
12
+ - Code (1): [codes/README.md](codes/README.md)
13
 
14
  ## Machine-Readable Catalog
15
  - Canonical catalog: [catalog/resources.json](catalog/resources.json)
 
22
  - Run `python scripts/validate_resource_catalog.py` before opening a PR.
23
  - Run `python scripts/generate_resource_views.py` after catalog changes.
24
 
25
+ Verified resource count: `37`
resources/catalog/pending_candidates.json CHANGED
The diff for this file is too large to render. See raw diff
 
resources/catalog/resource.template.json CHANGED
@@ -2,7 +2,7 @@
2
  "id": "example-resource-id",
3
  "title": "Example Resource Title",
4
  "url": "https://example.org/resource",
5
- "category": "dataset",
6
  "source": "other",
7
  "status": "verified",
8
  "summary": "One-line summary explaining why this resource matters for Pashto in technology.",
 
2
  "id": "example-resource-id",
3
  "title": "Example Resource Title",
4
  "url": "https://example.org/resource",
5
+ "category": "project",
6
  "source": "other",
7
  "status": "verified",
8
  "summary": "One-line summary explaining why this resource matters for Pashto in technology.",
resources/catalog/resources.json CHANGED
@@ -782,6 +782,201 @@
782
  "whisper",
783
  "fleurs"
784
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
  }
786
  ]
787
  }
 
782
  "whisper",
783
  "fleurs"
784
  ]
785
+ },
786
+ {
787
+ "id": "dataset-kaggle-pold-pashto-offensive",
788
+ "title": "POLD - Pashto Offensive Language Dataset",
789
+ "url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
790
+ "category": "dataset",
791
+ "source": "kaggle",
792
+ "status": "verified",
793
+ "summary": "Benchmark dataset for offensive content detection in Pashto social text.",
794
+ "primary_use": "Pashto toxicity and moderation NLP benchmarks",
795
+ "license": "CC BY 4.0",
796
+ "tasks": [
797
+ "nlp",
798
+ "classification"
799
+ ],
800
+ "pashto_evidence": {
801
+ "evidence_text": "Kaggle title and description explicitly state Pashto offensive language benchmark dataset.",
802
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset",
803
+ "markers": [
804
+ "Pashto"
805
+ ]
806
+ },
807
+ "tags": [
808
+ "pashto",
809
+ "kaggle",
810
+ "nlp",
811
+ "toxicity"
812
+ ]
813
+ },
814
+ {
815
+ "id": "dataset-kaggle-pashto-english-sentiment-corpus",
816
+ "title": "Pashto English Bilingual Sentiment Corpus",
817
+ "url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
818
+ "category": "dataset",
819
+ "source": "kaggle",
820
+ "status": "verified",
821
+ "summary": "Pashto to English bilingual sentiment corpus useful for low-resource sentiment tasks.",
822
+ "primary_use": "Sentiment analysis and bilingual NLP experiments",
823
+ "license": "CC BY-NC-SA 4.0",
824
+ "tasks": [
825
+ "nlp",
826
+ "sentiment"
827
+ ],
828
+ "pashto_evidence": {
829
+ "evidence_text": "Kaggle dataset title and description identify the corpus as Pashto-English sentiment data.",
830
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus",
831
+ "markers": [
832
+ "Pashto"
833
+ ]
834
+ },
835
+ "tags": [
836
+ "pashto",
837
+ "kaggle",
838
+ "sentiment",
839
+ "bilingual"
840
+ ]
841
+ },
842
+ {
843
+ "id": "dataset-kaggle-urdu-pashto-lexicon",
844
+ "title": "Urdu-Pashto Lexicon Dataset",
845
+ "url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
846
+ "category": "dataset",
847
+ "source": "kaggle",
848
+ "status": "verified",
849
+ "summary": "Lexicon of Urdu words with Pashto translations for dictionary and MT support.",
850
+ "primary_use": "Lexicon and translation lexeme mapping",
851
+ "license": "CC0",
852
+ "tasks": [
853
+ "nlp",
854
+ "mt"
855
+ ],
856
+ "pashto_evidence": {
857
+ "evidence_text": "Kaggle metadata describes 7,601 Urdu entries with Pashto translations.",
858
+ "evidence_url": "https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset",
859
+ "markers": [
860
+ "Pashto"
861
+ ]
862
+ },
863
+ "tags": [
864
+ "pashto",
865
+ "kaggle",
866
+ "lexicon",
867
+ "translation"
868
+ ]
869
+ },
870
+ {
871
+ "id": "project-hf-space-ihanif-pashto-asr-v3",
872
+ "title": "Pashto ASR V3 Space",
873
+ "url": "https://huggingface.co/spaces/ihanif/pashto-asr-v3",
874
+ "category": "project",
875
+ "source": "huggingface",
876
+ "status": "verified",
877
+ "summary": "Interactive Hugging Face Space for Pashto automatic speech recognition demos.",
878
+ "primary_use": "Project demo for Pashto ASR user testing",
879
+ "tasks": [
880
+ "asr",
881
+ "demo"
882
+ ],
883
+ "pashto_evidence": {
884
+ "evidence_text": "Space card title is Pashto ASR V3 and short description states Pashto ASR.",
885
+ "evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr-v3",
886
+ "markers": [
887
+ "Pashto",
888
+ "ASR"
889
+ ]
890
+ },
891
+ "tags": [
892
+ "pashto",
893
+ "project",
894
+ "huggingface-space",
895
+ "asr"
896
+ ]
897
+ },
898
+ {
899
+ "id": "project-hf-space-pashto2english-dictionary",
900
+ "title": "Pashto to English Dictionary Space",
901
+ "url": "https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary",
902
+ "category": "project",
903
+ "source": "huggingface",
904
+ "status": "verified",
905
+ "summary": "Streamlit project for Pashto to English dictionary lookups.",
906
+ "primary_use": "Interactive bilingual lookup project",
907
+ "tasks": [
908
+ "dictionary",
909
+ "translation",
910
+ "demo"
911
+ ],
912
+ "pashto_evidence": {
913
+ "evidence_text": "Space metadata title states Pashto to English Dictionary.",
914
+ "evidence_url": "https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary",
915
+ "markers": [
916
+ "Pashto"
917
+ ]
918
+ },
919
+ "tags": [
920
+ "pashto",
921
+ "project",
922
+ "huggingface-space",
923
+ "dictionary"
924
+ ]
925
+ },
926
+ {
927
+ "id": "project-hf-space-umar4321-pashto-translator",
928
+ "title": "Pashto Translator Space",
929
+ "url": "https://huggingface.co/spaces/Umar4321/Pashto-Translator",
930
+ "category": "project",
931
+ "source": "huggingface",
932
+ "status": "verified",
933
+ "summary": "Streamlit translator project for Pashto to English and Urdu conversion.",
934
+ "primary_use": "Interactive translation project demo",
935
+ "tasks": [
936
+ "translation",
937
+ "demo"
938
+ ],
939
+ "pashto_evidence": {
940
+ "evidence_text": "Space title is Pashto Translator and description states Pashto to English and Urdu translation.",
941
+ "evidence_url": "https://huggingface.co/api/spaces/Umar4321/Pashto-Translator",
942
+ "markers": [
943
+ "Pashto"
944
+ ]
945
+ },
946
+ "tags": [
947
+ "pashto",
948
+ "project",
949
+ "huggingface-space",
950
+ "translation"
951
+ ]
952
+ },
953
+ {
954
+ "id": "code-github-ijazul-haq-nlpashto",
955
+ "title": "nlpashto Toolkit",
956
+ "url": "https://github.com/ijazul-haq/nlpashto",
957
+ "category": "code",
958
+ "source": "github",
959
+ "status": "verified",
960
+ "summary": "Pashto NLP toolkit codebase for tokenization, embeddings, and downstream NLP workflows.",
961
+ "primary_use": "Pashto NLP code integration and experimentation",
962
+ "tasks": [
963
+ "nlp",
964
+ "tooling"
965
+ ],
966
+ "pashto_evidence": {
967
+ "evidence_text": "Repository name and description explicitly identify a Pashto NLP toolkit.",
968
+ "evidence_url": "https://api.github.com/repos/ijazul-haq/nlpashto",
969
+ "markers": [
970
+ "Pashto",
971
+ "NLP"
972
+ ]
973
+ },
974
+ "tags": [
975
+ "pashto",
976
+ "code",
977
+ "github",
978
+ "nlp"
979
+ ]
980
  }
981
  ]
982
  }
resources/codes/README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Code
2
+
3
+ ## Verified Pashto Resources
4
+
5
+ | Resource | Link | Pashto Evidence | Primary Use |
6
+ |---|---|---|---|
7
+ | nlpashto Toolkit | [github](https://github.com/ijazul-haq/nlpashto) | [Repository name and description explicitly identify a Pashto NLP toolkit. (`Pashto`, `NLP`)](https://api.github.com/repos/ijazul-haq/nlpashto) | Pashto NLP code integration and experimentation |
8
+
9
+ ## Maintenance
10
+ - Source of truth: [../catalog/resources.json](../catalog/resources.json)
11
+ - Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
12
+ - Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
resources/datasets/README.md CHANGED
@@ -10,9 +10,12 @@
10
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
11
  | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
12
  | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
 
13
  | Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
14
  | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
15
  | Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
 
 
16
  | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
17
  | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
18
 
 
10
  | Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
11
  | OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
12
  | OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
13
+ | Pashto English Bilingual Sentiment Corpus | [kaggle](https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus) | [Kaggle dataset title and description identify the corpus as Pashto-English sentiment data. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus) | Sentiment analysis and bilingual NLP experiments |
14
  | Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
15
  | Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
16
  | Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
17
+ | POLD - Pashto Offensive Language Dataset | [kaggle](https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset) | [Kaggle title and description explicitly state Pashto offensive language benchmark dataset. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset) | Pashto toxicity and moderation NLP benchmarks |
18
+ | Urdu-Pashto Lexicon Dataset | [kaggle](https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset) | [Kaggle metadata describes 7,601 Urdu entries with Pashto translations. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset) | Lexicon and translation lexeme mapping |
19
  | Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
20
  | Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
21
 
resources/projects/README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Projects
2
+
3
+ ## Verified Pashto Resources
4
+
5
+ | Resource | Link | Pashto Evidence | Primary Use |
6
+ |---|---|---|---|
7
+ | Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
8
+ | Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
9
+ | Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
10
+
11
+ ## Maintenance
12
+ - Source of truth: [../catalog/resources.json](../catalog/resources.json)
13
+ - Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
14
+ - Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
resources/schema/resource.schema.json CHANGED
@@ -62,7 +62,9 @@
62
  "model",
63
  "benchmark",
64
  "tool",
65
- "paper"
 
 
66
  ]
67
  },
68
  "source": {
 
62
  "model",
63
  "benchmark",
64
  "tool",
65
+ "paper",
66
+ "project",
67
+ "code"
68
  ]
69
  },
70
  "source": {
scripts/README.md CHANGED
@@ -7,7 +7,7 @@ Automation scripts for quality checks, resource catalog validation, and search i
7
  - `check_links.py`: ensure markdown links are clickable (optional online reachability check).
8
  - `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
9
  - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
10
- - `sync_resources.py`: collect new candidate Pashto resources from public endpoints into `resources/catalog/pending_candidates.json`.
11
  - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
12
 
13
  ## Usage
 
7
  - `check_links.py`: ensure markdown links are clickable (optional online reachability check).
8
  - `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
9
  - `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
10
+ - `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub repositories, and paper endpoints into `resources/catalog/pending_candidates.json`.
11
  - `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
12
 
13
  ## Usage
scripts/generate_resource_views.py CHANGED
@@ -17,6 +17,8 @@ CATEGORY_CONFIG = {
17
  "benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
18
  "tool": ("resources/tools/README.md", "Tools"),
19
  "paper": ("resources/papers/README.md", "Papers"),
 
 
20
  }
21
 
22
 
@@ -86,6 +88,8 @@ def _write_resources_home(path: Path, counts: dict[str, int], total_verified: in
86
  f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
87
  f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
88
  f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
 
 
89
  "",
90
  "## Machine-Readable Catalog",
91
  "- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
 
17
  "benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
18
  "tool": ("resources/tools/README.md", "Tools"),
19
  "paper": ("resources/papers/README.md", "Papers"),
20
+ "project": ("resources/projects/README.md", "Projects"),
21
+ "code": ("resources/codes/README.md", "Code"),
22
  }
23
 
24
 
 
88
  f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
89
  f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
90
  f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
91
+ f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)",
92
+ f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)",
93
  "",
94
  "## Machine-Readable Catalog",
95
  "- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
scripts/sync_resources.py CHANGED
@@ -108,6 +108,138 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
108
  return out
109
 
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
112
  query = urllib.parse.urlencode(
113
  {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
@@ -228,8 +360,11 @@ def main() -> int:
228
  sources_used: list[str] = []
229
 
230
  fetch_steps = [
 
231
  ("huggingface-datasets", lambda: fetch_huggingface("datasets", args.limit)),
232
  ("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
 
 
233
  ("arxiv", lambda: fetch_arxiv(args.limit)),
234
  ("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
235
  ]
 
108
  return out
109
 
110
 
111
+ def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
112
+ query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
113
+ url = f"https://huggingface.co/api/spaces?{query}"
114
+ payload = _fetch_json(url)
115
+
116
+ out: list[dict[str, Any]] = []
117
+ for item in payload:
118
+ space_id = item.get("id")
119
+ if not space_id:
120
+ continue
121
+ space_url = f"https://huggingface.co/spaces/{space_id}"
122
+ rid = f"candidate-hf-project-{_slug(space_id)}"
123
+ summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
124
+ out.append(
125
+ _candidate(
126
+ rid=rid,
127
+ title=space_id,
128
+ url=space_url,
129
+ category="project",
130
+ source="huggingface",
131
+ summary=summary,
132
+ evidence_text="Matched by Pashto keyword in Hugging Face Spaces search.",
133
+ evidence_url=space_url,
134
+ markers=["pashto"],
135
+ tags=["pashto", "candidate", "project", "space"],
136
+ )
137
+ )
138
+ return out
139
+
140
+
141
+ def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
142
+ # Public Kaggle dataset listing endpoint (no auth needed for list responses).
143
+ query = urllib.parse.urlencode({"search": "pashto", "page": "1"})
144
+ url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
145
+ payload = _fetch_json(url)
146
+
147
+ out: list[dict[str, Any]] = []
148
+ for item in payload:
149
+ title = (item.get("titleNullable") or "").strip()
150
+ dataset_url = (item.get("urlNullable") or "").strip()
151
+ owner = (item.get("ownerRefNullable") or "").strip()
152
+ subtitle = (item.get("subtitleNullable") or "").strip()
153
+ if not title or not dataset_url:
154
+ continue
155
+
156
+ blob = f"{title} {subtitle}".lower()
157
+ if "pashto" not in blob and "pukhto" not in blob:
158
+ continue
159
+
160
+ owner_prefix = f"{owner}/" if owner else ""
161
+ rid = f"candidate-kaggle-dataset-{_slug(owner_prefix + title)}"
162
+ out.append(
163
+ _candidate(
164
+ rid=rid,
165
+ title=title,
166
+ url=dataset_url,
167
+ category="dataset",
168
+ source="kaggle",
169
+ summary=(subtitle or "Candidate Kaggle dataset returned from Pashto search.")[:240],
170
+ evidence_text="Kaggle dataset title/subtitle includes Pashto keyword.",
171
+ evidence_url=dataset_url,
172
+ markers=["Pashto"],
173
+ tags=["pashto", "candidate", "dataset", "kaggle"],
174
+ )
175
+ )
176
+ if len(out) >= limit:
177
+ break
178
+ return out
179
+
180
+
181
+ def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
182
+ # Query by topic first for high precision, then by keyword for recall.
183
+ query_variants = [
184
+ "topic:pashto",
185
+ "pashto in:name,description,readme",
186
+ ]
187
+
188
+ combined: dict[str, dict[str, Any]] = {}
189
+ for query_text in query_variants:
190
+ query = urllib.parse.urlencode(
191
+ {"q": query_text, "sort": "stars", "order": "desc", "per_page": str(limit)}
192
+ )
193
+ url = f"https://api.github.com/search/repositories?{query}"
194
+ payload = _fetch_json(url)
195
+ for item in payload.get("items", []):
196
+ full_name = item.get("full_name")
197
+ html_url = item.get("html_url")
198
+ if not full_name or not html_url:
199
+ continue
200
+ combined[full_name] = item
201
+
202
+ out: list[dict[str, Any]] = []
203
+ for full_name, item in sorted(combined.items(), key=lambda kv: kv[1].get("stargazers_count", 0), reverse=True):
204
+ name_blob = " ".join(
205
+ [
206
+ full_name or "",
207
+ item.get("name") or "",
208
+ item.get("description") or "",
209
+ " ".join(item.get("topics") or []),
210
+ ]
211
+ ).lower()
212
+ if "pashto" not in name_blob and "pukhto" not in name_blob:
213
+ continue
214
+
215
+ html_url = item["html_url"]
216
+ category = "project"
217
+ topics = item.get("topics") or []
218
+ if any(token in name_blob for token in ("toolkit", "library", "nlp", "asr", "tts", "ocr", "api", "code")):
219
+ category = "code"
220
+
221
+ rid = f"candidate-gh-{category}-{_slug(full_name)}"
222
+ description = (item.get("description") or "").strip()
223
+ summary = description or "Candidate Pashto-related GitHub repository."
224
+ out.append(
225
+ _candidate(
226
+ rid=rid,
227
+ title=full_name,
228
+ url=html_url,
229
+ category=category,
230
+ source="github",
231
+ summary=summary[:240] if summary else "Candidate Pashto-related GitHub repository.",
232
+ evidence_text="Repository metadata (name/description/topics) includes Pashto markers.",
233
+ evidence_url=html_url,
234
+ markers=["pashto"],
235
+ tags=["pashto", "candidate", category, "github", *(topics[:3])],
236
+ )
237
+ )
238
+ if len(out) >= limit:
239
+ break
240
+ return out
241
+
242
+
243
  def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
244
  query = urllib.parse.urlencode(
245
  {"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
 
360
  sources_used: list[str] = []
361
 
362
  fetch_steps = [
363
+ ("kaggle-datasets", lambda: fetch_kaggle_datasets(args.limit)),
364
  ("huggingface-datasets", lambda: fetch_huggingface("datasets", args.limit)),
365
  ("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
366
+ ("huggingface-spaces", lambda: fetch_huggingface_spaces(args.limit)),
367
+ ("github-repositories", lambda: fetch_github_pashto_repos(args.limit)),
368
  ("arxiv", lambda: fetch_arxiv(args.limit)),
369
  ("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
370
  ]
scripts/validate_resource_catalog.py CHANGED
@@ -16,7 +16,7 @@ from typing import Any
16
  from urllib.parse import urlparse
17
 
18
 
19
- ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper"}
20
  ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
21
  ALLOWED_STATUS = {"verified", "candidate"}
22
  RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
 
16
  from urllib.parse import urlparse
17
 
18
 
19
+ ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project", "code"}
20
  ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
21
  ALLOWED_STATUS = {"verified", "candidate"}
22
  RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")