musaw commited on
Commit ·
081627f
1
Parent(s): 574cd8c
Expand resource cycle for projects/code and promote new Pashto sources
Browse files- README.md +2 -0
- docs/resource_automation.md +8 -0
- docs/resource_catalog.md +2 -0
- docs/resource_cycle_runbook.md +2 -0
- docs/search/resources.json +179 -1
- resources/README.md +4 -2
- resources/catalog/pending_candidates.json +0 -0
- resources/catalog/resource.template.json +1 -1
- resources/catalog/resources.json +195 -0
- resources/codes/README.md +12 -0
- resources/datasets/README.md +3 -0
- resources/projects/README.md +14 -0
- resources/schema/resource.schema.json +3 -1
- scripts/README.md +1 -1
- scripts/generate_resource_views.py +4 -0
- scripts/sync_resources.py +135 -0
- scripts/validate_resource_catalog.py +1 -1
README.md
CHANGED
|
@@ -89,6 +89,8 @@ python -m pytest -q
|
|
| 89 |
- Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
|
| 90 |
- Tools: [resources/tools/README.md](resources/tools/README.md)
|
| 91 |
- Papers: [resources/papers/README.md](resources/papers/README.md)
|
|
|
|
|
|
|
| 92 |
|
| 93 |
## Workspaces
|
| 94 |
- [data/](data/README.md): datasets, curation, metadata, quality
|
|
|
|
| 89 |
- Benchmarks: [resources/benchmarks/README.md](resources/benchmarks/README.md)
|
| 90 |
- Tools: [resources/tools/README.md](resources/tools/README.md)
|
| 91 |
- Papers: [resources/papers/README.md](resources/papers/README.md)
|
| 92 |
+
- Projects: [resources/projects/README.md](resources/projects/README.md)
|
| 93 |
+
- Code: [resources/codes/README.md](resources/codes/README.md)
|
| 94 |
|
| 95 |
## Workspaces
|
| 96 |
- [data/](data/README.md): datasets, curation, metadata, quality
|
docs/resource_automation.md
CHANGED
|
@@ -7,6 +7,14 @@ This repository uses a semi-automated process to keep Pashto resources current w
|
|
| 7 |
- Keep a machine-readable canonical catalog.
|
| 8 |
- Prevent unreviewed low-confidence resources from directly entering verified lists.
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
## Files involved
|
| 11 |
- Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
|
| 12 |
- Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)
|
|
|
|
| 7 |
- Keep a machine-readable canonical catalog.
|
| 8 |
- Prevent unreviewed low-confidence resources from directly entering verified lists.
|
| 9 |
|
| 10 |
+
## Covered source types
|
| 11 |
+
- Kaggle datasets
|
| 12 |
+
- Hugging Face datasets
|
| 13 |
+
- Hugging Face models
|
| 14 |
+
- Hugging Face Spaces (projects)
|
| 15 |
+
- GitHub repositories (projects and code)
|
| 16 |
+
- Research-paper endpoints
|
| 17 |
+
|
| 18 |
## Files involved
|
| 19 |
- Canonical verified catalog: [../resources/catalog/resources.json](../resources/catalog/resources.json)
|
| 20 |
- Candidate feed: [../resources/catalog/pending_candidates.json](../resources/catalog/pending_candidates.json)
|
docs/resource_catalog.md
CHANGED
|
@@ -20,6 +20,8 @@ This index points to validated Pashto-related resources tracked in structured fi
|
|
| 20 |
- Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
|
| 21 |
- Tools: [../resources/tools/README.md](../resources/tools/README.md)
|
| 22 |
- Papers: [../resources/papers/README.md](../resources/papers/README.md)
|
|
|
|
|
|
|
| 23 |
|
| 24 |
## Search page
|
| 25 |
- GitHub Pages search UI: [search/index.html](search/index.html)
|
|
|
|
| 20 |
- Benchmarks: [../resources/benchmarks/README.md](../resources/benchmarks/README.md)
|
| 21 |
- Tools: [../resources/tools/README.md](../resources/tools/README.md)
|
| 22 |
- Papers: [../resources/papers/README.md](../resources/papers/README.md)
|
| 23 |
+
- Projects: [../resources/projects/README.md](../resources/projects/README.md)
|
| 24 |
+
- Code: [../resources/codes/README.md](../resources/codes/README.md)
|
| 25 |
|
| 26 |
## Search page
|
| 27 |
- GitHub Pages search UI: [search/index.html](search/index.html)
|
docs/resource_cycle_runbook.md
CHANGED
|
@@ -21,6 +21,8 @@ What it executes:
|
|
| 21 |
4. `python scripts/check_links.py`
|
| 22 |
5. `python -m pytest -q`
|
| 23 |
|
|
|
|
|
|
|
| 24 |
## Discovery-only mode
|
| 25 |
If you only want fresh candidates:
|
| 26 |
|
|
|
|
| 21 |
4. `python scripts/check_links.py`
|
| 22 |
5. `python -m pytest -q`
|
| 23 |
|
| 24 |
+
Candidate sources in the sync step include Kaggle datasets, Hugging Face datasets/models/spaces, GitHub repositories, and paper endpoints.
|
| 25 |
+
|
| 26 |
## Discovery-only mode
|
| 27 |
If you only want fresh candidates:
|
| 28 |
|
docs/search/resources.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"generated_on": "2026-02-15T00:00:00Z",
|
| 3 |
-
"count":
|
| 4 |
"resources": [
|
| 5 |
{
|
| 6 |
"id": "dataset-common-voice-ps-v24",
|
|
@@ -717,6 +717,184 @@
|
|
| 717 |
"ps",
|
| 718 |
"ps_af"
|
| 719 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
}
|
| 721 |
]
|
| 722 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"generated_on": "2026-02-15T00:00:00Z",
|
| 3 |
+
"count": 37,
|
| 4 |
"resources": [
|
| 5 |
{
|
| 6 |
"id": "dataset-common-voice-ps-v24",
|
|
|
|
| 717 |
"ps",
|
| 718 |
"ps_af"
|
| 719 |
]
|
| 720 |
+
},
|
| 721 |
+
{
|
| 722 |
+
"id": "dataset-kaggle-pold-pashto-offensive",
|
| 723 |
+
"title": "POLD - Pashto Offensive Language Dataset",
|
| 724 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 725 |
+
"category": "dataset",
|
| 726 |
+
"source": "kaggle",
|
| 727 |
+
"status": "verified",
|
| 728 |
+
"summary": "Benchmark dataset for offensive content detection in Pashto social text.",
|
| 729 |
+
"primary_use": "Pashto toxicity and moderation NLP benchmarks",
|
| 730 |
+
"tasks": [
|
| 731 |
+
"nlp",
|
| 732 |
+
"classification"
|
| 733 |
+
],
|
| 734 |
+
"tags": [
|
| 735 |
+
"pashto",
|
| 736 |
+
"kaggle",
|
| 737 |
+
"nlp",
|
| 738 |
+
"toxicity"
|
| 739 |
+
],
|
| 740 |
+
"evidence_text": "Kaggle title and description explicitly state Pashto offensive language benchmark dataset.",
|
| 741 |
+
"evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset",
|
| 742 |
+
"markers": [
|
| 743 |
+
"Pashto"
|
| 744 |
+
]
|
| 745 |
+
},
|
| 746 |
+
{
|
| 747 |
+
"id": "dataset-kaggle-pashto-english-sentiment-corpus",
|
| 748 |
+
"title": "Pashto English Bilingual Sentiment Corpus",
|
| 749 |
+
"url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 750 |
+
"category": "dataset",
|
| 751 |
+
"source": "kaggle",
|
| 752 |
+
"status": "verified",
|
| 753 |
+
"summary": "Pashto to English bilingual sentiment corpus useful for low-resource sentiment tasks.",
|
| 754 |
+
"primary_use": "Sentiment analysis and bilingual NLP experiments",
|
| 755 |
+
"tasks": [
|
| 756 |
+
"nlp",
|
| 757 |
+
"sentiment"
|
| 758 |
+
],
|
| 759 |
+
"tags": [
|
| 760 |
+
"pashto",
|
| 761 |
+
"kaggle",
|
| 762 |
+
"sentiment",
|
| 763 |
+
"bilingual"
|
| 764 |
+
],
|
| 765 |
+
"evidence_text": "Kaggle dataset title and description identify the corpus as Pashto-English sentiment data.",
|
| 766 |
+
"evidence_url": "https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus",
|
| 767 |
+
"markers": [
|
| 768 |
+
"Pashto"
|
| 769 |
+
]
|
| 770 |
+
},
|
| 771 |
+
{
|
| 772 |
+
"id": "dataset-kaggle-urdu-pashto-lexicon",
|
| 773 |
+
"title": "Urdu-Pashto Lexicon Dataset",
|
| 774 |
+
"url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 775 |
+
"category": "dataset",
|
| 776 |
+
"source": "kaggle",
|
| 777 |
+
"status": "verified",
|
| 778 |
+
"summary": "Lexicon of Urdu words with Pashto translations for dictionary and MT support.",
|
| 779 |
+
"primary_use": "Lexicon and translation lexeme mapping",
|
| 780 |
+
"tasks": [
|
| 781 |
+
"nlp",
|
| 782 |
+
"mt"
|
| 783 |
+
],
|
| 784 |
+
"tags": [
|
| 785 |
+
"pashto",
|
| 786 |
+
"kaggle",
|
| 787 |
+
"lexicon",
|
| 788 |
+
"translation"
|
| 789 |
+
],
|
| 790 |
+
"evidence_text": "Kaggle metadata describes 7,601 Urdu entries with Pashto translations.",
|
| 791 |
+
"evidence_url": "https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 792 |
+
"markers": [
|
| 793 |
+
"Pashto"
|
| 794 |
+
]
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"id": "project-hf-space-ihanif-pashto-asr-v3",
|
| 798 |
+
"title": "Pashto ASR V3 Space",
|
| 799 |
+
"url": "https://huggingface.co/spaces/ihanif/pashto-asr-v3",
|
| 800 |
+
"category": "project",
|
| 801 |
+
"source": "huggingface",
|
| 802 |
+
"status": "verified",
|
| 803 |
+
"summary": "Interactive Hugging Face Space for Pashto automatic speech recognition demos.",
|
| 804 |
+
"primary_use": "Project demo for Pashto ASR user testing",
|
| 805 |
+
"tasks": [
|
| 806 |
+
"asr",
|
| 807 |
+
"demo"
|
| 808 |
+
],
|
| 809 |
+
"tags": [
|
| 810 |
+
"pashto",
|
| 811 |
+
"project",
|
| 812 |
+
"huggingface-space",
|
| 813 |
+
"asr"
|
| 814 |
+
],
|
| 815 |
+
"evidence_text": "Space card title is Pashto ASR V3 and short description states Pashto ASR.",
|
| 816 |
+
"evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr-v3",
|
| 817 |
+
"markers": [
|
| 818 |
+
"Pashto",
|
| 819 |
+
"ASR"
|
| 820 |
+
]
|
| 821 |
+
},
|
| 822 |
+
{
|
| 823 |
+
"id": "project-hf-space-pashto2english-dictionary",
|
| 824 |
+
"title": "Pashto to English Dictionary Space",
|
| 825 |
+
"url": "https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary",
|
| 826 |
+
"category": "project",
|
| 827 |
+
"source": "huggingface",
|
| 828 |
+
"status": "verified",
|
| 829 |
+
"summary": "Streamlit project for Pashto to English dictionary lookups.",
|
| 830 |
+
"primary_use": "Interactive bilingual lookup project",
|
| 831 |
+
"tasks": [
|
| 832 |
+
"dictionary",
|
| 833 |
+
"translation",
|
| 834 |
+
"demo"
|
| 835 |
+
],
|
| 836 |
+
"tags": [
|
| 837 |
+
"pashto",
|
| 838 |
+
"project",
|
| 839 |
+
"huggingface-space",
|
| 840 |
+
"dictionary"
|
| 841 |
+
],
|
| 842 |
+
"evidence_text": "Space metadata title states Pashto to English Dictionary.",
|
| 843 |
+
"evidence_url": "https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary",
|
| 844 |
+
"markers": [
|
| 845 |
+
"Pashto"
|
| 846 |
+
]
|
| 847 |
+
},
|
| 848 |
+
{
|
| 849 |
+
"id": "project-hf-space-umar4321-pashto-translator",
|
| 850 |
+
"title": "Pashto Translator Space",
|
| 851 |
+
"url": "https://huggingface.co/spaces/Umar4321/Pashto-Translator",
|
| 852 |
+
"category": "project",
|
| 853 |
+
"source": "huggingface",
|
| 854 |
+
"status": "verified",
|
| 855 |
+
"summary": "Streamlit translator project for Pashto to English and Urdu conversion.",
|
| 856 |
+
"primary_use": "Interactive translation project demo",
|
| 857 |
+
"tasks": [
|
| 858 |
+
"translation",
|
| 859 |
+
"demo"
|
| 860 |
+
],
|
| 861 |
+
"tags": [
|
| 862 |
+
"pashto",
|
| 863 |
+
"project",
|
| 864 |
+
"huggingface-space",
|
| 865 |
+
"translation"
|
| 866 |
+
],
|
| 867 |
+
"evidence_text": "Space title is Pashto Translator and description states Pashto to English and Urdu translation.",
|
| 868 |
+
"evidence_url": "https://huggingface.co/api/spaces/Umar4321/Pashto-Translator",
|
| 869 |
+
"markers": [
|
| 870 |
+
"Pashto"
|
| 871 |
+
]
|
| 872 |
+
},
|
| 873 |
+
{
|
| 874 |
+
"id": "code-github-ijazul-haq-nlpashto",
|
| 875 |
+
"title": "nlpashto Toolkit",
|
| 876 |
+
"url": "https://github.com/ijazul-haq/nlpashto",
|
| 877 |
+
"category": "code",
|
| 878 |
+
"source": "github",
|
| 879 |
+
"status": "verified",
|
| 880 |
+
"summary": "Pashto NLP toolkit codebase for tokenization, embeddings, and downstream NLP workflows.",
|
| 881 |
+
"primary_use": "Pashto NLP code integration and experimentation",
|
| 882 |
+
"tasks": [
|
| 883 |
+
"nlp",
|
| 884 |
+
"tooling"
|
| 885 |
+
],
|
| 886 |
+
"tags": [
|
| 887 |
+
"pashto",
|
| 888 |
+
"code",
|
| 889 |
+
"github",
|
| 890 |
+
"nlp"
|
| 891 |
+
],
|
| 892 |
+
"evidence_text": "Repository name and description explicitly identify a Pashto NLP toolkit.",
|
| 893 |
+
"evidence_url": "https://api.github.com/repos/ijazul-haq/nlpashto",
|
| 894 |
+
"markers": [
|
| 895 |
+
"Pashto",
|
| 896 |
+
"NLP"
|
| 897 |
+
]
|
| 898 |
}
|
| 899 |
]
|
| 900 |
}
|
resources/README.md
CHANGED
|
@@ -3,11 +3,13 @@
|
|
| 3 |
Structured, Pashto-focused resource tracking lives in this folder.
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
-
- Datasets (
|
| 7 |
- Models (9): [models/README.md](models/README.md)
|
| 8 |
- Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
- Tools (2): [tools/README.md](tools/README.md)
|
| 10 |
- Papers (4): [papers/README.md](papers/README.md)
|
|
|
|
|
|
|
| 11 |
|
| 12 |
## Machine-Readable Catalog
|
| 13 |
- Canonical catalog: [catalog/resources.json](catalog/resources.json)
|
|
@@ -20,4 +22,4 @@ Structured, Pashto-focused resource tracking lives in this folder.
|
|
| 20 |
- Run `python scripts/validate_resource_catalog.py` before opening a PR.
|
| 21 |
- Run `python scripts/generate_resource_views.py` after catalog changes.
|
| 22 |
|
| 23 |
-
Verified resource count: `
|
|
|
|
| 3 |
Structured, Pashto-focused resource tracking lives in this folder.
|
| 4 |
|
| 5 |
## Sections
|
| 6 |
+
- Datasets (14): [datasets/README.md](datasets/README.md)
|
| 7 |
- Models (9): [models/README.md](models/README.md)
|
| 8 |
- Benchmarks (4): [benchmarks/README.md](benchmarks/README.md)
|
| 9 |
- Tools (2): [tools/README.md](tools/README.md)
|
| 10 |
- Papers (4): [papers/README.md](papers/README.md)
|
| 11 |
+
- Projects (3): [projects/README.md](projects/README.md)
|
| 12 |
+
- Code (1): [codes/README.md](codes/README.md)
|
| 13 |
|
| 14 |
## Machine-Readable Catalog
|
| 15 |
- Canonical catalog: [catalog/resources.json](catalog/resources.json)
|
|
|
|
| 22 |
- Run `python scripts/validate_resource_catalog.py` before opening a PR.
|
| 23 |
- Run `python scripts/generate_resource_views.py` after catalog changes.
|
| 24 |
|
| 25 |
+
Verified resource count: `37`
|
resources/catalog/pending_candidates.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
resources/catalog/resource.template.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"id": "example-resource-id",
|
| 3 |
"title": "Example Resource Title",
|
| 4 |
"url": "https://example.org/resource",
|
| 5 |
-
"category": "
|
| 6 |
"source": "other",
|
| 7 |
"status": "verified",
|
| 8 |
"summary": "One-line summary explaining why this resource matters for Pashto in technology.",
|
|
|
|
| 2 |
"id": "example-resource-id",
|
| 3 |
"title": "Example Resource Title",
|
| 4 |
"url": "https://example.org/resource",
|
| 5 |
+
"category": "project",
|
| 6 |
"source": "other",
|
| 7 |
"status": "verified",
|
| 8 |
"summary": "One-line summary explaining why this resource matters for Pashto in technology.",
|
resources/catalog/resources.json
CHANGED
|
@@ -782,6 +782,201 @@
|
|
| 782 |
"whisper",
|
| 783 |
"fleurs"
|
| 784 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
}
|
| 786 |
]
|
| 787 |
}
|
|
|
|
| 782 |
"whisper",
|
| 783 |
"fleurs"
|
| 784 |
]
|
| 785 |
+
},
|
| 786 |
+
{
|
| 787 |
+
"id": "dataset-kaggle-pold-pashto-offensive",
|
| 788 |
+
"title": "POLD - Pashto Offensive Language Dataset",
|
| 789 |
+
"url": "https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset",
|
| 790 |
+
"category": "dataset",
|
| 791 |
+
"source": "kaggle",
|
| 792 |
+
"status": "verified",
|
| 793 |
+
"summary": "Benchmark dataset for offensive content detection in Pashto social text.",
|
| 794 |
+
"primary_use": "Pashto toxicity and moderation NLP benchmarks",
|
| 795 |
+
"license": "CC BY 4.0",
|
| 796 |
+
"tasks": [
|
| 797 |
+
"nlp",
|
| 798 |
+
"classification"
|
| 799 |
+
],
|
| 800 |
+
"pashto_evidence": {
|
| 801 |
+
"evidence_text": "Kaggle title and description explicitly state Pashto offensive language benchmark dataset.",
|
| 802 |
+
"evidence_url": "https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset",
|
| 803 |
+
"markers": [
|
| 804 |
+
"Pashto"
|
| 805 |
+
]
|
| 806 |
+
},
|
| 807 |
+
"tags": [
|
| 808 |
+
"pashto",
|
| 809 |
+
"kaggle",
|
| 810 |
+
"nlp",
|
| 811 |
+
"toxicity"
|
| 812 |
+
]
|
| 813 |
+
},
|
| 814 |
+
{
|
| 815 |
+
"id": "dataset-kaggle-pashto-english-sentiment-corpus",
|
| 816 |
+
"title": "Pashto English Bilingual Sentiment Corpus",
|
| 817 |
+
"url": "https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus",
|
| 818 |
+
"category": "dataset",
|
| 819 |
+
"source": "kaggle",
|
| 820 |
+
"status": "verified",
|
| 821 |
+
"summary": "Pashto to English bilingual sentiment corpus useful for low-resource sentiment tasks.",
|
| 822 |
+
"primary_use": "Sentiment analysis and bilingual NLP experiments",
|
| 823 |
+
"license": "CC BY-NC-SA 4.0",
|
| 824 |
+
"tasks": [
|
| 825 |
+
"nlp",
|
| 826 |
+
"sentiment"
|
| 827 |
+
],
|
| 828 |
+
"pashto_evidence": {
|
| 829 |
+
"evidence_text": "Kaggle dataset title and description identify the corpus as Pashto-English sentiment data.",
|
| 830 |
+
"evidence_url": "https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus",
|
| 831 |
+
"markers": [
|
| 832 |
+
"Pashto"
|
| 833 |
+
]
|
| 834 |
+
},
|
| 835 |
+
"tags": [
|
| 836 |
+
"pashto",
|
| 837 |
+
"kaggle",
|
| 838 |
+
"sentiment",
|
| 839 |
+
"bilingual"
|
| 840 |
+
]
|
| 841 |
+
},
|
| 842 |
+
{
|
| 843 |
+
"id": "dataset-kaggle-urdu-pashto-lexicon",
|
| 844 |
+
"title": "Urdu-Pashto Lexicon Dataset",
|
| 845 |
+
"url": "https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 846 |
+
"category": "dataset",
|
| 847 |
+
"source": "kaggle",
|
| 848 |
+
"status": "verified",
|
| 849 |
+
"summary": "Lexicon of Urdu words with Pashto translations for dictionary and MT support.",
|
| 850 |
+
"primary_use": "Lexicon and translation lexeme mapping",
|
| 851 |
+
"license": "CC0",
|
| 852 |
+
"tasks": [
|
| 853 |
+
"nlp",
|
| 854 |
+
"mt"
|
| 855 |
+
],
|
| 856 |
+
"pashto_evidence": {
|
| 857 |
+
"evidence_text": "Kaggle metadata describes 7,601 Urdu entries with Pashto translations.",
|
| 858 |
+
"evidence_url": "https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset",
|
| 859 |
+
"markers": [
|
| 860 |
+
"Pashto"
|
| 861 |
+
]
|
| 862 |
+
},
|
| 863 |
+
"tags": [
|
| 864 |
+
"pashto",
|
| 865 |
+
"kaggle",
|
| 866 |
+
"lexicon",
|
| 867 |
+
"translation"
|
| 868 |
+
]
|
| 869 |
+
},
|
| 870 |
+
{
|
| 871 |
+
"id": "project-hf-space-ihanif-pashto-asr-v3",
|
| 872 |
+
"title": "Pashto ASR V3 Space",
|
| 873 |
+
"url": "https://huggingface.co/spaces/ihanif/pashto-asr-v3",
|
| 874 |
+
"category": "project",
|
| 875 |
+
"source": "huggingface",
|
| 876 |
+
"status": "verified",
|
| 877 |
+
"summary": "Interactive Hugging Face Space for Pashto automatic speech recognition demos.",
|
| 878 |
+
"primary_use": "Project demo for Pashto ASR user testing",
|
| 879 |
+
"tasks": [
|
| 880 |
+
"asr",
|
| 881 |
+
"demo"
|
| 882 |
+
],
|
| 883 |
+
"pashto_evidence": {
|
| 884 |
+
"evidence_text": "Space card title is Pashto ASR V3 and short description states Pashto ASR.",
|
| 885 |
+
"evidence_url": "https://huggingface.co/api/spaces/ihanif/pashto-asr-v3",
|
| 886 |
+
"markers": [
|
| 887 |
+
"Pashto",
|
| 888 |
+
"ASR"
|
| 889 |
+
]
|
| 890 |
+
},
|
| 891 |
+
"tags": [
|
| 892 |
+
"pashto",
|
| 893 |
+
"project",
|
| 894 |
+
"huggingface-space",
|
| 895 |
+
"asr"
|
| 896 |
+
]
|
| 897 |
+
},
|
| 898 |
+
{
|
| 899 |
+
"id": "project-hf-space-pashto2english-dictionary",
|
| 900 |
+
"title": "Pashto to English Dictionary Space",
|
| 901 |
+
"url": "https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary",
|
| 902 |
+
"category": "project",
|
| 903 |
+
"source": "huggingface",
|
| 904 |
+
"status": "verified",
|
| 905 |
+
"summary": "Streamlit project for Pashto to English dictionary lookups.",
|
| 906 |
+
"primary_use": "Interactive bilingual lookup project",
|
| 907 |
+
"tasks": [
|
| 908 |
+
"dictionary",
|
| 909 |
+
"translation",
|
| 910 |
+
"demo"
|
| 911 |
+
],
|
| 912 |
+
"pashto_evidence": {
|
| 913 |
+
"evidence_text": "Space metadata title states Pashto to English Dictionary.",
|
| 914 |
+
"evidence_url": "https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary",
|
| 915 |
+
"markers": [
|
| 916 |
+
"Pashto"
|
| 917 |
+
]
|
| 918 |
+
},
|
| 919 |
+
"tags": [
|
| 920 |
+
"pashto",
|
| 921 |
+
"project",
|
| 922 |
+
"huggingface-space",
|
| 923 |
+
"dictionary"
|
| 924 |
+
]
|
| 925 |
+
},
|
| 926 |
+
{
|
| 927 |
+
"id": "project-hf-space-umar4321-pashto-translator",
|
| 928 |
+
"title": "Pashto Translator Space",
|
| 929 |
+
"url": "https://huggingface.co/spaces/Umar4321/Pashto-Translator",
|
| 930 |
+
"category": "project",
|
| 931 |
+
"source": "huggingface",
|
| 932 |
+
"status": "verified",
|
| 933 |
+
"summary": "Streamlit translator project for Pashto to English and Urdu conversion.",
|
| 934 |
+
"primary_use": "Interactive translation project demo",
|
| 935 |
+
"tasks": [
|
| 936 |
+
"translation",
|
| 937 |
+
"demo"
|
| 938 |
+
],
|
| 939 |
+
"pashto_evidence": {
|
| 940 |
+
"evidence_text": "Space title is Pashto Translator and description states Pashto to English and Urdu translation.",
|
| 941 |
+
"evidence_url": "https://huggingface.co/api/spaces/Umar4321/Pashto-Translator",
|
| 942 |
+
"markers": [
|
| 943 |
+
"Pashto"
|
| 944 |
+
]
|
| 945 |
+
},
|
| 946 |
+
"tags": [
|
| 947 |
+
"pashto",
|
| 948 |
+
"project",
|
| 949 |
+
"huggingface-space",
|
| 950 |
+
"translation"
|
| 951 |
+
]
|
| 952 |
+
},
|
| 953 |
+
{
|
| 954 |
+
"id": "code-github-ijazul-haq-nlpashto",
|
| 955 |
+
"title": "nlpashto Toolkit",
|
| 956 |
+
"url": "https://github.com/ijazul-haq/nlpashto",
|
| 957 |
+
"category": "code",
|
| 958 |
+
"source": "github",
|
| 959 |
+
"status": "verified",
|
| 960 |
+
"summary": "Pashto NLP toolkit codebase for tokenization, embeddings, and downstream NLP workflows.",
|
| 961 |
+
"primary_use": "Pashto NLP code integration and experimentation",
|
| 962 |
+
"tasks": [
|
| 963 |
+
"nlp",
|
| 964 |
+
"tooling"
|
| 965 |
+
],
|
| 966 |
+
"pashto_evidence": {
|
| 967 |
+
"evidence_text": "Repository name and description explicitly identify a Pashto NLP toolkit.",
|
| 968 |
+
"evidence_url": "https://api.github.com/repos/ijazul-haq/nlpashto",
|
| 969 |
+
"markers": [
|
| 970 |
+
"Pashto",
|
| 971 |
+
"NLP"
|
| 972 |
+
]
|
| 973 |
+
},
|
| 974 |
+
"tags": [
|
| 975 |
+
"pashto",
|
| 976 |
+
"code",
|
| 977 |
+
"github",
|
| 978 |
+
"nlp"
|
| 979 |
+
]
|
| 980 |
}
|
| 981 |
]
|
| 982 |
}
|
resources/codes/README.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code
|
| 2 |
+
|
| 3 |
+
## Verified Pashto Resources
|
| 4 |
+
|
| 5 |
+
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
+
|---|---|---|---|
|
| 7 |
+
| nlpashto Toolkit | [github](https://github.com/ijazul-haq/nlpashto) | [Repository name and description explicitly identify a Pashto NLP toolkit. (`Pashto`, `NLP`)](https://api.github.com/repos/ijazul-haq/nlpashto) | Pashto NLP code integration and experimentation |
|
| 8 |
+
|
| 9 |
+
## Maintenance
|
| 10 |
+
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
| 11 |
+
- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
|
| 12 |
+
- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
|
resources/datasets/README.md
CHANGED
|
@@ -10,9 +10,12 @@
|
|
| 10 |
| Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
|
| 11 |
| OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
|
| 12 |
| OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
|
|
|
|
| 13 |
| Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
|
| 14 |
| Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
|
| 15 |
| Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
|
|
|
|
|
|
|
| 16 |
| Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
|
| 17 |
| Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
|
| 18 |
|
|
|
|
| 10 |
| Google FLEURS | [huggingface](https://huggingface.co/datasets/google/fleurs) | [Dataset config includes ps_af. (`ps_af`)](https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py) | Speech benchmark and external evaluation |
|
| 11 |
| OPUS-100 | [huggingface](https://huggingface.co/datasets/Helsinki-NLP/opus-100) | [Dataset viewer includes en-ps split. (`en-ps`)](https://huggingface.co/datasets/Helsinki-NLP/opus-100/viewer/en-ps) | Machine translation training and evaluation |
|
| 12 |
| OSCAR Corpus | [huggingface](https://huggingface.co/datasets/oscar-corpus/oscar) | [Dataset includes unshuffled_deduplicated_ps split. (`unshuffled_deduplicated_ps`)](https://huggingface.co/datasets/oscar-corpus/oscar) | Language modeling and lexicon expansion |
|
| 13 |
+
| Pashto English Bilingual Sentiment Corpus | [kaggle](https://www.kaggle.com/datasets/farhadkhan66/pashto-translated-corpus) | [Kaggle dataset title and description identify the corpus as Pashto-English sentiment data. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/farhadkhan66/pashto-translated-corpus) | Sentiment analysis and bilingual NLP experiments |
|
| 14 |
| Pashto Isolated Words Speech Dataset | [kaggle](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | [Dataset title explicitly states Pashto speech dataset. (`Pashto`)](https://www.kaggle.com/datasets/engrirf/pashto-isolated-words-speech-dataset) | Keyword spotting and constrained ASR experiments |
|
| 15 |
| Pashto Wikipedia Corpus | [huggingface](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | [Dataset metadata includes language:ps and the title specifies Pashto corpus. (`ps`, `Pashto`)](https://huggingface.co/datasets/ihanif/pashto-wikipedia-corpus) | Pashto text corpus for NLP baselines |
|
| 16 |
| Pashto Word Embeddings | [kaggle](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | [Dataset description states pretrained Pashto embeddings. (`Pashto`)](https://www.kaggle.com/datasets/drijaz/pashto-word-embeddings) | Lexical semantics and lightweight NLP baselines |
|
| 17 |
+
| POLD - Pashto Offensive Language Dataset | [kaggle](https://www.kaggle.com/datasets/drijaz/pold-pashto-offensive-language-dataset) | [Kaggle title and description explicitly state Pashto offensive language benchmark dataset. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/drijaz/pold-pashto-offensive-language-dataset) | Pashto toxicity and moderation NLP benchmarks |
|
| 18 |
+
| Urdu-Pashto Lexicon Dataset | [kaggle](https://www.kaggle.com/datasets/shafeeqgigyani/urdu-pashto-lexicon-dataset) | [Kaggle metadata describes 7,601 Urdu entries with Pashto translations. (`Pashto`)](https://www.kaggle.com/api/v1/datasets/view/shafeeqgigyani/urdu-pashto-lexicon-dataset) | Lexicon and translation lexeme mapping |
|
| 19 |
| Wikimedia Wikipedia | [huggingface](https://huggingface.co/datasets/wikimedia/wikipedia) | [Dataset includes 20231101.ps subset. (`20231101.ps`)](https://huggingface.co/datasets/wikimedia/wikipedia) | Terminology and balanced text corpus |
|
| 20 |
| Zirak-AI PashtoOCR | [huggingface](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | [Dataset tags include language:ps and the dataset name is PashtoOCR. (`ps`, `PashtoOCR`)](https://huggingface.co/datasets/zirak-ai/PashtoOCR) | OCR and text extraction benchmarking |
|
| 21 |
|
resources/projects/README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Projects
|
| 2 |
+
|
| 3 |
+
## Verified Pashto Resources
|
| 4 |
+
|
| 5 |
+
| Resource | Link | Pashto Evidence | Primary Use |
|
| 6 |
+
|---|---|---|---|
|
| 7 |
+
| Pashto ASR V3 Space | [huggingface](https://huggingface.co/spaces/ihanif/pashto-asr-v3) | [Space card title is Pashto ASR V3 and short description states Pashto ASR. (`Pashto`, `ASR`)](https://huggingface.co/api/spaces/ihanif/pashto-asr-v3) | Project demo for Pashto ASR user testing |
|
| 8 |
+
| Pashto to English Dictionary Space | [huggingface](https://huggingface.co/spaces/EngrAamirBangash/Pashto2English-Dictionary) | [Space metadata title states Pashto to English Dictionary. (`Pashto`)](https://huggingface.co/api/spaces/EngrAamirBangash/Pashto2English-Dictionary) | Interactive bilingual lookup project |
|
| 9 |
+
| Pashto Translator Space | [huggingface](https://huggingface.co/spaces/Umar4321/Pashto-Translator) | [Space title is Pashto Translator and description states Pashto to English and Urdu translation. (`Pashto`)](https://huggingface.co/api/spaces/Umar4321/Pashto-Translator) | Interactive translation project demo |
|
| 10 |
+
|
| 11 |
+
## Maintenance
|
| 12 |
+
- Source of truth: [../catalog/resources.json](../catalog/resources.json)
|
| 13 |
+
- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)
|
| 14 |
+
- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)
|
resources/schema/resource.schema.json
CHANGED
|
@@ -62,7 +62,9 @@
|
|
| 62 |
"model",
|
| 63 |
"benchmark",
|
| 64 |
"tool",
|
| 65 |
-
"paper"
|
|
|
|
|
|
|
| 66 |
]
|
| 67 |
},
|
| 68 |
"source": {
|
|
|
|
| 62 |
"model",
|
| 63 |
"benchmark",
|
| 64 |
"tool",
|
| 65 |
+
"paper",
|
| 66 |
+
"project",
|
| 67 |
+
"code"
|
| 68 |
]
|
| 69 |
},
|
| 70 |
"source": {
|
scripts/README.md
CHANGED
|
@@ -7,7 +7,7 @@ Automation scripts for quality checks, resource catalog validation, and search i
|
|
| 7 |
- `check_links.py`: ensure markdown links are clickable (optional online reachability check).
|
| 8 |
- `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
|
| 9 |
- `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
|
| 10 |
-
- `sync_resources.py`: collect new candidate Pashto resources from
|
| 11 |
- `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
|
| 12 |
|
| 13 |
## Usage
|
|
|
|
| 7 |
- `check_links.py`: ensure markdown links are clickable (optional online reachability check).
|
| 8 |
- `validate_resource_catalog.py`: validate `resources/catalog/resources.json`.
|
| 9 |
- `generate_resource_views.py`: generate `resources/*/README.md`, `resources/README.md`, and `docs/search/resources.json` from the catalog.
|
| 10 |
+
- `sync_resources.py`: collect new candidate Pashto resources from Kaggle, Hugging Face (datasets/models/spaces), GitHub repositories, and paper endpoints into `resources/catalog/pending_candidates.json`.
|
| 11 |
- `run_resource_cycle.py`: run the full repeatable resource cycle with one command.
|
| 12 |
|
| 13 |
## Usage
|
scripts/generate_resource_views.py
CHANGED
|
@@ -17,6 +17,8 @@ CATEGORY_CONFIG = {
|
|
| 17 |
"benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
|
| 18 |
"tool": ("resources/tools/README.md", "Tools"),
|
| 19 |
"paper": ("resources/papers/README.md", "Papers"),
|
|
|
|
|
|
|
| 20 |
}
|
| 21 |
|
| 22 |
|
|
@@ -86,6 +88,8 @@ def _write_resources_home(path: Path, counts: dict[str, int], total_verified: in
|
|
| 86 |
f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
|
| 87 |
f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
|
| 88 |
f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
|
|
|
|
|
|
|
| 89 |
"",
|
| 90 |
"## Machine-Readable Catalog",
|
| 91 |
"- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
|
|
|
|
| 17 |
"benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
|
| 18 |
"tool": ("resources/tools/README.md", "Tools"),
|
| 19 |
"paper": ("resources/papers/README.md", "Papers"),
|
| 20 |
+
"project": ("resources/projects/README.md", "Projects"),
|
| 21 |
+
"code": ("resources/codes/README.md", "Code"),
|
| 22 |
}
|
| 23 |
|
| 24 |
|
|
|
|
| 88 |
f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
|
| 89 |
f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
|
| 90 |
f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
|
| 91 |
+
f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)",
|
| 92 |
+
f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)",
|
| 93 |
"",
|
| 94 |
"## Machine-Readable Catalog",
|
| 95 |
"- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
|
scripts/sync_resources.py
CHANGED
|
@@ -108,6 +108,138 @@ def fetch_huggingface(kind: str, limit: int) -> list[dict[str, Any]]:
|
|
| 108 |
return out
|
| 109 |
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
|
| 112 |
query = urllib.parse.urlencode(
|
| 113 |
{"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
|
|
@@ -228,8 +360,11 @@ def main() -> int:
|
|
| 228 |
sources_used: list[str] = []
|
| 229 |
|
| 230 |
fetch_steps = [
|
|
|
|
| 231 |
("huggingface-datasets", lambda: fetch_huggingface("datasets", args.limit)),
|
| 232 |
("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
|
|
|
|
|
|
|
| 233 |
("arxiv", lambda: fetch_arxiv(args.limit)),
|
| 234 |
("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
|
| 235 |
]
|
|
|
|
| 108 |
return out
|
| 109 |
|
| 110 |
|
| 111 |
+
def fetch_huggingface_spaces(limit: int) -> list[dict[str, Any]]:
|
| 112 |
+
query = urllib.parse.urlencode({"search": "pashto", "limit": str(limit)})
|
| 113 |
+
url = f"https://huggingface.co/api/spaces?{query}"
|
| 114 |
+
payload = _fetch_json(url)
|
| 115 |
+
|
| 116 |
+
out: list[dict[str, Any]] = []
|
| 117 |
+
for item in payload:
|
| 118 |
+
space_id = item.get("id")
|
| 119 |
+
if not space_id:
|
| 120 |
+
continue
|
| 121 |
+
space_url = f"https://huggingface.co/spaces/{space_id}"
|
| 122 |
+
rid = f"candidate-hf-project-{_slug(space_id)}"
|
| 123 |
+
summary = "Candidate project app returned from Hugging Face Spaces Pashto search."
|
| 124 |
+
out.append(
|
| 125 |
+
_candidate(
|
| 126 |
+
rid=rid,
|
| 127 |
+
title=space_id,
|
| 128 |
+
url=space_url,
|
| 129 |
+
category="project",
|
| 130 |
+
source="huggingface",
|
| 131 |
+
summary=summary,
|
| 132 |
+
evidence_text="Matched by Pashto keyword in Hugging Face Spaces search.",
|
| 133 |
+
evidence_url=space_url,
|
| 134 |
+
markers=["pashto"],
|
| 135 |
+
tags=["pashto", "candidate", "project", "space"],
|
| 136 |
+
)
|
| 137 |
+
)
|
| 138 |
+
return out
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def fetch_kaggle_datasets(limit: int) -> list[dict[str, Any]]:
|
| 142 |
+
# Public Kaggle dataset listing endpoint (no auth needed for list responses).
|
| 143 |
+
query = urllib.parse.urlencode({"search": "pashto", "page": "1"})
|
| 144 |
+
url = f"https://www.kaggle.com/api/v1/datasets/list?{query}"
|
| 145 |
+
payload = _fetch_json(url)
|
| 146 |
+
|
| 147 |
+
out: list[dict[str, Any]] = []
|
| 148 |
+
for item in payload:
|
| 149 |
+
title = (item.get("titleNullable") or "").strip()
|
| 150 |
+
dataset_url = (item.get("urlNullable") or "").strip()
|
| 151 |
+
owner = (item.get("ownerRefNullable") or "").strip()
|
| 152 |
+
subtitle = (item.get("subtitleNullable") or "").strip()
|
| 153 |
+
if not title or not dataset_url:
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
blob = f"{title} {subtitle}".lower()
|
| 157 |
+
if "pashto" not in blob and "pukhto" not in blob:
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
owner_prefix = f"{owner}/" if owner else ""
|
| 161 |
+
rid = f"candidate-kaggle-dataset-{_slug(owner_prefix + title)}"
|
| 162 |
+
out.append(
|
| 163 |
+
_candidate(
|
| 164 |
+
rid=rid,
|
| 165 |
+
title=title,
|
| 166 |
+
url=dataset_url,
|
| 167 |
+
category="dataset",
|
| 168 |
+
source="kaggle",
|
| 169 |
+
summary=(subtitle or "Candidate Kaggle dataset returned from Pashto search.")[:240],
|
| 170 |
+
evidence_text="Kaggle dataset title/subtitle includes Pashto keyword.",
|
| 171 |
+
evidence_url=dataset_url,
|
| 172 |
+
markers=["Pashto"],
|
| 173 |
+
tags=["pashto", "candidate", "dataset", "kaggle"],
|
| 174 |
+
)
|
| 175 |
+
)
|
| 176 |
+
if len(out) >= limit:
|
| 177 |
+
break
|
| 178 |
+
return out
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def fetch_github_pashto_repos(limit: int) -> list[dict[str, Any]]:
|
| 182 |
+
# Query by topic first for high precision, then by keyword for recall.
|
| 183 |
+
query_variants = [
|
| 184 |
+
"topic:pashto",
|
| 185 |
+
"pashto in:name,description,readme",
|
| 186 |
+
]
|
| 187 |
+
|
| 188 |
+
combined: dict[str, dict[str, Any]] = {}
|
| 189 |
+
for query_text in query_variants:
|
| 190 |
+
query = urllib.parse.urlencode(
|
| 191 |
+
{"q": query_text, "sort": "stars", "order": "desc", "per_page": str(limit)}
|
| 192 |
+
)
|
| 193 |
+
url = f"https://api.github.com/search/repositories?{query}"
|
| 194 |
+
payload = _fetch_json(url)
|
| 195 |
+
for item in payload.get("items", []):
|
| 196 |
+
full_name = item.get("full_name")
|
| 197 |
+
html_url = item.get("html_url")
|
| 198 |
+
if not full_name or not html_url:
|
| 199 |
+
continue
|
| 200 |
+
combined[full_name] = item
|
| 201 |
+
|
| 202 |
+
out: list[dict[str, Any]] = []
|
| 203 |
+
for full_name, item in sorted(combined.items(), key=lambda kv: kv[1].get("stargazers_count", 0), reverse=True):
|
| 204 |
+
name_blob = " ".join(
|
| 205 |
+
[
|
| 206 |
+
full_name or "",
|
| 207 |
+
item.get("name") or "",
|
| 208 |
+
item.get("description") or "",
|
| 209 |
+
" ".join(item.get("topics") or []),
|
| 210 |
+
]
|
| 211 |
+
).lower()
|
| 212 |
+
if "pashto" not in name_blob and "pukhto" not in name_blob:
|
| 213 |
+
continue
|
| 214 |
+
|
| 215 |
+
html_url = item["html_url"]
|
| 216 |
+
category = "project"
|
| 217 |
+
topics = item.get("topics") or []
|
| 218 |
+
if any(token in name_blob for token in ("toolkit", "library", "nlp", "asr", "tts", "ocr", "api", "code")):
|
| 219 |
+
category = "code"
|
| 220 |
+
|
| 221 |
+
rid = f"candidate-gh-{category}-{_slug(full_name)}"
|
| 222 |
+
description = (item.get("description") or "").strip()
|
| 223 |
+
summary = description or "Candidate Pashto-related GitHub repository."
|
| 224 |
+
out.append(
|
| 225 |
+
_candidate(
|
| 226 |
+
rid=rid,
|
| 227 |
+
title=full_name,
|
| 228 |
+
url=html_url,
|
| 229 |
+
category=category,
|
| 230 |
+
source="github",
|
| 231 |
+
summary=summary[:240] if summary else "Candidate Pashto-related GitHub repository.",
|
| 232 |
+
evidence_text="Repository metadata (name/description/topics) includes Pashto markers.",
|
| 233 |
+
evidence_url=html_url,
|
| 234 |
+
markers=["pashto"],
|
| 235 |
+
tags=["pashto", "candidate", category, "github", *(topics[:3])],
|
| 236 |
+
)
|
| 237 |
+
)
|
| 238 |
+
if len(out) >= limit:
|
| 239 |
+
break
|
| 240 |
+
return out
|
| 241 |
+
|
| 242 |
+
|
| 243 |
def fetch_arxiv(limit: int) -> list[dict[str, Any]]:
|
| 244 |
query = urllib.parse.urlencode(
|
| 245 |
{"search_query": "all:pashto", "start": "0", "max_results": str(limit)}
|
|
|
|
| 360 |
sources_used: list[str] = []
|
| 361 |
|
| 362 |
fetch_steps = [
|
| 363 |
+
("kaggle-datasets", lambda: fetch_kaggle_datasets(args.limit)),
|
| 364 |
("huggingface-datasets", lambda: fetch_huggingface("datasets", args.limit)),
|
| 365 |
("huggingface-models", lambda: fetch_huggingface("models", args.limit)),
|
| 366 |
+
("huggingface-spaces", lambda: fetch_huggingface_spaces(args.limit)),
|
| 367 |
+
("github-repositories", lambda: fetch_github_pashto_repos(args.limit)),
|
| 368 |
("arxiv", lambda: fetch_arxiv(args.limit)),
|
| 369 |
("semantic-scholar", lambda: fetch_semantic_scholar(args.limit)),
|
| 370 |
]
|
scripts/validate_resource_catalog.py
CHANGED
|
@@ -16,7 +16,7 @@ from typing import Any
|
|
| 16 |
from urllib.parse import urlparse
|
| 17 |
|
| 18 |
|
| 19 |
-
ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper"}
|
| 20 |
ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
|
| 21 |
ALLOWED_STATUS = {"verified", "candidate"}
|
| 22 |
RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
|
|
|
|
| 16 |
from urllib.parse import urlparse
|
| 17 |
|
| 18 |
|
| 19 |
+
ALLOWED_CATEGORIES = {"dataset", "model", "benchmark", "tool", "paper", "project", "code"}
|
| 20 |
ALLOWED_SOURCES = {"huggingface", "mozilla", "kaggle", "github", "arxiv", "meta", "other"}
|
| 21 |
ALLOWED_STATUS = {"verified", "candidate"}
|
| 22 |
RESOURCE_ID_RE = re.compile(r"^[a-z0-9][a-z0-9._-]*$")
|