"""Generate markdown resource views and search index from catalog JSON. Usage: python scripts/generate_resource_views.py """ from __future__ import annotations import json from pathlib import Path from typing import Any CATEGORY_CONFIG = { "dataset": ("resources/datasets/README.md", "Datasets"), "model": ("resources/models/README.md", "Models"), "benchmark": ("resources/benchmarks/README.md", "Benchmarks"), "tool": ("resources/tools/README.md", "Tools"), "paper": ("resources/papers/README.md", "Papers"), "project": ("resources/projects/README.md", "Projects"), "code": ("resources/codes/README.md", "Code"), } def _load_catalog(path: Path) -> dict[str, Any]: return json.loads(path.read_text(encoding="utf-8")) def _escape_cell(value: str) -> str: return value.replace("|", "\\|").strip() def _marker_text(markers: list[str]) -> str: return ", ".join(f"`{marker}`" for marker in markers) def _resource_row(resource: dict[str, Any]) -> str: evidence = resource["pashto_evidence"] evidence_text = _escape_cell(evidence["evidence_text"]) markers = _marker_text(evidence["markers"]) if markers: evidence_text = f"{evidence_text} ({markers})" return ( f"| {_escape_cell(resource['title'])} | " f"[{resource['source']}]({resource['url']}) | " f"[{evidence_text}]({evidence['evidence_url']}) | " f"{_escape_cell(resource['primary_use'])} |" ) def _write_markdown_table(path: Path, title: str, resources: list[dict[str, Any]]) -> None: lines = [ f"# {title}", "", "## Verified Pashto Resources", "", "| Resource | Link | Pashto Evidence | Primary Use |", "|---|---|---|---|", ] if resources: lines.extend(_resource_row(resource) for resource in resources) else: lines.append("| _None yet_ | - | - | - |") lines.extend( [ "", "## Maintenance", "- Source of truth: [../catalog/resources.json](../catalog/resources.json)", "- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)", "- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)", "", ] ) path.write_text("\n".join(lines), encoding="utf-8") def _write_resources_home(path: Path, counts: dict[str, int], total_verified: int) -> None: lines = [ "# Resources", "", "Structured, Pashto-focused resource tracking lives in this folder.", "", "## Sections", f"- Datasets ({counts.get('dataset', 0)}): [datasets/README.md](datasets/README.md)", f"- Models ({counts.get('model', 0)}): [models/README.md](models/README.md)", f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)", f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)", f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)", f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)", f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)", "", "## Machine-Readable Catalog", "- Canonical catalog: [catalog/resources.json](catalog/resources.json)", "- Candidate feed: [catalog/pending_candidates.json](catalog/pending_candidates.json)", "- Schema: [schema/resource.schema.json](schema/resource.schema.json)", "", "## Update Rule", "- Add only validated resources with explicit Pashto relevance.", "- Keep every external reference clickable using markdown links.", "- Run `python scripts/validate_resource_catalog.py` before opening a PR.", "- Run `python scripts/generate_resource_views.py` after catalog changes.", "", f"Verified resource count: `{total_verified}`", "", ] path.write_text("\n".join(lines), encoding="utf-8") def _build_search_payload(resources: list[dict[str, Any]], updated_on: str) -> dict[str, Any]: search_items: list[dict[str, Any]] = [] for resource in resources: evidence = resource["pashto_evidence"] search_items.append( { "id": resource["id"], "title": resource["title"], "url": resource["url"], "category": resource["category"], "source": resource["source"], "status": resource["status"], "summary": resource["summary"], "primary_use": resource["primary_use"], "tasks": resource.get("tasks", []), "tags": resource["tags"], "evidence_text": evidence["evidence_text"], "evidence_url": evidence["evidence_url"], "markers": evidence["markers"], } ) return { "generated_on": f"{updated_on}T00:00:00Z", "count": len(search_items), "resources": search_items, } def main() -> int: catalog_path = Path("resources/catalog/resources.json") catalog = _load_catalog(catalog_path) resources: list[dict[str, Any]] = catalog.get("resources", []) updated_on = catalog.get("updated_on", "1970-01-01") verified = [resource for resource in resources if resource.get("status") == "verified"] grouped: dict[str, list[dict[str, Any]]] = {category: [] for category in CATEGORY_CONFIG} for resource in verified: category = resource.get("category") if category in grouped: grouped[category].append(resource) for category, (file_path, title) in CATEGORY_CONFIG.items(): output_path = Path(file_path) output_path.parent.mkdir(parents=True, exist_ok=True) rows = sorted(grouped[category], key=lambda item: item["title"].lower()) _write_markdown_table(output_path, title, rows) counts = {category: len(items) for category, items in grouped.items()} _write_resources_home(Path("resources/README.md"), counts, len(verified)) search_payload = _build_search_payload(resources, updated_on) search_json_path = Path("docs/search/resources.json") search_json_path.parent.mkdir(parents=True, exist_ok=True) search_json_path.write_text( json.dumps(search_payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8", ) print( "Generated resources markdown and search index: " f"{len(verified)} verified resources, {len(resources)} total resources" ) return 0 if __name__ == "__main__": raise SystemExit(main())