pashto-language-resources / scripts /generate_resource_views.py

musaw

Expand resource cycle for projects/code and promote new Pashto sources

081627f 4 months ago

6.73 kB

	"""Generate markdown resource views and search index from catalog JSON.

	Usage:
	python scripts/generate_resource_views.py
	"""

	from __future__ import annotations

	import json
	from pathlib import Path
	from typing import Any


	CATEGORY_CONFIG = {
	"dataset": ("resources/datasets/README.md", "Datasets"),
	"model": ("resources/models/README.md", "Models"),
	"benchmark": ("resources/benchmarks/README.md", "Benchmarks"),
	"tool": ("resources/tools/README.md", "Tools"),
	"paper": ("resources/papers/README.md", "Papers"),
	"project": ("resources/projects/README.md", "Projects"),
	"code": ("resources/codes/README.md", "Code"),
	}


	def _load_catalog(path: Path) -> dict[str, Any]:
	return json.loads(path.read_text(encoding="utf-8"))


	def _escape_cell(value: str) -> str:
	return value.replace("\|", "\\\|").strip()


	def _marker_text(markers: list[str]) -> str:
	return ", ".join(f"`{marker}`" for marker in markers)


	def _resource_row(resource: dict[str, Any]) -> str:
	evidence = resource["pashto_evidence"]
	evidence_text = _escape_cell(evidence["evidence_text"])
	markers = _marker_text(evidence["markers"])
	if markers:
	evidence_text = f"{evidence_text} ({markers})"
	return (
	f"\| {_escape_cell(resource['title'])} \| "
	f"[{resource['source']}]({resource['url']}) \| "
	f"[{evidence_text}]({evidence['evidence_url']}) \| "
	f"{_escape_cell(resource['primary_use'])} \|"
	)


	def _write_markdown_table(path: Path, title: str, resources: list[dict[str, Any]]) -> None:
	lines = [
	f"# {title}",
	"",
	"## Verified Pashto Resources",
	"",
	"\| Resource \| Link \| Pashto Evidence \| Primary Use \|",
	"\|---\|---\|---\|---\|",
	]

	if resources:
	lines.extend(_resource_row(resource) for resource in resources)
	else:
	lines.append("\| _None yet_ \| - \| - \| - \|")

	lines.extend(
	[
	"",
	"## Maintenance",
	"- Source of truth: [../catalog/resources.json](../catalog/resources.json)",
	"- Validation: [../../scripts/validate_resource_catalog.py](../../scripts/validate_resource_catalog.py)",
	"- Generated by: [../../scripts/generate_resource_views.py](../../scripts/generate_resource_views.py)",
	"",
	]
	)
	path.write_text("\n".join(lines), encoding="utf-8")


	def _write_resources_home(path: Path, counts: dict[str, int], total_verified: int) -> None:
	lines = [
	"# Resources",
	"",
	"Structured, Pashto-focused resource tracking lives in this folder.",
	"",
	"## Sections",
	f"- Datasets ({counts.get('dataset', 0)}): [datasets/README.md](datasets/README.md)",
	f"- Models ({counts.get('model', 0)}): [models/README.md](models/README.md)",
	f"- Benchmarks ({counts.get('benchmark', 0)}): [benchmarks/README.md](benchmarks/README.md)",
	f"- Tools ({counts.get('tool', 0)}): [tools/README.md](tools/README.md)",
	f"- Papers ({counts.get('paper', 0)}): [papers/README.md](papers/README.md)",
	f"- Projects ({counts.get('project', 0)}): [projects/README.md](projects/README.md)",
	f"- Code ({counts.get('code', 0)}): [codes/README.md](codes/README.md)",
	"",
	"## Machine-Readable Catalog",
	"- Canonical catalog: [catalog/resources.json](catalog/resources.json)",
	"- Candidate feed: [catalog/pending_candidates.json](catalog/pending_candidates.json)",
	"- Schema: [schema/resource.schema.json](schema/resource.schema.json)",
	"",
	"## Update Rule",
	"- Add only validated resources with explicit Pashto relevance.",
	"- Keep every external reference clickable using markdown links.",
	"- Run `python scripts/validate_resource_catalog.py` before opening a PR.",
	"- Run `python scripts/generate_resource_views.py` after catalog changes.",
	"",
	f"Verified resource count: `{total_verified}`",
	"",
	]
	path.write_text("\n".join(lines), encoding="utf-8")


	def _build_search_payload(resources: list[dict[str, Any]], updated_on: str) -> dict[str, Any]:
	search_items: list[dict[str, Any]] = []
	for resource in resources:
	evidence = resource["pashto_evidence"]
	search_items.append(
	{
	"id": resource["id"],
	"title": resource["title"],
	"url": resource["url"],
	"category": resource["category"],
	"source": resource["source"],
	"status": resource["status"],
	"summary": resource["summary"],
	"primary_use": resource["primary_use"],
	"tasks": resource.get("tasks", []),
	"tags": resource["tags"],
	"evidence_text": evidence["evidence_text"],
	"evidence_url": evidence["evidence_url"],
	"markers": evidence["markers"],
	}
	)

	return {
	"generated_on": f"{updated_on}T00:00:00Z",
	"count": len(search_items),
	"resources": search_items,
	}


	def main() -> int:
	catalog_path = Path("resources/catalog/resources.json")
	catalog = _load_catalog(catalog_path)
	resources: list[dict[str, Any]] = catalog.get("resources", [])
	updated_on = catalog.get("updated_on", "1970-01-01")
	verified = [resource for resource in resources if resource.get("status") == "verified"]

	grouped: dict[str, list[dict[str, Any]]] = {category: [] for category in CATEGORY_CONFIG}
	for resource in verified:
	category = resource.get("category")
	if category in grouped:
	grouped[category].append(resource)

	for category, (file_path, title) in CATEGORY_CONFIG.items():
	output_path = Path(file_path)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	rows = sorted(grouped[category], key=lambda item: item["title"].lower())
	_write_markdown_table(output_path, title, rows)

	counts = {category: len(items) for category, items in grouped.items()}
	_write_resources_home(Path("resources/README.md"), counts, len(verified))

	search_payload = _build_search_payload(resources, updated_on)
	search_json_path = Path("docs/search/resources.json")
	search_json_path.parent.mkdir(parents=True, exist_ok=True)
	search_json_path.write_text(
	json.dumps(search_payload, ensure_ascii=False, indent=2) + "\n",
	encoding="utf-8",
	)

	print(
	"Generated resources markdown and search index: "
	f"{len(verified)} verified resources, {len(resources)} total resources"
	)
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())