"""Promote valid candidate resources into the verified catalog. Usage: python scripts/promote_candidates.py python scripts/promote_candidates.py --max-promotions 10 """ from __future__ import annotations import argparse import json from datetime import date from pathlib import Path from typing import Any try: from scripts.validate_resource_catalog import validate_resource except ModuleNotFoundError: from validate_resource_catalog import validate_resource try: from scripts.review_existing_resources import probe_resource_url except ModuleNotFoundError: from review_existing_resources import probe_resource_url PLACEHOLDER_PRIMARY_USE = "Needs maintainer review before promotion to verified catalog." NOT_FOUND_PATTERNS = ( "repository not found", "model not found", "dataset not found", "space not found", "page not found", "not found", "this repository does not exist", "we couldn't find", ) def _canonical_url(value: str) -> str: return value.rstrip("/") def _normalized_tasks(value: Any) -> list[str]: if isinstance(value, list): return [item for item in value if isinstance(item, str) and item.strip()] return [] def _prepare_candidate(candidate: dict[str, Any]) -> dict[str, Any]: promoted = dict(candidate) promoted["status"] = "verified" promoted["tasks"] = _normalized_tasks(promoted.get("tasks")) primary_use = str(promoted.get("primary_use", "")).strip() if primary_use == PLACEHOLDER_PRIMARY_USE: promoted["primary_use"] = "Automated discovery entry for Pashto resource tracking." return promoted def _candidate_url_unavailable(url: str, timeout: float) -> bool: probe = probe_resource_url(url, timeout) if probe.hard_missing: return True if probe.content_sample: lowered = probe.content_sample.casefold() if any(pattern in lowered for pattern in NOT_FOUND_PATTERNS): return True return False def promote_candidates( catalog: dict[str, Any], pending_payload: dict[str, Any], *, max_promotions: int | None = None, verify_urls: bool = False, url_timeout: float = 10.0, ) -> tuple[list[dict[str, Any]], dict[str, int]]: resources = catalog.get("resources") if not isinstance(resources, list): raise ValueError("catalog.resources must be a list") candidates = pending_payload.get("candidates", []) if not isinstance(candidates, list): raise ValueError("pending candidates payload must include a 'candidates' list") seen_ids = { resource.get("id") for resource in resources if isinstance(resource, dict) and isinstance(resource.get("id"), str) } seen_urls = { _canonical_url(resource.get("url", "")) for resource in resources if isinstance(resource, dict) and isinstance(resource.get("url"), str) } promoted: list[dict[str, Any]] = [] stats = {"total": len(candidates), "promoted": 0, "duplicate": 0, "invalid": 0, "unavailable": 0} for candidate in candidates: if max_promotions is not None and len(promoted) >= max_promotions: break if not isinstance(candidate, dict): stats["invalid"] += 1 continue resource = _prepare_candidate(candidate) rid = resource.get("id") url = resource.get("url") if not isinstance(rid, str) or not isinstance(url, str): stats["invalid"] += 1 continue canonical_url = _canonical_url(url) if rid in seen_ids or canonical_url in seen_urls: stats["duplicate"] += 1 continue if verify_urls and _candidate_url_unavailable(url, url_timeout): stats["unavailable"] += 1 continue errors = validate_resource(resource, len(resources) + len(promoted)) if errors: stats["invalid"] += 1 continue seen_ids.add(rid) seen_urls.add(canonical_url) promoted.append(resource) if promoted: resources.extend(promoted) catalog["resources"] = resources catalog["updated_on"] = date.today().isoformat() stats["promoted"] = len(promoted) return promoted, stats def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--catalog", default="resources/catalog/resources.json") parser.add_argument("--candidates", default="resources/catalog/pending_candidates.json") parser.add_argument("--max-promotions", type=int, default=None) parser.add_argument("--skip-url-check", action="store_true") parser.add_argument("--url-timeout", type=float, default=10.0) args = parser.parse_args() catalog_path = Path(args.catalog) candidates_path = Path(args.candidates) if not catalog_path.exists(): print(f"Missing catalog file: {catalog_path}") return 1 if not candidates_path.exists(): print(f"Missing candidates file: {candidates_path}") return 1 try: catalog = json.loads(catalog_path.read_text(encoding="utf-8")) pending_payload = json.loads(candidates_path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: print(f"Invalid JSON input: {exc}") return 1 promoted, stats = promote_candidates( catalog, pending_payload, max_promotions=args.max_promotions, verify_urls=not args.skip_url_check, url_timeout=args.url_timeout, ) if not promoted: print( "Promotion complete: no new verified resources " f"(duplicates={stats['duplicate']}, invalid={stats['invalid']}, unavailable={stats['unavailable']})" ) return 0 catalog_path.write_text(json.dumps(catalog, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") print( "Promotion complete: " "promoted=" f"{stats['promoted']} duplicate={stats['duplicate']} invalid={stats['invalid']} " f"unavailable={stats['unavailable']}" ) return 0 if __name__ == "__main__": raise SystemExit(main())