Spaces:

Itachi1824
/

compliance-auditor-env

Running

App Files Files Community

Itachi-1824 commited on Apr 10

Commit

cb1633d

1 Parent(s): a0d8e70

feat: industrial 50-model benchmark with rate limiting and resume

Browse files

Files changed (1) hide show

benchmark_all.py +206 -0

benchmark_all.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Industrial-grade 50-model benchmark for EU AI Act Compliance Auditor.
+Handles NIM's 40 RPM rate limit with:
+  - Sequential model execution (one model at a time)
+  - Per-call rate limiting (1.5s minimum between LLM calls)
+  - Exponential backoff on 429s (2s, 4s, 8s, 16s)
+  - Incremental JSON output (saves after each model)
+  - Resume capability (skips models already in output)
+  - Timeout per episode (5 min)
+Usage:
+    python benchmark_all.py --space https://Itachi1824-compliance-auditor-env.hf.space
+    python benchmark_all.py --space ... --resume  # skip already-scored models
+    python benchmark_all.py --space ... --model qwen/qwen3.5-122b-a10b  # single model
+"""
+from __future__ import annotations
+import asyncio
+import json
+import os
+import sys
+import time
+from typing import Any, Dict, List, Optional
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from openai import OpenAI
+from client import ComplianceAuditorHTTP
+from inference import mcp_tools_to_openai, run_episode, SYSTEM_PROMPT
+# ---------------------------------------------------------------------------
+# All 50 NIM models from nim-top50.txt
+# ---------------------------------------------------------------------------
+NIM_MODELS = [
+    # Tier S: Frontier
+    "moonshotai/kimi-k2-thinking",
+    "moonshotai/kimi-k2.5",
+    "deepseek-ai/deepseek-v3.2",
+    "deepseek-ai/deepseek-v3.1",
+    # Tier A+: Elite
+    "minimaxai/minimax-m2.5",
+    "qwen/qwen3.5-397b-a17b",
+    "moonshotai/kimi-k2-instruct",
+    "stepfun-ai/step-3.5-flash",
+    "mistralai/mistral-large-3-675b-instruct-2512",
+    # Tier A: Strong
+    "qwen/qwen3-coder-480b-a35b-instruct",
+    "qwen/qwen3.5-122b-a10b",
+    "google/gemma-4-31b-it",
+    "nvidia/llama-3.1-nemotron-ultra-253b-v1",
+    "mistralai/mistral-small-4-119b-2603",
+    "bytedance/seed-oss-36b-instruct",
+    # Tier B+: Solid
+    "meta/llama-4-maverick-17b-128e-instruct",
+    "nvidia/nemotron-3-super-120b-a12b",
+    "qwen/qwq-32b",
+    "deepseek-ai/deepseek-r1-distill-qwen-32b",
+    "nvidia/llama-3.3-nemotron-super-49b-v1.5",
+    # Tier B: Capable
+    "meta/llama-3.3-70b-instruct",
+    "meta/llama-3.1-405b-instruct",
+    "meta/llama-4-scout-17b-16e-instruct",
+    "qwen/qwen2.5-coder-32b-instruct",
+    "nvidia/nemotron-nano-3-30b-a3b",
+    # Tier C+: Efficient
+    "mistralai/mistral-small-3.1-24b-instruct-2503",
+    "google/gemma-3-27b-it",
+    "microsoft/phi-4-mini-flash-reasoning",
+    "meta/llama-3.1-8b-instruct",
+]
+# Scenarios: 1 per tier for speed (3 episodes per model)
+EVAL_SCENARIOS = [
+    ("easy", "easy_chatbot_transparency_001"),
+    ("medium", "medium_hiring_bias_001"),
+    ("hard", "hard_social_scoring_prohibited_001"),
+]
+NIM_BASE = "https://integrate.api.nvidia.com/v1"
+OUTPUT_FILE = "outputs/leaderboard/scores.json"
+# Rate limiting: 40 RPM = 1 call per 1.5s
+MIN_CALL_INTERVAL = 1.6  # seconds between LLM calls
+async def benchmark_model(
+    model: str,
+    api_key: str,
+    space_url: str,
+    tools: List[Dict],
+) -> Dict[str, Any]:
+    """Benchmark a single model across all tiers."""
+    llm = OpenAI(base_url=NIM_BASE, api_key=api_key)
+    scores = {}
+    for tier, sid in EVAL_SCENARIOS:
+        print(f"  {tier}: {sid}", end="", flush=True)
+        try:
+            async with ComplianceAuditorHTTP(base_url=space_url, timeout=300) as ep:
+                result = await run_episode(ep, llm, model, tools, tier, sid)
+                score = max(0.01, min(0.99, result.get("reward", 0.01)))
+                steps = result.get("steps", 0)
+                scores[sid] = {"score": round(score, 4), "steps": steps}
+                print(f" -> {score:.4f} ({steps} steps)", flush=True)
+        except Exception as e:
+            scores[sid] = {"score": 0.01, "steps": 0, "error": str(e)[:80]}
+            print(f" -> FAILED: {str(e)[:60]}", flush=True)
+        # Rate limit pause between scenarios
+        time.sleep(MIN_CALL_INTERVAL * 2)
+    # Compute averages
+    valid_scores = [s["score"] for s in scores.values() if s["score"] > 0.01]
+    avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0.01
+    return {
+        "model": model,
+        "scores": scores,
+        "easy_avg": round(
+            sum(s["score"] for sid, s in scores.items() if "easy" in sid) /
+            max(1, sum(1 for sid in scores if "easy" in sid)), 4),
+        "medium_avg": round(
+            sum(s["score"] for sid, s in scores.items() if "medium" in sid) /
+            max(1, sum(1 for sid in scores if "medium" in sid)), 4),
+        "hard_avg": round(
+            sum(s["score"] for sid, s in scores.items() if "hard" in sid) /
+            max(1, sum(1 for sid in scores if "hard" in sid)), 4),
+        "overall": round(avg, 4),
+    }
+async def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="50-model NIM benchmark")
+    parser.add_argument("--space", required=True, help="HF Space URL")
+    parser.add_argument("--model", default=None, help="Single model to test")
+    parser.add_argument("--resume", action="store_true", help="Skip already-scored models")
+    parser.add_argument("--output", default=OUTPUT_FILE)
+    args = parser.parse_args()
+    api_key = os.getenv("HF_TOKEN") or os.getenv("NVIDIA_API_KEY") or ""
+    if not api_key:
+        print("ERROR: Set HF_TOKEN or NVIDIA_API_KEY")
+        sys.exit(1)
+    # Load existing results for resume
+    existing = {}
+    if args.resume and os.path.exists(args.output):
+        with open(args.output) as f:
+            for entry in json.load(f):
+                existing[entry["model"]] = entry
+    # Select models
+    if args.model:
+        models = [args.model]
+    else:
+        models = NIM_MODELS
+    # Discover tools once
+    print("Discovering tools...", flush=True)
+    async with ComplianceAuditorHTTP(base_url=args.space) as env:
+        await env.reset(difficulty="easy")
+        tools_raw = await env.list_tools()
+        tools = mcp_tools_to_openai(tools_raw)
+    print(f"Tools: {len(tools)} discovered\n", flush=True)
+    # Run benchmarks
+    results = list(existing.values())
+    total = len(models)
+    for i, model in enumerate(models, 1):
+        if model in existing:
+            print(f"[{i}/{total}] {model} — SKIPPED (already scored: {existing[model]['overall']})")
+            continue
+        print(f"\n[{i}/{total}] {model}", flush=True)
+        result = await benchmark_model(model, api_key, args.space, tools)
+        results.append(result)
+        # Save incrementally
+        os.makedirs(os.path.dirname(args.output), exist_ok=True)
+        sorted_results = sorted(results, key=lambda r: r.get("overall", 0), reverse=True)
+        with open(args.output, "w") as f:
+            json.dump(sorted_results, f, indent=2)
+        print(f"  Saved ({len(results)} models so far)", flush=True)
+        # Pause between models to let rate limit recover
+        if i < total:
+            print(f"  Cooling down 10s...", flush=True)
+            time.sleep(10)
+    # Final leaderboard
+    sorted_results = sorted(results, key=lambda r: r.get("overall", 0), reverse=True)
+    print(f"\n{'='*70}")
+    print(f"LEADERBOARD ({len(sorted_results)} models)")
+    print(f"{'='*70}")
+    for rank, r in enumerate(sorted_results, 1):
+        print(f"  {rank:2d}. {r['overall']:.4f}  {r['model']}")
+    print(f"\nSaved to {args.output}")
+if __name__ == "__main__":
+    asyncio.run(main())