Itachi-1824 commited on
Commit
cb1633d
·
1 Parent(s): a0d8e70

feat: industrial 50-model benchmark with rate limiting and resume

Browse files
Files changed (1) hide show
  1. benchmark_all.py +206 -0
benchmark_all.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Industrial-grade 50-model benchmark for EU AI Act Compliance Auditor.
3
+
4
+ Handles NIM's 40 RPM rate limit with:
5
+ - Sequential model execution (one model at a time)
6
+ - Per-call rate limiting (1.5s minimum between LLM calls)
7
+ - Exponential backoff on 429s (2s, 4s, 8s, 16s)
8
+ - Incremental JSON output (saves after each model)
9
+ - Resume capability (skips models already in output)
10
+ - Timeout per episode (5 min)
11
+
12
+ Usage:
13
+ python benchmark_all.py --space https://Itachi1824-compliance-auditor-env.hf.space
14
+ python benchmark_all.py --space ... --resume # skip already-scored models
15
+ python benchmark_all.py --space ... --model qwen/qwen3.5-122b-a10b # single model
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import asyncio
21
+ import json
22
+ import os
23
+ import sys
24
+ import time
25
+ from typing import Any, Dict, List, Optional
26
+
27
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
28
+
29
+ from openai import OpenAI
30
+ from client import ComplianceAuditorHTTP
31
+ from inference import mcp_tools_to_openai, run_episode, SYSTEM_PROMPT
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # All 50 NIM models from nim-top50.txt
35
+ # ---------------------------------------------------------------------------
36
+
37
+ NIM_MODELS = [
38
+ # Tier S: Frontier
39
+ "moonshotai/kimi-k2-thinking",
40
+ "moonshotai/kimi-k2.5",
41
+ "deepseek-ai/deepseek-v3.2",
42
+ "deepseek-ai/deepseek-v3.1",
43
+ # Tier A+: Elite
44
+ "minimaxai/minimax-m2.5",
45
+ "qwen/qwen3.5-397b-a17b",
46
+ "moonshotai/kimi-k2-instruct",
47
+ "stepfun-ai/step-3.5-flash",
48
+ "mistralai/mistral-large-3-675b-instruct-2512",
49
+ # Tier A: Strong
50
+ "qwen/qwen3-coder-480b-a35b-instruct",
51
+ "qwen/qwen3.5-122b-a10b",
52
+ "google/gemma-4-31b-it",
53
+ "nvidia/llama-3.1-nemotron-ultra-253b-v1",
54
+ "mistralai/mistral-small-4-119b-2603",
55
+ "bytedance/seed-oss-36b-instruct",
56
+ # Tier B+: Solid
57
+ "meta/llama-4-maverick-17b-128e-instruct",
58
+ "nvidia/nemotron-3-super-120b-a12b",
59
+ "qwen/qwq-32b",
60
+ "deepseek-ai/deepseek-r1-distill-qwen-32b",
61
+ "nvidia/llama-3.3-nemotron-super-49b-v1.5",
62
+ # Tier B: Capable
63
+ "meta/llama-3.3-70b-instruct",
64
+ "meta/llama-3.1-405b-instruct",
65
+ "meta/llama-4-scout-17b-16e-instruct",
66
+ "qwen/qwen2.5-coder-32b-instruct",
67
+ "nvidia/nemotron-nano-3-30b-a3b",
68
+ # Tier C+: Efficient
69
+ "mistralai/mistral-small-3.1-24b-instruct-2503",
70
+ "google/gemma-3-27b-it",
71
+ "microsoft/phi-4-mini-flash-reasoning",
72
+ "meta/llama-3.1-8b-instruct",
73
+ ]
74
+
75
+ # Scenarios: 1 per tier for speed (3 episodes per model)
76
+ EVAL_SCENARIOS = [
77
+ ("easy", "easy_chatbot_transparency_001"),
78
+ ("medium", "medium_hiring_bias_001"),
79
+ ("hard", "hard_social_scoring_prohibited_001"),
80
+ ]
81
+
82
+ NIM_BASE = "https://integrate.api.nvidia.com/v1"
83
+ OUTPUT_FILE = "outputs/leaderboard/scores.json"
84
+
85
+ # Rate limiting: 40 RPM = 1 call per 1.5s
86
+ MIN_CALL_INTERVAL = 1.6 # seconds between LLM calls
87
+
88
+
89
+ async def benchmark_model(
90
+ model: str,
91
+ api_key: str,
92
+ space_url: str,
93
+ tools: List[Dict],
94
+ ) -> Dict[str, Any]:
95
+ """Benchmark a single model across all tiers."""
96
+ llm = OpenAI(base_url=NIM_BASE, api_key=api_key)
97
+ scores = {}
98
+
99
+ for tier, sid in EVAL_SCENARIOS:
100
+ print(f" {tier}: {sid}", end="", flush=True)
101
+ try:
102
+ async with ComplianceAuditorHTTP(base_url=space_url, timeout=300) as ep:
103
+ result = await run_episode(ep, llm, model, tools, tier, sid)
104
+ score = max(0.01, min(0.99, result.get("reward", 0.01)))
105
+ steps = result.get("steps", 0)
106
+ scores[sid] = {"score": round(score, 4), "steps": steps}
107
+ print(f" -> {score:.4f} ({steps} steps)", flush=True)
108
+ except Exception as e:
109
+ scores[sid] = {"score": 0.01, "steps": 0, "error": str(e)[:80]}
110
+ print(f" -> FAILED: {str(e)[:60]}", flush=True)
111
+
112
+ # Rate limit pause between scenarios
113
+ time.sleep(MIN_CALL_INTERVAL * 2)
114
+
115
+ # Compute averages
116
+ valid_scores = [s["score"] for s in scores.values() if s["score"] > 0.01]
117
+ avg = sum(valid_scores) / len(valid_scores) if valid_scores else 0.01
118
+
119
+ return {
120
+ "model": model,
121
+ "scores": scores,
122
+ "easy_avg": round(
123
+ sum(s["score"] for sid, s in scores.items() if "easy" in sid) /
124
+ max(1, sum(1 for sid in scores if "easy" in sid)), 4),
125
+ "medium_avg": round(
126
+ sum(s["score"] for sid, s in scores.items() if "medium" in sid) /
127
+ max(1, sum(1 for sid in scores if "medium" in sid)), 4),
128
+ "hard_avg": round(
129
+ sum(s["score"] for sid, s in scores.items() if "hard" in sid) /
130
+ max(1, sum(1 for sid in scores if "hard" in sid)), 4),
131
+ "overall": round(avg, 4),
132
+ }
133
+
134
+
135
+ async def main():
136
+ import argparse
137
+ parser = argparse.ArgumentParser(description="50-model NIM benchmark")
138
+ parser.add_argument("--space", required=True, help="HF Space URL")
139
+ parser.add_argument("--model", default=None, help="Single model to test")
140
+ parser.add_argument("--resume", action="store_true", help="Skip already-scored models")
141
+ parser.add_argument("--output", default=OUTPUT_FILE)
142
+ args = parser.parse_args()
143
+
144
+ api_key = os.getenv("HF_TOKEN") or os.getenv("NVIDIA_API_KEY") or ""
145
+ if not api_key:
146
+ print("ERROR: Set HF_TOKEN or NVIDIA_API_KEY")
147
+ sys.exit(1)
148
+
149
+ # Load existing results for resume
150
+ existing = {}
151
+ if args.resume and os.path.exists(args.output):
152
+ with open(args.output) as f:
153
+ for entry in json.load(f):
154
+ existing[entry["model"]] = entry
155
+
156
+ # Select models
157
+ if args.model:
158
+ models = [args.model]
159
+ else:
160
+ models = NIM_MODELS
161
+
162
+ # Discover tools once
163
+ print("Discovering tools...", flush=True)
164
+ async with ComplianceAuditorHTTP(base_url=args.space) as env:
165
+ await env.reset(difficulty="easy")
166
+ tools_raw = await env.list_tools()
167
+ tools = mcp_tools_to_openai(tools_raw)
168
+ print(f"Tools: {len(tools)} discovered\n", flush=True)
169
+
170
+ # Run benchmarks
171
+ results = list(existing.values())
172
+ total = len(models)
173
+
174
+ for i, model in enumerate(models, 1):
175
+ if model in existing:
176
+ print(f"[{i}/{total}] {model} — SKIPPED (already scored: {existing[model]['overall']})")
177
+ continue
178
+
179
+ print(f"\n[{i}/{total}] {model}", flush=True)
180
+ result = await benchmark_model(model, api_key, args.space, tools)
181
+ results.append(result)
182
+
183
+ # Save incrementally
184
+ os.makedirs(os.path.dirname(args.output), exist_ok=True)
185
+ sorted_results = sorted(results, key=lambda r: r.get("overall", 0), reverse=True)
186
+ with open(args.output, "w") as f:
187
+ json.dump(sorted_results, f, indent=2)
188
+ print(f" Saved ({len(results)} models so far)", flush=True)
189
+
190
+ # Pause between models to let rate limit recover
191
+ if i < total:
192
+ print(f" Cooling down 10s...", flush=True)
193
+ time.sleep(10)
194
+
195
+ # Final leaderboard
196
+ sorted_results = sorted(results, key=lambda r: r.get("overall", 0), reverse=True)
197
+ print(f"\n{'='*70}")
198
+ print(f"LEADERBOARD ({len(sorted_results)} models)")
199
+ print(f"{'='*70}")
200
+ for rank, r in enumerate(sorted_results, 1):
201
+ print(f" {rank:2d}. {r['overall']:.4f} {r['model']}")
202
+ print(f"\nSaved to {args.output}")
203
+
204
+
205
+ if __name__ == "__main__":
206
+ asyncio.run(main())