Spaces:

Itachi1824
/

compliance-auditor-env

Running

compliance-auditor-env / evaluate_models.py

Itachi-1824

feat: multi-model leaderboard evaluation (nim + hf)

ff2e396 2 months ago

5.57 kB

	"""
	Multi-model leaderboard evaluation for EU AI Act Compliance Auditor.

	Runs baseline episodes across multiple LLM models via NIM and HF Inference.
	Outputs a leaderboard JSON for the Gradio dashboard.

	Usage:
	# NIM models
	python evaluate_models.py --provider nim --space https://Itachi1824-compliance-auditor-env.hf.space

	# HF free models
	python evaluate_models.py --provider hf --space https://Itachi1824-compliance-auditor-env.hf.space

	# Single model test
	python evaluate_models.py --model google/gemma-4-31b-it --space https://...
	"""

	from __future__ import annotations

	import asyncio
	import json
	import os
	import sys
	import time
	from typing import Any, Dict, List

	from openai import OpenAI

	sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
	from client import ComplianceAuditorHTTP
	from inference import mcp_tools_to_openai, run_episode

	# ---------------------------------------------------------------------------
	# Model lists
	# ---------------------------------------------------------------------------

	NIM_BASE = "https://integrate.api.nvidia.com/v1"
	HF_BASE = "https://router.huggingface.co/v1"

	# Top NIM models for tool-calling (curated from nim-top50.txt)
	NIM_MODELS = [
	"google/gemma-4-31b-it",
	"deepseek-ai/deepseek-v3.1",
	"qwen/qwen3.5-122b-a10b",
	"meta/llama-4-maverick-17b-128e-instruct",
	"nvidia/llama-3.1-nemotron-ultra-253b-v1",
	"nvidia/nemotron-3-super-120b-a12b",
	"nvidia/llama-3.3-nemotron-super-49b-v1.5",
	"mistralai/mistral-small-4-119b-2603",
	"mistralai/mistral-large-3-675b-instruct-2512",
	"stepfun-ai/step-3.5-flash",
	"meta/llama-3.3-70b-instruct",
	"meta/llama-3.1-8b-instruct",
	"qwen/qwq-32b",
	"deepseek-ai/deepseek-r1-distill-qwen-32b",
	"bytedance/seed-oss-36b-instruct",
	"mistralai/mistral-small-3.1-24b-instruct-2503",
	"google/gemma-3-27b-it",
	]

	# HF free models (available via router.huggingface.co)
	HF_MODELS = [
	"Qwen/Qwen2.5-72B-Instruct",
	"meta-llama/Llama-3.3-70B-Instruct",
	"mistralai/Mistral-Small-24B-Instruct-2501",
	"google/gemma-2-27b-it",
	"Qwen/Qwen2.5-Coder-32B-Instruct",
	]

	# Scenarios to test (1 per tier for speed)
	EVAL_SCENARIOS = {
	"easy": "easy_chatbot_transparency_001",
	"medium": "medium_hiring_bias_001",
	"hard": "hard_social_scoring_prohibited_001",
	}


	async def evaluate_model(
	model: str,
	base_url: str,
	api_key: str,
	space_url: str,
	) -> Dict[str, Any]:
	"""Evaluate a single model across all difficulty tiers."""
	llm = OpenAI(base_url=base_url, api_key=api_key)
	results = {}

	# Discover tools once
	try:
	async with ComplianceAuditorHTTP(base_url=space_url) as env:
	await env.reset(difficulty="easy")
	tools_raw = await env.list_tools()
	tools = mcp_tools_to_openai(tools_raw)
	except Exception as e:
	print(f" [SKIP] {model}: tool discovery failed: {e}")
	return {"model": model, "error": str(e), "scores": {}}

	scores = {}
	for tier, scenario_id in EVAL_SCENARIOS.items():
	try:
	async with ComplianceAuditorHTTP(base_url=space_url) as ep_env:
	result = await run_episode(
	ep_env, llm, model, tools,
	difficulty=tier, scenario_id=scenario_id,
	)
	score = max(0.01, min(0.99, result.get("reward", 0.01)))
	scores[scenario_id] = round(score, 4)
	print(f" {tier}: {score:.4f} ({result.get('steps', 0)} steps)")
	except Exception as e:
	scores[scenario_id] = 0.01
	print(f" {tier}: FAILED ({e})")

	avg = sum(scores.values()) / len(scores) if scores else 0.0
	return {"model": model, "scores": scores, "average": round(avg, 4)}


	async def main():
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--provider", choices=["nim", "hf", "both"], default="nim")
	parser.add_argument("--model", default=None, help="Single model to test")
	parser.add_argument("--space", required=True, help="HF Space URL")
	parser.add_argument("--output", default="outputs/leaderboard/scores.json")
	args = parser.parse_args()

	api_key = os.getenv("HF_TOKEN") or os.getenv("NVIDIA_API_KEY") or ""

	if args.model:
	models = [(args.model, NIM_BASE)]
	else:
	models = []
	if args.provider in ("nim", "both"):
	models.extend([(m, NIM_BASE) for m in NIM_MODELS])
	if args.provider in ("hf", "both"):
	models.extend([(m, HF_BASE) for m in HF_MODELS])

	print(f"Evaluating {len(models)} models against {args.space}")
	print(f"Scenarios: {list(EVAL_SCENARIOS.values())}")
	print("=" * 60)

	all_results = []
	for model, base_url in models:
	print(f"\n{model} ({base_url.split('/')[2]})")
	result = await evaluate_model(model, base_url, api_key, args.space)
	all_results.append(result)

	# Save incrementally
	os.makedirs(os.path.dirname(args.output), exist_ok=True)
	with open(args.output, "w") as f:
	json.dump(all_results, f, indent=2)

	# Print leaderboard
	print(f"\n{'='*60}")
	print("LEADERBOARD")
	print(f"{'='*60}")
	sorted_results = sorted(all_results, key=lambda r: r.get("average", 0), reverse=True)
	for i, r in enumerate(sorted_results, 1):
	avg = r.get("average", 0)
	print(f" {i:2d}. {avg:.4f} {r['model']}")

	print(f"\nSaved to {args.output}")


	if __name__ == "__main__":
	asyncio.run(main())