"""
Multi-model leaderboard evaluation for EU AI Act Compliance Auditor.

Runs baseline episodes across multiple LLM models via NIM and HF Inference.
Outputs a leaderboard JSON for the Gradio dashboard.

Usage:
    # NIM models
    python evaluate_models.py --provider nim --space https://Itachi1824-compliance-auditor-env.hf.space

    # HF free models
    python evaluate_models.py --provider hf --space https://Itachi1824-compliance-auditor-env.hf.space

    # Single model test
    python evaluate_models.py --model google/gemma-4-31b-it --space https://...
"""

from __future__ import annotations

import asyncio
import json
import os
import sys
import time
from typing import Any, Dict, List

from openai import OpenAI

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from client import ComplianceAuditorHTTP
from inference import mcp_tools_to_openai, run_episode

# ---------------------------------------------------------------------------
# Model lists
# ---------------------------------------------------------------------------

NIM_BASE = "https://integrate.api.nvidia.com/v1"
HF_BASE = "https://router.huggingface.co/v1"

# Top NIM models for tool-calling (curated from nim-top50.txt)
NIM_MODELS = [
    "google/gemma-4-31b-it",
    "deepseek-ai/deepseek-v3.1",
    "qwen/qwen3.5-122b-a10b",
    "meta/llama-4-maverick-17b-128e-instruct",
    "nvidia/llama-3.1-nemotron-ultra-253b-v1",
    "nvidia/nemotron-3-super-120b-a12b",
    "nvidia/llama-3.3-nemotron-super-49b-v1.5",
    "mistralai/mistral-small-4-119b-2603",
    "mistralai/mistral-large-3-675b-instruct-2512",
    "stepfun-ai/step-3.5-flash",
    "meta/llama-3.3-70b-instruct",
    "meta/llama-3.1-8b-instruct",
    "qwen/qwq-32b",
    "deepseek-ai/deepseek-r1-distill-qwen-32b",
    "bytedance/seed-oss-36b-instruct",
    "mistralai/mistral-small-3.1-24b-instruct-2503",
    "google/gemma-3-27b-it",
]

# HF free models (available via router.huggingface.co)
HF_MODELS = [
    "Qwen/Qwen2.5-72B-Instruct",
    "meta-llama/Llama-3.3-70B-Instruct",
    "mistralai/Mistral-Small-24B-Instruct-2501",
    "google/gemma-2-27b-it",
    "Qwen/Qwen2.5-Coder-32B-Instruct",
]

# Scenarios to test (1 per tier for speed)
EVAL_SCENARIOS = {
    "easy": "easy_chatbot_transparency_001",
    "medium": "medium_hiring_bias_001",
    "hard": "hard_social_scoring_prohibited_001",
}


async def evaluate_model(
    model: str,
    base_url: str,
    api_key: str,
    space_url: str,
) -> Dict[str, Any]:
    """Evaluate a single model across all difficulty tiers."""
    llm = OpenAI(base_url=base_url, api_key=api_key)
    results = {}

    # Discover tools once
    try:
        async with ComplianceAuditorHTTP(base_url=space_url) as env:
            await env.reset(difficulty="easy")
            tools_raw = await env.list_tools()
            tools = mcp_tools_to_openai(tools_raw)
    except Exception as e:
        print(f"  [SKIP] {model}: tool discovery failed: {e}")
        return {"model": model, "error": str(e), "scores": {}}

    scores = {}
    for tier, scenario_id in EVAL_SCENARIOS.items():
        try:
            async with ComplianceAuditorHTTP(base_url=space_url) as ep_env:
                result = await run_episode(
                    ep_env, llm, model, tools,
                    difficulty=tier, scenario_id=scenario_id,
                )
                score = max(0.01, min(0.99, result.get("reward", 0.01)))
                scores[scenario_id] = round(score, 4)
                print(f"  {tier}: {score:.4f} ({result.get('steps', 0)} steps)")
        except Exception as e:
            scores[scenario_id] = 0.01
            print(f"  {tier}: FAILED ({e})")

    avg = sum(scores.values()) / len(scores) if scores else 0.0
    return {"model": model, "scores": scores, "average": round(avg, 4)}


async def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--provider", choices=["nim", "hf", "both"], default="nim")
    parser.add_argument("--model", default=None, help="Single model to test")
    parser.add_argument("--space", required=True, help="HF Space URL")
    parser.add_argument("--output", default="outputs/leaderboard/scores.json")
    args = parser.parse_args()

    api_key = os.getenv("HF_TOKEN") or os.getenv("NVIDIA_API_KEY") or ""

    if args.model:
        models = [(args.model, NIM_BASE)]
    else:
        models = []
        if args.provider in ("nim", "both"):
            models.extend([(m, NIM_BASE) for m in NIM_MODELS])
        if args.provider in ("hf", "both"):
            models.extend([(m, HF_BASE) for m in HF_MODELS])

    print(f"Evaluating {len(models)} models against {args.space}")
    print(f"Scenarios: {list(EVAL_SCENARIOS.values())}")
    print("=" * 60)

    all_results = []
    for model, base_url in models:
        print(f"\n{model} ({base_url.split('/')[2]})")
        result = await evaluate_model(model, base_url, api_key, args.space)
        all_results.append(result)

        # Save incrementally
        os.makedirs(os.path.dirname(args.output), exist_ok=True)
        with open(args.output, "w") as f:
            json.dump(all_results, f, indent=2)

    # Print leaderboard
    print(f"\n{'='*60}")
    print("LEADERBOARD")
    print(f"{'='*60}")
    sorted_results = sorted(all_results, key=lambda r: r.get("average", 0), reverse=True)
    for i, r in enumerate(sorted_results, 1):
        avg = r.get("average", 0)
        print(f"  {i:2d}. {avg:.4f}  {r['model']}")

    print(f"\nSaved to {args.output}")


if __name__ == "__main__":
    asyncio.run(main())