"""
Leaderboard benchmark — 10 models, 3 NIM keys, parallel execution.
Resilient to network errors with retries per episode.
"""
import asyncio
import json
import os
import sys
import time
from pathlib import Path

from openai import OpenAI
from client import ComplianceAuditorHTTP
from inference import run_episode, mcp_tools_to_openai
from scenarios.registry import SCENARIO_LIST

API_BASE = "https://integrate.api.nvidia.com/v1"
SPACE_URL = "https://Itachi1824-compliance-auditor-env.hf.space"
MAX_RETRIES = 2

KEYS = {
    1: os.environ.get("NVIDIA_API_KEY_1", ""),
    2: os.environ.get("NVIDIA_API_KEY_2", ""),
    3: os.environ.get("NVIDIA_API_KEY_3", ""),
}

GROUPS = [
    # Key 1
    ["stepfun-ai/step-3.5-flash", "deepseek-ai/deepseek-v3.1", "qwen/qwen3.5-122b-a10b"],
    # Key 2
    ["meta/llama-4-maverick-17b-128e-instruct", "google/gemma-4-31b-it", "nvidia/llama-3.1-nemotron-ultra-253b-v1", "meta/llama-4-scout-17b-16e-instruct"],
    # Key 3
    ["mistralai/mistral-large-3-675b-instruct-2512", "nvidia/nemotron-3-super-120b-a12b", "meta/llama-3.3-70b-instruct"],
]

SCENARIOS = [(s["id"], s["difficulty"]) for s in SCENARIO_LIST if not s["id"].startswith("procedural")]


async def run_one_episode(model, api_key, tools, sid, diff):
    """Run one episode with retries."""
    llm = OpenAI(base_url=API_BASE, api_key=api_key, timeout=120.0)
    for attempt in range(MAX_RETRIES + 1):
        try:
            async with ComplianceAuditorHTTP(base_url=SPACE_URL) as env:
                result = await run_episode(env, llm, model, tools, diff, sid)
                score = max(0.001, min(0.999, result.get("reward", 0.01)))
                return round(score, 4)
        except Exception as e:
            if attempt < MAX_RETRIES:
                await asyncio.sleep(3)
                continue
            return 0.01


async def run_model(model, api_key, tools, progress):
    """Run all scenarios for one model."""
    short = model.split("/")[-1][:28]
    scores = {}
    for sid, diff in SCENARIOS:
        score = await run_one_episode(model, api_key, tools, sid, diff)
        scores[sid] = score
        progress["done"] += 1
        total = progress["total"]
        pct = progress["done"] / total * 100
        print(f"  [{pct:5.1f}%] {short:28s} | {sid:50s} | {score:.4f}", flush=True)
        await asyncio.sleep(1.5)
    return scores


async def run_group(group_idx, models, api_key, tools, progress):
    """Run all models in a key group sequentially."""
    if not api_key:
        print(f"  Key {group_idx+1} not set — skipping {len(models)} models", flush=True)
        return []
    entries = []
    for model in models:
        short = model.split("/")[-1][:28]
        print(f"\n{'='*70}\n  KEY {group_idx+1} | {model}\n{'='*70}", flush=True)
        t0 = time.time()
        scores = await run_model(model, api_key, tools, progress)
        elapsed = time.time() - t0

        all_s = list(scores.values())
        avg = sum(all_s) / len(all_s) if all_s else 0
        tiers = {"easy": [], "medium": [], "hard": []}
        for sid, diff in SCENARIOS:
            if sid in scores:
                tiers[diff].append(scores[sid])
        tier_avgs = {t: (sum(v)/len(v) if v else 0) for t, v in tiers.items()}

        entries.append({
            "model": model,
            "scores": scores,
            "overall": round(avg, 4),
            "tier_averages": {k: round(v, 4) for k, v in tier_avgs.items()},
            "elapsed_seconds": round(elapsed, 1),
        })
        print(f"  DONE: {short:28s} | overall={avg:.4f} | e={tier_avgs['easy']:.4f} m={tier_avgs['medium']:.4f} h={tier_avgs['hard']:.4f} | {elapsed:.0f}s", flush=True)
    return entries


async def main():
    total_models = sum(len(g) for g in GROUPS)
    total_episodes = total_models * len(SCENARIOS)
    print(f"Benchmarking {total_models} models x {len(SCENARIOS)} scenarios = {total_episodes} episodes", flush=True)
    print(f"Space: {SPACE_URL}", flush=True)

    # Discover tools
    async with ComplianceAuditorHTTP(base_url=SPACE_URL) as env:
        await env.reset(difficulty="easy")
        tools = mcp_tools_to_openai(await env.list_tools())
    print(f"Tools: {len(tools)}\n", flush=True)

    progress = {"done": 0, "total": total_episodes}

    # Run 3 groups in parallel
    tasks = [run_group(i, GROUPS[i], KEYS[i+1], tools, progress) for i in range(3)]
    results = await asyncio.gather(*tasks)

    # Flatten + sort
    all_entries = [e for group in results for e in group]
    all_entries.sort(key=lambda e: e["overall"], reverse=True)

    # Save
    out = Path("outputs/leaderboard/scores.json")
    out.parent.mkdir(parents=True, exist_ok=True)
    with open(out, "w") as f:
        json.dump(all_entries, f, indent=2)

    print(f"\n{'='*70}", flush=True)
    print("FINAL LEADERBOARD", flush=True)
    print(f"{'='*70}", flush=True)
    for i, e in enumerate(all_entries, 1):
        m = e["model"].split("/")[-1][:28]
        print(f"  {i:2d}. {m:28s} | {e['overall']:.4f} | e={e['tier_averages']['easy']:.4f} m={e['tier_averages']['medium']:.4f} h={e['tier_averages']['hard']:.4f}", flush=True)
    print(f"\nSaved to {out}", flush=True)


if __name__ == "__main__":
    asyncio.run(main())