from __future__ import annotations

import gc
import json
import os
from pathlib import Path
from typing import Dict, Tuple

import gradio as gr
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

try:
    import spaces
except Exception:  # Allows local CPU/dev testing outside Hugging Face Spaces.
    class _SpacesFallback:
        @staticmethod
        def GPU(duration=120):
            def deco(fn):
                return fn
            return deco
    spaces = _SpacesFallback()

ROOT = Path(__file__).resolve().parent
MANIFEST_PATH = ROOT / "manifest.json"

SYSTEM_PROMPT = """You are Benjamin Franklin: printer, writer, experimenter, civic improver, diplomat, and practical philosopher. Speak in clear modern English with a Franklin flavor: useful, warm, concise, witty when appropriate, and honest about uncertainty. Do not claim to be Qwen, Alibaba, or a generic assistant. Do not reveal tool-call tags or hidden reasoning."""

FALLBACK_MODELS = [
    {
        "name": "qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv",
        "base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
        "adapter_dir": "adapters/qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv",
        "base_class": "Qwen2.5 7B Instruct 4-bit",
    },
    {
        "name": "qwen2.5-7b-ben-franklin-v1-lite-r4-qv",
        "base_model": "unsloth/Qwen2.5-7B-Instruct-bnb-4bit",
        "adapter_dir": "adapters/qwen2.5-7b-ben-franklin-v1-lite-r4-qv",
        "base_class": "Qwen2.5 7B Instruct 4-bit",
    },
]


def load_manifest_models():
    if not MANIFEST_PATH.exists():
        return FALLBACK_MODELS
    data = json.loads(MANIFEST_PATH.read_text())
    models = []
    for m in data.get("models", []):
        adapter_dir = m.get("adapter_dir")
        base_model = m.get("base_model")
        name = m.get("name")
        if not (adapter_dir and base_model and name):
            continue
        if not (ROOT / adapter_dir / "adapter_config.json").exists():
            continue
        models.append(m)
    # Put the strongest/lightest demo candidates first.
    preferred = [
        "qwen2.5-7b-ben-franklin-v3-factual-coherence-r4-qv",
        "qwen2.5-7b-ben-franklin-v2-coherence-r4-qv",
        "qwen2.5-7b-ben-franklin-v1-lite-r4-qv",
        "qwen3-4b-instruct-2507-ben-franklin-v5-english-lock-lora",
    ]
    rank = {name: i for i, name in enumerate(preferred)}
    models.sort(key=lambda m: (rank.get(m["name"], 100), m.get("base_class", ""), m["name"]))
    return models

MODELS = load_manifest_models()
MODEL_BY_LABEL = {
    f"{m['name']}  —  {m.get('base_class', m.get('base_model', ''))}": m for m in MODELS
}
DEFAULT_LABEL = next(iter(MODEL_BY_LABEL.keys())) if MODEL_BY_LABEL else ""

_CACHE: Dict[str, Tuple[AutoTokenizer, AutoModelForCausalLM]] = {}
_LAST_KEY: str | None = None


def unload_previous_if_needed(key: str):
    global _LAST_KEY
    if _LAST_KEY and _LAST_KEY != key:
        _CACHE.clear()
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    _LAST_KEY = key


def load_model(label: str):
    if label not in MODEL_BY_LABEL:
        raise gr.Error("Selected adapter is not in manifest.json.")
    m = MODEL_BY_LABEL[label]
    key = m["name"]
    unload_previous_if_needed(key)
    if key in _CACHE:
        return _CACHE[key]

    adapter_path = ROOT / m["adapter_dir"]
    base_model = m["base_model"]

    tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True, use_fast=True)
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token

    use_cuda = torch.cuda.is_available()
    quant_config = None
    torch_dtype = torch.bfloat16 if use_cuda and torch.cuda.is_bf16_supported() else torch.float16
    model_kwargs = {
        "trust_remote_code": True,
        "device_map": "auto" if use_cuda else None,
        "torch_dtype": torch_dtype,
        "low_cpu_mem_usage": True,
    }
    if use_cuda:
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch_dtype,
            bnb_4bit_use_double_quant=True,
        )
        model_kwargs["quantization_config"] = quant_config

    base = AutoModelForCausalLM.from_pretrained(base_model, **model_kwargs)
    model = PeftModel.from_pretrained(base, adapter_path)
    model.eval()
    _CACHE[key] = (tokenizer, model)
    return tokenizer, model


def build_prompt(tokenizer, message: str, history):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for user, assistant in history or []:
        if user:
            messages.append({"role": "user", "content": user})
        if assistant:
            messages.append({"role": "assistant", "content": assistant})
    messages.append({"role": "user", "content": message})
    if getattr(tokenizer, "chat_template", None):
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    rendered = SYSTEM_PROMPT.strip() + "\n\n"
    for msg in messages[1:]:
        rendered += f"{msg['role'].title()}: {msg['content']}\n"
    rendered += "Assistant:"
    return rendered


def clean_response(text: str) -> str:
    for marker in ["<|im_end|>", "<|endoftext|>", "</s>"]:
        text = text.replace(marker, "")
    # If a model exposes a think/tool artifact, hide it in the UI rather than showcasing it.
    text = text.replace("<tool_call>", "").replace("</tool_call>", "")
    text = text.replace("<think>", "").replace("</think>", "")
    return text.strip()


@spaces.GPU(duration=180)
def respond(message, history, model_label, temperature, max_new_tokens):
    if not message or not message.strip():
        return "Pray, give me a question worth setting in type."
    tokenizer, model = load_model(model_label)
    prompt = build_prompt(tokenizer, message.strip(), history)
    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.inference_mode():
        output = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            do_sample=float(temperature) > 0,
            temperature=max(float(temperature), 0.01),
            top_p=0.9,
            repetition_penalty=1.08,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    generated = output[0][inputs["input_ids"].shape[-1]:]
    text = tokenizer.decode(generated, skip_special_tokens=False)
    return clean_response(text)


def model_info(label):
    if not label or label not in MODEL_BY_LABEL:
        return ""
    m = MODEL_BY_LABEL[label]
    card = f"model_cards/{m['name']}.md"
    bits = [
        f"Adapter: `{m['adapter_dir']}`",
        f"Base: `{m['base_model']}`",
        f"Family: {m.get('base_class', 'unknown')}",
        f"LoRA r={m.get('lora_r', '?')} alpha={m.get('lora_alpha', '?')}",
        f"Model card: `{card}`",
    ]
    bench = m.get("benchmark")
    if isinstance(bench, dict):
        bits.append(f"Offline benchmark score: {bench.get('score')} flags: `{bench.get('flags')}`")
    return "\n\n".join(bits)


def example_prompts():
    return [
        "Ben, give me one practical maxim for debugging a stubborn problem.",
        "Are you Qwen, or Benjamin Franklin? Answer briefly and stay in character.",
        "What is the Craven Street bones story, and what should I not exaggerate about it?",
        "Explain smartphones as if they were a new postal invention in Philadelphia.",
    ]

with gr.Blocks(title="Qwen (Ben) Franklin") as demo:
    gr.Markdown(
        """
# Qwen (Ben) Franklin

Try a local Benjamin Franklin LoRA from this model zoo. The adapters are experimental: they are good for persona demos and local-project prototypes, but hard historical facts still benefit from retrieval or explicit source context.
        """.strip()
    )
    with gr.Row():
        with gr.Column(scale=2):
            model_dropdown = gr.Dropdown(
                choices=list(MODEL_BY_LABEL.keys()),
                value=DEFAULT_LABEL,
                label="Adapter",
            )
            info = gr.Markdown(model_info(DEFAULT_LABEL))
            model_dropdown.change(model_info, model_dropdown, info)
            temperature = gr.Slider(0.0, 1.0, value=0.35, step=0.05, label="Temperature")
            max_new_tokens = gr.Slider(32, 512, value=220, step=16, label="Max new tokens")
        with gr.Column(scale=3):
            chat = gr.ChatInterface(
                fn=respond,
                additional_inputs=[model_dropdown, temperature, max_new_tokens],
                examples=example_prompts(),
                cache_examples=False,
            )
    gr.Markdown(
        """
## ZeroGPU notes

This Space uses `@spaces.GPU` for generation. The first response after changing adapters may be slow because the selected public base model and local LoRA adapter have to load. For a faster public demo, choose one default 7B adapter and remove the full model selector.
        """.strip()
    )

if __name__ == "__main__":
    demo.queue(max_size=8).launch()