# ============================================================
# Open Model Playground
# Colab A100 Local + HF Space ZeroGPU FLUX.2 Profile
# Generated from Colab notebook cells.
# ============================================================

import os
import time
import random
import gc
import io
from typing import Tuple

import gradio as gr
from PIL import Image, ImageDraw

import torch

APP_TITLE = "Open Model Playground"

TEXT_MODEL_ID = os.getenv("TEXT_MODEL_ID", "Qwen/Qwen3-4B-Instruct-2507")

# IMAGE_BACKEND:
# - "sdxl": Colab A100 local 기본값
# - "flux2_4bit": HF Space ZeroGPU 기본 목표
# - "placeholder": 배포/API 검증용 fallback
IMAGE_BACKEND = os.getenv("IMAGE_BACKEND", "sdxl")
IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/stable-diffusion-xl-base-1.0")
DEFAULT_USE_REAL_TEXT_MODEL = os.getenv("DEFAULT_USE_REAL_TEXT_MODEL", "1") == "1"
DEFAULT_USE_REAL_IMAGE_MODEL = os.getenv("DEFAULT_USE_REAL_IMAGE_MODEL", "1") == "1"

TEXT_MODEL = None
TEXT_TOKENIZER = None
IMAGE_PIPE = None

def _gpu_decorator(duration=360, size=None):
    """
    Hugging Face ZeroGPU Space에서만 spaces.GPU를 사용합니다.

    Space variable:
    USE_ZERO_GPU_DECORATOR=1

    size:
    - None: ZeroGPU default large
    - "xlarge": full H200. 더 많은 quota를 사용하며 queue가 길어질 수 있음.
    """
    use_zero_gpu = os.getenv("USE_ZERO_GPU_DECORATOR", "0") == "1"
    if use_zero_gpu:
        try:
            import spaces
            kwargs = {"duration": duration}
            if size:
                kwargs["size"] = size
            return spaces.GPU(**kwargs)
        except Exception:
            pass

    def decorator(fn):
        return fn
    return decorator


def get_best_device() -> str:
    if torch.cuda.is_available():
        return "cuda"
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        return "mps"
    return "cpu"


def get_torch_dtype(device: str):
    if device == "cuda":
        return torch.bfloat16
    if device == "mps":
        return torch.float16
    return torch.float32


def now_ms() -> float:
    return time.time() * 1000


def cleanup_cuda():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

def make_placeholder_image(prompt: str, seed: int, width: int = 1024, height: int = 1024) -> Image.Image:
    random.seed(seed)
    bg = (
        random.randint(230, 255),
        random.randint(220, 245),
        random.randint(190, 235),
    )
    img = Image.new("RGB", (int(width), int(height)), color=bg)
    draw = ImageDraw.Draw(img)
    title = "Open Model Playground"
    prompt_text = (prompt or "No prompt")[:120]
    draw.rectangle((32, 32, int(width) - 32, int(height) - 32), outline=(40, 40, 40), width=5)
    draw.text((64, 72), title, fill=(20, 20, 20))
    draw.text((64, 140), f"Prompt: {prompt_text}", fill=(20, 20, 20))
    draw.text((64, 210), f"Seed: {seed}", fill=(20, 20, 20))
    draw.text((64, int(height) - 92), f"Backend: {IMAGE_BACKEND}", fill=(80, 80, 80))
    return img

def load_text_model():
    global TEXT_MODEL, TEXT_TOKENIZER

    if TEXT_MODEL is not None and TEXT_TOKENIZER is not None:
        return TEXT_TOKENIZER, TEXT_MODEL

    from transformers import AutoModelForCausalLM, AutoTokenizer

    device = get_best_device()
    dtype = get_torch_dtype(device)

    if device != "cuda":
        raise RuntimeError("실제 텍스트 모델 실행은 CUDA GPU를 권장합니다.")

    started = time.time()
    print(f"Loading text model: {TEXT_MODEL_ID}")

    TEXT_TOKENIZER = AutoTokenizer.from_pretrained(
        TEXT_MODEL_ID,
        trust_remote_code=True,
    )

    TEXT_MODEL = AutoModelForCausalLM.from_pretrained(
        TEXT_MODEL_ID,
        torch_dtype=dtype,
        device_map="auto",
        trust_remote_code=True,
    )

    TEXT_MODEL.eval()
    print(f"Loaded text model in {time.time() - started:.1f} sec")
    return TEXT_TOKENIZER, TEXT_MODEL

@_gpu_decorator(duration=120)
def generate_text(
    system_message: str,
    user_prompt: str,
    max_new_tokens: int,
    temperature: float,
    top_p: float,
    use_real_model: bool,
) -> Tuple[str, str]:
    started = now_ms()
    system_message = system_message or "당신은 친절하고 정확한 AI 튜터입니다."
    user_prompt = user_prompt or ""

    if not user_prompt.strip():
        return "프롬프트를 입력해주세요.", "status=empty_prompt"

    if not use_real_model:
        elapsed = (now_ms() - started) / 1000
        answer = (
            "이것은 빠른 fallback 응답입니다.\n\n"
            f"사용자 프롬프트: {user_prompt}\n\n"
            "실제 모델을 사용하려면 Use real text model을 켜세요."
        )
        meta = (
            f"mode=fallback\n"
            f"model_id={TEXT_MODEL_ID}\n"
            f"device={get_best_device()}\n"
            f"elapsed_sec={elapsed:.2f}\n"
        )
        return answer, meta

    try:
        tokenizer, model = load_text_model()
        device = get_best_device()

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt},
        ]

        if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None:
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
            )
        else:
            text = f"{system_message}\n\nUser: {user_prompt}\nAssistant:"

        inputs = tokenizer(text, return_tensors="pt").to(model.device)
        do_sample = temperature > 0

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=int(max_new_tokens),
                temperature=float(temperature) if do_sample else None,
                top_p=float(top_p) if do_sample else None,
                do_sample=do_sample,
                pad_token_id=tokenizer.eos_token_id,
                use_cache=True,
            )

        generated = outputs[0][inputs["input_ids"].shape[-1]:]
        answer = tokenizer.decode(generated, skip_special_tokens=True)

        elapsed = (now_ms() - started) / 1000
        meta = (
            f"mode=real_transformers\n"
            f"model_id={TEXT_MODEL_ID}\n"
            f"device={device}\n"
            f"dtype={get_torch_dtype(device)}\n"
            f"elapsed_sec={elapsed:.2f}\n"
            f"max_new_tokens={max_new_tokens}\n"
            f"temperature={temperature}\n"
            f"top_p={top_p}"
        )
        return answer, meta

    except Exception as e:
        elapsed = (now_ms() - started) / 1000
        return (
            "실제 텍스트 모델 실행 중 오류가 발생했습니다.",
            f"mode=error\nelapsed_sec={elapsed:.2f}\nerror={type(e).__name__}: {e}",
        )

def load_sdxl_pipe():
    from diffusers import DiffusionPipeline

    device = get_best_device()
    if device != "cuda":
        raise RuntimeError("SDXL 실제 실행은 CUDA GPU를 권장합니다.")

    started = time.time()
    print(f"Loading SDXL image model: {IMAGE_MODEL_ID}")

    try:
        pipe = DiffusionPipeline.from_pretrained(
            IMAGE_MODEL_ID,
            torch_dtype=torch.float16,
            variant="fp16",
            use_safetensors=True,
        )
    except Exception:
        pipe = DiffusionPipeline.from_pretrained(
            IMAGE_MODEL_ID,
            torch_dtype=torch.float16,
            use_safetensors=True,
        )

    pipe = pipe.to("cuda")
    try:
        pipe.enable_attention_slicing()
    except Exception:
        pass
    pipe.set_progress_bar_config(disable=True)

    print(f"Loaded SDXL in {time.time() - started:.1f} sec")
    return pipe


def load_flux2_4bit_pipe():
    """
    Space ZeroGPU용 FLUX.2 4-bit loader.

    기본 repo:
    - diffusers/FLUX.2-dev-bnb-4bit

    이 repo는 FLUX.2-dev의 NF4 quantized DiT와 text encoder를 포함합니다.
    full original model은 black-forest-labs/FLUX.2-dev입니다.
    접근 조건/gated 여부를 미리 확인해야 합니다.
    """
    from transformers import Mistral3ForConditionalGeneration
    from diffusers import Flux2Pipeline, Flux2Transformer2DModel

    device = get_best_device()
    if device != "cuda":
        raise RuntimeError("FLUX.2 실제 실행은 CUDA GPU가 필요합니다.")

    started = time.time()
    repo_id = IMAGE_MODEL_ID
    torch_dtype = torch.bfloat16

    print(f"Loading FLUX.2 4-bit image model: {repo_id}")

    transformer = Flux2Transformer2DModel.from_pretrained(
        repo_id,
        subfolder="transformer",
        torch_dtype=torch_dtype,
        device_map="cpu",
    )

    text_encoder = Mistral3ForConditionalGeneration.from_pretrained(
        repo_id,
        subfolder="text_encoder",
        dtype=torch_dtype,
        device_map="cpu",
    )

    pipe = Flux2Pipeline.from_pretrained(
        repo_id,
        transformer=transformer,
        text_encoder=text_encoder,
        torch_dtype=torch_dtype,
    )

    # 4-bit + CPU offload path. ZeroGPU에서는 queue/duration이 길어질 수 있습니다.
    pipe.enable_model_cpu_offload()
    pipe.set_progress_bar_config(disable=True)

    print(f"Loaded FLUX.2 4-bit in {time.time() - started:.1f} sec")
    return pipe


def load_image_pipe():
    global IMAGE_PIPE

    if IMAGE_PIPE is not None:
        return IMAGE_PIPE

    if IMAGE_BACKEND == "sdxl":
        IMAGE_PIPE = load_sdxl_pipe()
    elif IMAGE_BACKEND == "flux2_4bit":
        IMAGE_PIPE = load_flux2_4bit_pipe()
    elif IMAGE_BACKEND == "placeholder":
        IMAGE_PIPE = None
    else:
        raise ValueError(f"Unknown IMAGE_BACKEND: {IMAGE_BACKEND}")

    return IMAGE_PIPE

@_gpu_decorator(duration=240)
def generate_image(
    prompt: str,
    negative_prompt: str,
    steps: int,
    guidance_scale: float,
    seed: int,
    width: int,
    height: int,
    use_real_image_model: bool,
):
    started = now_ms()
    prompt = prompt or "A friendly robot teaching open-source AI"

    if (not use_real_image_model) or IMAGE_BACKEND == "placeholder":
        img = make_placeholder_image(prompt, int(seed), int(width), int(height))
        elapsed = (now_ms() - started) / 1000
        meta = (
            f"mode=fallback\n"
            f"image_backend={IMAGE_BACKEND}\n"
            f"model_id={IMAGE_MODEL_ID}\n"
            f"device={get_best_device()}\n"
            f"elapsed_sec={elapsed:.2f}\n"
            f"seed={seed}\n"
            f"steps={steps}\n"
            f"guidance_scale={guidance_scale}\n"
            f"size={width}x{height}"
        )
        return img, meta

    try:
        pipe = load_image_pipe()
        device = get_best_device()
        generator = torch.Generator(device="cuda").manual_seed(int(seed))

        if IMAGE_BACKEND == "sdxl":
            kwargs = dict(
                prompt=prompt,
                negative_prompt=negative_prompt or None,
                num_inference_steps=int(steps),
                guidance_scale=float(guidance_scale),
                generator=generator,
                width=int(width),
                height=int(height),
            )
            with torch.inference_mode():
                result = pipe(**kwargs)

        elif IMAGE_BACKEND == "flux2_4bit":
            # FLUX.2는 guidance 2.5~4.0, steps 28~50 범위가 일반적인 출발점입니다.
            kwargs = dict(
                prompt=prompt,
                num_inference_steps=int(steps),
                guidance_scale=float(guidance_scale),
                generator=generator,
                width=int(width),
                height=int(height),
            )
            with torch.inference_mode():
                result = pipe(**kwargs)

        else:
            raise ValueError(f"Unknown IMAGE_BACKEND: {IMAGE_BACKEND}")

        img = result.images[0]
        elapsed = (now_ms() - started) / 1000

        meta = (
            f"mode=real_diffusers\n"
            f"image_backend={IMAGE_BACKEND}\n"
            f"model_id={IMAGE_MODEL_ID}\n"
            f"device={device}\n"
            f"elapsed_sec={elapsed:.2f}\n"
            f"seed={seed}\n"
            f"steps={steps}\n"
            f"guidance_scale={guidance_scale}\n"
            f"size={width}x{height}"
        )
        return img, meta

    except Exception as e:
        elapsed = (now_ms() - started) / 1000
        img = make_placeholder_image(
            f"Error fallback: {prompt}",
            int(seed),
            int(width),
            int(height),
        )
        meta = (
            f"mode=error_fallback\n"
            f"image_backend={IMAGE_BACKEND}\n"
            f"elapsed_sec={elapsed:.2f}\n"
            f"error={type(e).__name__}: {e}"
        )
        return img, meta

def build_demo():
    default_steps = 25 if IMAGE_BACKEND == "sdxl" else 28
    default_guidance = 7.0 if IMAGE_BACKEND == "sdxl" else 4.0

    with gr.Blocks(title=APP_TITLE) as demo:
        gr.Markdown("# 🤗 Open Model Playground")
        gr.Markdown(
            f"Text model: `{TEXT_MODEL_ID}`  \n"
            f"Image backend: `{IMAGE_BACKEND}`  \n"
            f"Image model: `{IMAGE_MODEL_ID}`"
        )

        with gr.Tab("Text Generation"):
            gr.Markdown("## Text Generation with Qwen3 + Transformers")

            system_message = gr.Textbox(
                label="System message",
                value="당신은 친절하고 정확한 AI 튜터입니다.",
                lines=2,
            )
            user_prompt = gr.Textbox(
                label="User prompt",
                value="Hugging Face Hub가 AI 시대에 중요한 이유를 세 문장으로 설명해줘.",
                lines=4,
            )

            with gr.Row():
                max_new_tokens = gr.Slider(32, 1024, value=256, step=32, label="max_new_tokens")
                temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature")
                top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p")

            use_real_text_model = gr.Checkbox(
                label="Use real text model",
                value=DEFAULT_USE_REAL_TEXT_MODEL,
            )

            text_btn = gr.Button("Generate Text")
            text_output = gr.Textbox(label="Generated answer", lines=10)
            text_meta = gr.Textbox(label="Metadata", lines=7)

            text_btn.click(
                fn=generate_text,
                inputs=[
                    system_message,
                    user_prompt,
                    max_new_tokens,
                    temperature,
                    top_p,
                    use_real_text_model,
                ],
                outputs=[text_output, text_meta],
                api_name="generate_text",
            )

        with gr.Tab("Image Generation"):
            gr.Markdown("## Image Generation with Diffusers")

            image_prompt = gr.Textbox(
                label="Prompt",
                value="A friendly robot teaching open-source AI in a modern Korean research lab, cinematic lighting, highly detailed",
                lines=3,
            )
            negative_prompt = gr.Textbox(
                label="Negative prompt",
                value="low quality, blurry, distorted, watermark, text artifacts",
                lines=2,
            )

            with gr.Row():
                steps = gr.Slider(4, 50, value=default_steps, step=1, label="steps")
                guidance_scale = gr.Slider(0.0, 15.0, value=default_guidance, step=0.5, label="guidance_scale")
                seed = gr.Number(value=42, label="seed")

            with gr.Row():
                width = gr.Slider(512, 1024, value=1024, step=64, label="width")
                height = gr.Slider(512, 1024, value=1024, step=64, label="height")

            use_real_image_model = gr.Checkbox(
                label="Use real image model",
                value=DEFAULT_USE_REAL_IMAGE_MODEL,
            )

            image_btn = gr.Button("Generate Image")
            image_output = gr.Image(label="Generated image")
            image_meta = gr.Textbox(label="Metadata", lines=8)

            image_btn.click(
                fn=generate_image,
                inputs=[
                    image_prompt,
                    negative_prompt,
                    steps,
                    guidance_scale,
                    seed,
                    width,
                    height,
                    use_real_image_model,
                ],
                outputs=[image_output, image_meta],
                api_name="generate_image",
            )

        gr.Markdown(
            "### Deployment note\n"
            "Colab A100에서는 `sdxl` 프로파일로 실제 생성 실습을 진행하고, "
            "Hugging Face Space ZeroGPU에서는 `flux2_4bit` 프로파일을 사용하도록 Space Variables를 설정합니다."
        )

    return demo

demo = build_demo()

if __name__ == "__main__":
    demo.queue()
    demo.launch()