# ============================================================ # Open Model Playground # Colab A100 Local + HF Space ZeroGPU FLUX.2 Profile # Generated from Colab notebook cells. # ============================================================ import os import time import random import gc import io from typing import Tuple import gradio as gr from PIL import Image, ImageDraw import torch APP_TITLE = "Open Model Playground" TEXT_MODEL_ID = os.getenv("TEXT_MODEL_ID", "Qwen/Qwen3-4B-Instruct-2507") # IMAGE_BACKEND: # - "sdxl": Colab A100 local 기본값 # - "flux2_4bit": HF Space ZeroGPU 기본 목표 # - "placeholder": 배포/API 검증용 fallback IMAGE_BACKEND = os.getenv("IMAGE_BACKEND", "sdxl") IMAGE_MODEL_ID = os.getenv("IMAGE_MODEL_ID", "stabilityai/stable-diffusion-xl-base-1.0") DEFAULT_USE_REAL_TEXT_MODEL = os.getenv("DEFAULT_USE_REAL_TEXT_MODEL", "1") == "1" DEFAULT_USE_REAL_IMAGE_MODEL = os.getenv("DEFAULT_USE_REAL_IMAGE_MODEL", "1") == "1" TEXT_MODEL = None TEXT_TOKENIZER = None IMAGE_PIPE = None def _gpu_decorator(duration=360, size=None): """ Hugging Face ZeroGPU Space에서만 spaces.GPU를 사용합니다. Space variable: USE_ZERO_GPU_DECORATOR=1 size: - None: ZeroGPU default large - "xlarge": full H200. 더 많은 quota를 사용하며 queue가 길어질 수 있음. """ use_zero_gpu = os.getenv("USE_ZERO_GPU_DECORATOR", "0") == "1" if use_zero_gpu: try: import spaces kwargs = {"duration": duration} if size: kwargs["size"] = size return spaces.GPU(**kwargs) except Exception: pass def decorator(fn): return fn return decorator def get_best_device() -> str: if torch.cuda.is_available(): return "cuda" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps" return "cpu" def get_torch_dtype(device: str): if device == "cuda": return torch.bfloat16 if device == "mps": return torch.float16 return torch.float32 def now_ms() -> float: return time.time() * 1000 def cleanup_cuda(): gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.ipc_collect() def make_placeholder_image(prompt: str, seed: int, width: int = 1024, height: int = 1024) -> Image.Image: random.seed(seed) bg = ( random.randint(230, 255), random.randint(220, 245), random.randint(190, 235), ) img = Image.new("RGB", (int(width), int(height)), color=bg) draw = ImageDraw.Draw(img) title = "Open Model Playground" prompt_text = (prompt or "No prompt")[:120] draw.rectangle((32, 32, int(width) - 32, int(height) - 32), outline=(40, 40, 40), width=5) draw.text((64, 72), title, fill=(20, 20, 20)) draw.text((64, 140), f"Prompt: {prompt_text}", fill=(20, 20, 20)) draw.text((64, 210), f"Seed: {seed}", fill=(20, 20, 20)) draw.text((64, int(height) - 92), f"Backend: {IMAGE_BACKEND}", fill=(80, 80, 80)) return img def load_text_model(): global TEXT_MODEL, TEXT_TOKENIZER if TEXT_MODEL is not None and TEXT_TOKENIZER is not None: return TEXT_TOKENIZER, TEXT_MODEL from transformers import AutoModelForCausalLM, AutoTokenizer device = get_best_device() dtype = get_torch_dtype(device) if device != "cuda": raise RuntimeError("실제 텍스트 모델 실행은 CUDA GPU를 권장합니다.") started = time.time() print(f"Loading text model: {TEXT_MODEL_ID}") TEXT_TOKENIZER = AutoTokenizer.from_pretrained( TEXT_MODEL_ID, trust_remote_code=True, ) TEXT_MODEL = AutoModelForCausalLM.from_pretrained( TEXT_MODEL_ID, torch_dtype=dtype, device_map="auto", trust_remote_code=True, ) TEXT_MODEL.eval() print(f"Loaded text model in {time.time() - started:.1f} sec") return TEXT_TOKENIZER, TEXT_MODEL @_gpu_decorator(duration=120) def generate_text( system_message: str, user_prompt: str, max_new_tokens: int, temperature: float, top_p: float, use_real_model: bool, ) -> Tuple[str, str]: started = now_ms() system_message = system_message or "당신은 친절하고 정확한 AI 튜터입니다." user_prompt = user_prompt or "" if not user_prompt.strip(): return "프롬프트를 입력해주세요.", "status=empty_prompt" if not use_real_model: elapsed = (now_ms() - started) / 1000 answer = ( "이것은 빠른 fallback 응답입니다.\n\n" f"사용자 프롬프트: {user_prompt}\n\n" "실제 모델을 사용하려면 Use real text model을 켜세요." ) meta = ( f"mode=fallback\n" f"model_id={TEXT_MODEL_ID}\n" f"device={get_best_device()}\n" f"elapsed_sec={elapsed:.2f}\n" ) return answer, meta try: tokenizer, model = load_text_model() device = get_best_device() messages = [ {"role": "system", "content": system_message}, {"role": "user", "content": user_prompt}, ] if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None: text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) else: text = f"{system_message}\n\nUser: {user_prompt}\nAssistant:" inputs = tokenizer(text, return_tensors="pt").to(model.device) do_sample = temperature > 0 with torch.inference_mode(): outputs = model.generate( **inputs, max_new_tokens=int(max_new_tokens), temperature=float(temperature) if do_sample else None, top_p=float(top_p) if do_sample else None, do_sample=do_sample, pad_token_id=tokenizer.eos_token_id, use_cache=True, ) generated = outputs[0][inputs["input_ids"].shape[-1]:] answer = tokenizer.decode(generated, skip_special_tokens=True) elapsed = (now_ms() - started) / 1000 meta = ( f"mode=real_transformers\n" f"model_id={TEXT_MODEL_ID}\n" f"device={device}\n" f"dtype={get_torch_dtype(device)}\n" f"elapsed_sec={elapsed:.2f}\n" f"max_new_tokens={max_new_tokens}\n" f"temperature={temperature}\n" f"top_p={top_p}" ) return answer, meta except Exception as e: elapsed = (now_ms() - started) / 1000 return ( "실제 텍스트 모델 실행 중 오류가 발생했습니다.", f"mode=error\nelapsed_sec={elapsed:.2f}\nerror={type(e).__name__}: {e}", ) def load_sdxl_pipe(): from diffusers import DiffusionPipeline device = get_best_device() if device != "cuda": raise RuntimeError("SDXL 실제 실행은 CUDA GPU를 권장합니다.") started = time.time() print(f"Loading SDXL image model: {IMAGE_MODEL_ID}") try: pipe = DiffusionPipeline.from_pretrained( IMAGE_MODEL_ID, torch_dtype=torch.float16, variant="fp16", use_safetensors=True, ) except Exception: pipe = DiffusionPipeline.from_pretrained( IMAGE_MODEL_ID, torch_dtype=torch.float16, use_safetensors=True, ) pipe = pipe.to("cuda") try: pipe.enable_attention_slicing() except Exception: pass pipe.set_progress_bar_config(disable=True) print(f"Loaded SDXL in {time.time() - started:.1f} sec") return pipe def load_flux2_4bit_pipe(): """ Space ZeroGPU용 FLUX.2 4-bit loader. 기본 repo: - diffusers/FLUX.2-dev-bnb-4bit 이 repo는 FLUX.2-dev의 NF4 quantized DiT와 text encoder를 포함합니다. full original model은 black-forest-labs/FLUX.2-dev입니다. 접근 조건/gated 여부를 미리 확인해야 합니다. """ from transformers import Mistral3ForConditionalGeneration from diffusers import Flux2Pipeline, Flux2Transformer2DModel device = get_best_device() if device != "cuda": raise RuntimeError("FLUX.2 실제 실행은 CUDA GPU가 필요합니다.") started = time.time() repo_id = IMAGE_MODEL_ID torch_dtype = torch.bfloat16 print(f"Loading FLUX.2 4-bit image model: {repo_id}") transformer = Flux2Transformer2DModel.from_pretrained( repo_id, subfolder="transformer", torch_dtype=torch_dtype, device_map="cpu", ) text_encoder = Mistral3ForConditionalGeneration.from_pretrained( repo_id, subfolder="text_encoder", dtype=torch_dtype, device_map="cpu", ) pipe = Flux2Pipeline.from_pretrained( repo_id, transformer=transformer, text_encoder=text_encoder, torch_dtype=torch_dtype, ) # 4-bit + CPU offload path. ZeroGPU에서는 queue/duration이 길어질 수 있습니다. pipe.enable_model_cpu_offload() pipe.set_progress_bar_config(disable=True) print(f"Loaded FLUX.2 4-bit in {time.time() - started:.1f} sec") return pipe def load_image_pipe(): global IMAGE_PIPE if IMAGE_PIPE is not None: return IMAGE_PIPE if IMAGE_BACKEND == "sdxl": IMAGE_PIPE = load_sdxl_pipe() elif IMAGE_BACKEND == "flux2_4bit": IMAGE_PIPE = load_flux2_4bit_pipe() elif IMAGE_BACKEND == "placeholder": IMAGE_PIPE = None else: raise ValueError(f"Unknown IMAGE_BACKEND: {IMAGE_BACKEND}") return IMAGE_PIPE @_gpu_decorator(duration=240) def generate_image( prompt: str, negative_prompt: str, steps: int, guidance_scale: float, seed: int, width: int, height: int, use_real_image_model: bool, ): started = now_ms() prompt = prompt or "A friendly robot teaching open-source AI" if (not use_real_image_model) or IMAGE_BACKEND == "placeholder": img = make_placeholder_image(prompt, int(seed), int(width), int(height)) elapsed = (now_ms() - started) / 1000 meta = ( f"mode=fallback\n" f"image_backend={IMAGE_BACKEND}\n" f"model_id={IMAGE_MODEL_ID}\n" f"device={get_best_device()}\n" f"elapsed_sec={elapsed:.2f}\n" f"seed={seed}\n" f"steps={steps}\n" f"guidance_scale={guidance_scale}\n" f"size={width}x{height}" ) return img, meta try: pipe = load_image_pipe() device = get_best_device() generator = torch.Generator(device="cuda").manual_seed(int(seed)) if IMAGE_BACKEND == "sdxl": kwargs = dict( prompt=prompt, negative_prompt=negative_prompt or None, num_inference_steps=int(steps), guidance_scale=float(guidance_scale), generator=generator, width=int(width), height=int(height), ) with torch.inference_mode(): result = pipe(**kwargs) elif IMAGE_BACKEND == "flux2_4bit": # FLUX.2는 guidance 2.5~4.0, steps 28~50 범위가 일반적인 출발점입니다. kwargs = dict( prompt=prompt, num_inference_steps=int(steps), guidance_scale=float(guidance_scale), generator=generator, width=int(width), height=int(height), ) with torch.inference_mode(): result = pipe(**kwargs) else: raise ValueError(f"Unknown IMAGE_BACKEND: {IMAGE_BACKEND}") img = result.images[0] elapsed = (now_ms() - started) / 1000 meta = ( f"mode=real_diffusers\n" f"image_backend={IMAGE_BACKEND}\n" f"model_id={IMAGE_MODEL_ID}\n" f"device={device}\n" f"elapsed_sec={elapsed:.2f}\n" f"seed={seed}\n" f"steps={steps}\n" f"guidance_scale={guidance_scale}\n" f"size={width}x{height}" ) return img, meta except Exception as e: elapsed = (now_ms() - started) / 1000 img = make_placeholder_image( f"Error fallback: {prompt}", int(seed), int(width), int(height), ) meta = ( f"mode=error_fallback\n" f"image_backend={IMAGE_BACKEND}\n" f"elapsed_sec={elapsed:.2f}\n" f"error={type(e).__name__}: {e}" ) return img, meta def build_demo(): default_steps = 25 if IMAGE_BACKEND == "sdxl" else 28 default_guidance = 7.0 if IMAGE_BACKEND == "sdxl" else 4.0 with gr.Blocks(title=APP_TITLE) as demo: gr.Markdown("# 🤗 Open Model Playground") gr.Markdown( f"Text model: `{TEXT_MODEL_ID}` \n" f"Image backend: `{IMAGE_BACKEND}` \n" f"Image model: `{IMAGE_MODEL_ID}`" ) with gr.Tab("Text Generation"): gr.Markdown("## Text Generation with Qwen3 + Transformers") system_message = gr.Textbox( label="System message", value="당신은 친절하고 정확한 AI 튜터입니다.", lines=2, ) user_prompt = gr.Textbox( label="User prompt", value="Hugging Face Hub가 AI 시대에 중요한 이유를 세 문장으로 설명해줘.", lines=4, ) with gr.Row(): max_new_tokens = gr.Slider(32, 1024, value=256, step=32, label="max_new_tokens") temperature = gr.Slider(0.0, 1.5, value=0.7, step=0.1, label="temperature") top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="top_p") use_real_text_model = gr.Checkbox( label="Use real text model", value=DEFAULT_USE_REAL_TEXT_MODEL, ) text_btn = gr.Button("Generate Text") text_output = gr.Textbox(label="Generated answer", lines=10) text_meta = gr.Textbox(label="Metadata", lines=7) text_btn.click( fn=generate_text, inputs=[ system_message, user_prompt, max_new_tokens, temperature, top_p, use_real_text_model, ], outputs=[text_output, text_meta], api_name="generate_text", ) with gr.Tab("Image Generation"): gr.Markdown("## Image Generation with Diffusers") image_prompt = gr.Textbox( label="Prompt", value="A friendly robot teaching open-source AI in a modern Korean research lab, cinematic lighting, highly detailed", lines=3, ) negative_prompt = gr.Textbox( label="Negative prompt", value="low quality, blurry, distorted, watermark, text artifacts", lines=2, ) with gr.Row(): steps = gr.Slider(4, 50, value=default_steps, step=1, label="steps") guidance_scale = gr.Slider(0.0, 15.0, value=default_guidance, step=0.5, label="guidance_scale") seed = gr.Number(value=42, label="seed") with gr.Row(): width = gr.Slider(512, 1024, value=1024, step=64, label="width") height = gr.Slider(512, 1024, value=1024, step=64, label="height") use_real_image_model = gr.Checkbox( label="Use real image model", value=DEFAULT_USE_REAL_IMAGE_MODEL, ) image_btn = gr.Button("Generate Image") image_output = gr.Image(label="Generated image") image_meta = gr.Textbox(label="Metadata", lines=8) image_btn.click( fn=generate_image, inputs=[ image_prompt, negative_prompt, steps, guidance_scale, seed, width, height, use_real_image_model, ], outputs=[image_output, image_meta], api_name="generate_image", ) gr.Markdown( "### Deployment note\n" "Colab A100에서는 `sdxl` 프로파일로 실제 생성 실습을 진행하고, " "Hugging Face Space ZeroGPU에서는 `flux2_4bit` 프로파일을 사용하도록 Space Variables를 설정합니다." ) return demo demo = build_demo() if __name__ == "__main__": demo.queue() demo.launch()