File size: 12,490 Bytes

8e2062d

"""
LucasArts Pixel Art Style — Hugging Face Space

Transform face photos into LucasArts adventure-game pixel art using:
  - SDXL base (AlbedoBase XL v2.1)
  - InstantID ControlNet for face identity preservation
  - ZoeDepth ControlNet for structural preservation
  - LucasArts LoRA (primerz/pixagram → lucasart.safetensors)
  - DPMSolver++ scheduler (traditional SDXL, not LCM)

Architecture inspired by fofr's face-to-many.
"""

import spaces
import gradio as gr
import torch
import time
import cv2
import numpy as np
from PIL import Image

torch.jit.script = lambda f: f  # Disable JIT for compatibility

from huggingface_hub import hf_hub_download, snapshot_download
from diffusers.models import ControlNetModel
from diffusers import AutoencoderKL, DPMSolverMultistepScheduler
from controlnet_aux import ZoeDetector
from insightface.app import FaceAnalysis

from pipeline_stable_diffusion_xl_instantid_img2img import (
    StableDiffusionXLInstantIDImg2ImgPipeline,
    draw_kps,
)

# ============================================================
# CONFIGURATION
# ============================================================

TITLE = "LucasArts Pixel Art Style"
DESCRIPTION = """Transform any face photo into LucasArts adventure-game pixel art.
Uses InstantID for face identity + ZoeDepth for structure + LucasArts LoRA style."""

# Model repos
BASE_MODEL_REPO = "frankjoshua/albedobaseXL_v21"
VAE_REPO = "madebyollin/sdxl-vae-fp16-fix"
INSTANTID_REPO = "InstantX/InstantID"
ZOEDEPTH_CN_REPO = "diffusers/controlnet-zoe-depth-sdxl-1.0"
ANNOTATOR_REPO = "lllyasviel/Annotators"
ANTELOPE_REPO = "DIAMONIK7777/antelopev2"

# LucasArts LoRA
LORA_REPO = "primerz/pixagram"
LORA_FILENAME = "lucasart.safetensors"
LORA_STRENGTH = 0.9
TRIGGER_WORD = "lucasarts style"

# Generation defaults
DEFAULT_PROMPT = "a person"
DEFAULT_NEGATIVE = (
    "ugly, artifacts, blurry, deformed, disfigured, low quality, "
    "watermark, text, photo-realistic, photography, realistic"
)
DEFAULT_GUIDANCE_SCALE = 7.0
DEFAULT_STEPS = 20
DEFAULT_FACE_STRENGTH = 0.85
DEFAULT_IMAGE_STRENGTH = 0.15
DEFAULT_DEPTH_STRENGTH = 0.8

DEVICE = "cuda"
DTYPE = torch.float16

# ============================================================
# MODEL LOADING (runs once at startup)
# ============================================================

print("=" * 60)
print("Loading LucasArts Pixel Art Space")
print("=" * 60)

# 1. InsightFace — face detection & embedding
print("\n[1/6] Loading InsightFace (antelopev2)...")
st = time.time()
snapshot_download(repo_id=ANTELOPE_REPO, local_dir="/data/models/antelopev2")
face_app = FaceAnalysis(
    name="antelopev2",
    root="/data",
    providers=["CPUExecutionProvider"],
)
face_app.prepare(ctx_id=0, det_size=(640, 640))
print(f"  [OK] InsightFace loaded ({time.time() - st:.1f}s)")

# 2. InstantID ControlNet
print("\n[2/6] Loading InstantID ControlNet...")
st = time.time()
hf_hub_download(
    repo_id=INSTANTID_REPO,
    filename="ControlNetModel/config.json",
    local_dir="/data/checkpoints",
)
hf_hub_download(
    repo_id=INSTANTID_REPO,
    filename="ControlNetModel/diffusion_pytorch_model.safetensors",
    local_dir="/data/checkpoints",
)
hf_hub_download(
    repo_id=INSTANTID_REPO,
    filename="ip-adapter.bin",
    local_dir="/data/checkpoints",
)
identitynet = ControlNetModel.from_pretrained(
    "/data/checkpoints/ControlNetModel", torch_dtype=DTYPE
)
print(f"  [OK] InstantID ControlNet loaded ({time.time() - st:.1f}s)")

# 3. ZoeDepth ControlNet
print("\n[3/6] Loading ZoeDepth ControlNet...")
st = time.time()
zoedepthnet = ControlNetModel.from_pretrained(
    ZOEDEPTH_CN_REPO, torch_dtype=DTYPE
)
print(f"  [OK] ZoeDepth ControlNet loaded ({time.time() - st:.1f}s)")

# 4. SDXL Pipeline with dual ControlNet
print("\n[4/6] Loading SDXL Pipeline...")
st = time.time()
vae = AutoencoderKL.from_pretrained(VAE_REPO, torch_dtype=DTYPE)
pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_pretrained(
    BASE_MODEL_REPO,
    vae=vae,
    controlnet=[identitynet, zoedepthnet],
    torch_dtype=DTYPE,
)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(
    pipe.scheduler.config, use_karras_sigmas=True
)
pipe.load_ip_adapter_instantid("/data/checkpoints/ip-adapter.bin")
pipe.set_ip_adapter_scale(0.8)
print(f"  [OK] Pipeline loaded ({time.time() - st:.1f}s)")

# 5. Load and fuse LucasArts LoRA
print("\n[5/6] Loading LucasArts LoRA...")
st = time.time()
pipe.load_lora_weights(LORA_REPO, weight_name=LORA_FILENAME)
pipe.fuse_lora(LORA_STRENGTH)
print(f"  [OK] LoRA fused at strength {LORA_STRENGTH} ({time.time() - st:.1f}s)")

# 6. ZoeDetector for depth maps
print("\n[6/6] Loading ZoeDetector...")
st = time.time()
zoe = ZoeDetector.from_pretrained(ANNOTATOR_REPO)
zoe.to(DEVICE)
print(f"  [OK] ZoeDetector loaded ({time.time() - st:.1f}s)")

# Move pipeline to GPU
pipe.to(DEVICE)

print("\n" + "=" * 60)
print("All models loaded — ready to generate!")
print("=" * 60 + "\n")


# ============================================================
# HELPERS
# ============================================================

def center_crop_square(img: Image.Image) -> Image.Image:
    """Center-crop an image to a square."""
    square_size = min(img.size)
    left = (img.width - square_size) / 2
    top = (img.height - square_size) / 2
    right = (img.width + square_size) / 2
    bottom = (img.height + square_size) / 2
    return img.crop((left, top, right, bottom))


def extract_face(image: Image.Image):
    """
    Detect face with InsightFace, return (embedding, keypoints_image).
    Raises gr.Error if no face found.
    """
    bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    faces = face_app.get(bgr)
    if not faces:
        raise gr.Error(
            "No face detected in your image. Please upload a clear face photo."
        )
    # Use the largest face
    face_info = sorted(
        faces,
        key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]),
    )[-1]
    face_emb = face_info["embedding"]
    face_kps = draw_kps(image, face_info["kps"])
    return face_emb, face_kps


# ============================================================
# GENERATION
# ============================================================

@spaces.GPU(duration=90)
def generate(
    face_image: Image.Image,
    prompt: str,
    negative_prompt: str,
    face_strength: float,
    image_strength: float,
    depth_strength: float,
    guidance_scale: float,
    num_steps: int,
) -> tuple:
    """Generate LucasArts-style pixel art from a face photo."""

    if face_image is None:
        gr.Warning("Please upload a face photo first!")
        return None, "No image provided"

    try:
        # Prepare image (square crop, 1024x1024)
        face_image = center_crop_square(face_image)
        face_image = face_image.resize((1024, 1024), Image.LANCZOS)

        # Extract face embedding + keypoints
        face_emb, face_kps = extract_face(face_image)

        # Generate depth map
        with torch.no_grad():
            depth_image = zoe(face_image)

        # Dual control images: [InstantID keypoints, ZoeDepth]
        w, h = face_kps.size
        control_images = [face_kps, depth_image.resize((w, h))]

        # Build prompt with trigger word
        full_prompt = f"{TRIGGER_WORD}, {prompt}" if prompt else TRIGGER_WORD
        neg = negative_prompt if negative_prompt else None

        # Generate
        result = pipe(
            prompt=full_prompt,
            negative_prompt=neg,
            image_embeds=face_emb,
            image=face_image,
            control_image=control_images,
            strength=1.0 - image_strength,
            num_inference_steps=num_steps,
            guidance_scale=guidance_scale,
            controlnet_conditioning_scale=[face_strength, depth_strength],
            width=1024,
            height=1024,
        ).images[0]

        info = (
            f"Prompt: {full_prompt}\n"
            f"Steps: {num_steps} | Guidance: {guidance_scale}\n"
            f"Face: {face_strength} | Image: {image_strength} | Depth: {depth_strength}"
        )

        return result, info

    except gr.Error:
        raise
    except Exception as e:
        gr.Error(f"Generation failed: {str(e)}")
        return None, f"Error: {str(e)}"


# ============================================================
# GRADIO UI
# ============================================================

with gr.Blocks(
    title=TITLE,
    theme=gr.themes.Soft(primary_hue="amber", secondary_hue="orange"),
) as demo:

    gr.Markdown(f"# 🎮 {TITLE}")
    gr.Markdown(DESCRIPTION)

    with gr.Row():
        with gr.Column(scale=1):
            input_image = gr.Image(
                label="📷 Upload a face photo",
                type="pil",
                height=400,
            )

            prompt = gr.Textbox(
                label="✨ Prompt",
                value=DEFAULT_PROMPT,
                placeholder="Describe the subject (e.g., a pirate captain, a wizard)...",
                lines=2,
            )

            generate_btn = gr.Button(
                "🎮 Generate LucasArts Style",
                variant="primary",
                size="lg",
            )

            with gr.Accordion("⚙️ Advanced Settings", open=False):
                negative_prompt = gr.Textbox(
                    label="Negative Prompt",
                    value=DEFAULT_NEGATIVE,
                    lines=2,
                )

                face_strength = gr.Slider(
                    label="Face Identity Strength",
                    minimum=0.0,
                    maximum=2.0,
                    value=DEFAULT_FACE_STRENGTH,
                    step=0.01,
                    info="Higher = more face likeness, less creative freedom",
                )

                image_strength = gr.Slider(
                    label="Image Strength",
                    minimum=0.0,
                    maximum=1.0,
                    value=DEFAULT_IMAGE_STRENGTH,
                    step=0.01,
                    info="Higher = more similarity to original photo structure/colors",
                )

                depth_strength = gr.Slider(
                    label="Depth ControlNet Strength",
                    minimum=0.0,
                    maximum=1.0,
                    value=DEFAULT_DEPTH_STRENGTH,
                    step=0.01,
                    info="Higher = more structural preservation from depth map",
                )

                guidance_scale = gr.Slider(
                    label="Guidance Scale",
                    minimum=1.0,
                    maximum=15.0,
                    value=DEFAULT_GUIDANCE_SCALE,
                    step=0.1,
                    info="Higher = stronger prompt adherence",
                )

                num_steps = gr.Slider(
                    label="Inference Steps",
                    minimum=10,
                    maximum=50,
                    value=DEFAULT_STEPS,
                    step=1,
                    info="More steps = higher quality but slower",
                )

        with gr.Column(scale=1):
            output_image = gr.Image(
                label="🖼️ LucasArts Style Result",
                type="pil",
                height=400,
            )

            gen_info = gr.Textbox(
                label="📋 Generation Info",
                lines=4,
                interactive=False,
            )

    gr.Markdown("### 💡 Prompt Ideas")
    gr.Examples(
        examples=[
            ["a pirate captain"],
            ["a wizard in a dark tower"],
            ["a detective in a noir city"],
            ["a space adventurer"],
            ["a medieval knight"],
        ],
        inputs=[prompt],
        label="Click to use",
    )

    gr.Markdown(
        "---\n"
        "**Architecture:** SDXL + InstantID + ZoeDepth ControlNet + LucasArts LoRA  \n"
        "**Scheduler:** DPMSolver++ (Karras)  \n"
        "**Inspired by:** fofr's [face-to-many](https://github.com/fofr/cog-face-to-many)"
    )

    # Wire up
    generate_btn.click(
        fn=generate,
        inputs=[
            input_image,
            prompt,
            negative_prompt,
            face_strength,
            image_strength,
            depth_strength,
            guidance_scale,
            num_steps,
        ],
        outputs=[output_image, gen_info],
    )


if __name__ == "__main__":
    demo.queue()
    demo.launch(share=True)