import gradio as gr
import spaces
import numpy as np
import random
import os
import torch
from diffusers import DiffusionPipeline
from transformers import pipeline, AutoTokenizer
from huggingface_hub import login
from PIL import Image
import shutil
import subprocess
import sys
import importlib.util
os.environ['SPCONV_ALGO'] = 'native'
# Use PyTorch native scaled_dot_product_attention everywhere: xformers / flash_attn
# have no kernels for the Blackwell (sm_120) GPUs now used by ZeroGPU.
os.environ.setdefault('ATTN_BACKEND', 'sdpa')
os.environ.setdefault('SPARSE_ATTN_BACKEND', 'sdpa')
# DINOv2 (the image conditioning model loaded via torch.hub) also calls xformers
# directly; disabling it makes DINOv2 fall back to standard PyTorch attention,
# which works on Blackwell (sm_120).
os.environ.setdefault('XFORMERS_DISABLED', '1')
# Compile the runtime CUDA extensions (diff_gaussian_rasterization) for sm_120,
# with PTX so the driver can JIT for newer archs.
os.environ.setdefault('TORCH_CUDA_ARCH_LIST', '12.0+PTX')
# Workaround for a gradio_client 1.7.0 bug: boolean JSON schemas (e.g.
# additionalProperties: true, produced by gr.State) crash the /info endpoint
# with "TypeError: argument of type 'bool' is not iterable".
import gradio_client.utils as _gc_utils
_gc_orig_json_to_py = _gc_utils._json_schema_to_python_type
def _gc_safe_json_to_py(schema, defs=None):
if isinstance(schema, bool):
return "bool"
return _gc_orig_json_to_py(schema, defs)
_gc_utils._json_schema_to_python_type = _gc_safe_json_to_py
_gc_orig_get_type = _gc_utils.get_type
def _gc_safe_get_type(schema):
if not isinstance(schema, dict):
return "Any"
return _gc_orig_get_type(schema)
_gc_utils.get_type = _gc_safe_get_type
def ensure_runtime_package(module_name: str, requirement: str) -> None:
if importlib.util.find_spec(module_name) is not None:
return
subprocess.run(
[sys.executable, "-m", "pip", "install", "--no-build-isolation", requirement],
check=True,
)
def ensure_mip_gaussian_rasterization() -> None:
# TRELLIS needs the Mip-Splatting fork of diff-gaussian-rasterization (it exposes
# the kernel_size / subpixel_offset rasterization settings; the upstream
# graphdeco-inria build does not). We clone recursively (to fetch the third_party
# glm headers) and build from source so it links against the installed torch and
# the Blackwell (sm_120) arch.
if importlib.util.find_spec("diff_gaussian_rasterization") is not None:
return
repo_dir = "/tmp/mip-splatting"
if not os.path.isdir(repo_dir):
subprocess.run(
["git", "clone", "--recursive",
"https://github.com/autonomousvision/mip-splatting.git", repo_dir],
check=True,
)
subprocess.run(
[sys.executable, "-m", "pip", "install", "--no-build-isolation",
os.path.join(repo_dir, "submodules", "diff-gaussian-rasterization")],
check=True,
)
# diff_gaussian_rasterization and nvdiffrast are CUDA extensions that must be
# compiled against the installed torch version, so we build them at runtime
# instead of shipping torch-2.4 prebuilt wheels.
ensure_mip_gaussian_rasterization()
ensure_runtime_package("nvdiffrast", "git+https://github.com/NVlabs/nvdiffrast.git")
from typing import *
import imageio
from easydict import EasyDict as edict
from trellis.pipelines import TrellisImageTo3DPipeline
from trellis.representations import Gaussian, MeshExtractResult
from trellis.utils import render_utils, postprocessing_utils
hf_token = os.getenv("hf_token")
login(token=hf_token)
# Global constants and default values
MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 2048
# Default system prompt for text generation
DEFAULT_SYSTEM_PROMPT = """You are a product designer with strong knowledge in text-to-image generation. You will receive a product request in the form of a brief description, and your mission will be to imagine a new product design that meets this need.
The deliverable (generated response) will be exclusively a text prompt for the FLUX.1-dev text-to-image AI.
This prompt should include a visual description of the object explicitly mentioning the essential aspects of its function.
Additionally, you should explicitly mention in this prompt the aesthetic/photo characteristics of the image rendering (e.g., photorealistic, high quality, focal length, grain, etc.), knowing that the image will be the main image of this object in the product catalog. The background of the generated image must be entirely white.
The prompt should be without narration."""
# Default Flux parameters
DEFAULT_SEED = 42
DEFAULT_RANDOMIZE_SEED = True
DEFAULT_WIDTH = 512
DEFAULT_HEIGHT = 512
DEFAULT_NUM_INFERENCE_STEPS = 6
DEFAULT_GUIDANCE_SCALE = 0.0
DEFAULT_TEMPERATURE = 0.9
TMP_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'tmp')
os.makedirs(TMP_DIR, exist_ok=True)
_text_gen_pipeline = None
_image_gen_pipeline = None
_trellis_pipeline = None
def start_session(req: gr.Request):
# user_dir = os.path.join(TMP_DIR, "temp_output")
user_dir = os.path.join(TMP_DIR, str(req.session_hash))
os.makedirs(user_dir, exist_ok=True)
def end_session(req: gr.Request):
# user_dir = os.path.join(TMP_DIR, "temp_output")
user_dir = os.path.join(TMP_DIR, str(req.session_hash))
shutil.rmtree(user_dir)
def preprocess_image(image: Image.Image) -> Image.Image:
trellis = get_trellis_pipeline()
if trellis is None:
# If the pipeline is not loaded, just return the original image
return image
processed_image = trellis.preprocess_image(image)
return processed_image
@spaces.GPU()
def get_image_gen_pipeline():
global _image_gen_pipeline
if (_image_gen_pipeline is None):
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16
_image_gen_pipeline = DiffusionPipeline.from_pretrained(
# "black-forest-labs/FLUX.1-schnell",
"black-forest-labs/FLUX.1-dev",
torch_dtype=dtype,
).to(device)
except Exception as e:
print(f"Error loading image generation model: {e}")
return None
return _image_gen_pipeline
@spaces.GPU()
def get_text_gen_pipeline():
global _text_gen_pipeline
if (_text_gen_pipeline is None):
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
use_fast=True
)
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
_text_gen_pipeline = pipeline(
"text-generation",
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
tokenizer=tokenizer,
max_new_tokens=2048,
device=device,
pad_token_id=tokenizer.pad_token_id
)
except Exception as e:
print(f"Error loading text generation model: {e}")
return None
return _text_gen_pipeline
# @spaces.GPU()
def get_trellis_pipeline():
global _trellis_pipeline
if _trellis_pipeline is None:
try:
print("Loading Trellis pipeline...")
_trellis_pipeline = TrellisImageTo3DPipeline.from_pretrained("microsoft/TRELLIS-image-large")
except Exception as e:
print(f"Error loading Trellis pipeline: {e}")
return None
return _trellis_pipeline
def split_reasoning(text: str) -> Tuple[str, str]:
"""Split a DeepSeek-R1 response into (reasoning, clean_prompt).
DeepSeek-R1 emits its chain-of-thought inside ... tags
before the final answer. We keep only the text after for FLUX,
and surface the reasoning separately. If the closing tag is missing
(the model occasionally omits it), we treat the whole text as the prompt
and leave the reasoning empty.
"""
reasoning = ""
prompt = text.strip()
if "" in prompt:
reasoning, _, prompt = prompt.partition("")
# Drop a leading opener if present.
reasoning = reasoning.replace("", "").strip()
prompt = prompt.strip()
# Strip any stray opener (e.g. the model omitted the closing tag).
prompt = prompt.replace("", "").strip()
# Strip a leading bold header, whether on its own line ("**Prompt for
# FLUX:**\n...") or inline ("**Prompt:** actual text"). Only strip when the
# bold segment looks like a header (mentions "prompt") to avoid removing a
# legitimate bold opening word.
stripped = prompt.lstrip()
if stripped.startswith("**"):
end = stripped.find("**", 2)
if end != -1 and "prompt" in stripped[2:end].lower():
rest = stripped[end + 2:].lstrip()
if rest.startswith(":"):
rest = rest[1:]
prompt = rest.strip()
# Remove wrapping quotation marks.
if len(prompt) >= 2 and prompt[0] in "\"'" and prompt[-1] == prompt[0]:
prompt = prompt[1:-1].strip()
return reasoning, prompt
@spaces.GPU()
def refine_prompt(
prompt,
system_prompt=DEFAULT_SYSTEM_PROMPT,
progress=gr.Progress(track_tqdm=True)
):
text_gen = get_text_gen_pipeline()
if text_gen is None:
return "", "", "Text generation model is unavailable."
try:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
# Indicate progress started
progress(0, desc="Generating text")
# Generate text
refined_prompt = text_gen(messages)
# Indicate progress complete
progress(1)
# Extract just the assistant's content from the response
try:
messages = refined_prompt[0]['generated_text']
# Find the last message with role 'assistant'
assistant_messages = [msg for msg in messages if msg['role'] == 'assistant']
if not assistant_messages:
return "", "", "Error: No assistant response found"
assistant_content = assistant_messages[-1]['content']
# Separate DeepSeek-R1 reasoning from the final FLUX prompt.
reasoning, clean_prompt = split_reasoning(assistant_content)
if not clean_prompt:
return reasoning, "", "Error: Model returned an empty prompt"
return reasoning, clean_prompt, "Prompt refined successfully!"
except (KeyError, IndexError):
return "", "", "Error: Unexpected response format from the model"
except Exception as e:
print(f"Error in refine_prompt: {str(e)}") # Add debug print
return "", "", f"Error refining prompt: {str(e)}"
def validate_dimensions(width, height):
if width * height > MAX_IMAGE_SIZE * MAX_IMAGE_SIZE:
return False, "Image dimensions too large"
return True, None
@spaces.GPU()
def generate_image(prompt, seed=DEFAULT_SEED,
randomize_seed=DEFAULT_RANDOMIZE_SEED,
width=DEFAULT_WIDTH,
height=DEFAULT_HEIGHT,
num_inference_steps=DEFAULT_NUM_INFERENCE_STEPS,
progress=gr.Progress(track_tqdm=True)):
try:
# Validate that prompt is not empty
if not prompt or prompt.strip() == "":
return None, "Please provide a valid prompt."
progress(0.1, desc="Loading model")
pipe = get_image_gen_pipeline()
if pipe is None:
return None, "Image generation model is unavailable."
is_valid, error_msg = validate_dimensions(width, height)
if not is_valid:
return None, error_msg
if randomize_seed:
seed = random.randint(0, MAX_SEED)
# Use default torch generator instead of cuda-specific generator
generator = torch.Generator().manual_seed(seed)
progress(0.3, desc="Running inference")
# Match the working example's parameters
output = pipe(
prompt=prompt,
width=width,
height=height,
num_inference_steps=num_inference_steps,
generator=generator,
guidance_scale=DEFAULT_GUIDANCE_SCALE,
)
progress(0.8, desc="Processing output")
image = output.images[0]
progress(1.0, desc="Complete")
return image, f"Image generated successfully with seed {seed}"
except Exception as e:
print(f"Error in generate_image: {str(e)}")
return None, f"Error generating image: {str(e)}"
examples = [
"a backpack for kids, flower style",
"medieval flip flops",
"cat shaped cake mold",
]
css="""
#col-container {
margin: 0 auto;
max-width: 720px;
}
.step-card {
border: 1px solid var(--border-color-primary);
border-radius: 12px;
padding: 12px;
}
"""
# Real gallery results produced by running the pipeline on the Space, captured
# by scripts/generate_gallery.py into assets/gallery/ + manifest.json. Each
# manifest entry is {prompt, refined_prompt, image, video, glb} with repo-relative
# asset paths.
APP_DIR = os.path.dirname(os.path.abspath(__file__))
GALLERY_DIR = os.path.join(APP_DIR, 'assets', 'gallery')
GALLERY_MANIFEST = os.path.join(GALLERY_DIR, 'manifest.json')
def load_gallery():
if not os.path.exists(GALLERY_MANIFEST):
return []
import json
try:
with open(GALLERY_MANIFEST, encoding='utf-8') as f:
items = json.load(f)
except Exception as e:
print(f"Error loading gallery manifest: {e}")
return []
resolved = []
for it in items:
entry = dict(it)
for key in ('image', 'video', 'glb'):
if it.get(key):
abs_path = os.path.join(APP_DIR, it[key])
entry[key] = abs_path if os.path.exists(abs_path) else None
resolved.append(entry)
return resolved
def pack_state(gs: Gaussian, mesh: MeshExtractResult) -> dict:
return {
'gaussian': {
**gs.init_params,
'_xyz': gs._xyz.cpu().numpy(),
'_features_dc': gs._features_dc.cpu().numpy(),
'_scaling': gs._scaling.cpu().numpy(),
'_rotation': gs._rotation.cpu().numpy(),
'_opacity': gs._opacity.cpu().numpy(),
},
'mesh': {
'vertices': mesh.vertices.cpu().numpy(),
'faces': mesh.faces.cpu().numpy(),
},
}
def unpack_state(state: dict) -> Tuple[Gaussian, edict, str]:
gs = Gaussian(
aabb=state['gaussian']['aabb'],
sh_degree=state['gaussian']['sh_degree'],
mininum_kernel_size=state['gaussian']['mininum_kernel_size'],
scaling_bias=state['gaussian']['scaling_bias'],
opacity_bias=state['gaussian']['opacity_bias'],
scaling_activation=state['gaussian']['scaling_activation'],
)
gs._xyz = torch.tensor(state['gaussian']['_xyz'], device='cuda')
gs._features_dc = torch.tensor(state['gaussian']['_features_dc'], device='cuda')
gs._scaling = torch.tensor(state['gaussian']['_scaling'], device='cuda')
gs._rotation = torch.tensor(state['gaussian']['_rotation'], device='cuda')
gs._opacity = torch.tensor(state['gaussian']['_opacity'], device='cuda')
mesh = edict(
vertices=torch.tensor(state['mesh']['vertices'], device='cuda'),
faces=torch.tensor(state['mesh']['faces'], device='cuda'),
)
return gs, mesh
@spaces.GPU
def image_to_3d(
image: Image.Image,
seed: int,
ss_guidance_strength: float,
ss_sampling_steps: int,
slat_guidance_strength: float,
slat_sampling_steps: int,
req: gr.Request,
) -> Tuple[dict, str, str, str, str]:
try:
# Load the Trellis pipeline
pipeline = get_trellis_pipeline()
if pipeline is None:
return None, None, None, "Trellis pipeline is unavailable.", None
pipeline.cuda()
# Preprocess image
image = preprocess_image(image)
# Run the pipeline
outputs = pipeline.run(
image,
seed=seed,
formats=["gaussian", "mesh"],
preprocess_image=False,
sparse_structure_sampler_params={
"steps": ss_sampling_steps,
"cfg_strength": ss_guidance_strength,
},
slat_sampler_params={
"steps": slat_sampling_steps,
"cfg_strength": slat_guidance_strength,
},
)
# temp_dir = os.path.join(TMP_DIR, "temp_output")
temp_dir = os.path.join(TMP_DIR, str(req.session_hash))
# The browser `load` event normally creates this via start_session, but
# API callers (e.g. gradio_client) never trigger it, so ensure it exists.
os.makedirs(temp_dir, exist_ok=True)
video = render_utils.render_video(outputs['gaussian'][0], num_frames=120)['color']
video_geo = render_utils.render_video(outputs['mesh'][0], num_frames=120)['normal']
video = [np.concatenate([video[i], video_geo[i]], axis=1) for i in range(len(video))]
video_path = os.path.join(temp_dir, 'sample.mp4')
imageio.mimsave(video_path, video, fps=15)
state = pack_state(outputs['gaussian'][0], outputs['mesh'][0])
# Export an interactive, textured GLB for the 3D viewer and download.
glb = postprocessing_utils.to_glb(
outputs['gaussian'][0],
outputs['mesh'][0],
simplify=0.95,
texture_size=1024,
verbose=False,
)
glb_path = os.path.join(temp_dir, 'sample.glb')
glb.export(glb_path)
torch.cuda.empty_cache()
return state, video_path, glb_path, "3D model generated successfully!", glb_path
except Exception as e:
print(f"Error in image_to_3d: {str(e)}")
import traceback
traceback.print_exc() # Print the full stack trace for debugging
return None, None, None, f"Error generating 3D model: {str(e)}", None
def process_example_pipeline(example_prompt):
return example_prompt
HOW_IT_WORKS_MD = """
## Comment ça marche
Ce Space transforme une **idée en une phrase** en un **objet 3D téléchargeable**,
en enchaînant trois modèles spécialisés.
```
Prompt simple ──▶ DeepSeek-R1 ──▶ FLUX.1-dev ──▶ TRELLIS ──▶ Vidéo + GLB
(texte) (raisonnement (image (objet 3D)
+ prompt riche) produit)
```
**Étape 1 — DeepSeek-R1-Distill-Llama-8B (texte → texte).**
Le modèle joue le rôle d'un designer produit : à partir d'une description courte,
il *réfléchit* (chaîne de raisonnement visible dans l'accordéon « Raisonnement du
modèle ») puis rédige un prompt visuel détaillé et photoréaliste pour FLUX. Seul
le prompt final — sans le raisonnement — est transmis à l'étape suivante.
**Étape 2 — FLUX.1-dev (texte → image).**
Le prompt détaillé est rendu en une image produit sur fond blanc, cadrée comme une
photo de catalogue.
**Étape 3 — TRELLIS (image → 3D).**
L'image est convertie en un asset 3D : une vidéo de prévisualisation (rendu couleur
+ normales) et un fichier **GLB** texturé, manipulable directement dans la
visionneuse interactive et téléchargeable.
> **Pourquoi passer par une image avant la 3D ?** TRELLIS est conditionné sur une
> image. Générer d'abord une image nette et bien cadrée donne un maillage et une
> texture bien plus propres qu'une génération 3D directe depuis du texte.
### Stack technique
- **DeepSeek-R1-Distill-Llama-8B** — raisonnement + ingénierie de prompt
- **FLUX.1-dev** — diffusion texte→image
- **TRELLIS (microsoft/TRELLIS-image-large)** — génération 3D (Gaussian + mesh)
- **ZeroGPU** sur GPU Blackwell (sm_120), attention `sdpa`, extensions CUDA
compilées au runtime
"""
def create_interface():
theme = gr.themes.Soft(primary_hue="pink", secondary_hue="rose")
with gr.Blocks(css=css, theme=theme, title="Text to 3D") as demo:
# Move session handlers INSIDE the Blocks context
demo.load(fn=start_session)
demo.unload(fn=end_session)
# State for storing 3D model data
output_state = gr.State(None)
with gr.Column(elem_id="col-container"):
gr.Markdown(
"# Text to 3D\n"
"De quelques mots à un objet 3D téléchargeable — "
"**DeepSeek-R1 + FLUX.1-dev + TRELLIS**."
)
gr.Markdown(
"> ⏳ **Démarrage à froid (ZeroGPU)** : au premier lancement, le "
"chargement de DeepSeek-R1 (8B), FLUX et TRELLIS peut prendre "
"plusieurs minutes. Les générations suivantes sont bien plus rapides. "
"Pas envie d'attendre ? Voyez l'onglet **Galerie**."
)
with gr.Tabs():
# ---------------------------------------------------------- Demo
with gr.Tab("Démo en direct"):
prompt = gr.Text(
label="Votre idée",
max_lines=1,
placeholder="Ex. : a backpack for kids, flower style",
)
run_all_button = gr.Button("✨ Générer tout", variant="primary")
# Step 1 — DeepSeek-R1
with gr.Group(elem_classes="step-card"):
gr.Markdown("### Étape 1 — DeepSeek-R1 · prompt design")
status1 = gr.Markdown("Étape 1 — en attente")
with gr.Accordion("Raisonnement du modèle", open=False):
reasoning_box = gr.Textbox(
show_label=False,
max_lines=20,
placeholder="La chaîne de raisonnement de DeepSeek-R1 apparaîtra ici",
interactive=False,
)
refined_prompt = gr.Text(
label="Prompt détaillé (envoyé à FLUX)",
max_lines=10,
placeholder="Detailed object prompt",
max_length=2048,
)
prompt_button = gr.Button("Rejouer l'étape 1 — Affiner le prompt")
# Step 2 — FLUX
with gr.Group(elem_classes="step-card"):
gr.Markdown("### Étape 2 — FLUX.1-dev · image produit")
status2 = gr.Markdown("Étape 2 — en attente")
generated_image = gr.Image(
label="Image générée",
format="png",
image_mode="RGBA",
type="pil",
height=300,
)
visual_button = gr.Button("Rejouer l'étape 2 — Générer l'image")
# Step 3 — TRELLIS
with gr.Group(elem_classes="step-card"):
gr.Markdown("### Étape 3 — TRELLIS · objet 3D")
status3 = gr.Markdown("Étape 3 — en attente")
with gr.Row():
video_output = gr.Video(
label="Prévisualisation (couleur + normales)",
autoplay=True, loop=True, height=300,
)
model_3d = gr.Model3D(label="Modèle 3D interactif", height=300)
download_glb = gr.DownloadButton("⬇️ Télécharger le .glb", value=None)
gen3d_button = gr.Button("Rejouer l'étape 3 — Générer la 3D")
message_box = gr.Textbox(
label="Messages de statut",
interactive=False,
placeholder="Les messages détaillés apparaîtront ici",
)
# Accordion sections for advanced settings
with gr.Accordion("Réglages avancés", open=False):
with gr.Tab("DeepSeek-R1"):
temperature = gr.Slider(
label="Temperature",
value=DEFAULT_TEMPERATURE,
minimum=0.0,
maximum=1.0,
step=0.05,
info="Higher values produce more diverse outputs",
)
system_prompt = gr.Textbox(
label="System Prompt",
value=DEFAULT_SYSTEM_PROMPT,
lines=10,
info="Instructions for the DeepSeek-R1 model"
)
with gr.Tab("Flux"):
flux_seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=DEFAULT_SEED)
flux_randomize_seed = gr.Checkbox(label="Randomize seed", value=DEFAULT_RANDOMIZE_SEED)
with gr.Row():
width = gr.Slider(label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=DEFAULT_WIDTH)
height = gr.Slider(label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=DEFAULT_HEIGHT)
num_inference_steps = gr.Slider(
label="Number of inference steps",
minimum=1,
maximum=50,
step=1,
value=DEFAULT_NUM_INFERENCE_STEPS,
)
with gr.Tab("3D Generation Settings"):
trellis_seed = gr.Slider(0, MAX_SEED, label="Seed", value=0, step=1)
trellis_randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
gr.Markdown("Stage 1: Sparse Structure Generation")
with gr.Row():
ss_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=7.5, step=0.1)
ss_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
gr.Markdown("Stage 2: Structured Latent Generation")
with gr.Row():
slat_guidance_strength = gr.Slider(0.0, 10.0, label="Guidance Strength", value=3.0, step=0.1)
slat_sampling_steps = gr.Slider(1, 50, label="Sampling Steps", value=12, step=1)
gr.Examples(
examples=examples,
fn=process_example_pipeline,
inputs=[prompt],
outputs=[prompt],
cache_examples=True,
)
# ------------------------------------------------------- Gallery
with gr.Tab("Galerie"):
gr.Markdown(
"De **vrais résultats** produits par le pipeline complet "
"(prompt → image → 3D). Affichage instantané, même quand le "
"GPU est endormi."
)
gallery_items = load_gallery()
if not gallery_items:
gr.Markdown(
"_Galerie en cours de génération — lancez "
"`python scripts/generate_gallery.py` puis committez "
"`assets/gallery/`._"
)
for item in gallery_items:
with gr.Group(elem_classes="step-card"):
gr.Markdown(f"**Prompt :** {item.get('prompt', '')}")
with gr.Row():
if item.get("image"):
gr.Image(
value=item["image"], label="Image (FLUX)",
height=260, interactive=False,
)
if item.get("video"):
gr.Video(
value=item["video"], label="Aperçu (TRELLIS)",
autoplay=True, loop=True, height=260,
)
if item.get("glb"):
gr.Model3D(
value=item["glb"],
label="Modèle 3D interactif", height=260,
)
if item.get("refined_prompt"):
with gr.Accordion("Prompt détaillé (DeepSeek-R1)", open=False):
gr.Markdown(item["refined_prompt"])
# --------------------------------------------------- How it works
with gr.Tab("Comment ça marche"):
gr.Markdown(HOW_IT_WORKS_MD)
# ----------------------------------------------------------- Wiring
# Individual step buttons (replay one stage in isolation).
gr.on(
triggers=[prompt_button.click, prompt.submit],
fn=refine_prompt,
inputs=[prompt, system_prompt],
outputs=[reasoning_box, refined_prompt, status1],
)
gr.on(
triggers=[visual_button.click],
fn=generate_image,
inputs=[refined_prompt, flux_seed, flux_randomize_seed, width, height, num_inference_steps],
outputs=[generated_image, status2],
)
gr.on(
triggers=[gen3d_button.click],
fn=image_to_3d,
inputs=[generated_image, trellis_seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps],
outputs=[output_state, video_output, model_3d, status3, download_glb],
)
# One-click pipeline: refine → image → 3D, with a per-step status.
run_all_button.click(
fn=lambda: ("⏳ Étape 1 — en cours…", "Étape 2 — en attente", "Étape 3 — en attente"),
inputs=None,
outputs=[status1, status2, status3],
).then(
fn=refine_prompt,
inputs=[prompt, system_prompt],
outputs=[reasoning_box, refined_prompt, status1],
).then(
fn=lambda: "⏳ Étape 2 — en cours…",
inputs=None,
outputs=[status2],
).then(
fn=generate_image,
inputs=[refined_prompt, flux_seed, flux_randomize_seed, width, height, num_inference_steps],
outputs=[generated_image, status2],
).then(
fn=lambda: "⏳ Étape 3 — en cours…",
inputs=None,
outputs=[status3],
).then(
fn=image_to_3d,
inputs=[generated_image, trellis_seed, ss_guidance_strength, ss_sampling_steps, slat_guidance_strength, slat_sampling_steps],
outputs=[output_state, video_output, model_3d, status3, download_glb],
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()