Bernini-R-Lightning

Runtime error

App Files Files Community

signsur4739379373 commited on 7 days ago

Commit

7c6328c

1 Parent(s): 45056bf

duration max 25, steps_per_chunk param

Browse files

Files changed (1) hide show

app.py +212 -167

app.py CHANGED Viewed

@@ -378,72 +378,92 @@ DEFAULT_NEGATIVE = (
 R2V_TEMPLATE = """You are an expert at writing subject-driven video generation prompts. I'm providing you with:
-1. {image_num} reference image(s) of the subject(s) that will appear in the video (referred to as image0, image1, image2, ... in order).
 2. An original video description text.
-Your task is to rewrite the original description into a new format with TWO parts concatenated together:
-**Part 1 - Short instruction**: A concise sentence describing who the subject(s) from the reference image(s) are, what they look like briefly, where they are, and what key action/motion they perform. Reference the subject(s) using "image0", "image1", etc. to link them to the provided reference images.
-**Part 2 - Long instruction**: A detailed "Generate a video where..." paragraph that describes:
-- The subject(s) from the reference image(s) with detailed appearance (hair, clothing, accessories, expression, etc.), referencing them as "the person/man/woman from image0" etc.
-- The scene/environment in detail (background, lighting, objects, atmosphere).
-- The motion and actions in a step-by-step temporal sequence (at the start..., then..., after that...).
-- The motion should remain natural and realistic.
 Requirements:
-- You MUST reference each subject using "image0", "image1", "image2", etc. to correspond to the provided reference images in order.
-- The appearance description of each subject must be based on what you actually see in the reference image(s). Do NOT hallucinate details not visible in the images.
-- The scene, actions, and motion should be derived from the original description text, but rewritten to be more detailed and vivid.
-- The output must be entirely in English.
-- Return ONLY a JSON object with one key: "rewritten_text". The value should be the full rewritten text (short instruction + long instruction concatenated as one string). No extra text.
 Original description:
 {original_text}
 """
-def _enhance_prompt_r2v(prompt: str, image_paths: list[str]) -> str | None:
-    """Call grok-4.3 via xAI API to enhance prompt for r2v. Returns None on failure."""
-    api_key = os.environ.get("XAI_API_KEY")
-    if not api_key:
-        print("[enhancer] XAI_API_KEY not set, skipping enhancement", flush=True)
-        return None
     try:
-        import base64
-        import json as _json
-        from openai import OpenAI
-        client = OpenAI(api_key=api_key, base_url="https://api.x.ai/v1")
-        image_num = len(image_paths)
-        user_text = R2V_TEMPLATE.format(image_num=max(image_num, 1), original_text=prompt)
-        content: list = [{"type": "text", "text": user_text}]
-        for i, path in enumerate(image_paths):
             if not path or not os.path.exists(path):
                 continue
             with open(path, "rb") as f:
                 b64 = base64.b64encode(f.read()).decode("utf-8")
-            content.append({"type": "text", "text": f"\n[Image {i}]:"})
-            content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}})
-        messages = [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": content},
-        ]
-        resp = client.chat.completions.create(
-            model="grok-4.3",
-            messages=messages,
-            max_completion_tokens=8192,
-            response_format={"type": "json_object"},
         )
-        text = resp.choices[0].message.content or ""
         enhanced = _json.loads(text).get("rewritten_text", "").strip()
         if enhanced:
-            print(f"[enhancer] enhanced prompt ({len(enhanced)} chars)", flush=True)
             return enhanced
         return None
     except Exception as e:
         print(f"[enhancer] failed: {e}", flush=True)
         return None
 def _load_workflow() -> dict[str, Any]:
     wf_path = ROOT / WORKFLOW_FILE
     return json.loads(wf_path.read_text(encoding="utf-8"))
@@ -856,41 +876,20 @@ def _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref
     return max(30, int(total))
 def _get_duration(
     prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
-    gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode, *args, **kwargs,
 ):
     if gpu_budget and int(gpu_budget) > 0:
         return int(gpu_budget)
     return _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref_aspect, rife_mode)
-def _chunk_get_duration(
-    prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
-    gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode, *args, **kwargs,
-):
-    """Each chunk is self-contained — request 120s for the GPU."""
-    if gpu_budget and int(gpu_budget) > 0:
-        return int(gpu_budget)
-    return 120
-def _coerce_gallery(g) -> list:
-    paths = []
-    for item in (g or []):
-        if isinstance(item, str):
-            paths.append(item)
-        elif isinstance(item, dict):
-            p = item.get("path") or item.get("name")
-            if p: paths.append(p)
-        elif isinstance(item, (list, tuple)) and item:
-            p = item[0]
-            if isinstance(p, str): paths.append(p)
-            elif isinstance(p, dict): paths.append(p.get("path",""))
-    return [p for p in paths if p and os.path.exists(p)][:5]
-@spaces.GPU(duration=_chunk_get_duration)
 def generate_chunk_handler(
     prompt: str,
     negative: str,
@@ -901,26 +900,26 @@ def generate_chunk_handler(
     gpu_budget: int = 120,
     num_steps: int = 6,
     duration_secs: float = 5.0,
-    sampler_name: str = "uni_pc",
     base_fps: int = 15,
-    rife_mode: str = "no rife",
     loras_enabled: bool = False,
     s_wamu_h: float = 1.0,
-    s_dreamly_h: float = 0.3,
     s_wamu_l: float = 0.5,
-    s_dreamly_l: float = 0.5,
-    enhance_prompt: bool = False,
-    ipnc_enabled: bool = False,
     ipnc_wamu_h: float = 100.0,
     ipnc_dreamly_h: float = 100.0,
     ipnc_wamu_l: float = 100.0,
     ipnc_dreamly_l: float = 100.0,
-    s_svicamera_h: float = 0.0,
-    s_svicamera_l: float = 0.0,
     ipnc_svicamera_h: float = 100.0,
     ipnc_svicamera_l: float = 100.0,
     # chunking params
     session_id: str = "",
     chunk_step: str = "0",
     progress=gr.Progress(track_tqdm=True),
 ):
@@ -952,7 +951,7 @@ def generate_chunk_handler(
         gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
         n_frames       = max(1, round(float(duration_secs) * int(base_fps)))
         steps          = int(num_steps)
-        STEPS_PER_CHUNK = 2
         chunk_idx       = int(float(chunk_step or 0))
         step_start      = chunk_idx * STEPS_PER_CHUNK
         step_end        = min(step_start + STEPS_PER_CHUNK, steps)
@@ -992,7 +991,7 @@ def generate_chunk_handler(
         sampler_id = "57" if use_high else "58"
         # Inject chunk nodes: save/load latent, save/load conditioning
-        # Batch multiple steps per chunk (STEPS_PER_CHUNK=2 for short videos)
         api_wf[sampler_id]["inputs"]["steps"]         = steps
         api_wf[sampler_id]["inputs"]["start_at_step"]  = step_start
         api_wf[sampler_id]["inputs"]["end_at_step"]    = step_end
@@ -1074,25 +1073,24 @@ def generate_handler(
     seed: int,
     aspect_ratio: str = "16:9",
     use_ref_aspect: bool = False,
-    gpu_budget: int = 0,  # 0 = auto-estimate
     num_steps: int = 6,
     duration_secs: float = 10.0,
-    sampler_name: str = "uni_pc",
     base_fps: int = 15,
-    rife_mode: str = "no rife",
     loras_enabled: bool = False,
     s_wamu_h: float = 1.0,
-    s_dreamly_h: float = 0.3,
     s_wamu_l: float = 0.5,
-    s_dreamly_l: float = 0.5,
-    enhance_prompt: bool = False,
-    ipnc_enabled: bool = False,
     ipnc_wamu_h: float = 100.0,
     ipnc_dreamly_h: float = 100.0,
     ipnc_wamu_l: float = 100.0,
     ipnc_dreamly_l: float = 100.0,
-    s_svicamera_h: float = 0.0,
-    s_svicamera_l: float = 0.0,
     ipnc_svicamera_h: float = 100.0,
     ipnc_svicamera_l: float = 100.0,
     progress=gr.Progress(track_tqdm=True),
@@ -1105,6 +1103,20 @@ def generate_handler(
         final_seed = int(seed) if seed else random.randint(0, MAX_SEED)
         negative = negative or DEFAULT_NEGATIVE
         image_paths = _coerce_gallery(image_input)
         if not image_paths:
             return None, "upload at least one reference image"
@@ -1113,17 +1125,12 @@ def generate_handler(
         dest_names = []
         for p in image_paths:
             dn = f"ref_{uuid.uuid4().hex[:8]}_{os.path.basename(p)}"
-            shutil.copy(p, INPUT / dn)
             dest_names.append(dn)
         gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
-        enhanced_prompt_text = None
-        if enhance_prompt:
-            progress(0.05, desc="enhancing prompt...")
-            enhanced_prompt_text = _enhance_prompt_r2v(prompt, image_paths)
-            if enhanced_prompt_text:
-                prompt = enhanced_prompt_text
         progress(0.1, desc="building workflow...")
         visual_wf = _load_workflow()
@@ -1168,12 +1175,10 @@ def generate_handler(
     ts = time.strftime("%Y%m%d_%H%M%S")
     out_path = os.path.join(SAVE_BASE, f"r2v_{ts}.mp4")
-    shutil.copy(output_video, out_path)
-    status = f"Done: {out_path}"
-    if enhanced_prompt_text:
-        status += f"\n\n--- Enhanced prompt ---\n{enhanced_prompt_text}"
-    return out_path, status
 with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
     gr.Markdown("# Bernini-R Wan 2.2 R2V Lightning")
@@ -1189,15 +1194,16 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
             prompt = gr.Textbox(
                 label="Prompt",
                 lines=3,
-                placeholder="Describe the subject's action in detail. Reference uploaded images by name (e.g. image0, image1). Example: the person in image0, wearing the outfit from image1, walks confidently through a busy train station.",
                 value="Keeping the exact identity and appearance the same as in image0, the person in image0 dances in a supermarket.",
             )
-            with gr.Accordion("Negative prompt", open=False):
-                negative = gr.Textbox(
-                    label="",
-                    lines=2,
-                    value=DEFAULT_NEGATIVE,
                 )
             with gr.Group():
                 aspect_ratio = gr.Radio(
                     choices=list(ASPECT_PRESETS.keys()),
@@ -1209,6 +1215,14 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
                     value=False,
                 )
             with gr.Group(elem_id="loras_9999"):
                 loras_enabled = gr.Checkbox(label="optional loras", value=False)
                 with gr.Column(visible=False) as loras_section:
@@ -1216,8 +1230,8 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
                         with gr.Group():
                             gr.Markdown("<div style='padding-left:8px'>High</div>")
                             with gr.Row():
-                                s_wamu_h = gr.Slider(-2, 2, value=0.9, step=0.05, label="wamu")
-                                s_dreamly_h = gr.Slider(-2, 2, value=0.9, step=0.05, label="dreamly")
                                 s_svicamera_h = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
                         with gr.Group():
                             gr.Markdown("<div style='padding-left:8px'>Low</div>")
@@ -1225,21 +1239,6 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
                                 s_wamu_l    = gr.Slider(-2, 2, value=0.5, step=0.05, label="wamu")
                                 s_dreamly_l = gr.Slider(-2, 2, value=0.7, step=0.05, label="dreamly")
                                 s_svicamera_l = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
-                        gr.Markdown("<hr style='margin:8px 0'>")
-                        ipnc_enabled = gr.Checkbox(label="IPNC", value=False)
-                        with gr.Column(visible=False) as ipnc_section:
-                            with gr.Group():
-                                gr.Markdown("<div style='padding-left:8px'>High</div>")
-                                with gr.Row():
-                                    ipnc_wamu_h    = gr.Slider(0, 200, value=100, step=1, label="wamu")
-                                    ipnc_dreamly_h = gr.Slider(0, 200, value=100, step=1, label="dreamly")
-                                    ipnc_svicamera_h = gr.Slider(0, 200, value=100, step=1, label="svicamera")
-                            with gr.Group():
-                                gr.Markdown("<div style='padding-left:8px'>Low</div>")
-                                with gr.Row():
-                                    ipnc_wamu_l    = gr.Slider(0, 200, value=100, step=1, label="wamu")
-                                    ipnc_dreamly_l = gr.Slider(0, 200, value=100, step=1, label="dreamly")
-                                    ipnc_svicamera_l = gr.Slider(0, 200, value=100, step=1, label="svicamera")
             loras_enabled.change(
                 fn=lambda x: gr.update(visible=x),
@@ -1247,32 +1246,101 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
                 outputs=loras_section,
             )
-            ipnc_enabled.change(
-                fn=lambda x: gr.update(visible=x),
-                inputs=ipnc_enabled,
-                outputs=ipnc_section,
-            )
-            with gr.Row():
-                seed = gr.Number(value=0, precision=0, label="Seed (0=random)")
-                gpu_budget = gr.Slider(0, 540, value=0, step=10, label="ZeroGPU budget (0=auto)")
-            with gr.Row():
-                num_steps = gr.Slider(4, 20, value=6, step=1, label="Steps")
-                duration_secs = gr.Slider(1, 15, value=5, step=0.5, label="Duration (s)")
-                sampler_name = gr.Dropdown(choices=["uni_pc", "lcm"], value="lcm", label="Sampler")
-                base_fps = gr.Number(value=15, precision=0, label="Combine FPS")
-                rife_mode = gr.Dropdown(
-                    choices=["no rife", "2x rife", "4x rife"],
-                    value="no rife",
-                    label="RIFE interpolation",
                 )
-            enhance_prompt = gr.Checkbox(label="enhance prompt", value=False)
             generate_btn = gr.Button("Generate", variant="primary", size="lg")
         with gr.Column(scale=1):
             output_video = gr.Video(label="Generated video")
-            output_status = gr.Textbox(label="Status", interactive=False)
     generate_btn.click(
         fn=generate_handler,
@@ -1282,39 +1350,16 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
             loras_enabled,
             s_wamu_h, s_dreamly_h,
             s_wamu_l, s_dreamly_l,
-            enhance_prompt,
-            ipnc_enabled,
-            ipnc_wamu_h, ipnc_dreamly_h,
-            ipnc_wamu_l, ipnc_dreamly_l,
             s_svicamera_h, s_svicamera_l,
-            ipnc_svicamera_h, ipnc_svicamera_l,
-        ],
-        outputs=[output_video, output_status],
-    )
-    _chunk_session_id = gr.Textbox(visible=False, value="")
-    _chunk_step_txt   = gr.Textbox(visible=False, value="0")
-    _chunk_btn        = gr.Button(visible=False)
-    _chunk_btn.click(
-        fn=generate_chunk_handler,
-        inputs=[
-            prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect, gpu_budget,
-            num_steps, duration_secs, sampler_name, base_fps, rife_mode,
-            loras_enabled,
-            s_wamu_h, s_dreamly_h,
-            s_wamu_l, s_dreamly_l,
-            enhance_prompt,
             ipnc_enabled,
             ipnc_wamu_h, ipnc_dreamly_h,
             ipnc_wamu_l, ipnc_dreamly_l,
-            s_svicamera_h, s_svicamera_l,
             ipnc_svicamera_h, ipnc_svicamera_l,
-            _chunk_session_id, _chunk_step_txt,
         ],
         outputs=[output_video, output_status],
-        api_name="generate_chunk",
     )
 if __name__ == "__main__":
     _ensure_comfy()
     _ensure_models()

 R2V_TEMPLATE = """You are an expert at writing subject-driven video generation prompts. I'm providing you with:
+1. {image_num} reference image(s) of the subject(s) (referred to as image0, image1, ... in order).
 2. An original video description text.
+Rewrite the description into TWO concatenated parts:
+Part 1 - Short: A concise sentence describing who appears (reference as image0/image1/etc.), where, and what key action/motion.
+Part 2 - Long: A detailed "Generate a video where..." paragraph with full appearance details referencing each subject as "the person from image0" etc., detailed scene/environment, and step-by-step temporal motion sequence.
 Requirements:
+- Reference each subject as image0/image1/etc., base appearance on what you see in the images (no hallucination), output entirely in English.
+- For every action or scenario described, identify and explicitly state all implied visual elements that are not mentioned but must be present for the scene to exist: who or what else is in the frame, what the subject is wearing or holding that the activity requires, what the environment necessarily contains, any other participants or objects involved.
+- For each action, explicitly state all visually relevant body states that are not mentioned but would be visible on camera: hand positions (open, clenched, raised, at side), facial expression, direction of gaze, posture, weight distribution, foot placement. Do not assume any body state is obvious -- state it explicitly.
+- The final prompt must be detailed enough that a complete mental image of the scene can be formed without seeing the reference images. Every visual element a film director would need to brief their crew on -- blocking, attire, props, environment, participant positions -- must appear in the text.
+- Do not robotically enumerate biomechanics. Write naturally while ensuring no visual element is left implicit.
+{extra_rule}
+- For any close physical interaction or insertion, you MUST explicitly define the spatial occlusion. State exactly what is physically connected, what is penetrating or entering a space, and what is visually hidden inside the other object/body versus what remains visible outside.
+Return ONLY a JSON object with one key: "rewritten_text".
 Original description:
 {original_text}
 """
+ENHANCE_EXTRA_RULE_NOREDESCRIBE = (
+    "Do not describe the inherent visual appearance of subjects from the reference images "
+    "(their face, hair, body type, baseline clothing) -- the model already sees those. "
+    "However, DO describe any scene-specific additions to reference subjects' appearance: "
+    "equipment, props, or attire added for this scene that would not be present in the reference image."
+)
+ENHANCE_EXTRA_RULE_DEFAULT = (
+    "Include full appearance details of reference subjects."
+)
+def _enhance_prompt_r2v(prompt: str, image_paths: list[str], no_redescribe: bool = False) -> str | None:
+    """Call grok-4.3 via xAI API. Direct connection."""
+    import base64, mimetypes as _mt, json as _json
     try:
+        image_num = len([p for p in image_paths if p and os.path.exists(p)])
+        extra_rule = ENHANCE_EXTRA_RULE_NOREDESCRIBE if no_redescribe else ENHANCE_EXTRA_RULE_DEFAULT
+        user_text = R2V_TEMPLATE.format(
+            image_num=max(image_num, 1),
+            extra_rule=extra_rule,
+            original_text=prompt,
+        )
+        content_msgs: list = [{"type": "text", "text": user_text}]
+        for i, path in enumerate(image_paths[:5]):
             if not path or not os.path.exists(path):
                 continue
             with open(path, "rb") as f:
                 b64 = base64.b64encode(f.read()).decode("utf-8")
+            mime, _ = _mt.guess_type(path)
+            mime = mime or "image/jpeg"
+            content_msgs.append({"type": "text", "text": f"\\n[Image {i}]:"})
+            content_msgs.append({"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}})
+        payload = {
+            "model": "grok-4.3",
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": content_msgs},
+            ],
+            "reasoning_effort": "xhigh",
+            "response_format": {"type": "json_object"},
+        }
+        import requests as _req
+        r = _req.post(
+            "https://api.x.ai/v1/chat/completions",
+            json=payload,
+            headers={
+                "Authorization": f"Bearer {os.environ.get('XAI_API_KEY', '')}",
+                "Content-Type": "application/json",
+            },
+            timeout=300,
         )
+        r.raise_for_status()
+        text = r.json()["choices"][0]["message"]["content"]
         enhanced = _json.loads(text).get("rewritten_text", "").strip()
         if enhanced:
+            print(f"[enhancer] enhanced ({len(enhanced)} chars)", flush=True)
             return enhanced
         return None
     except Exception as e:
         print(f"[enhancer] failed: {e}", flush=True)
         return None
 def _load_workflow() -> dict[str, Any]:
     wf_path = ROOT / WORKFLOW_FILE
     return json.loads(wf_path.read_text(encoding="utf-8"))
     return max(30, int(total))
 def _get_duration(
     prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
+    gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode,
+    ipnc_enabled, ipnc_wamu_h, ipnc_dreamly_h, ipnc_wamu_l, ipnc_dreamly_l,
+    ipnc_svicamera_h, ipnc_svicamera_l,
+    *args, **kwargs,
 ):
     if gpu_budget and int(gpu_budget) > 0:
         return int(gpu_budget)
     return _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref_aspect, rife_mode)
+@spaces.GPU(duration=_get_duration)
 def generate_chunk_handler(
     prompt: str,
     negative: str,
     gpu_budget: int = 120,
     num_steps: int = 6,
     duration_secs: float = 5.0,
+    sampler_name: str = "lcm",
     base_fps: int = 15,
+    rife_mode: str = "2x rife",
     loras_enabled: bool = False,
     s_wamu_h: float = 1.0,
+    s_dreamly_h: float = 1.0,
     s_wamu_l: float = 0.5,
+    s_dreamly_l: float = 0.7,
+    s_svicamera_h: float = 0.0,
+    s_svicamera_l: float = 0.0,
+    ipnc_enabled: bool = True,
     ipnc_wamu_h: float = 100.0,
     ipnc_dreamly_h: float = 100.0,
     ipnc_wamu_l: float = 100.0,
     ipnc_dreamly_l: float = 100.0,
     ipnc_svicamera_h: float = 100.0,
     ipnc_svicamera_l: float = 100.0,
     # chunking params
     session_id: str = "",
+    steps_per_chunk: str = "2",
     chunk_step: str = "0",
     progress=gr.Progress(track_tqdm=True),
 ):
         gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
         n_frames       = max(1, round(float(duration_secs) * int(base_fps)))
         steps          = int(num_steps)
+        STEPS_PER_CHUNK = max(1, int(float(steps_per_chunk or 2)))
         chunk_idx       = int(float(chunk_step or 0))
         step_start      = chunk_idx * STEPS_PER_CHUNK
         step_end        = min(step_start + STEPS_PER_CHUNK, steps)
         sampler_id = "57" if use_high else "58"
         # Inject chunk nodes: save/load latent, save/load conditioning
+        # Batch multiple steps per chunk (set by the caller)
         api_wf[sampler_id]["inputs"]["steps"]         = steps
         api_wf[sampler_id]["inputs"]["start_at_step"]  = step_start
         api_wf[sampler_id]["inputs"]["end_at_step"]    = step_end
     seed: int,
     aspect_ratio: str = "16:9",
     use_ref_aspect: bool = False,
+    gpu_budget: int = 0,
     num_steps: int = 6,
     duration_secs: float = 10.0,
+    sampler_name: str = "lcm",
     base_fps: int = 15,
+    rife_mode: str = "2x rife",
     loras_enabled: bool = False,
     s_wamu_h: float = 1.0,
+    s_dreamly_h: float = 1.0,
     s_wamu_l: float = 0.5,
+    s_dreamly_l: float = 0.7,
+    s_svicamera_h: float = 0.0,
+    s_svicamera_l: float = 0.0,
+    ipnc_enabled: bool = True,
     ipnc_wamu_h: float = 100.0,
     ipnc_dreamly_h: float = 100.0,
     ipnc_wamu_l: float = 100.0,
     ipnc_dreamly_l: float = 100.0,
     ipnc_svicamera_h: float = 100.0,
     ipnc_svicamera_l: float = 100.0,
     progress=gr.Progress(track_tqdm=True),
         final_seed = int(seed) if seed else random.randint(0, MAX_SEED)
         negative = negative or DEFAULT_NEGATIVE
+        def _coerce_gallery(g):
+            paths = []
+            for item in (g or []):
+                if isinstance(item, str):
+                    paths.append(item)
+                elif isinstance(item, dict):
+                    p = item.get("path") or item.get("name")
+                    if p: paths.append(p)
+                elif isinstance(item, (list, tuple)) and item:
+                    p = item[0]
+                    if isinstance(p, str): paths.append(p)
+                    elif isinstance(p, dict): paths.append(p.get("path",""))
+            return [p for p in paths if p and os.path.exists(p)][:5]
         image_paths = _coerce_gallery(image_input)
         if not image_paths:
             return None, "upload at least one reference image"
         dest_names = []
         for p in image_paths:
             dn = f"ref_{uuid.uuid4().hex[:8]}_{os.path.basename(p)}"
+            shutil.copy2(p, INPUT / dn)
             dest_names.append(dn)
         gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
+        print(prompt)
         progress(0.1, desc="building workflow...")
         visual_wf = _load_workflow()
     ts = time.strftime("%Y%m%d_%H%M%S")
     out_path = os.path.join(SAVE_BASE, f"r2v_{ts}.mp4")
+    shutil.copy2(output_video, out_path)
+    return out_path, f"Seed: {final_seed}\n{out_path}"
 with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
     gr.Markdown("# Bernini-R Wan 2.2 R2V Lightning")
             prompt = gr.Textbox(
                 label="Prompt",
                 lines=3,
+                placeholder="Describe the subject's action in detail...",
                 value="Keeping the exact identity and appearance the same as in image0, the person in image0 dances in a supermarket.",
             )
+            with gr.Row():
+                enhance_btn = gr.Button("Enhance prompt", variant="secondary", size="sm")
+                no_redescribe = gr.Checkbox(
+                    label="don't redescribe reference subjects",
+                    value=False,
                 )
             with gr.Group():
                 aspect_ratio = gr.Radio(
                     choices=list(ASPECT_PRESETS.keys()),
                     value=False,
                 )
+            with gr.Row():
+                duration_secs = gr.Slider(1, 20, value=5, step=0.5, label="Duration (s)")
+                base_fps = gr.Number(value=15, precision=0, label="Base FPS")
+            with gr.Row():
+                seed = gr.Number(value=0, precision=0, label="Seed (0=random)")
+                gpu_budget = gr.Slider(0, 540, value=0, step=10, label="ZeroGPU budget (0=auto)")
             with gr.Group(elem_id="loras_9999"):
                 loras_enabled = gr.Checkbox(label="optional loras", value=False)
                 with gr.Column(visible=False) as loras_section:
                         with gr.Group():
                             gr.Markdown("<div style='padding-left:8px'>High</div>")
                             with gr.Row():
+                                s_wamu_h = gr.Slider(-2, 2, value=1.0, step=0.05, label="wamu")
+                                s_dreamly_h = gr.Slider(-2, 2, value=1.0, step=0.05, label="dreamly")
                                 s_svicamera_h = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
                         with gr.Group():
                             gr.Markdown("<div style='padding-left:8px'>Low</div>")
                                 s_wamu_l    = gr.Slider(-2, 2, value=0.5, step=0.05, label="wamu")
                                 s_dreamly_l = gr.Slider(-2, 2, value=0.7, step=0.05, label="dreamly")
                                 s_svicamera_l = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
             loras_enabled.change(
                 fn=lambda x: gr.update(visible=x),
                 outputs=loras_section,
             )
+            with gr.Accordion("Advanced", open=False):
+                with gr.Row():
+                    num_steps = gr.Slider(4, 20, value=6, step=1, label="Steps")
+                    sampler_name = gr.Dropdown(choices=["uni_pc", "lcm"], value="lcm", label="Sampler")
+                    rife_mode = gr.Dropdown(
+                        choices=["no rife", "2x rife", "4x rife"],
+                        value="2x rife",
+                        label="RIFE interpolation",
+                    )
+                with gr.Accordion("Negative prompt", open=False):
+                    negative = gr.Textbox(
+                        label="",
+                        lines=2,
+                        value=DEFAULT_NEGATIVE,
+                    )
+                with gr.Accordion("IPNC", open=False):
+                    ipnc_enabled = gr.Checkbox(label="Enable", value=True)
+                    with gr.Column(visible=True) as ipnc_section:
+                        with gr.Group():
+                            gr.Markdown("<div style='padding-left:8px'>High</div>")
+                            with gr.Row():
+                                ipnc_wamu_h    = gr.Slider(0, 200, value=100, step=1, label="wamu")
+                                ipnc_dreamly_h = gr.Slider(0, 200, value=100, step=1, label="dreamly")
+                                ipnc_svicamera_h = gr.Slider(0, 200, value=100, step=1, label="svicamera")
+                        with gr.Group():
+                            gr.Markdown("<div style='padding-left:8px'>Low</div>")
+                            with gr.Row():
+                                ipnc_wamu_l    = gr.Slider(0, 200, value=100, step=1, label="wamu")
+                                ipnc_dreamly_l = gr.Slider(0, 200, value=100, step=1, label="dreamly")
+                                ipnc_svicamera_l = gr.Slider(0, 200, value=100, step=1, label="svicamera")
+                ipnc_enabled.change(
+                    fn=lambda x: gr.update(visible=x),
+                    inputs=ipnc_enabled,
+                    outputs=ipnc_section,
                 )
             generate_btn = gr.Button("Generate", variant="primary", size="lg")
         with gr.Column(scale=1):
+            # hidden: chunked generation endpoint
+            chunk_session_id = gr.Textbox(visible=False, value="")
+            chunk_steps_per = gr.Textbox(visible=False, value="2")
+            chunk_step_idx = gr.Textbox(visible=False, value="0")
+            chunk_btn = gr.Button(visible=False)
+            chunk_btn.click(
+                fn=generate_chunk_handler,
+                inputs=[
+                    prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect, gpu_budget,
+                    num_steps, duration_secs, sampler_name, base_fps, rife_mode,
+                    loras_enabled,
+                    s_wamu_h, s_dreamly_h,
+                    s_wamu_l, s_dreamly_l,
+                    s_svicamera_h, s_svicamera_l,
+                    ipnc_enabled,
+                    ipnc_wamu_h, ipnc_dreamly_h,
+                    ipnc_wamu_l, ipnc_dreamly_l,
+                    ipnc_svicamera_h, ipnc_svicamera_l,
+                    chunk_session_id, chunk_steps_per, chunk_step_idx,
+                ],
+                outputs=[output_video, output_status],
+            )
             output_video = gr.Video(label="Generated video")
+            output_status = gr.Textbox(label="Status", interactive=False, lines=4)
+    # ── enhance handler ──────────────────────────────────────────────────
+    def enhance_handler(prompt: str, image_input: Any, no_redescribe: bool,
+                        progress=gr.Progress()):
+        paths = []
+        for item in (image_input or []):
+            if isinstance(item, str) and os.path.exists(item):
+                paths.append(item)
+            elif isinstance(item, dict):
+                p = item.get("path") or item.get("name")
+                if p and os.path.exists(p): paths.append(p)
+            elif isinstance(item, (list, tuple)) and item:
+                p = item[0] if isinstance(item[0], str) else (item[0].get("path") if isinstance(item[0], dict) else None)
+                if p and os.path.exists(p): paths.append(p)
+        paths = [p for p in paths if p and os.path.exists(p)][:5]
+        if not paths:
+            raise gr.Error("upload at least one reference image")
+        if not (prompt or "").strip():
+            raise gr.Error("enter a prompt")
+        result = _enhance_prompt_r2v(prompt, paths, no_redescribe=bool(no_redescribe))
+        if result:
+            return result
+        raise gr.Error("enhancement failed")
+    enhance_btn.click(
+        fn=enhance_handler,
+        inputs=[prompt, image_input, no_redescribe],
+        outputs=[prompt],
+    )
     generate_btn.click(
         fn=generate_handler,
             loras_enabled,
             s_wamu_h, s_dreamly_h,
             s_wamu_l, s_dreamly_l,
             s_svicamera_h, s_svicamera_l,
             ipnc_enabled,
             ipnc_wamu_h, ipnc_dreamly_h,
             ipnc_wamu_l, ipnc_dreamly_l,
             ipnc_svicamera_h, ipnc_svicamera_l,
         ],
         outputs=[output_video, output_status],
     )
 if __name__ == "__main__":
     _ensure_comfy()
     _ensure_models()