signsur4739379373 commited on
Commit
7c6328c
Β·
1 Parent(s): 45056bf

duration max 25, steps_per_chunk param

Browse files
Files changed (1) hide show
  1. app.py +212 -167
app.py CHANGED
@@ -378,72 +378,92 @@ DEFAULT_NEGATIVE = (
378
 
379
 
380
  R2V_TEMPLATE = """You are an expert at writing subject-driven video generation prompts. I'm providing you with:
381
- 1. {image_num} reference image(s) of the subject(s) that will appear in the video (referred to as image0, image1, image2, ... in order).
382
  2. An original video description text.
383
 
384
- Your task is to rewrite the original description into a new format with TWO parts concatenated together:
385
-
386
- **Part 1 - Short instruction**: A concise sentence describing who the subject(s) from the reference image(s) are, what they look like briefly, where they are, and what key action/motion they perform. Reference the subject(s) using "image0", "image1", etc. to link them to the provided reference images.
387
-
388
- **Part 2 - Long instruction**: A detailed "Generate a video where..." paragraph that describes:
389
- - The subject(s) from the reference image(s) with detailed appearance (hair, clothing, accessories, expression, etc.), referencing them as "the person/man/woman from image0" etc.
390
- - The scene/environment in detail (background, lighting, objects, atmosphere).
391
- - The motion and actions in a step-by-step temporal sequence (at the start..., then..., after that...).
392
- - The motion should remain natural and realistic.
393
 
394
  Requirements:
395
- - You MUST reference each subject using "image0", "image1", "image2", etc. to correspond to the provided reference images in order.
396
- - The appearance description of each subject must be based on what you actually see in the reference image(s). Do NOT hallucinate details not visible in the images.
397
- - The scene, actions, and motion should be derived from the original description text, but rewritten to be more detailed and vivid.
398
- - The output must be entirely in English.
399
- - Return ONLY a JSON object with one key: "rewritten_text". The value should be the full rewritten text (short instruction + long instruction concatenated as one string). No extra text.
 
 
 
400
 
401
  Original description:
402
  {original_text}
403
  """
404
 
405
 
406
- def _enhance_prompt_r2v(prompt: str, image_paths: list[str]) -> str | None:
407
- """Call grok-4.3 via xAI API to enhance prompt for r2v. Returns None on failure."""
408
- api_key = os.environ.get("XAI_API_KEY")
409
- if not api_key:
410
- print("[enhancer] XAI_API_KEY not set, skipping enhancement", flush=True)
411
- return None
 
 
 
 
 
 
 
 
 
412
  try:
413
- import base64
414
- import json as _json
415
- from openai import OpenAI
416
- client = OpenAI(api_key=api_key, base_url="https://api.x.ai/v1")
417
- image_num = len(image_paths)
418
- user_text = R2V_TEMPLATE.format(image_num=max(image_num, 1), original_text=prompt)
419
- content: list = [{"type": "text", "text": user_text}]
420
- for i, path in enumerate(image_paths):
 
421
  if not path or not os.path.exists(path):
422
  continue
423
  with open(path, "rb") as f:
424
  b64 = base64.b64encode(f.read()).decode("utf-8")
425
- content.append({"type": "text", "text": f"\n[Image {i}]:"})
426
- content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}})
427
- messages = [
428
- {"role": "system", "content": "You are a helpful assistant."},
429
- {"role": "user", "content": content},
430
- ]
431
- resp = client.chat.completions.create(
432
- model="grok-4.3",
433
- messages=messages,
434
- max_completion_tokens=8192,
435
- response_format={"type": "json_object"},
 
 
 
 
 
 
 
 
 
 
 
436
  )
437
- text = resp.choices[0].message.content or ""
 
438
  enhanced = _json.loads(text).get("rewritten_text", "").strip()
439
  if enhanced:
440
- print(f"[enhancer] enhanced prompt ({len(enhanced)} chars)", flush=True)
441
  return enhanced
442
  return None
443
  except Exception as e:
444
  print(f"[enhancer] failed: {e}", flush=True)
445
  return None
446
 
 
447
  def _load_workflow() -> dict[str, Any]:
448
  wf_path = ROOT / WORKFLOW_FILE
449
  return json.loads(wf_path.read_text(encoding="utf-8"))
@@ -856,41 +876,20 @@ def _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref
856
  return max(30, int(total))
857
 
858
 
 
859
  def _get_duration(
860
  prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
861
- gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode, *args, **kwargs,
 
 
 
862
  ):
863
  if gpu_budget and int(gpu_budget) > 0:
864
  return int(gpu_budget)
865
  return _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref_aspect, rife_mode)
866
 
867
 
868
- def _chunk_get_duration(
869
- prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
870
- gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode, *args, **kwargs,
871
- ):
872
- """Each chunk is self-contained β€” request 120s for the GPU."""
873
- if gpu_budget and int(gpu_budget) > 0:
874
- return int(gpu_budget)
875
- return 120
876
-
877
-
878
- def _coerce_gallery(g) -> list:
879
- paths = []
880
- for item in (g or []):
881
- if isinstance(item, str):
882
- paths.append(item)
883
- elif isinstance(item, dict):
884
- p = item.get("path") or item.get("name")
885
- if p: paths.append(p)
886
- elif isinstance(item, (list, tuple)) and item:
887
- p = item[0]
888
- if isinstance(p, str): paths.append(p)
889
- elif isinstance(p, dict): paths.append(p.get("path",""))
890
- return [p for p in paths if p and os.path.exists(p)][:5]
891
-
892
-
893
- @spaces.GPU(duration=_chunk_get_duration)
894
  def generate_chunk_handler(
895
  prompt: str,
896
  negative: str,
@@ -901,26 +900,26 @@ def generate_chunk_handler(
901
  gpu_budget: int = 120,
902
  num_steps: int = 6,
903
  duration_secs: float = 5.0,
904
- sampler_name: str = "uni_pc",
905
  base_fps: int = 15,
906
- rife_mode: str = "no rife",
907
  loras_enabled: bool = False,
908
  s_wamu_h: float = 1.0,
909
- s_dreamly_h: float = 0.3,
910
  s_wamu_l: float = 0.5,
911
- s_dreamly_l: float = 0.5,
912
- enhance_prompt: bool = False,
913
- ipnc_enabled: bool = False,
 
914
  ipnc_wamu_h: float = 100.0,
915
  ipnc_dreamly_h: float = 100.0,
916
  ipnc_wamu_l: float = 100.0,
917
  ipnc_dreamly_l: float = 100.0,
918
- s_svicamera_h: float = 0.0,
919
- s_svicamera_l: float = 0.0,
920
  ipnc_svicamera_h: float = 100.0,
921
  ipnc_svicamera_l: float = 100.0,
922
  # chunking params
923
  session_id: str = "",
 
924
  chunk_step: str = "0",
925
  progress=gr.Progress(track_tqdm=True),
926
  ):
@@ -952,7 +951,7 @@ def generate_chunk_handler(
952
  gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
953
  n_frames = max(1, round(float(duration_secs) * int(base_fps)))
954
  steps = int(num_steps)
955
- STEPS_PER_CHUNK = 2
956
  chunk_idx = int(float(chunk_step or 0))
957
  step_start = chunk_idx * STEPS_PER_CHUNK
958
  step_end = min(step_start + STEPS_PER_CHUNK, steps)
@@ -992,7 +991,7 @@ def generate_chunk_handler(
992
  sampler_id = "57" if use_high else "58"
993
 
994
  # Inject chunk nodes: save/load latent, save/load conditioning
995
- # Batch multiple steps per chunk (STEPS_PER_CHUNK=2 for short videos)
996
  api_wf[sampler_id]["inputs"]["steps"] = steps
997
  api_wf[sampler_id]["inputs"]["start_at_step"] = step_start
998
  api_wf[sampler_id]["inputs"]["end_at_step"] = step_end
@@ -1074,25 +1073,24 @@ def generate_handler(
1074
  seed: int,
1075
  aspect_ratio: str = "16:9",
1076
  use_ref_aspect: bool = False,
1077
- gpu_budget: int = 0, # 0 = auto-estimate
1078
  num_steps: int = 6,
1079
  duration_secs: float = 10.0,
1080
- sampler_name: str = "uni_pc",
1081
  base_fps: int = 15,
1082
- rife_mode: str = "no rife",
1083
  loras_enabled: bool = False,
1084
  s_wamu_h: float = 1.0,
1085
- s_dreamly_h: float = 0.3,
1086
  s_wamu_l: float = 0.5,
1087
- s_dreamly_l: float = 0.5,
1088
- enhance_prompt: bool = False,
1089
- ipnc_enabled: bool = False,
 
1090
  ipnc_wamu_h: float = 100.0,
1091
  ipnc_dreamly_h: float = 100.0,
1092
  ipnc_wamu_l: float = 100.0,
1093
  ipnc_dreamly_l: float = 100.0,
1094
- s_svicamera_h: float = 0.0,
1095
- s_svicamera_l: float = 0.0,
1096
  ipnc_svicamera_h: float = 100.0,
1097
  ipnc_svicamera_l: float = 100.0,
1098
  progress=gr.Progress(track_tqdm=True),
@@ -1105,6 +1103,20 @@ def generate_handler(
1105
  final_seed = int(seed) if seed else random.randint(0, MAX_SEED)
1106
  negative = negative or DEFAULT_NEGATIVE
1107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1108
  image_paths = _coerce_gallery(image_input)
1109
  if not image_paths:
1110
  return None, "upload at least one reference image"
@@ -1113,17 +1125,12 @@ def generate_handler(
1113
  dest_names = []
1114
  for p in image_paths:
1115
  dn = f"ref_{uuid.uuid4().hex[:8]}_{os.path.basename(p)}"
1116
- shutil.copy(p, INPUT / dn)
1117
  dest_names.append(dn)
1118
 
1119
  gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
1120
 
1121
- enhanced_prompt_text = None
1122
- if enhance_prompt:
1123
- progress(0.05, desc="enhancing prompt...")
1124
- enhanced_prompt_text = _enhance_prompt_r2v(prompt, image_paths)
1125
- if enhanced_prompt_text:
1126
- prompt = enhanced_prompt_text
1127
 
1128
  progress(0.1, desc="building workflow...")
1129
  visual_wf = _load_workflow()
@@ -1168,12 +1175,10 @@ def generate_handler(
1168
 
1169
  ts = time.strftime("%Y%m%d_%H%M%S")
1170
  out_path = os.path.join(SAVE_BASE, f"r2v_{ts}.mp4")
1171
- shutil.copy(output_video, out_path)
 
 
1172
 
1173
- status = f"Done: {out_path}"
1174
- if enhanced_prompt_text:
1175
- status += f"\n\n--- Enhanced prompt ---\n{enhanced_prompt_text}"
1176
- return out_path, status
1177
 
1178
  with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1179
  gr.Markdown("# Bernini-R Wan 2.2 R2V Lightning")
@@ -1189,15 +1194,16 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1189
  prompt = gr.Textbox(
1190
  label="Prompt",
1191
  lines=3,
1192
- placeholder="Describe the subject's action in detail. Reference uploaded images by name (e.g. image0, image1). Example: the person in image0, wearing the outfit from image1, walks confidently through a busy train station.",
1193
  value="Keeping the exact identity and appearance the same as in image0, the person in image0 dances in a supermarket.",
1194
  )
1195
- with gr.Accordion("Negative prompt", open=False):
1196
- negative = gr.Textbox(
1197
- label="",
1198
- lines=2,
1199
- value=DEFAULT_NEGATIVE,
1200
  )
 
1201
  with gr.Group():
1202
  aspect_ratio = gr.Radio(
1203
  choices=list(ASPECT_PRESETS.keys()),
@@ -1209,6 +1215,14 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1209
  value=False,
1210
  )
1211
 
 
 
 
 
 
 
 
 
1212
  with gr.Group(elem_id="loras_9999"):
1213
  loras_enabled = gr.Checkbox(label="optional loras", value=False)
1214
  with gr.Column(visible=False) as loras_section:
@@ -1216,8 +1230,8 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1216
  with gr.Group():
1217
  gr.Markdown("<div style='padding-left:8px'>High</div>")
1218
  with gr.Row():
1219
- s_wamu_h = gr.Slider(-2, 2, value=0.9, step=0.05, label="wamu")
1220
- s_dreamly_h = gr.Slider(-2, 2, value=0.9, step=0.05, label="dreamly")
1221
  s_svicamera_h = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
1222
  with gr.Group():
1223
  gr.Markdown("<div style='padding-left:8px'>Low</div>")
@@ -1225,21 +1239,6 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1225
  s_wamu_l = gr.Slider(-2, 2, value=0.5, step=0.05, label="wamu")
1226
  s_dreamly_l = gr.Slider(-2, 2, value=0.7, step=0.05, label="dreamly")
1227
  s_svicamera_l = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
1228
- gr.Markdown("<hr style='margin:8px 0'>")
1229
- ipnc_enabled = gr.Checkbox(label="IPNC", value=False)
1230
- with gr.Column(visible=False) as ipnc_section:
1231
- with gr.Group():
1232
- gr.Markdown("<div style='padding-left:8px'>High</div>")
1233
- with gr.Row():
1234
- ipnc_wamu_h = gr.Slider(0, 200, value=100, step=1, label="wamu")
1235
- ipnc_dreamly_h = gr.Slider(0, 200, value=100, step=1, label="dreamly")
1236
- ipnc_svicamera_h = gr.Slider(0, 200, value=100, step=1, label="svicamera")
1237
- with gr.Group():
1238
- gr.Markdown("<div style='padding-left:8px'>Low</div>")
1239
- with gr.Row():
1240
- ipnc_wamu_l = gr.Slider(0, 200, value=100, step=1, label="wamu")
1241
- ipnc_dreamly_l = gr.Slider(0, 200, value=100, step=1, label="dreamly")
1242
- ipnc_svicamera_l = gr.Slider(0, 200, value=100, step=1, label="svicamera")
1243
 
1244
  loras_enabled.change(
1245
  fn=lambda x: gr.update(visible=x),
@@ -1247,32 +1246,101 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1247
  outputs=loras_section,
1248
  )
1249
 
1250
- ipnc_enabled.change(
1251
- fn=lambda x: gr.update(visible=x),
1252
- inputs=ipnc_enabled,
1253
- outputs=ipnc_section,
1254
- )
1255
-
1256
- with gr.Row():
1257
- seed = gr.Number(value=0, precision=0, label="Seed (0=random)")
1258
- gpu_budget = gr.Slider(0, 540, value=0, step=10, label="ZeroGPU budget (0=auto)")
1259
- with gr.Row():
1260
- num_steps = gr.Slider(4, 20, value=6, step=1, label="Steps")
1261
- duration_secs = gr.Slider(1, 15, value=5, step=0.5, label="Duration (s)")
1262
- sampler_name = gr.Dropdown(choices=["uni_pc", "lcm"], value="lcm", label="Sampler")
1263
- base_fps = gr.Number(value=15, precision=0, label="Combine FPS")
1264
- rife_mode = gr.Dropdown(
1265
- choices=["no rife", "2x rife", "4x rife"],
1266
- value="no rife",
1267
- label="RIFE interpolation",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1268
  )
1269
 
1270
- enhance_prompt = gr.Checkbox(label="enhance prompt", value=False)
1271
  generate_btn = gr.Button("Generate", variant="primary", size="lg")
1272
 
1273
  with gr.Column(scale=1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1274
  output_video = gr.Video(label="Generated video")
1275
- output_status = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1276
 
1277
  generate_btn.click(
1278
  fn=generate_handler,
@@ -1282,39 +1350,16 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1282
  loras_enabled,
1283
  s_wamu_h, s_dreamly_h,
1284
  s_wamu_l, s_dreamly_l,
1285
- enhance_prompt,
1286
- ipnc_enabled,
1287
- ipnc_wamu_h, ipnc_dreamly_h,
1288
- ipnc_wamu_l, ipnc_dreamly_l,
1289
  s_svicamera_h, s_svicamera_l,
1290
- ipnc_svicamera_h, ipnc_svicamera_l,
1291
- ],
1292
- outputs=[output_video, output_status],
1293
- )
1294
-
1295
- _chunk_session_id = gr.Textbox(visible=False, value="")
1296
- _chunk_step_txt = gr.Textbox(visible=False, value="0")
1297
- _chunk_btn = gr.Button(visible=False)
1298
- _chunk_btn.click(
1299
- fn=generate_chunk_handler,
1300
- inputs=[
1301
- prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect, gpu_budget,
1302
- num_steps, duration_secs, sampler_name, base_fps, rife_mode,
1303
- loras_enabled,
1304
- s_wamu_h, s_dreamly_h,
1305
- s_wamu_l, s_dreamly_l,
1306
- enhance_prompt,
1307
  ipnc_enabled,
1308
  ipnc_wamu_h, ipnc_dreamly_h,
1309
  ipnc_wamu_l, ipnc_dreamly_l,
1310
- s_svicamera_h, s_svicamera_l,
1311
  ipnc_svicamera_h, ipnc_svicamera_l,
1312
- _chunk_session_id, _chunk_step_txt,
1313
  ],
1314
  outputs=[output_video, output_status],
1315
- api_name="generate_chunk",
1316
  )
1317
 
 
1318
  if __name__ == "__main__":
1319
  _ensure_comfy()
1320
  _ensure_models()
 
378
 
379
 
380
  R2V_TEMPLATE = """You are an expert at writing subject-driven video generation prompts. I'm providing you with:
381
+ 1. {image_num} reference image(s) of the subject(s) (referred to as image0, image1, ... in order).
382
  2. An original video description text.
383
 
384
+ Rewrite the description into TWO concatenated parts:
385
+ Part 1 - Short: A concise sentence describing who appears (reference as image0/image1/etc.), where, and what key action/motion.
386
+ Part 2 - Long: A detailed "Generate a video where..." paragraph with full appearance details referencing each subject as "the person from image0" etc., detailed scene/environment, and step-by-step temporal motion sequence.
 
 
 
 
 
 
387
 
388
  Requirements:
389
+ - Reference each subject as image0/image1/etc., base appearance on what you see in the images (no hallucination), output entirely in English.
390
+ - For every action or scenario described, identify and explicitly state all implied visual elements that are not mentioned but must be present for the scene to exist: who or what else is in the frame, what the subject is wearing or holding that the activity requires, what the environment necessarily contains, any other participants or objects involved.
391
+ - For each action, explicitly state all visually relevant body states that are not mentioned but would be visible on camera: hand positions (open, clenched, raised, at side), facial expression, direction of gaze, posture, weight distribution, foot placement. Do not assume any body state is obvious -- state it explicitly.
392
+ - The final prompt must be detailed enough that a complete mental image of the scene can be formed without seeing the reference images. Every visual element a film director would need to brief their crew on -- blocking, attire, props, environment, participant positions -- must appear in the text.
393
+ - Do not robotically enumerate biomechanics. Write naturally while ensuring no visual element is left implicit.
394
+ {extra_rule}
395
+ - For any close physical interaction or insertion, you MUST explicitly define the spatial occlusion. State exactly what is physically connected, what is penetrating or entering a space, and what is visually hidden inside the other object/body versus what remains visible outside.
396
+ Return ONLY a JSON object with one key: "rewritten_text".
397
 
398
  Original description:
399
  {original_text}
400
  """
401
 
402
 
403
+ ENHANCE_EXTRA_RULE_NOREDESCRIBE = (
404
+ "Do not describe the inherent visual appearance of subjects from the reference images "
405
+ "(their face, hair, body type, baseline clothing) -- the model already sees those. "
406
+ "However, DO describe any scene-specific additions to reference subjects' appearance: "
407
+ "equipment, props, or attire added for this scene that would not be present in the reference image."
408
+ )
409
+
410
+ ENHANCE_EXTRA_RULE_DEFAULT = (
411
+ "Include full appearance details of reference subjects."
412
+ )
413
+
414
+
415
+ def _enhance_prompt_r2v(prompt: str, image_paths: list[str], no_redescribe: bool = False) -> str | None:
416
+ """Call grok-4.3 via xAI API. Direct connection."""
417
+ import base64, mimetypes as _mt, json as _json
418
  try:
419
+ image_num = len([p for p in image_paths if p and os.path.exists(p)])
420
+ extra_rule = ENHANCE_EXTRA_RULE_NOREDESCRIBE if no_redescribe else ENHANCE_EXTRA_RULE_DEFAULT
421
+ user_text = R2V_TEMPLATE.format(
422
+ image_num=max(image_num, 1),
423
+ extra_rule=extra_rule,
424
+ original_text=prompt,
425
+ )
426
+ content_msgs: list = [{"type": "text", "text": user_text}]
427
+ for i, path in enumerate(image_paths[:5]):
428
  if not path or not os.path.exists(path):
429
  continue
430
  with open(path, "rb") as f:
431
  b64 = base64.b64encode(f.read()).decode("utf-8")
432
+ mime, _ = _mt.guess_type(path)
433
+ mime = mime or "image/jpeg"
434
+ content_msgs.append({"type": "text", "text": f"\\n[Image {i}]:"})
435
+ content_msgs.append({"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}})
436
+ payload = {
437
+ "model": "grok-4.3",
438
+ "messages": [
439
+ {"role": "system", "content": "You are a helpful assistant."},
440
+ {"role": "user", "content": content_msgs},
441
+ ],
442
+ "reasoning_effort": "xhigh",
443
+ "response_format": {"type": "json_object"},
444
+ }
445
+ import requests as _req
446
+ r = _req.post(
447
+ "https://api.x.ai/v1/chat/completions",
448
+ json=payload,
449
+ headers={
450
+ "Authorization": f"Bearer {os.environ.get('XAI_API_KEY', '')}",
451
+ "Content-Type": "application/json",
452
+ },
453
+ timeout=300,
454
  )
455
+ r.raise_for_status()
456
+ text = r.json()["choices"][0]["message"]["content"]
457
  enhanced = _json.loads(text).get("rewritten_text", "").strip()
458
  if enhanced:
459
+ print(f"[enhancer] enhanced ({len(enhanced)} chars)", flush=True)
460
  return enhanced
461
  return None
462
  except Exception as e:
463
  print(f"[enhancer] failed: {e}", flush=True)
464
  return None
465
 
466
+
467
  def _load_workflow() -> dict[str, Any]:
468
  wf_path = ROOT / WORKFLOW_FILE
469
  return json.loads(wf_path.read_text(encoding="utf-8"))
 
876
  return max(30, int(total))
877
 
878
 
879
+
880
  def _get_duration(
881
  prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
882
+ gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode,
883
+ ipnc_enabled, ipnc_wamu_h, ipnc_dreamly_h, ipnc_wamu_l, ipnc_dreamly_l,
884
+ ipnc_svicamera_h, ipnc_svicamera_l,
885
+ *args, **kwargs,
886
  ):
887
  if gpu_budget and int(gpu_budget) > 0:
888
  return int(gpu_budget)
889
  return _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref_aspect, rife_mode)
890
 
891
 
892
+ @spaces.GPU(duration=_get_duration)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893
  def generate_chunk_handler(
894
  prompt: str,
895
  negative: str,
 
900
  gpu_budget: int = 120,
901
  num_steps: int = 6,
902
  duration_secs: float = 5.0,
903
+ sampler_name: str = "lcm",
904
  base_fps: int = 15,
905
+ rife_mode: str = "2x rife",
906
  loras_enabled: bool = False,
907
  s_wamu_h: float = 1.0,
908
+ s_dreamly_h: float = 1.0,
909
  s_wamu_l: float = 0.5,
910
+ s_dreamly_l: float = 0.7,
911
+ s_svicamera_h: float = 0.0,
912
+ s_svicamera_l: float = 0.0,
913
+ ipnc_enabled: bool = True,
914
  ipnc_wamu_h: float = 100.0,
915
  ipnc_dreamly_h: float = 100.0,
916
  ipnc_wamu_l: float = 100.0,
917
  ipnc_dreamly_l: float = 100.0,
 
 
918
  ipnc_svicamera_h: float = 100.0,
919
  ipnc_svicamera_l: float = 100.0,
920
  # chunking params
921
  session_id: str = "",
922
+ steps_per_chunk: str = "2",
923
  chunk_step: str = "0",
924
  progress=gr.Progress(track_tqdm=True),
925
  ):
 
951
  gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
952
  n_frames = max(1, round(float(duration_secs) * int(base_fps)))
953
  steps = int(num_steps)
954
+ STEPS_PER_CHUNK = max(1, int(float(steps_per_chunk or 2)))
955
  chunk_idx = int(float(chunk_step or 0))
956
  step_start = chunk_idx * STEPS_PER_CHUNK
957
  step_end = min(step_start + STEPS_PER_CHUNK, steps)
 
991
  sampler_id = "57" if use_high else "58"
992
 
993
  # Inject chunk nodes: save/load latent, save/load conditioning
994
+ # Batch multiple steps per chunk (set by the caller)
995
  api_wf[sampler_id]["inputs"]["steps"] = steps
996
  api_wf[sampler_id]["inputs"]["start_at_step"] = step_start
997
  api_wf[sampler_id]["inputs"]["end_at_step"] = step_end
 
1073
  seed: int,
1074
  aspect_ratio: str = "16:9",
1075
  use_ref_aspect: bool = False,
1076
+ gpu_budget: int = 0,
1077
  num_steps: int = 6,
1078
  duration_secs: float = 10.0,
1079
+ sampler_name: str = "lcm",
1080
  base_fps: int = 15,
1081
+ rife_mode: str = "2x rife",
1082
  loras_enabled: bool = False,
1083
  s_wamu_h: float = 1.0,
1084
+ s_dreamly_h: float = 1.0,
1085
  s_wamu_l: float = 0.5,
1086
+ s_dreamly_l: float = 0.7,
1087
+ s_svicamera_h: float = 0.0,
1088
+ s_svicamera_l: float = 0.0,
1089
+ ipnc_enabled: bool = True,
1090
  ipnc_wamu_h: float = 100.0,
1091
  ipnc_dreamly_h: float = 100.0,
1092
  ipnc_wamu_l: float = 100.0,
1093
  ipnc_dreamly_l: float = 100.0,
 
 
1094
  ipnc_svicamera_h: float = 100.0,
1095
  ipnc_svicamera_l: float = 100.0,
1096
  progress=gr.Progress(track_tqdm=True),
 
1103
  final_seed = int(seed) if seed else random.randint(0, MAX_SEED)
1104
  negative = negative or DEFAULT_NEGATIVE
1105
 
1106
+ def _coerce_gallery(g):
1107
+ paths = []
1108
+ for item in (g or []):
1109
+ if isinstance(item, str):
1110
+ paths.append(item)
1111
+ elif isinstance(item, dict):
1112
+ p = item.get("path") or item.get("name")
1113
+ if p: paths.append(p)
1114
+ elif isinstance(item, (list, tuple)) and item:
1115
+ p = item[0]
1116
+ if isinstance(p, str): paths.append(p)
1117
+ elif isinstance(p, dict): paths.append(p.get("path",""))
1118
+ return [p for p in paths if p and os.path.exists(p)][:5]
1119
+
1120
  image_paths = _coerce_gallery(image_input)
1121
  if not image_paths:
1122
  return None, "upload at least one reference image"
 
1125
  dest_names = []
1126
  for p in image_paths:
1127
  dn = f"ref_{uuid.uuid4().hex[:8]}_{os.path.basename(p)}"
1128
+ shutil.copy2(p, INPUT / dn)
1129
  dest_names.append(dn)
1130
 
1131
  gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
1132
 
1133
+ print(prompt)
 
 
 
 
 
1134
 
1135
  progress(0.1, desc="building workflow...")
1136
  visual_wf = _load_workflow()
 
1175
 
1176
  ts = time.strftime("%Y%m%d_%H%M%S")
1177
  out_path = os.path.join(SAVE_BASE, f"r2v_{ts}.mp4")
1178
+ shutil.copy2(output_video, out_path)
1179
+
1180
+ return out_path, f"Seed: {final_seed}\n{out_path}"
1181
 
 
 
 
 
1182
 
1183
  with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
1184
  gr.Markdown("# Bernini-R Wan 2.2 R2V Lightning")
 
1194
  prompt = gr.Textbox(
1195
  label="Prompt",
1196
  lines=3,
1197
+ placeholder="Describe the subject's action in detail...",
1198
  value="Keeping the exact identity and appearance the same as in image0, the person in image0 dances in a supermarket.",
1199
  )
1200
+ with gr.Row():
1201
+ enhance_btn = gr.Button("Enhance prompt", variant="secondary", size="sm")
1202
+ no_redescribe = gr.Checkbox(
1203
+ label="don't redescribe reference subjects",
1204
+ value=False,
1205
  )
1206
+
1207
  with gr.Group():
1208
  aspect_ratio = gr.Radio(
1209
  choices=list(ASPECT_PRESETS.keys()),
 
1215
  value=False,
1216
  )
1217
 
1218
+ with gr.Row():
1219
+ duration_secs = gr.Slider(1, 20, value=5, step=0.5, label="Duration (s)")
1220
+ base_fps = gr.Number(value=15, precision=0, label="Base FPS")
1221
+
1222
+ with gr.Row():
1223
+ seed = gr.Number(value=0, precision=0, label="Seed (0=random)")
1224
+ gpu_budget = gr.Slider(0, 540, value=0, step=10, label="ZeroGPU budget (0=auto)")
1225
+
1226
  with gr.Group(elem_id="loras_9999"):
1227
  loras_enabled = gr.Checkbox(label="optional loras", value=False)
1228
  with gr.Column(visible=False) as loras_section:
 
1230
  with gr.Group():
1231
  gr.Markdown("<div style='padding-left:8px'>High</div>")
1232
  with gr.Row():
1233
+ s_wamu_h = gr.Slider(-2, 2, value=1.0, step=0.05, label="wamu")
1234
+ s_dreamly_h = gr.Slider(-2, 2, value=1.0, step=0.05, label="dreamly")
1235
  s_svicamera_h = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
1236
  with gr.Group():
1237
  gr.Markdown("<div style='padding-left:8px'>Low</div>")
 
1239
  s_wamu_l = gr.Slider(-2, 2, value=0.5, step=0.05, label="wamu")
1240
  s_dreamly_l = gr.Slider(-2, 2, value=0.7, step=0.05, label="dreamly")
1241
  s_svicamera_l = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1242
 
1243
  loras_enabled.change(
1244
  fn=lambda x: gr.update(visible=x),
 
1246
  outputs=loras_section,
1247
  )
1248
 
1249
+ with gr.Accordion("Advanced", open=False):
1250
+ with gr.Row():
1251
+ num_steps = gr.Slider(4, 20, value=6, step=1, label="Steps")
1252
+ sampler_name = gr.Dropdown(choices=["uni_pc", "lcm"], value="lcm", label="Sampler")
1253
+ rife_mode = gr.Dropdown(
1254
+ choices=["no rife", "2x rife", "4x rife"],
1255
+ value="2x rife",
1256
+ label="RIFE interpolation",
1257
+ )
1258
+
1259
+ with gr.Accordion("Negative prompt", open=False):
1260
+ negative = gr.Textbox(
1261
+ label="",
1262
+ lines=2,
1263
+ value=DEFAULT_NEGATIVE,
1264
+ )
1265
+
1266
+ with gr.Accordion("IPNC", open=False):
1267
+ ipnc_enabled = gr.Checkbox(label="Enable", value=True)
1268
+ with gr.Column(visible=True) as ipnc_section:
1269
+ with gr.Group():
1270
+ gr.Markdown("<div style='padding-left:8px'>High</div>")
1271
+ with gr.Row():
1272
+ ipnc_wamu_h = gr.Slider(0, 200, value=100, step=1, label="wamu")
1273
+ ipnc_dreamly_h = gr.Slider(0, 200, value=100, step=1, label="dreamly")
1274
+ ipnc_svicamera_h = gr.Slider(0, 200, value=100, step=1, label="svicamera")
1275
+ with gr.Group():
1276
+ gr.Markdown("<div style='padding-left:8px'>Low</div>")
1277
+ with gr.Row():
1278
+ ipnc_wamu_l = gr.Slider(0, 200, value=100, step=1, label="wamu")
1279
+ ipnc_dreamly_l = gr.Slider(0, 200, value=100, step=1, label="dreamly")
1280
+ ipnc_svicamera_l = gr.Slider(0, 200, value=100, step=1, label="svicamera")
1281
+
1282
+ ipnc_enabled.change(
1283
+ fn=lambda x: gr.update(visible=x),
1284
+ inputs=ipnc_enabled,
1285
+ outputs=ipnc_section,
1286
  )
1287
 
 
1288
  generate_btn = gr.Button("Generate", variant="primary", size="lg")
1289
 
1290
  with gr.Column(scale=1):
1291
+ # hidden: chunked generation endpoint
1292
+ chunk_session_id = gr.Textbox(visible=False, value="")
1293
+ chunk_steps_per = gr.Textbox(visible=False, value="2")
1294
+ chunk_step_idx = gr.Textbox(visible=False, value="0")
1295
+ chunk_btn = gr.Button(visible=False)
1296
+ chunk_btn.click(
1297
+ fn=generate_chunk_handler,
1298
+ inputs=[
1299
+ prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect, gpu_budget,
1300
+ num_steps, duration_secs, sampler_name, base_fps, rife_mode,
1301
+ loras_enabled,
1302
+ s_wamu_h, s_dreamly_h,
1303
+ s_wamu_l, s_dreamly_l,
1304
+ s_svicamera_h, s_svicamera_l,
1305
+ ipnc_enabled,
1306
+ ipnc_wamu_h, ipnc_dreamly_h,
1307
+ ipnc_wamu_l, ipnc_dreamly_l,
1308
+ ipnc_svicamera_h, ipnc_svicamera_l,
1309
+ chunk_session_id, chunk_steps_per, chunk_step_idx,
1310
+ ],
1311
+ outputs=[output_video, output_status],
1312
+ )
1313
  output_video = gr.Video(label="Generated video")
1314
+ output_status = gr.Textbox(label="Status", interactive=False, lines=4)
1315
+
1316
+ # ── enhance handler ──────────────────────────────────────────────────
1317
+ def enhance_handler(prompt: str, image_input: Any, no_redescribe: bool,
1318
+ progress=gr.Progress()):
1319
+ paths = []
1320
+ for item in (image_input or []):
1321
+ if isinstance(item, str) and os.path.exists(item):
1322
+ paths.append(item)
1323
+ elif isinstance(item, dict):
1324
+ p = item.get("path") or item.get("name")
1325
+ if p and os.path.exists(p): paths.append(p)
1326
+ elif isinstance(item, (list, tuple)) and item:
1327
+ p = item[0] if isinstance(item[0], str) else (item[0].get("path") if isinstance(item[0], dict) else None)
1328
+ if p and os.path.exists(p): paths.append(p)
1329
+ paths = [p for p in paths if p and os.path.exists(p)][:5]
1330
+ if not paths:
1331
+ raise gr.Error("upload at least one reference image")
1332
+ if not (prompt or "").strip():
1333
+ raise gr.Error("enter a prompt")
1334
+ result = _enhance_prompt_r2v(prompt, paths, no_redescribe=bool(no_redescribe))
1335
+ if result:
1336
+ return result
1337
+ raise gr.Error("enhancement failed")
1338
+
1339
+ enhance_btn.click(
1340
+ fn=enhance_handler,
1341
+ inputs=[prompt, image_input, no_redescribe],
1342
+ outputs=[prompt],
1343
+ )
1344
 
1345
  generate_btn.click(
1346
  fn=generate_handler,
 
1350
  loras_enabled,
1351
  s_wamu_h, s_dreamly_h,
1352
  s_wamu_l, s_dreamly_l,
 
 
 
 
1353
  s_svicamera_h, s_svicamera_l,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1354
  ipnc_enabled,
1355
  ipnc_wamu_h, ipnc_dreamly_h,
1356
  ipnc_wamu_l, ipnc_dreamly_l,
 
1357
  ipnc_svicamera_h, ipnc_svicamera_l,
 
1358
  ],
1359
  outputs=[output_video, output_status],
 
1360
  )
1361
 
1362
+
1363
  if __name__ == "__main__":
1364
  _ensure_comfy()
1365
  _ensure_models()