Spaces:
Runtime error
Runtime error
Commit Β·
7c6328c
1
Parent(s): 45056bf
duration max 25, steps_per_chunk param
Browse files
app.py
CHANGED
|
@@ -378,72 +378,92 @@ DEFAULT_NEGATIVE = (
|
|
| 378 |
|
| 379 |
|
| 380 |
R2V_TEMPLATE = """You are an expert at writing subject-driven video generation prompts. I'm providing you with:
|
| 381 |
-
1. {image_num} reference image(s) of the subject(s)
|
| 382 |
2. An original video description text.
|
| 383 |
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
**Part 2 - Long instruction**: A detailed "Generate a video where..." paragraph that describes:
|
| 389 |
-
- The subject(s) from the reference image(s) with detailed appearance (hair, clothing, accessories, expression, etc.), referencing them as "the person/man/woman from image0" etc.
|
| 390 |
-
- The scene/environment in detail (background, lighting, objects, atmosphere).
|
| 391 |
-
- The motion and actions in a step-by-step temporal sequence (at the start..., then..., after that...).
|
| 392 |
-
- The motion should remain natural and realistic.
|
| 393 |
|
| 394 |
Requirements:
|
| 395 |
-
-
|
| 396 |
-
-
|
| 397 |
-
-
|
| 398 |
-
- The
|
| 399 |
-
-
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
Original description:
|
| 402 |
{original_text}
|
| 403 |
"""
|
| 404 |
|
| 405 |
|
| 406 |
-
|
| 407 |
-
"
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
try:
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
|
|
|
| 421 |
if not path or not os.path.exists(path):
|
| 422 |
continue
|
| 423 |
with open(path, "rb") as f:
|
| 424 |
b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
{"
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
)
|
| 437 |
-
|
|
|
|
| 438 |
enhanced = _json.loads(text).get("rewritten_text", "").strip()
|
| 439 |
if enhanced:
|
| 440 |
-
print(f"[enhancer] enhanced
|
| 441 |
return enhanced
|
| 442 |
return None
|
| 443 |
except Exception as e:
|
| 444 |
print(f"[enhancer] failed: {e}", flush=True)
|
| 445 |
return None
|
| 446 |
|
|
|
|
| 447 |
def _load_workflow() -> dict[str, Any]:
|
| 448 |
wf_path = ROOT / WORKFLOW_FILE
|
| 449 |
return json.loads(wf_path.read_text(encoding="utf-8"))
|
|
@@ -856,41 +876,20 @@ def _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref
|
|
| 856 |
return max(30, int(total))
|
| 857 |
|
| 858 |
|
|
|
|
| 859 |
def _get_duration(
|
| 860 |
prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
|
| 861 |
-
gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode,
|
|
|
|
|
|
|
|
|
|
| 862 |
):
|
| 863 |
if gpu_budget and int(gpu_budget) > 0:
|
| 864 |
return int(gpu_budget)
|
| 865 |
return _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref_aspect, rife_mode)
|
| 866 |
|
| 867 |
|
| 868 |
-
|
| 869 |
-
prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
|
| 870 |
-
gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode, *args, **kwargs,
|
| 871 |
-
):
|
| 872 |
-
"""Each chunk is self-contained β request 120s for the GPU."""
|
| 873 |
-
if gpu_budget and int(gpu_budget) > 0:
|
| 874 |
-
return int(gpu_budget)
|
| 875 |
-
return 120
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
def _coerce_gallery(g) -> list:
|
| 879 |
-
paths = []
|
| 880 |
-
for item in (g or []):
|
| 881 |
-
if isinstance(item, str):
|
| 882 |
-
paths.append(item)
|
| 883 |
-
elif isinstance(item, dict):
|
| 884 |
-
p = item.get("path") or item.get("name")
|
| 885 |
-
if p: paths.append(p)
|
| 886 |
-
elif isinstance(item, (list, tuple)) and item:
|
| 887 |
-
p = item[0]
|
| 888 |
-
if isinstance(p, str): paths.append(p)
|
| 889 |
-
elif isinstance(p, dict): paths.append(p.get("path",""))
|
| 890 |
-
return [p for p in paths if p and os.path.exists(p)][:5]
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
@spaces.GPU(duration=_chunk_get_duration)
|
| 894 |
def generate_chunk_handler(
|
| 895 |
prompt: str,
|
| 896 |
negative: str,
|
|
@@ -901,26 +900,26 @@ def generate_chunk_handler(
|
|
| 901 |
gpu_budget: int = 120,
|
| 902 |
num_steps: int = 6,
|
| 903 |
duration_secs: float = 5.0,
|
| 904 |
-
sampler_name: str = "
|
| 905 |
base_fps: int = 15,
|
| 906 |
-
rife_mode: str = "
|
| 907 |
loras_enabled: bool = False,
|
| 908 |
s_wamu_h: float = 1.0,
|
| 909 |
-
s_dreamly_h: float =
|
| 910 |
s_wamu_l: float = 0.5,
|
| 911 |
-
s_dreamly_l: float = 0.
|
| 912 |
-
|
| 913 |
-
|
|
|
|
| 914 |
ipnc_wamu_h: float = 100.0,
|
| 915 |
ipnc_dreamly_h: float = 100.0,
|
| 916 |
ipnc_wamu_l: float = 100.0,
|
| 917 |
ipnc_dreamly_l: float = 100.0,
|
| 918 |
-
s_svicamera_h: float = 0.0,
|
| 919 |
-
s_svicamera_l: float = 0.0,
|
| 920 |
ipnc_svicamera_h: float = 100.0,
|
| 921 |
ipnc_svicamera_l: float = 100.0,
|
| 922 |
# chunking params
|
| 923 |
session_id: str = "",
|
|
|
|
| 924 |
chunk_step: str = "0",
|
| 925 |
progress=gr.Progress(track_tqdm=True),
|
| 926 |
):
|
|
@@ -952,7 +951,7 @@ def generate_chunk_handler(
|
|
| 952 |
gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
|
| 953 |
n_frames = max(1, round(float(duration_secs) * int(base_fps)))
|
| 954 |
steps = int(num_steps)
|
| 955 |
-
STEPS_PER_CHUNK = 2
|
| 956 |
chunk_idx = int(float(chunk_step or 0))
|
| 957 |
step_start = chunk_idx * STEPS_PER_CHUNK
|
| 958 |
step_end = min(step_start + STEPS_PER_CHUNK, steps)
|
|
@@ -992,7 +991,7 @@ def generate_chunk_handler(
|
|
| 992 |
sampler_id = "57" if use_high else "58"
|
| 993 |
|
| 994 |
# Inject chunk nodes: save/load latent, save/load conditioning
|
| 995 |
-
# Batch multiple steps per chunk (
|
| 996 |
api_wf[sampler_id]["inputs"]["steps"] = steps
|
| 997 |
api_wf[sampler_id]["inputs"]["start_at_step"] = step_start
|
| 998 |
api_wf[sampler_id]["inputs"]["end_at_step"] = step_end
|
|
@@ -1074,25 +1073,24 @@ def generate_handler(
|
|
| 1074 |
seed: int,
|
| 1075 |
aspect_ratio: str = "16:9",
|
| 1076 |
use_ref_aspect: bool = False,
|
| 1077 |
-
gpu_budget: int = 0,
|
| 1078 |
num_steps: int = 6,
|
| 1079 |
duration_secs: float = 10.0,
|
| 1080 |
-
sampler_name: str = "
|
| 1081 |
base_fps: int = 15,
|
| 1082 |
-
rife_mode: str = "
|
| 1083 |
loras_enabled: bool = False,
|
| 1084 |
s_wamu_h: float = 1.0,
|
| 1085 |
-
s_dreamly_h: float =
|
| 1086 |
s_wamu_l: float = 0.5,
|
| 1087 |
-
s_dreamly_l: float = 0.
|
| 1088 |
-
|
| 1089 |
-
|
|
|
|
| 1090 |
ipnc_wamu_h: float = 100.0,
|
| 1091 |
ipnc_dreamly_h: float = 100.0,
|
| 1092 |
ipnc_wamu_l: float = 100.0,
|
| 1093 |
ipnc_dreamly_l: float = 100.0,
|
| 1094 |
-
s_svicamera_h: float = 0.0,
|
| 1095 |
-
s_svicamera_l: float = 0.0,
|
| 1096 |
ipnc_svicamera_h: float = 100.0,
|
| 1097 |
ipnc_svicamera_l: float = 100.0,
|
| 1098 |
progress=gr.Progress(track_tqdm=True),
|
|
@@ -1105,6 +1103,20 @@ def generate_handler(
|
|
| 1105 |
final_seed = int(seed) if seed else random.randint(0, MAX_SEED)
|
| 1106 |
negative = negative or DEFAULT_NEGATIVE
|
| 1107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1108 |
image_paths = _coerce_gallery(image_input)
|
| 1109 |
if not image_paths:
|
| 1110 |
return None, "upload at least one reference image"
|
|
@@ -1113,17 +1125,12 @@ def generate_handler(
|
|
| 1113 |
dest_names = []
|
| 1114 |
for p in image_paths:
|
| 1115 |
dn = f"ref_{uuid.uuid4().hex[:8]}_{os.path.basename(p)}"
|
| 1116 |
-
shutil.
|
| 1117 |
dest_names.append(dn)
|
| 1118 |
|
| 1119 |
gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
|
| 1120 |
|
| 1121 |
-
|
| 1122 |
-
if enhance_prompt:
|
| 1123 |
-
progress(0.05, desc="enhancing prompt...")
|
| 1124 |
-
enhanced_prompt_text = _enhance_prompt_r2v(prompt, image_paths)
|
| 1125 |
-
if enhanced_prompt_text:
|
| 1126 |
-
prompt = enhanced_prompt_text
|
| 1127 |
|
| 1128 |
progress(0.1, desc="building workflow...")
|
| 1129 |
visual_wf = _load_workflow()
|
|
@@ -1168,12 +1175,10 @@ def generate_handler(
|
|
| 1168 |
|
| 1169 |
ts = time.strftime("%Y%m%d_%H%M%S")
|
| 1170 |
out_path = os.path.join(SAVE_BASE, f"r2v_{ts}.mp4")
|
| 1171 |
-
shutil.
|
|
|
|
|
|
|
| 1172 |
|
| 1173 |
-
status = f"Done: {out_path}"
|
| 1174 |
-
if enhanced_prompt_text:
|
| 1175 |
-
status += f"\n\n--- Enhanced prompt ---\n{enhanced_prompt_text}"
|
| 1176 |
-
return out_path, status
|
| 1177 |
|
| 1178 |
with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
| 1179 |
gr.Markdown("# Bernini-R Wan 2.2 R2V Lightning")
|
|
@@ -1189,15 +1194,16 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
|
| 1189 |
prompt = gr.Textbox(
|
| 1190 |
label="Prompt",
|
| 1191 |
lines=3,
|
| 1192 |
-
placeholder="Describe the subject's action in detail.
|
| 1193 |
value="Keeping the exact identity and appearance the same as in image0, the person in image0 dances in a supermarket.",
|
| 1194 |
)
|
| 1195 |
-
with gr.
|
| 1196 |
-
|
| 1197 |
-
|
| 1198 |
-
|
| 1199 |
-
value=
|
| 1200 |
)
|
|
|
|
| 1201 |
with gr.Group():
|
| 1202 |
aspect_ratio = gr.Radio(
|
| 1203 |
choices=list(ASPECT_PRESETS.keys()),
|
|
@@ -1209,6 +1215,14 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
|
| 1209 |
value=False,
|
| 1210 |
)
|
| 1211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1212 |
with gr.Group(elem_id="loras_9999"):
|
| 1213 |
loras_enabled = gr.Checkbox(label="optional loras", value=False)
|
| 1214 |
with gr.Column(visible=False) as loras_section:
|
|
@@ -1216,8 +1230,8 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
|
| 1216 |
with gr.Group():
|
| 1217 |
gr.Markdown("<div style='padding-left:8px'>High</div>")
|
| 1218 |
with gr.Row():
|
| 1219 |
-
s_wamu_h = gr.Slider(-2, 2, value=
|
| 1220 |
-
s_dreamly_h = gr.Slider(-2, 2, value=
|
| 1221 |
s_svicamera_h = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
|
| 1222 |
with gr.Group():
|
| 1223 |
gr.Markdown("<div style='padding-left:8px'>Low</div>")
|
|
@@ -1225,21 +1239,6 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
|
| 1225 |
s_wamu_l = gr.Slider(-2, 2, value=0.5, step=0.05, label="wamu")
|
| 1226 |
s_dreamly_l = gr.Slider(-2, 2, value=0.7, step=0.05, label="dreamly")
|
| 1227 |
s_svicamera_l = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
|
| 1228 |
-
gr.Markdown("<hr style='margin:8px 0'>")
|
| 1229 |
-
ipnc_enabled = gr.Checkbox(label="IPNC", value=False)
|
| 1230 |
-
with gr.Column(visible=False) as ipnc_section:
|
| 1231 |
-
with gr.Group():
|
| 1232 |
-
gr.Markdown("<div style='padding-left:8px'>High</div>")
|
| 1233 |
-
with gr.Row():
|
| 1234 |
-
ipnc_wamu_h = gr.Slider(0, 200, value=100, step=1, label="wamu")
|
| 1235 |
-
ipnc_dreamly_h = gr.Slider(0, 200, value=100, step=1, label="dreamly")
|
| 1236 |
-
ipnc_svicamera_h = gr.Slider(0, 200, value=100, step=1, label="svicamera")
|
| 1237 |
-
with gr.Group():
|
| 1238 |
-
gr.Markdown("<div style='padding-left:8px'>Low</div>")
|
| 1239 |
-
with gr.Row():
|
| 1240 |
-
ipnc_wamu_l = gr.Slider(0, 200, value=100, step=1, label="wamu")
|
| 1241 |
-
ipnc_dreamly_l = gr.Slider(0, 200, value=100, step=1, label="dreamly")
|
| 1242 |
-
ipnc_svicamera_l = gr.Slider(0, 200, value=100, step=1, label="svicamera")
|
| 1243 |
|
| 1244 |
loras_enabled.change(
|
| 1245 |
fn=lambda x: gr.update(visible=x),
|
|
@@ -1247,32 +1246,101 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
|
| 1247 |
outputs=loras_section,
|
| 1248 |
)
|
| 1249 |
|
| 1250 |
-
|
| 1251 |
-
|
| 1252 |
-
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
-
|
| 1256 |
-
|
| 1257 |
-
|
| 1258 |
-
|
| 1259 |
-
|
| 1260 |
-
|
| 1261 |
-
|
| 1262 |
-
|
| 1263 |
-
|
| 1264 |
-
|
| 1265 |
-
|
| 1266 |
-
|
| 1267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1268 |
)
|
| 1269 |
|
| 1270 |
-
enhance_prompt = gr.Checkbox(label="enhance prompt", value=False)
|
| 1271 |
generate_btn = gr.Button("Generate", variant="primary", size="lg")
|
| 1272 |
|
| 1273 |
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1274 |
output_video = gr.Video(label="Generated video")
|
| 1275 |
-
output_status = gr.Textbox(label="Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1276 |
|
| 1277 |
generate_btn.click(
|
| 1278 |
fn=generate_handler,
|
|
@@ -1282,39 +1350,16 @@ with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
|
| 1282 |
loras_enabled,
|
| 1283 |
s_wamu_h, s_dreamly_h,
|
| 1284 |
s_wamu_l, s_dreamly_l,
|
| 1285 |
-
enhance_prompt,
|
| 1286 |
-
ipnc_enabled,
|
| 1287 |
-
ipnc_wamu_h, ipnc_dreamly_h,
|
| 1288 |
-
ipnc_wamu_l, ipnc_dreamly_l,
|
| 1289 |
s_svicamera_h, s_svicamera_l,
|
| 1290 |
-
ipnc_svicamera_h, ipnc_svicamera_l,
|
| 1291 |
-
],
|
| 1292 |
-
outputs=[output_video, output_status],
|
| 1293 |
-
)
|
| 1294 |
-
|
| 1295 |
-
_chunk_session_id = gr.Textbox(visible=False, value="")
|
| 1296 |
-
_chunk_step_txt = gr.Textbox(visible=False, value="0")
|
| 1297 |
-
_chunk_btn = gr.Button(visible=False)
|
| 1298 |
-
_chunk_btn.click(
|
| 1299 |
-
fn=generate_chunk_handler,
|
| 1300 |
-
inputs=[
|
| 1301 |
-
prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect, gpu_budget,
|
| 1302 |
-
num_steps, duration_secs, sampler_name, base_fps, rife_mode,
|
| 1303 |
-
loras_enabled,
|
| 1304 |
-
s_wamu_h, s_dreamly_h,
|
| 1305 |
-
s_wamu_l, s_dreamly_l,
|
| 1306 |
-
enhance_prompt,
|
| 1307 |
ipnc_enabled,
|
| 1308 |
ipnc_wamu_h, ipnc_dreamly_h,
|
| 1309 |
ipnc_wamu_l, ipnc_dreamly_l,
|
| 1310 |
-
s_svicamera_h, s_svicamera_l,
|
| 1311 |
ipnc_svicamera_h, ipnc_svicamera_l,
|
| 1312 |
-
_chunk_session_id, _chunk_step_txt,
|
| 1313 |
],
|
| 1314 |
outputs=[output_video, output_status],
|
| 1315 |
-
api_name="generate_chunk",
|
| 1316 |
)
|
| 1317 |
|
|
|
|
| 1318 |
if __name__ == "__main__":
|
| 1319 |
_ensure_comfy()
|
| 1320 |
_ensure_models()
|
|
|
|
| 378 |
|
| 379 |
|
| 380 |
R2V_TEMPLATE = """You are an expert at writing subject-driven video generation prompts. I'm providing you with:
|
| 381 |
+
1. {image_num} reference image(s) of the subject(s) (referred to as image0, image1, ... in order).
|
| 382 |
2. An original video description text.
|
| 383 |
|
| 384 |
+
Rewrite the description into TWO concatenated parts:
|
| 385 |
+
Part 1 - Short: A concise sentence describing who appears (reference as image0/image1/etc.), where, and what key action/motion.
|
| 386 |
+
Part 2 - Long: A detailed "Generate a video where..." paragraph with full appearance details referencing each subject as "the person from image0" etc., detailed scene/environment, and step-by-step temporal motion sequence.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
Requirements:
|
| 389 |
+
- Reference each subject as image0/image1/etc., base appearance on what you see in the images (no hallucination), output entirely in English.
|
| 390 |
+
- For every action or scenario described, identify and explicitly state all implied visual elements that are not mentioned but must be present for the scene to exist: who or what else is in the frame, what the subject is wearing or holding that the activity requires, what the environment necessarily contains, any other participants or objects involved.
|
| 391 |
+
- For each action, explicitly state all visually relevant body states that are not mentioned but would be visible on camera: hand positions (open, clenched, raised, at side), facial expression, direction of gaze, posture, weight distribution, foot placement. Do not assume any body state is obvious -- state it explicitly.
|
| 392 |
+
- The final prompt must be detailed enough that a complete mental image of the scene can be formed without seeing the reference images. Every visual element a film director would need to brief their crew on -- blocking, attire, props, environment, participant positions -- must appear in the text.
|
| 393 |
+
- Do not robotically enumerate biomechanics. Write naturally while ensuring no visual element is left implicit.
|
| 394 |
+
{extra_rule}
|
| 395 |
+
- For any close physical interaction or insertion, you MUST explicitly define the spatial occlusion. State exactly what is physically connected, what is penetrating or entering a space, and what is visually hidden inside the other object/body versus what remains visible outside.
|
| 396 |
+
Return ONLY a JSON object with one key: "rewritten_text".
|
| 397 |
|
| 398 |
Original description:
|
| 399 |
{original_text}
|
| 400 |
"""
|
| 401 |
|
| 402 |
|
| 403 |
+
ENHANCE_EXTRA_RULE_NOREDESCRIBE = (
|
| 404 |
+
"Do not describe the inherent visual appearance of subjects from the reference images "
|
| 405 |
+
"(their face, hair, body type, baseline clothing) -- the model already sees those. "
|
| 406 |
+
"However, DO describe any scene-specific additions to reference subjects' appearance: "
|
| 407 |
+
"equipment, props, or attire added for this scene that would not be present in the reference image."
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
ENHANCE_EXTRA_RULE_DEFAULT = (
|
| 411 |
+
"Include full appearance details of reference subjects."
|
| 412 |
+
)
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
def _enhance_prompt_r2v(prompt: str, image_paths: list[str], no_redescribe: bool = False) -> str | None:
|
| 416 |
+
"""Call grok-4.3 via xAI API. Direct connection."""
|
| 417 |
+
import base64, mimetypes as _mt, json as _json
|
| 418 |
try:
|
| 419 |
+
image_num = len([p for p in image_paths if p and os.path.exists(p)])
|
| 420 |
+
extra_rule = ENHANCE_EXTRA_RULE_NOREDESCRIBE if no_redescribe else ENHANCE_EXTRA_RULE_DEFAULT
|
| 421 |
+
user_text = R2V_TEMPLATE.format(
|
| 422 |
+
image_num=max(image_num, 1),
|
| 423 |
+
extra_rule=extra_rule,
|
| 424 |
+
original_text=prompt,
|
| 425 |
+
)
|
| 426 |
+
content_msgs: list = [{"type": "text", "text": user_text}]
|
| 427 |
+
for i, path in enumerate(image_paths[:5]):
|
| 428 |
if not path or not os.path.exists(path):
|
| 429 |
continue
|
| 430 |
with open(path, "rb") as f:
|
| 431 |
b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 432 |
+
mime, _ = _mt.guess_type(path)
|
| 433 |
+
mime = mime or "image/jpeg"
|
| 434 |
+
content_msgs.append({"type": "text", "text": f"\\n[Image {i}]:"})
|
| 435 |
+
content_msgs.append({"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}})
|
| 436 |
+
payload = {
|
| 437 |
+
"model": "grok-4.3",
|
| 438 |
+
"messages": [
|
| 439 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 440 |
+
{"role": "user", "content": content_msgs},
|
| 441 |
+
],
|
| 442 |
+
"reasoning_effort": "xhigh",
|
| 443 |
+
"response_format": {"type": "json_object"},
|
| 444 |
+
}
|
| 445 |
+
import requests as _req
|
| 446 |
+
r = _req.post(
|
| 447 |
+
"https://api.x.ai/v1/chat/completions",
|
| 448 |
+
json=payload,
|
| 449 |
+
headers={
|
| 450 |
+
"Authorization": f"Bearer {os.environ.get('XAI_API_KEY', '')}",
|
| 451 |
+
"Content-Type": "application/json",
|
| 452 |
+
},
|
| 453 |
+
timeout=300,
|
| 454 |
)
|
| 455 |
+
r.raise_for_status()
|
| 456 |
+
text = r.json()["choices"][0]["message"]["content"]
|
| 457 |
enhanced = _json.loads(text).get("rewritten_text", "").strip()
|
| 458 |
if enhanced:
|
| 459 |
+
print(f"[enhancer] enhanced ({len(enhanced)} chars)", flush=True)
|
| 460 |
return enhanced
|
| 461 |
return None
|
| 462 |
except Exception as e:
|
| 463 |
print(f"[enhancer] failed: {e}", flush=True)
|
| 464 |
return None
|
| 465 |
|
| 466 |
+
|
| 467 |
def _load_workflow() -> dict[str, Any]:
|
| 468 |
wf_path = ROOT / WORKFLOW_FILE
|
| 469 |
return json.loads(wf_path.read_text(encoding="utf-8"))
|
|
|
|
| 876 |
return max(30, int(total))
|
| 877 |
|
| 878 |
|
| 879 |
+
|
| 880 |
def _get_duration(
|
| 881 |
prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect,
|
| 882 |
+
gpu_budget, num_steps, duration_secs, sampler_name, base_fps, rife_mode,
|
| 883 |
+
ipnc_enabled, ipnc_wamu_h, ipnc_dreamly_h, ipnc_wamu_l, ipnc_dreamly_l,
|
| 884 |
+
ipnc_svicamera_h, ipnc_svicamera_l,
|
| 885 |
+
*args, **kwargs,
|
| 886 |
):
|
| 887 |
if gpu_budget and int(gpu_budget) > 0:
|
| 888 |
return int(gpu_budget)
|
| 889 |
return _estimate_duration(num_steps, duration_secs, base_fps, aspect_ratio, use_ref_aspect, rife_mode)
|
| 890 |
|
| 891 |
|
| 892 |
+
@spaces.GPU(duration=_get_duration)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 893 |
def generate_chunk_handler(
|
| 894 |
prompt: str,
|
| 895 |
negative: str,
|
|
|
|
| 900 |
gpu_budget: int = 120,
|
| 901 |
num_steps: int = 6,
|
| 902 |
duration_secs: float = 5.0,
|
| 903 |
+
sampler_name: str = "lcm",
|
| 904 |
base_fps: int = 15,
|
| 905 |
+
rife_mode: str = "2x rife",
|
| 906 |
loras_enabled: bool = False,
|
| 907 |
s_wamu_h: float = 1.0,
|
| 908 |
+
s_dreamly_h: float = 1.0,
|
| 909 |
s_wamu_l: float = 0.5,
|
| 910 |
+
s_dreamly_l: float = 0.7,
|
| 911 |
+
s_svicamera_h: float = 0.0,
|
| 912 |
+
s_svicamera_l: float = 0.0,
|
| 913 |
+
ipnc_enabled: bool = True,
|
| 914 |
ipnc_wamu_h: float = 100.0,
|
| 915 |
ipnc_dreamly_h: float = 100.0,
|
| 916 |
ipnc_wamu_l: float = 100.0,
|
| 917 |
ipnc_dreamly_l: float = 100.0,
|
|
|
|
|
|
|
| 918 |
ipnc_svicamera_h: float = 100.0,
|
| 919 |
ipnc_svicamera_l: float = 100.0,
|
| 920 |
# chunking params
|
| 921 |
session_id: str = "",
|
| 922 |
+
steps_per_chunk: str = "2",
|
| 923 |
chunk_step: str = "0",
|
| 924 |
progress=gr.Progress(track_tqdm=True),
|
| 925 |
):
|
|
|
|
| 951 |
gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
|
| 952 |
n_frames = max(1, round(float(duration_secs) * int(base_fps)))
|
| 953 |
steps = int(num_steps)
|
| 954 |
+
STEPS_PER_CHUNK = max(1, int(float(steps_per_chunk or 2)))
|
| 955 |
chunk_idx = int(float(chunk_step or 0))
|
| 956 |
step_start = chunk_idx * STEPS_PER_CHUNK
|
| 957 |
step_end = min(step_start + STEPS_PER_CHUNK, steps)
|
|
|
|
| 991 |
sampler_id = "57" if use_high else "58"
|
| 992 |
|
| 993 |
# Inject chunk nodes: save/load latent, save/load conditioning
|
| 994 |
+
# Batch multiple steps per chunk (set by the caller)
|
| 995 |
api_wf[sampler_id]["inputs"]["steps"] = steps
|
| 996 |
api_wf[sampler_id]["inputs"]["start_at_step"] = step_start
|
| 997 |
api_wf[sampler_id]["inputs"]["end_at_step"] = step_end
|
|
|
|
| 1073 |
seed: int,
|
| 1074 |
aspect_ratio: str = "16:9",
|
| 1075 |
use_ref_aspect: bool = False,
|
| 1076 |
+
gpu_budget: int = 0,
|
| 1077 |
num_steps: int = 6,
|
| 1078 |
duration_secs: float = 10.0,
|
| 1079 |
+
sampler_name: str = "lcm",
|
| 1080 |
base_fps: int = 15,
|
| 1081 |
+
rife_mode: str = "2x rife",
|
| 1082 |
loras_enabled: bool = False,
|
| 1083 |
s_wamu_h: float = 1.0,
|
| 1084 |
+
s_dreamly_h: float = 1.0,
|
| 1085 |
s_wamu_l: float = 0.5,
|
| 1086 |
+
s_dreamly_l: float = 0.7,
|
| 1087 |
+
s_svicamera_h: float = 0.0,
|
| 1088 |
+
s_svicamera_l: float = 0.0,
|
| 1089 |
+
ipnc_enabled: bool = True,
|
| 1090 |
ipnc_wamu_h: float = 100.0,
|
| 1091 |
ipnc_dreamly_h: float = 100.0,
|
| 1092 |
ipnc_wamu_l: float = 100.0,
|
| 1093 |
ipnc_dreamly_l: float = 100.0,
|
|
|
|
|
|
|
| 1094 |
ipnc_svicamera_h: float = 100.0,
|
| 1095 |
ipnc_svicamera_l: float = 100.0,
|
| 1096 |
progress=gr.Progress(track_tqdm=True),
|
|
|
|
| 1103 |
final_seed = int(seed) if seed else random.randint(0, MAX_SEED)
|
| 1104 |
negative = negative or DEFAULT_NEGATIVE
|
| 1105 |
|
| 1106 |
+
def _coerce_gallery(g):
|
| 1107 |
+
paths = []
|
| 1108 |
+
for item in (g or []):
|
| 1109 |
+
if isinstance(item, str):
|
| 1110 |
+
paths.append(item)
|
| 1111 |
+
elif isinstance(item, dict):
|
| 1112 |
+
p = item.get("path") or item.get("name")
|
| 1113 |
+
if p: paths.append(p)
|
| 1114 |
+
elif isinstance(item, (list, tuple)) and item:
|
| 1115 |
+
p = item[0]
|
| 1116 |
+
if isinstance(p, str): paths.append(p)
|
| 1117 |
+
elif isinstance(p, dict): paths.append(p.get("path",""))
|
| 1118 |
+
return [p for p in paths if p and os.path.exists(p)][:5]
|
| 1119 |
+
|
| 1120 |
image_paths = _coerce_gallery(image_input)
|
| 1121 |
if not image_paths:
|
| 1122 |
return None, "upload at least one reference image"
|
|
|
|
| 1125 |
dest_names = []
|
| 1126 |
for p in image_paths:
|
| 1127 |
dn = f"ref_{uuid.uuid4().hex[:8]}_{os.path.basename(p)}"
|
| 1128 |
+
shutil.copy2(p, INPUT / dn)
|
| 1129 |
dest_names.append(dn)
|
| 1130 |
|
| 1131 |
gen_w, gen_h = _compute_dims(str(aspect_ratio), image_paths[0], bool(use_ref_aspect))
|
| 1132 |
|
| 1133 |
+
print(prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1134 |
|
| 1135 |
progress(0.1, desc="building workflow...")
|
| 1136 |
visual_wf = _load_workflow()
|
|
|
|
| 1175 |
|
| 1176 |
ts = time.strftime("%Y%m%d_%H%M%S")
|
| 1177 |
out_path = os.path.join(SAVE_BASE, f"r2v_{ts}.mp4")
|
| 1178 |
+
shutil.copy2(output_video, out_path)
|
| 1179 |
+
|
| 1180 |
+
return out_path, f"Seed: {final_seed}\n{out_path}"
|
| 1181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1182 |
|
| 1183 |
with gr.Blocks(title="Bernini-R Wan 2.2 R2V Lightning") as demo:
|
| 1184 |
gr.Markdown("# Bernini-R Wan 2.2 R2V Lightning")
|
|
|
|
| 1194 |
prompt = gr.Textbox(
|
| 1195 |
label="Prompt",
|
| 1196 |
lines=3,
|
| 1197 |
+
placeholder="Describe the subject's action in detail...",
|
| 1198 |
value="Keeping the exact identity and appearance the same as in image0, the person in image0 dances in a supermarket.",
|
| 1199 |
)
|
| 1200 |
+
with gr.Row():
|
| 1201 |
+
enhance_btn = gr.Button("Enhance prompt", variant="secondary", size="sm")
|
| 1202 |
+
no_redescribe = gr.Checkbox(
|
| 1203 |
+
label="don't redescribe reference subjects",
|
| 1204 |
+
value=False,
|
| 1205 |
)
|
| 1206 |
+
|
| 1207 |
with gr.Group():
|
| 1208 |
aspect_ratio = gr.Radio(
|
| 1209 |
choices=list(ASPECT_PRESETS.keys()),
|
|
|
|
| 1215 |
value=False,
|
| 1216 |
)
|
| 1217 |
|
| 1218 |
+
with gr.Row():
|
| 1219 |
+
duration_secs = gr.Slider(1, 20, value=5, step=0.5, label="Duration (s)")
|
| 1220 |
+
base_fps = gr.Number(value=15, precision=0, label="Base FPS")
|
| 1221 |
+
|
| 1222 |
+
with gr.Row():
|
| 1223 |
+
seed = gr.Number(value=0, precision=0, label="Seed (0=random)")
|
| 1224 |
+
gpu_budget = gr.Slider(0, 540, value=0, step=10, label="ZeroGPU budget (0=auto)")
|
| 1225 |
+
|
| 1226 |
with gr.Group(elem_id="loras_9999"):
|
| 1227 |
loras_enabled = gr.Checkbox(label="optional loras", value=False)
|
| 1228 |
with gr.Column(visible=False) as loras_section:
|
|
|
|
| 1230 |
with gr.Group():
|
| 1231 |
gr.Markdown("<div style='padding-left:8px'>High</div>")
|
| 1232 |
with gr.Row():
|
| 1233 |
+
s_wamu_h = gr.Slider(-2, 2, value=1.0, step=0.05, label="wamu")
|
| 1234 |
+
s_dreamly_h = gr.Slider(-2, 2, value=1.0, step=0.05, label="dreamly")
|
| 1235 |
s_svicamera_h = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
|
| 1236 |
with gr.Group():
|
| 1237 |
gr.Markdown("<div style='padding-left:8px'>Low</div>")
|
|
|
|
| 1239 |
s_wamu_l = gr.Slider(-2, 2, value=0.5, step=0.05, label="wamu")
|
| 1240 |
s_dreamly_l = gr.Slider(-2, 2, value=0.7, step=0.05, label="dreamly")
|
| 1241 |
s_svicamera_l = gr.Slider(-2, 2, value=0.0, step=0.05, label="svicamera")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1242 |
|
| 1243 |
loras_enabled.change(
|
| 1244 |
fn=lambda x: gr.update(visible=x),
|
|
|
|
| 1246 |
outputs=loras_section,
|
| 1247 |
)
|
| 1248 |
|
| 1249 |
+
with gr.Accordion("Advanced", open=False):
|
| 1250 |
+
with gr.Row():
|
| 1251 |
+
num_steps = gr.Slider(4, 20, value=6, step=1, label="Steps")
|
| 1252 |
+
sampler_name = gr.Dropdown(choices=["uni_pc", "lcm"], value="lcm", label="Sampler")
|
| 1253 |
+
rife_mode = gr.Dropdown(
|
| 1254 |
+
choices=["no rife", "2x rife", "4x rife"],
|
| 1255 |
+
value="2x rife",
|
| 1256 |
+
label="RIFE interpolation",
|
| 1257 |
+
)
|
| 1258 |
+
|
| 1259 |
+
with gr.Accordion("Negative prompt", open=False):
|
| 1260 |
+
negative = gr.Textbox(
|
| 1261 |
+
label="",
|
| 1262 |
+
lines=2,
|
| 1263 |
+
value=DEFAULT_NEGATIVE,
|
| 1264 |
+
)
|
| 1265 |
+
|
| 1266 |
+
with gr.Accordion("IPNC", open=False):
|
| 1267 |
+
ipnc_enabled = gr.Checkbox(label="Enable", value=True)
|
| 1268 |
+
with gr.Column(visible=True) as ipnc_section:
|
| 1269 |
+
with gr.Group():
|
| 1270 |
+
gr.Markdown("<div style='padding-left:8px'>High</div>")
|
| 1271 |
+
with gr.Row():
|
| 1272 |
+
ipnc_wamu_h = gr.Slider(0, 200, value=100, step=1, label="wamu")
|
| 1273 |
+
ipnc_dreamly_h = gr.Slider(0, 200, value=100, step=1, label="dreamly")
|
| 1274 |
+
ipnc_svicamera_h = gr.Slider(0, 200, value=100, step=1, label="svicamera")
|
| 1275 |
+
with gr.Group():
|
| 1276 |
+
gr.Markdown("<div style='padding-left:8px'>Low</div>")
|
| 1277 |
+
with gr.Row():
|
| 1278 |
+
ipnc_wamu_l = gr.Slider(0, 200, value=100, step=1, label="wamu")
|
| 1279 |
+
ipnc_dreamly_l = gr.Slider(0, 200, value=100, step=1, label="dreamly")
|
| 1280 |
+
ipnc_svicamera_l = gr.Slider(0, 200, value=100, step=1, label="svicamera")
|
| 1281 |
+
|
| 1282 |
+
ipnc_enabled.change(
|
| 1283 |
+
fn=lambda x: gr.update(visible=x),
|
| 1284 |
+
inputs=ipnc_enabled,
|
| 1285 |
+
outputs=ipnc_section,
|
| 1286 |
)
|
| 1287 |
|
|
|
|
| 1288 |
generate_btn = gr.Button("Generate", variant="primary", size="lg")
|
| 1289 |
|
| 1290 |
with gr.Column(scale=1):
|
| 1291 |
+
# hidden: chunked generation endpoint
|
| 1292 |
+
chunk_session_id = gr.Textbox(visible=False, value="")
|
| 1293 |
+
chunk_steps_per = gr.Textbox(visible=False, value="2")
|
| 1294 |
+
chunk_step_idx = gr.Textbox(visible=False, value="0")
|
| 1295 |
+
chunk_btn = gr.Button(visible=False)
|
| 1296 |
+
chunk_btn.click(
|
| 1297 |
+
fn=generate_chunk_handler,
|
| 1298 |
+
inputs=[
|
| 1299 |
+
prompt, negative, image_input, seed, aspect_ratio, use_ref_aspect, gpu_budget,
|
| 1300 |
+
num_steps, duration_secs, sampler_name, base_fps, rife_mode,
|
| 1301 |
+
loras_enabled,
|
| 1302 |
+
s_wamu_h, s_dreamly_h,
|
| 1303 |
+
s_wamu_l, s_dreamly_l,
|
| 1304 |
+
s_svicamera_h, s_svicamera_l,
|
| 1305 |
+
ipnc_enabled,
|
| 1306 |
+
ipnc_wamu_h, ipnc_dreamly_h,
|
| 1307 |
+
ipnc_wamu_l, ipnc_dreamly_l,
|
| 1308 |
+
ipnc_svicamera_h, ipnc_svicamera_l,
|
| 1309 |
+
chunk_session_id, chunk_steps_per, chunk_step_idx,
|
| 1310 |
+
],
|
| 1311 |
+
outputs=[output_video, output_status],
|
| 1312 |
+
)
|
| 1313 |
output_video = gr.Video(label="Generated video")
|
| 1314 |
+
output_status = gr.Textbox(label="Status", interactive=False, lines=4)
|
| 1315 |
+
|
| 1316 |
+
# ββ enhance handler ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1317 |
+
def enhance_handler(prompt: str, image_input: Any, no_redescribe: bool,
|
| 1318 |
+
progress=gr.Progress()):
|
| 1319 |
+
paths = []
|
| 1320 |
+
for item in (image_input or []):
|
| 1321 |
+
if isinstance(item, str) and os.path.exists(item):
|
| 1322 |
+
paths.append(item)
|
| 1323 |
+
elif isinstance(item, dict):
|
| 1324 |
+
p = item.get("path") or item.get("name")
|
| 1325 |
+
if p and os.path.exists(p): paths.append(p)
|
| 1326 |
+
elif isinstance(item, (list, tuple)) and item:
|
| 1327 |
+
p = item[0] if isinstance(item[0], str) else (item[0].get("path") if isinstance(item[0], dict) else None)
|
| 1328 |
+
if p and os.path.exists(p): paths.append(p)
|
| 1329 |
+
paths = [p for p in paths if p and os.path.exists(p)][:5]
|
| 1330 |
+
if not paths:
|
| 1331 |
+
raise gr.Error("upload at least one reference image")
|
| 1332 |
+
if not (prompt or "").strip():
|
| 1333 |
+
raise gr.Error("enter a prompt")
|
| 1334 |
+
result = _enhance_prompt_r2v(prompt, paths, no_redescribe=bool(no_redescribe))
|
| 1335 |
+
if result:
|
| 1336 |
+
return result
|
| 1337 |
+
raise gr.Error("enhancement failed")
|
| 1338 |
+
|
| 1339 |
+
enhance_btn.click(
|
| 1340 |
+
fn=enhance_handler,
|
| 1341 |
+
inputs=[prompt, image_input, no_redescribe],
|
| 1342 |
+
outputs=[prompt],
|
| 1343 |
+
)
|
| 1344 |
|
| 1345 |
generate_btn.click(
|
| 1346 |
fn=generate_handler,
|
|
|
|
| 1350 |
loras_enabled,
|
| 1351 |
s_wamu_h, s_dreamly_h,
|
| 1352 |
s_wamu_l, s_dreamly_l,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1353 |
s_svicamera_h, s_svicamera_l,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1354 |
ipnc_enabled,
|
| 1355 |
ipnc_wamu_h, ipnc_dreamly_h,
|
| 1356 |
ipnc_wamu_l, ipnc_dreamly_l,
|
|
|
|
| 1357 |
ipnc_svicamera_h, ipnc_svicamera_l,
|
|
|
|
| 1358 |
],
|
| 1359 |
outputs=[output_video, output_status],
|
|
|
|
| 1360 |
)
|
| 1361 |
|
| 1362 |
+
|
| 1363 |
if __name__ == "__main__":
|
| 1364 |
_ensure_comfy()
|
| 1365 |
_ensure_models()
|