from __future__ import annotations import math from pathlib import Path from typing import Final import gradio as gr from video_to_colmap import ( ConversionOutputs, VideoMetadata, convert_video_to_colmap_archive, infer_target_frames, read_video_metadata, ) APP_DIR: Final[Path] = Path(__file__).resolve().parent OUTPUTS_DIR: Final[Path] = APP_DIR / "outputs" OUTPUTS_DIR.mkdir(parents=True, exist_ok=True) gr.set_static_paths(paths=[str(OUTPUTS_DIR)]) DEFAULT_TARGET_FRAMES: Final[int] = 24 TARGET_FRAME_CHOICES: Final[list[str]] = ["Auto", "16", "24", "32", "48"] CSS: Final[str] = """ html { scrollbar-gutter: stable; } body { overflow: auto; } .gradio-container { max-width: none; width: 100%; margin: 0; padding: 0.75rem 1rem 1rem; } #main-row { gap: 1rem; align-items: stretch; } #controls-panel { display: flex; flex-direction: column; gap: 0.75rem; } #preview-panel { min-height: 540px; } .preview-placeholder { position: relative; overflow: hidden; width: 100%; min-height: 540px; display: flex; align-items: center; justify-content: center; border-radius: 14px; background: radial-gradient(circle at top left, rgba(250, 204, 21, 0.22), transparent 32%), linear-gradient(135deg, #0f172a 0%, #172554 48%, #1e293b 100%); border: 1px solid rgba(148, 163, 184, 0.24); color: #f8fafc; } .preview-placeholder::before { content: ""; position: absolute; inset: 0; background: linear-gradient(180deg, rgba(15, 23, 42, 0.18), rgba(15, 23, 42, 0.4)); } .preview-inner { position: relative; z-index: 1; max-width: 460px; padding: 32px; text-align: center; border-radius: 20px; background: rgba(15, 23, 42, 0.44); box-shadow: 0 18px 50px rgba(15, 23, 42, 0.24); backdrop-filter: blur(10px); } .preview-placeholder .preview-title { color: #f8fafc !important; font-size: 28px; font-weight: 600; margin-bottom: 8px; letter-spacing: 0.01em; text-shadow: 0 1px 2px rgba(15, 23, 42, 0.4); } .preview-placeholder .preview-desc { color: rgba(226, 232, 240, 0.96) !important; font-size: 16px; line-height: 1.5; text-shadow: 0 1px 2px rgba(15, 23, 42, 0.35); } #status-text { font-size: 13px; opacity: 0.92; } #video-summary { margin-top: -0.15rem; padding: 0.8rem 0.9rem; border-radius: 12px; background: rgba(15, 23, 42, 0.04); border: 1px solid rgba(148, 163, 184, 0.18); } @media (max-width: 900px) { #main-row { flex-direction: column; } #preview-panel, .preview-placeholder { min-height: 420px; } } """ def preview_placeholder_html(title: str, description: str) -> str: return f"""
{title}
{description}
""" def default_video_summary() -> str: return "Upload a video to infer the target keyframe count automatically, or choose a manual max." def fallback_video_summary() -> str: return "Could not inspect video metadata on upload. Using the default target of **24 keyframes**." def manual_video_summary(target_frames: int, metadata: VideoMetadata | None = None) -> str: if metadata is None: return f"Using a manual limit of **{target_frames} keyframes**." return ( f"Using a manual limit of **{target_frames} keyframes** for a **{metadata.duration_seconds:.1f}s** clip " f"at **{metadata.width}x{metadata.height}** and **{metadata.fps:.1f} fps**." ) def video_summary_text(metadata: VideoMetadata, target_frames: int) -> str: return ( f"Auto-selecting **{target_frames} keyframes** from a **{metadata.duration_seconds:.1f}s** clip " f"at **{metadata.width}x{metadata.height}** and **{metadata.fps:.1f} fps**." ) def start_generation(target_frames: int) -> tuple[object, object, str]: return ( gr.update(interactive=False, value="Converting..."), gr.update(interactive=False), preview_placeholder_html( "Preparing Video for COLMAP", f"Normalizing the clip, selecting {target_frames} sharp overlapping keyframes, and running sparse reconstruction.", ), ) def _status_text(outputs: ConversionOutputs) -> str: coverage = 0.0 if outputs.selected_frames: coverage = outputs.registered_frames / outputs.selected_frames return ( f"Prepared **{outputs.scene_name}** from a **{outputs.duration_seconds:.1f}s** clip. " f"Selected **{outputs.selected_frames}** keyframes, COLMAP registered **{outputs.registered_frames}**, " f"and the reconstruction quality is **{outputs.quality_label}** " f"({math.floor(coverage * 100)}% registration)." ) def run_conversion( video_path: str | None, target_frames: int, sampling_profile: str, max_edge: str, ) -> tuple[object, object, object, str]: if not video_path: raise gr.Error("Upload a video first.") try: outputs = convert_video_to_colmap_archive( video_path=video_path, target_frames=target_frames, profile_key=sampling_profile, max_image_edge=int(max_edge), ) return ( gr.update(value=str(outputs.archive_path), visible=True, interactive=True), gr.update(value=str(outputs.report_path), visible=True, interactive=True), gr.update(value=str(outputs.contact_sheet_path), visible=True), _status_text(outputs), ) except gr.Error: raise except Exception as exc: raise gr.Error(f"Conversion failed: {type(exc).__name__}: {exc}") from exc def clear_all() -> tuple[None, object, object, object, str, str, int, object]: return ( None, gr.update(value=None, visible=False), gr.update(value=None, visible=False), gr.update(value=None, visible=False), "", default_video_summary(), DEFAULT_TARGET_FRAMES, gr.update(value="Auto"), ) def update_target_settings(video_path: str | None, target_mode: str) -> tuple[object, object, str, int]: has_video = bool(video_path) generate_update = gr.update(interactive=has_video, value="Build COLMAP Archive") clear_update = gr.update(interactive=has_video) if not video_path: if target_mode != "Auto": manual_frames = int(target_mode) return ( generate_update, clear_update, manual_video_summary(manual_frames), manual_frames, ) return ( generate_update, clear_update, default_video_summary(), DEFAULT_TARGET_FRAMES, ) try: metadata = read_video_metadata(Path(video_path)) if target_mode == "Auto": target_frames = infer_target_frames(metadata) summary = video_summary_text(metadata, target_frames) else: target_frames = int(target_mode) summary = manual_video_summary(target_frames, metadata) except Exception: if target_mode == "Auto": target_frames = DEFAULT_TARGET_FRAMES summary = fallback_video_summary() else: target_frames = int(target_mode) summary = manual_video_summary(target_frames) return ( generate_update, clear_update, summary, target_frames, ) def build_demo() -> gr.Blocks: with gr.Blocks( css=CSS, title="Video to COLMAP for tttLRM", theme=gr.themes.Origin(), ) as demo: gr.Markdown("## Video to COLMAP for tttLRM") gr.Markdown( "Upload a single video. The Space will pick sharp overlapping keyframes, run COLMAP, and export a raw scene archive ready for the `tttLRM` Space." ) with gr.Row(elem_id="main-row", equal_height=True): with gr.Column(scale=3, min_width=320, elem_id="controls-panel"): video_in = gr.File( label="Input Video", type="filepath", file_types=[".mp4", ".mov", ".webm", ".mkv", ".avi"], ) target_frames_state = gr.State(value=DEFAULT_TARGET_FRAMES) target_mode = gr.Dropdown( label="Max Angles / Keyframes", choices=TARGET_FRAME_CHOICES, value="Auto", ) video_summary = gr.Markdown(default_video_summary(), elem_id="video-summary") sampling_profile = gr.Dropdown( label="Sampling Profile", choices=["balanced", "dense", "sparse"], value="balanced", ) max_edge = gr.Dropdown( label="Max Frame Edge", choices=["960", "1280", "1600"], value="1280", ) with gr.Row(): generate_btn = gr.Button("Build COLMAP Archive", variant="primary", interactive=False) clear_btn = gr.Button("Clear", interactive=False) archive_download = gr.File(label="Download Raw COLMAP Archive", visible=False) report_download = gr.File(label="Download Reconstruction Report", visible=False) status_text = gr.Markdown(elem_id="status-text") with gr.Column(scale=7, min_width=520): preview_html = gr.HTML( value=preview_placeholder_html( "Keyframe Selection Preview", "After conversion, the selected frames contact sheet will appear here so you can check overlap and viewpoint coverage.", ), elem_id="preview-panel", ) contact_sheet = gr.Image(label="Selected Keyframes", visible=False, type="filepath") video_in.change( update_target_settings, inputs=[video_in, target_mode], outputs=[generate_btn, clear_btn, video_summary, target_frames_state], queue=False, ) target_mode.change( update_target_settings, inputs=[video_in, target_mode], outputs=[generate_btn, clear_btn, video_summary, target_frames_state], queue=False, ) generate_btn.click( start_generation, inputs=[target_frames_state], outputs=[generate_btn, clear_btn, preview_html], queue=False, ).then( run_conversion, inputs=[video_in, target_frames_state, sampling_profile, max_edge], outputs=[archive_download, report_download, contact_sheet, status_text], ).then( lambda: ( gr.update(interactive=True, value="Build COLMAP Archive"), gr.update(interactive=True), preview_placeholder_html( "Keyframe Selection Complete", "Review the contact sheet below and download the raw COLMAP archive for the `tttLRM` Space.", ), ), outputs=[generate_btn, clear_btn, preview_html], queue=False, ) clear_btn.click( clear_all, outputs=[video_in, archive_download, report_download, contact_sheet, status_text, video_summary, target_frames_state, target_mode], queue=False, ).then( lambda: ( gr.update(interactive=False), gr.update(interactive=False), preview_placeholder_html( "Keyframe Selection Preview", "After conversion, the selected frames contact sheet will appear here so you can check overlap and viewpoint coverage.", ), ), outputs=[generate_btn, clear_btn, preview_html], queue=False, ) demo.queue(max_size=4) return demo if __name__ == "__main__": build_demo().launch()