notaneimu commited on
Commit
66dec57
·
0 Parent(s):

Initial Space app

Browse files
Files changed (6) hide show
  1. .gitignore +4 -0
  2. README.md +39 -0
  3. app.py +258 -0
  4. packages.txt +2 -0
  5. requirements.txt +4 -0
  6. video_to_colmap.py +632 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ .DS_Store
4
+ outputs/
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Video to COLMAP for tttLRM
3
+ emoji: 🎞️
4
+ colorFrom: amber
5
+ colorTo: slate
6
+ sdk: gradio
7
+ sdk_version: 6.2.0
8
+ python_version: 3.12
9
+ app_file: app.py
10
+ pinned: false
11
+ suggested_hardware: cpu-upgrade
12
+ ---
13
+
14
+ # Video to COLMAP for tttLRM
15
+
16
+ Standalone Hugging Face Space that converts an uploaded video into a raw COLMAP scene archive suitable for the `tttLRM` inference Space.
17
+
18
+ ## What it does
19
+
20
+ - Normalizes uploaded video with `ffmpeg`
21
+ - Samples candidate frames across the clip
22
+ - Scores sharpness, scene cuts, and inter-frame motion
23
+ - Chooses an ordered keyframe set with overlap appropriate for COLMAP
24
+ - Runs CPU COLMAP (`feature_extractor`, `sequential_matcher`, `mapper`)
25
+ - Packages a raw scene archive with:
26
+ - `images/`
27
+ - `sparse/0/`
28
+ - `report.json`
29
+
30
+ ## Recommended workflow
31
+
32
+ 1. Upload a short orbit or slow pan video of a single object or scene.
33
+ 2. Download the generated raw COLMAP `.zip`.
34
+ 3. Upload that archive into the companion `tttLRM` inference Space.
35
+
36
+ ## Notes
37
+
38
+ - This Space is CPU-oriented because COLMAP runs on CPU in standard Hugging Face Spaces more reliably than GPU-specific builds.
39
+ - Best results come from a stable single-shot orbit video with limited lighting changes and no jump cuts.
app.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from pathlib import Path
5
+ from typing import Final
6
+
7
+ import gradio as gr
8
+
9
+ from video_to_colmap import ConversionOutputs, convert_video_to_colmap_archive
10
+
11
+ APP_DIR: Final[Path] = Path(__file__).resolve().parent
12
+ OUTPUTS_DIR: Final[Path] = APP_DIR / "outputs"
13
+
14
+ OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
15
+ gr.set_static_paths(paths=[str(OUTPUTS_DIR)])
16
+
17
+ CSS: Final[str] = """
18
+ html { scrollbar-gutter: stable; }
19
+ body { overflow: auto; }
20
+ .gradio-container {
21
+ max-width: none;
22
+ width: 100%;
23
+ margin: 0;
24
+ padding: 0.75rem 1rem 1rem;
25
+ }
26
+ #main-row {
27
+ gap: 1rem;
28
+ align-items: stretch;
29
+ }
30
+ #controls-panel {
31
+ display: flex;
32
+ flex-direction: column;
33
+ gap: 0.75rem;
34
+ }
35
+ #preview-panel {
36
+ min-height: 540px;
37
+ }
38
+ .preview-placeholder {
39
+ width: 100%;
40
+ min-height: 540px;
41
+ display: flex;
42
+ align-items: center;
43
+ justify-content: center;
44
+ border-radius: 14px;
45
+ background: linear-gradient(135deg, #111827 0%, #1f2937 100%);
46
+ border: 1px solid rgba(148, 163, 184, 0.2);
47
+ color: #e5e7eb;
48
+ }
49
+ .preview-inner {
50
+ max-width: 460px;
51
+ padding: 32px;
52
+ text-align: center;
53
+ }
54
+ .preview-title {
55
+ font-size: 20px;
56
+ font-weight: 600;
57
+ margin-bottom: 8px;
58
+ }
59
+ .preview-desc {
60
+ font-size: 14px;
61
+ line-height: 1.5;
62
+ opacity: 0.82;
63
+ }
64
+ #status-text {
65
+ font-size: 13px;
66
+ opacity: 0.92;
67
+ }
68
+ @media (max-width: 900px) {
69
+ #main-row {
70
+ flex-direction: column;
71
+ }
72
+ #preview-panel,
73
+ .preview-placeholder {
74
+ min-height: 420px;
75
+ }
76
+ }
77
+ """
78
+
79
+
80
+ def preview_placeholder_html(title: str, description: str) -> str:
81
+ return f"""
82
+ <div class="preview-placeholder">
83
+ <div class="preview-inner">
84
+ <div class="preview-title">{title}</div>
85
+ <div class="preview-desc">{description}</div>
86
+ </div>
87
+ </div>
88
+ """
89
+
90
+
91
+ def start_generation() -> tuple[object, object, str]:
92
+ return (
93
+ gr.update(interactive=False, value="Converting..."),
94
+ gr.update(interactive=False),
95
+ preview_placeholder_html(
96
+ "Preparing Video for COLMAP",
97
+ "Normalizing the clip, selecting sharp overlapping keyframes, and running sparse reconstruction.",
98
+ ),
99
+ )
100
+
101
+
102
+ def _status_text(outputs: ConversionOutputs) -> str:
103
+ coverage = 0.0
104
+ if outputs.selected_frames:
105
+ coverage = outputs.registered_frames / outputs.selected_frames
106
+
107
+ return (
108
+ f"Prepared **{outputs.scene_name}** from a **{outputs.duration_seconds:.1f}s** clip. "
109
+ f"Selected **{outputs.selected_frames}** keyframes, COLMAP registered **{outputs.registered_frames}**, "
110
+ f"and the reconstruction quality is **{outputs.quality_label}** "
111
+ f"({math.floor(coverage * 100)}% registration)."
112
+ )
113
+
114
+
115
+ def run_conversion(
116
+ video_path: str | None,
117
+ target_frames: str,
118
+ sampling_profile: str,
119
+ max_edge: str,
120
+ ) -> tuple[object, object, object, str]:
121
+ if not video_path:
122
+ raise gr.Error("Upload a video first.")
123
+
124
+ try:
125
+ outputs = convert_video_to_colmap_archive(
126
+ video_path=video_path,
127
+ target_frames=int(target_frames),
128
+ profile_key=sampling_profile,
129
+ max_image_edge=int(max_edge),
130
+ )
131
+ return (
132
+ gr.update(value=str(outputs.archive_path), visible=True, interactive=True),
133
+ gr.update(value=str(outputs.report_path), visible=True, interactive=True),
134
+ gr.update(value=str(outputs.contact_sheet_path), visible=True),
135
+ _status_text(outputs),
136
+ )
137
+ except gr.Error:
138
+ raise
139
+ except Exception as exc:
140
+ raise gr.Error(f"Conversion failed: {type(exc).__name__}: {exc}") from exc
141
+
142
+
143
+ def clear_all() -> tuple[None, object, object, object, str]:
144
+ return (
145
+ None,
146
+ gr.update(value=None, visible=False),
147
+ gr.update(value=None, visible=False),
148
+ gr.update(value=None, visible=False),
149
+ "",
150
+ )
151
+
152
+
153
+ def on_video_change(video_path: str | None) -> tuple[object, object]:
154
+ has_video = bool(video_path)
155
+ return (
156
+ gr.update(interactive=has_video, value="Build COLMAP Archive"),
157
+ gr.update(interactive=has_video),
158
+ )
159
+
160
+
161
+ def build_demo() -> gr.Blocks:
162
+ with gr.Blocks(
163
+ css=CSS,
164
+ title="Video to COLMAP for tttLRM",
165
+ theme=gr.themes.Origin(),
166
+ ) as demo:
167
+ gr.Markdown("## Video to COLMAP for tttLRM")
168
+ gr.Markdown(
169
+ "Upload a single video. The Space will pick sharp overlapping keyframes, run COLMAP, and export a raw scene archive ready for the `tttLRM` Space."
170
+ )
171
+
172
+ with gr.Row(elem_id="main-row", equal_height=True):
173
+ with gr.Column(scale=3, min_width=320, elem_id="controls-panel"):
174
+ video_in = gr.File(
175
+ label="Input Video",
176
+ type="filepath",
177
+ file_types=[".mp4", ".mov", ".webm", ".mkv", ".avi"],
178
+ )
179
+ target_frames = gr.Dropdown(
180
+ label="Target Keyframes",
181
+ choices=["16", "24", "32", "48"],
182
+ value="24",
183
+ )
184
+ sampling_profile = gr.Dropdown(
185
+ label="Sampling Profile",
186
+ choices=["balanced", "dense", "sparse"],
187
+ value="balanced",
188
+ )
189
+ max_edge = gr.Dropdown(
190
+ label="Max Frame Edge",
191
+ choices=["960", "1280", "1600"],
192
+ value="1280",
193
+ )
194
+ with gr.Row():
195
+ generate_btn = gr.Button("Build COLMAP Archive", variant="primary", interactive=False)
196
+ clear_btn = gr.Button("Clear", interactive=False)
197
+ archive_download = gr.File(label="Download Raw COLMAP Archive", visible=False)
198
+ report_download = gr.File(label="Download Reconstruction Report", visible=False)
199
+ status_text = gr.Markdown(elem_id="status-text")
200
+
201
+ with gr.Column(scale=7, min_width=520):
202
+ preview_html = gr.HTML(
203
+ value=preview_placeholder_html(
204
+ "Keyframe Selection Preview",
205
+ "After conversion, the selected frames contact sheet will appear here so you can check overlap and viewpoint coverage.",
206
+ ),
207
+ elem_id="preview-panel",
208
+ )
209
+ contact_sheet = gr.Image(label="Selected Keyframes", visible=False, type="filepath")
210
+
211
+ video_in.change(
212
+ on_video_change,
213
+ inputs=[video_in],
214
+ outputs=[generate_btn, clear_btn],
215
+ )
216
+ generate_btn.click(
217
+ start_generation,
218
+ outputs=[generate_btn, clear_btn, preview_html],
219
+ queue=False,
220
+ ).then(
221
+ run_conversion,
222
+ inputs=[video_in, target_frames, sampling_profile, max_edge],
223
+ outputs=[archive_download, report_download, contact_sheet, status_text],
224
+ ).then(
225
+ lambda: (
226
+ gr.update(interactive=True, value="Build COLMAP Archive"),
227
+ gr.update(interactive=True),
228
+ preview_placeholder_html(
229
+ "Keyframe Selection Complete",
230
+ "Review the contact sheet below and download the raw COLMAP archive for the `tttLRM` Space.",
231
+ ),
232
+ ),
233
+ outputs=[generate_btn, clear_btn, preview_html],
234
+ queue=False,
235
+ )
236
+ clear_btn.click(
237
+ clear_all,
238
+ outputs=[video_in, archive_download, report_download, contact_sheet, status_text],
239
+ queue=False,
240
+ ).then(
241
+ lambda: (
242
+ gr.update(interactive=False),
243
+ gr.update(interactive=False),
244
+ preview_placeholder_html(
245
+ "Keyframe Selection Preview",
246
+ "After conversion, the selected frames contact sheet will appear here so you can check overlap and viewpoint coverage.",
247
+ ),
248
+ ),
249
+ outputs=[generate_btn, clear_btn, preview_html],
250
+ queue=False,
251
+ )
252
+
253
+ demo.queue(max_size=4)
254
+ return demo
255
+
256
+
257
+ if __name__ == "__main__":
258
+ build_demo().launch()
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ colmap
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==6.2.0
2
+ numpy==2.2.6
3
+ opencv-python-headless==4.12.0.88
4
+ Pillow==12.0.0
video_to_colmap.py ADDED
@@ -0,0 +1,632 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import math
5
+ import re
6
+ import shutil
7
+ import struct
8
+ import subprocess
9
+ import time
10
+ import uuid
11
+ import zipfile
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+ from typing import Final
15
+
16
+ import cv2
17
+ import numpy as np
18
+
19
+ APP_DIR: Final[Path] = Path(__file__).resolve().parent
20
+ WORK_DIR: Final[Path] = APP_DIR / "work"
21
+ OUTPUTS_DIR: Final[Path] = APP_DIR / "outputs"
22
+ THUMB_SIZE: Final[tuple[int, int]] = (96, 96)
23
+ JPEG_QUALITY: Final[int] = 95
24
+ FONT = cv2.FONT_HERSHEY_SIMPLEX
25
+
26
+ WORK_DIR.mkdir(parents=True, exist_ok=True)
27
+ OUTPUTS_DIR.mkdir(parents=True, exist_ok=True)
28
+
29
+
30
+ @dataclass(frozen=True)
31
+ class ProfileConfig:
32
+ candidate_multiplier: int
33
+ cut_threshold: float
34
+ min_blur_percentile: float
35
+ sequential_overlap: int
36
+ min_segment_frames: int
37
+
38
+
39
+ PROFILES: Final[dict[str, ProfileConfig]] = {
40
+ "balanced": ProfileConfig(
41
+ candidate_multiplier=6,
42
+ cut_threshold=0.42,
43
+ min_blur_percentile=35.0,
44
+ sequential_overlap=8,
45
+ min_segment_frames=14,
46
+ ),
47
+ "dense": ProfileConfig(
48
+ candidate_multiplier=8,
49
+ cut_threshold=0.38,
50
+ min_blur_percentile=30.0,
51
+ sequential_overlap=12,
52
+ min_segment_frames=18,
53
+ ),
54
+ "sparse": ProfileConfig(
55
+ candidate_multiplier=5,
56
+ cut_threshold=0.48,
57
+ min_blur_percentile=40.0,
58
+ sequential_overlap=6,
59
+ min_segment_frames=12,
60
+ ),
61
+ }
62
+
63
+
64
+ @dataclass(frozen=True)
65
+ class VideoMetadata:
66
+ fps: float
67
+ frame_count: int
68
+ duration_seconds: float
69
+ width: int
70
+ height: int
71
+
72
+
73
+ @dataclass(frozen=True)
74
+ class FrameCandidate:
75
+ candidate_index: int
76
+ frame_index: int
77
+ timestamp_seconds: float
78
+ path: Path
79
+ blur_score: float
80
+ motion_score: float
81
+ cut_score: float
82
+ thumb: np.ndarray
83
+
84
+
85
+ @dataclass(frozen=True)
86
+ class ConversionOutputs:
87
+ archive_path: Path
88
+ report_path: Path
89
+ contact_sheet_path: Path
90
+ scene_name: str
91
+ selected_frames: int
92
+ registered_frames: int
93
+ duration_seconds: float
94
+ quality_label: str
95
+
96
+
97
+ def _now_ms() -> int:
98
+ return int(time.time() * 1000)
99
+
100
+
101
+ def _ensure_dir(path: Path) -> Path:
102
+ path.mkdir(parents=True, exist_ok=True)
103
+ return path
104
+
105
+
106
+ def _unique_dir(parent: Path, prefix: str) -> Path:
107
+ path = parent / f"{prefix}-{_now_ms()}-{uuid.uuid4().hex[:8]}"
108
+ path.mkdir(parents=True, exist_ok=True)
109
+ return path
110
+
111
+
112
+ def _slugify(value: str) -> str:
113
+ slug = re.sub(r"[^a-zA-Z0-9]+", "-", value).strip("-").lower()
114
+ return slug or "scene"
115
+
116
+
117
+ def _run(cmd: list[str], cwd: Path | None = None) -> None:
118
+ result = subprocess.run(
119
+ cmd,
120
+ cwd=str(cwd) if cwd else None,
121
+ text=True,
122
+ stdout=subprocess.PIPE,
123
+ stderr=subprocess.STDOUT,
124
+ check=False,
125
+ )
126
+ if result.returncode != 0:
127
+ raise RuntimeError(
128
+ f"Command failed ({result.returncode}): {' '.join(cmd)}\n{result.stdout.strip()}"
129
+ )
130
+
131
+
132
+ def _require_binary(binary_name: str) -> None:
133
+ if shutil.which(binary_name) is None:
134
+ raise RuntimeError(f"Required executable not found: {binary_name}")
135
+
136
+
137
+ def normalize_video_input(video_path: Path, work_dir: Path) -> Path:
138
+ _require_binary("ffmpeg")
139
+ normalized_path = work_dir / "normalized.mp4"
140
+ _run(
141
+ [
142
+ "ffmpeg",
143
+ "-y",
144
+ "-i",
145
+ str(video_path),
146
+ "-an",
147
+ "-movflags",
148
+ "+faststart",
149
+ "-pix_fmt",
150
+ "yuv420p",
151
+ "-c:v",
152
+ "libx264",
153
+ str(normalized_path),
154
+ ],
155
+ cwd=work_dir,
156
+ )
157
+ return normalized_path
158
+
159
+
160
+ def read_video_metadata(video_path: Path) -> VideoMetadata:
161
+ capture = cv2.VideoCapture(str(video_path))
162
+ if not capture.isOpened():
163
+ raise RuntimeError(f"Failed to open video: {video_path}")
164
+
165
+ fps = float(capture.get(cv2.CAP_PROP_FPS) or 0.0)
166
+ frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
167
+ width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH) or 0)
168
+ height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT) or 0)
169
+ capture.release()
170
+
171
+ if frame_count <= 0 or width <= 0 or height <= 0:
172
+ raise RuntimeError("Video metadata could not be read from the uploaded file.")
173
+
174
+ if fps <= 0:
175
+ fps = 24.0
176
+
177
+ return VideoMetadata(
178
+ fps=fps,
179
+ frame_count=frame_count,
180
+ duration_seconds=frame_count / fps,
181
+ width=width,
182
+ height=height,
183
+ )
184
+
185
+
186
+ def _resize_max_edge(frame: np.ndarray, max_edge: int) -> np.ndarray:
187
+ height, width = frame.shape[:2]
188
+ current_max = max(height, width)
189
+ if current_max <= max_edge:
190
+ return frame
191
+
192
+ scale = max_edge / current_max
193
+ new_size = (max(2, int(round(width * scale))), max(2, int(round(height * scale))))
194
+ return cv2.resize(frame, new_size, interpolation=cv2.INTER_AREA)
195
+
196
+
197
+ def _compute_histogram(frame: np.ndarray) -> np.ndarray:
198
+ hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
199
+ hist = cv2.calcHist([hsv], [0, 1], None, [16, 16], [0, 180, 0, 256])
200
+ cv2.normalize(hist, hist)
201
+ return hist
202
+
203
+
204
+ def _compute_thumb(gray_frame: np.ndarray) -> np.ndarray:
205
+ thumb = cv2.resize(gray_frame, THUMB_SIZE, interpolation=cv2.INTER_AREA)
206
+ return thumb.astype(np.float32) / 255.0
207
+
208
+
209
+ def extract_candidates(
210
+ video_path: Path,
211
+ metadata: VideoMetadata,
212
+ candidates_dir: Path,
213
+ target_frames: int,
214
+ max_image_edge: int,
215
+ profile: ProfileConfig,
216
+ ) -> list[FrameCandidate]:
217
+ desired_candidates = min(max(target_frames * profile.candidate_multiplier, target_frames + 8), 240)
218
+ stride = max(1, metadata.frame_count // desired_candidates)
219
+
220
+ capture = cv2.VideoCapture(str(video_path))
221
+ if not capture.isOpened():
222
+ raise RuntimeError(f"Failed to open video for frame extraction: {video_path}")
223
+
224
+ candidates: list[FrameCandidate] = []
225
+ frame_index = 0
226
+ candidate_index = 0
227
+ previous_hist: np.ndarray | None = None
228
+ previous_thumb: np.ndarray | None = None
229
+ while True:
230
+ ok, frame = capture.read()
231
+ if not ok:
232
+ break
233
+ if frame_index % stride != 0:
234
+ frame_index += 1
235
+ continue
236
+
237
+ frame = _resize_max_edge(frame, max_image_edge)
238
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
239
+ blur_score = float(cv2.Laplacian(gray, cv2.CV_32F).var())
240
+ thumb = _compute_thumb(gray)
241
+ hist = _compute_histogram(frame)
242
+
243
+ motion_score = float(np.mean(np.abs(thumb - previous_thumb))) if previous_thumb is not None else 0.0
244
+ cut_score = (
245
+ float(cv2.compareHist(previous_hist, hist, cv2.HISTCMP_BHATTACHARYYA))
246
+ if previous_hist is not None
247
+ else 0.0
248
+ )
249
+
250
+ output_path = candidates_dir / f"candidate_{candidate_index:04d}.jpg"
251
+ cv2.imwrite(str(output_path), frame, [int(cv2.IMWRITE_JPEG_QUALITY), JPEG_QUALITY])
252
+ candidates.append(
253
+ FrameCandidate(
254
+ candidate_index=candidate_index,
255
+ frame_index=frame_index,
256
+ timestamp_seconds=frame_index / metadata.fps,
257
+ path=output_path,
258
+ blur_score=blur_score,
259
+ motion_score=motion_score,
260
+ cut_score=cut_score,
261
+ thumb=thumb,
262
+ )
263
+ )
264
+
265
+ previous_hist = hist
266
+ previous_thumb = thumb
267
+ candidate_index += 1
268
+ frame_index += 1
269
+
270
+ capture.release()
271
+ if len(candidates) < max(8, target_frames // 2):
272
+ raise RuntimeError(
273
+ f"Video yielded only {len(candidates)} usable candidates; upload a longer or slower video."
274
+ )
275
+ return candidates
276
+
277
+
278
+ def segment_candidates(candidates: list[FrameCandidate], profile: ProfileConfig) -> list[list[FrameCandidate]]:
279
+ if not candidates:
280
+ return []
281
+
282
+ segments: list[list[FrameCandidate]] = []
283
+ start = 0
284
+ for index in range(1, len(candidates)):
285
+ if candidates[index].cut_score >= profile.cut_threshold:
286
+ segments.append(candidates[start:index])
287
+ start = index
288
+ segments.append(candidates[start:])
289
+ return [segment for segment in segments if segment]
290
+
291
+
292
+ def choose_best_segment(
293
+ segments: list[list[FrameCandidate]],
294
+ target_frames: int,
295
+ profile: ProfileConfig,
296
+ ) -> list[FrameCandidate]:
297
+ if not segments:
298
+ raise RuntimeError("No coherent video segment was found for reconstruction.")
299
+
300
+ scored_segments: list[tuple[float, list[FrameCandidate]]] = []
301
+ for segment in segments:
302
+ duration = segment[-1].timestamp_seconds - segment[0].timestamp_seconds if len(segment) > 1 else 0.0
303
+ median_blur = float(np.median([candidate.blur_score for candidate in segment]))
304
+ coverage_bonus = min(len(segment) / max(target_frames, 1), 1.5)
305
+ segment_penalty = 0.0 if len(segment) >= profile.min_segment_frames else 0.6
306
+ score = (duration + len(segment) * 0.12) * coverage_bonus * math.log1p(max(median_blur, 1.0)) - segment_penalty
307
+ scored_segments.append((score, segment))
308
+
309
+ scored_segments.sort(key=lambda item: item[0], reverse=True)
310
+ return scored_segments[0][1]
311
+
312
+
313
+ def select_keyframes(
314
+ segment: list[FrameCandidate],
315
+ target_frames: int,
316
+ profile: ProfileConfig,
317
+ ) -> list[FrameCandidate]:
318
+ if len(segment) <= target_frames:
319
+ return segment
320
+
321
+ blur_scores = np.array([candidate.blur_score for candidate in segment], dtype=np.float32)
322
+ blur_threshold = float(np.percentile(blur_scores, profile.min_blur_percentile))
323
+ normalized_blur = blur_scores / max(float(blur_scores.max()), 1e-6)
324
+
325
+ motion = np.array([0.0] + [max(candidate.motion_score, 1e-6) for candidate in segment[1:]], dtype=np.float32)
326
+ cumulative_motion = np.cumsum(motion)
327
+
328
+ selected_indices: list[int] = []
329
+ neighborhood = max(2, len(segment) // max(target_frames * 2, 1))
330
+
331
+ if float(cumulative_motion[-1]) <= 1e-5:
332
+ marks = np.linspace(0, len(segment) - 1, target_frames)
333
+ mark_distances = np.arange(len(segment), dtype=np.float32)
334
+ else:
335
+ marks = np.linspace(float(cumulative_motion[0]), float(cumulative_motion[-1]), target_frames)
336
+ mark_distances = cumulative_motion
337
+
338
+ for mark in marks:
339
+ center = int(np.searchsorted(mark_distances, mark))
340
+ best_index: int | None = None
341
+ best_score = float("inf")
342
+ min_allowed = selected_indices[-1] + 1 if selected_indices else 0
343
+ lower = max(min_allowed, center - neighborhood)
344
+ upper = min(len(segment), center + neighborhood + 1)
345
+ search_ranges = [(lower, upper), (min_allowed, len(segment))]
346
+
347
+ for range_start, range_end in search_ranges:
348
+ for idx in range(range_start, range_end):
349
+ candidate = segment[idx]
350
+ mark_penalty = abs(float(mark_distances[idx]) - float(mark))
351
+ blur_penalty = 0.25 if candidate.blur_score < blur_threshold else 0.0
352
+ spacing_penalty = 0.15 if selected_indices and idx - selected_indices[-1] < 2 else 0.0
353
+ sharpness_bonus = 0.08 * float(normalized_blur[idx])
354
+ score = mark_penalty + blur_penalty + spacing_penalty - sharpness_bonus
355
+ if score < best_score:
356
+ best_score = score
357
+ best_index = idx
358
+ if best_index is not None:
359
+ break
360
+
361
+ if best_index is not None and (not selected_indices or best_index > selected_indices[-1]):
362
+ selected_indices.append(best_index)
363
+
364
+ selected_indices = sorted(set(selected_indices))
365
+ if len(selected_indices) < target_frames:
366
+ remaining = [idx for idx in range(len(segment)) if idx not in selected_indices]
367
+ remaining.sort(
368
+ key=lambda idx: (
369
+ -segment[idx].blur_score,
370
+ -(min(abs(idx - chosen) for chosen in selected_indices) if selected_indices else float("inf")),
371
+ )
372
+ )
373
+ for idx in remaining:
374
+ if len(selected_indices) >= target_frames:
375
+ break
376
+ selected_indices.append(idx)
377
+ selected_indices.sort()
378
+
379
+ trimmed = selected_indices[:target_frames]
380
+ return [segment[idx] for idx in trimmed]
381
+
382
+
383
+ def export_selected_images(scene_dir: Path, selected_frames: list[FrameCandidate]) -> list[Path]:
384
+ images_dir = _ensure_dir(scene_dir / "images")
385
+ exported: list[Path] = []
386
+ for index, candidate in enumerate(selected_frames):
387
+ destination = images_dir / f"frame_{index:04d}.jpg"
388
+ shutil.copy2(candidate.path, destination)
389
+ exported.append(destination)
390
+ return exported
391
+
392
+
393
+ def run_colmap(scene_dir: Path, selected_count: int, profile: ProfileConfig, max_image_edge: int) -> Path:
394
+ _require_binary("colmap")
395
+ database_path = scene_dir / "database.db"
396
+ images_dir = scene_dir / "images"
397
+ sparse_dir = _ensure_dir(scene_dir / "sparse")
398
+
399
+ _run(
400
+ [
401
+ "colmap",
402
+ "feature_extractor",
403
+ "--database_path",
404
+ str(database_path),
405
+ "--image_path",
406
+ str(images_dir),
407
+ "--ImageReader.single_camera",
408
+ "1",
409
+ "--ImageReader.camera_model",
410
+ "SIMPLE_RADIAL",
411
+ "--SiftExtraction.use_gpu",
412
+ "0",
413
+ "--SiftExtraction.max_image_size",
414
+ str(max_image_edge),
415
+ ],
416
+ cwd=scene_dir,
417
+ )
418
+ _run(
419
+ [
420
+ "colmap",
421
+ "sequential_matcher",
422
+ "--database_path",
423
+ str(database_path),
424
+ "--SiftMatching.use_gpu",
425
+ "0",
426
+ "--SequentialMatching.overlap",
427
+ str(min(profile.sequential_overlap, max(selected_count - 1, 1))),
428
+ "--SequentialMatching.quadratic_overlap",
429
+ "1",
430
+ "--SequentialMatching.loop_detection",
431
+ "0",
432
+ ],
433
+ cwd=scene_dir,
434
+ )
435
+ _run(
436
+ [
437
+ "colmap",
438
+ "mapper",
439
+ "--database_path",
440
+ str(database_path),
441
+ "--image_path",
442
+ str(images_dir),
443
+ "--output_path",
444
+ str(sparse_dir),
445
+ "--Mapper.multiple_models",
446
+ "0",
447
+ "--Mapper.extract_colors",
448
+ "0",
449
+ "--Mapper.min_model_size",
450
+ str(min(8, max(selected_count // 3, 4))),
451
+ ],
452
+ cwd=scene_dir,
453
+ )
454
+
455
+ model_dirs = sorted(path for path in sparse_dir.iterdir() if path.is_dir())
456
+ if not model_dirs:
457
+ raise RuntimeError("COLMAP did not produce a sparse reconstruction.")
458
+ return model_dirs[0]
459
+
460
+
461
+ def count_registered_images(model_dir: Path) -> int:
462
+ image_bin = model_dir / "images.bin"
463
+ image_txt = model_dir / "images.txt"
464
+ if image_bin.exists():
465
+ with image_bin.open("rb") as handle:
466
+ header = handle.read(8)
467
+ return int(struct.unpack("<Q", header)[0]) if header else 0
468
+
469
+ if image_txt.exists():
470
+ lines = [line.strip() for line in image_txt.read_text(encoding="utf-8").splitlines()]
471
+ payload = [line for line in lines if line and not line.startswith("#")]
472
+ return len(payload) // 2
473
+
474
+ return 0
475
+
476
+
477
+ def quality_label(registered_frames: int, selected_frames: int) -> str:
478
+ if selected_frames <= 0:
479
+ return "unknown"
480
+
481
+ ratio = registered_frames / selected_frames
482
+ if ratio >= 0.85:
483
+ return "strong"
484
+ if ratio >= 0.6:
485
+ return "usable"
486
+ return "weak"
487
+
488
+
489
+ def create_contact_sheet(selected_frames: list[FrameCandidate], output_path: Path) -> Path:
490
+ if not selected_frames:
491
+ raise RuntimeError("No selected frames were available for the contact sheet.")
492
+
493
+ thumbs: list[np.ndarray] = []
494
+ for candidate in selected_frames:
495
+ image = cv2.imread(str(candidate.path), cv2.IMREAD_COLOR)
496
+ if image is None:
497
+ continue
498
+ image = _resize_max_edge(image, 320)
499
+ overlay = image.copy()
500
+ label = f"{candidate.timestamp_seconds:0.2f}s | blur {candidate.blur_score:0.0f}"
501
+ cv2.rectangle(overlay, (0, 0), (image.shape[1], 32), (12, 18, 28), -1)
502
+ image = cv2.addWeighted(overlay, 0.72, image, 0.28, 0.0)
503
+ cv2.putText(image, label, (10, 22), FONT, 0.55, (230, 235, 240), 1, cv2.LINE_AA)
504
+ thumbs.append(image)
505
+
506
+ cols = min(4, len(thumbs))
507
+ rows = int(math.ceil(len(thumbs) / cols))
508
+ cell_height = max(image.shape[0] for image in thumbs)
509
+ cell_width = max(image.shape[1] for image in thumbs)
510
+ canvas = np.full((rows * cell_height, cols * cell_width, 3), 18, dtype=np.uint8)
511
+
512
+ for index, image in enumerate(thumbs):
513
+ row = index // cols
514
+ col = index % cols
515
+ y = row * cell_height
516
+ x = col * cell_width
517
+ canvas[y : y + image.shape[0], x : x + image.shape[1]] = image
518
+
519
+ cv2.imwrite(str(output_path), canvas, [int(cv2.IMWRITE_JPEG_QUALITY), 92])
520
+ return output_path
521
+
522
+
523
+ def write_report(
524
+ scene_dir: Path,
525
+ metadata: VideoMetadata,
526
+ selected_frames: list[FrameCandidate],
527
+ registered_frames: int,
528
+ profile_key: str,
529
+ max_image_edge: int,
530
+ ) -> Path:
531
+ report = {
532
+ "scene_name": scene_dir.name,
533
+ "video": {
534
+ "fps": metadata.fps,
535
+ "frame_count": metadata.frame_count,
536
+ "duration_seconds": metadata.duration_seconds,
537
+ "width": metadata.width,
538
+ "height": metadata.height,
539
+ },
540
+ "selection": {
541
+ "profile": profile_key,
542
+ "max_image_edge": max_image_edge,
543
+ "selected_frames": len(selected_frames),
544
+ "registered_frames": registered_frames,
545
+ "quality_label": quality_label(registered_frames, len(selected_frames)),
546
+ },
547
+ "frames": [
548
+ {
549
+ "filename": f"images/frame_{index:04d}.jpg",
550
+ "timestamp_seconds": candidate.timestamp_seconds,
551
+ "source_frame_index": candidate.frame_index,
552
+ "blur_score": candidate.blur_score,
553
+ "motion_score": candidate.motion_score,
554
+ "cut_score": candidate.cut_score,
555
+ }
556
+ for index, candidate in enumerate(selected_frames)
557
+ ],
558
+ }
559
+ report_path = scene_dir / "report.json"
560
+ report_path.write_text(json.dumps(report, indent=2), encoding="utf-8")
561
+ return report_path
562
+
563
+
564
+ def build_archive(scene_dir: Path, output_archive: Path) -> Path:
565
+ package_dir = _unique_dir(WORK_DIR, "package")
566
+ scene_package = _ensure_dir(package_dir / scene_dir.name)
567
+ shutil.copytree(scene_dir / "images", scene_package / "images")
568
+ shutil.copytree(scene_dir / "sparse", scene_package / "sparse")
569
+ report_path = scene_dir / "report.json"
570
+ if report_path.exists():
571
+ shutil.copy2(report_path, scene_package / "report.json")
572
+
573
+ with zipfile.ZipFile(output_archive, "w", compression=zipfile.ZIP_DEFLATED) as archive:
574
+ for path in sorted(scene_package.rglob("*")):
575
+ if path.is_file():
576
+ archive.write(path, path.relative_to(package_dir))
577
+ return output_archive
578
+
579
+
580
+ def convert_video_to_colmap_archive(
581
+ video_path: str | Path,
582
+ target_frames: int,
583
+ profile_key: str,
584
+ max_image_edge: int,
585
+ ) -> ConversionOutputs:
586
+ if profile_key not in PROFILES:
587
+ raise ValueError(f"Unknown sampling profile: {profile_key}")
588
+
589
+ source_path = Path(video_path)
590
+ if not source_path.exists():
591
+ raise FileNotFoundError(f"Input video not found: {source_path}")
592
+
593
+ job_dir = _unique_dir(WORK_DIR, "video-job")
594
+ normalized_path = normalize_video_input(source_path, job_dir)
595
+ metadata = read_video_metadata(normalized_path)
596
+
597
+ profile = PROFILES[profile_key]
598
+ candidates_dir = _ensure_dir(job_dir / "candidates")
599
+ candidates = extract_candidates(
600
+ video_path=normalized_path,
601
+ metadata=metadata,
602
+ candidates_dir=candidates_dir,
603
+ target_frames=target_frames,
604
+ max_image_edge=max_image_edge,
605
+ profile=profile,
606
+ )
607
+ segment = choose_best_segment(segment_candidates(candidates, profile), target_frames, profile)
608
+ selected = select_keyframes(segment, target_frames, profile)
609
+
610
+ scene_name = f"{_slugify(source_path.stem)}-{_now_ms()}"
611
+ scene_dir = _ensure_dir(job_dir / scene_name)
612
+ export_selected_images(scene_dir, selected)
613
+ model_dir = run_colmap(scene_dir, len(selected), profile, max_image_edge)
614
+ registered_frames = count_registered_images(model_dir)
615
+ report_path = write_report(scene_dir, metadata, selected, registered_frames, profile_key, max_image_edge)
616
+
617
+ output_stem = f"{scene_name}-{profile_key}-{len(selected)}"
618
+ contact_sheet_path = create_contact_sheet(selected, OUTPUTS_DIR / f"{output_stem}.jpg")
619
+ archive_path = build_archive(scene_dir, OUTPUTS_DIR / f"{output_stem}.zip")
620
+ output_report_path = OUTPUTS_DIR / f"{output_stem}.report.json"
621
+ shutil.copy2(report_path, output_report_path)
622
+
623
+ return ConversionOutputs(
624
+ archive_path=archive_path,
625
+ report_path=output_report_path,
626
+ contact_sheet_path=contact_sheet_path,
627
+ scene_name=scene_name,
628
+ selected_frames=len(selected),
629
+ registered_frames=registered_frames,
630
+ duration_seconds=metadata.duration_seconds,
631
+ quality_label=quality_label(registered_frames, len(selected)),
632
+ )