Segment-Anything-2-video-tracking

Runtime error

Segment-Anything-2-video-tracking / app.py

Mirko Trasciatti

Visualize SAM2 ball trajectory using mask centroids

11e7a5f 7 months ago

51.7 kB

	import colorsys
	import gc
	from copy import deepcopy
	import base64
	from pathlib import Path
	BASE64_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.b64")
	EXAMPLE_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.mp4")


	def ensure_example_video() -> str:
	"""
	Ensure the Kickit example video exists locally by decoding the base64 text file.
	Returns the path to the decoded MP4.
	"""
	if EXAMPLE_VIDEO_PATH.exists():
	return str(EXAMPLE_VIDEO_PATH)
	if not BASE64_VIDEO_PATH.exists():
	raise FileNotFoundError("Base64 video asset not found.")
	data = BASE64_VIDEO_PATH.read_text()
	EXAMPLE_VIDEO_PATH.write_bytes(base64.b64decode(data))
	return str(EXAMPLE_VIDEO_PATH)

	from types import SimpleNamespace
	from typing import Optional

	import cv2
	import gradio as gr
	import numpy as np
	import spaces
	import torch
	from gradio.themes import Soft
	from PIL import Image, ImageDraw

	from transformers import AutoModel, Sam2VideoProcessor
	from ultralytics import YOLO
	from huggingface_hub import hf_hub_download

	YOLO_MODEL_CACHE: dict[str, YOLO] = {}
	YOLO_DEFAULT_MODEL = "yolov13n.pt"
	YOLO_REPO_ID = "atalaydenknalbant/Yolov13"
	YOLO_TARGET_NAME = "sports ball"
	YOLO_CONF_THRESHOLD = 0.0
	YOLO_IOU_THRESHOLD = 0.02


	def get_yolo_model(model_filename: str = YOLO_DEFAULT_MODEL) -> YOLO:
	"""
	Lazily download and load a YOLOv13 model, caching it for reuse.
	"""
	if model_filename in YOLO_MODEL_CACHE:
	return YOLO_MODEL_CACHE[model_filename]

	model_path = hf_hub_download(repo_id=YOLO_REPO_ID, filename=model_filename)
	model = YOLO(model_path)
	YOLO_MODEL_CACHE[model_filename] = model
	return model


	def detect_ball_center(
	frame: Image.Image,
	model_filename: str = YOLO_DEFAULT_MODEL,
	conf_threshold: float = YOLO_CONF_THRESHOLD,
	iou_threshold: float = YOLO_IOU_THRESHOLD,
	) -> Optional[tuple[int, int, int, int, float]]:
	"""
	Run YOLO on a single frame and return (x_center, y_center, width, height, confidence)
	for the highest-confidence sports ball detection.
	"""
	model = get_yolo_model(model_filename)
	class_ids = [
	idx for idx, name in model.names.items() if name.lower() == YOLO_TARGET_NAME
	]
	if not class_ids:
	return None

	results = model.predict(
	source=frame,
	conf=conf_threshold,
	iou=iou_threshold,
	max_det=1,
	classes=class_ids,
	imgsz=640,
	device="cpu",
	verbose=False,
	)

	if not results:
	return None

	boxes = results[0].boxes
	if boxes is None or len(boxes) == 0:
	return None

	box = boxes[0]
	# xywh format: x_center, y_center, width, height
	xywh = box.xywh[0].cpu().tolist()
	conf = float(box.conf[0].cpu().item()) if box.conf is not None else 0.0
	x_center, y_center, width, height = xywh
	return (
	int(round(x_center)),
	int(round(y_center)),
	int(round(width)),
	int(round(height)),
	conf,
	)


	def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
	"""Generate a deterministic pastel RGB color for a given object id.

	Uses golden ratio to distribute hues; low-medium saturation, high value.
	"""
	golden_ratio_conjugate = 0.61803398875
	# Map obj_id (1-based) to hue in [0,1)
	hue = (obj_id * golden_ratio_conjugate) % 1.0
	saturation = 0.45
	value = 1.0
	r_f, g_f, b_f = colorsys.hsv_to_rgb(hue, saturation, value)
	return int(r_f * 255), int(g_f * 255), int(b_f * 255)


	def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
	"""Load video frames as PIL Images using transformers.video_utils if available,
	otherwise fall back to OpenCV. Returns (frames, info).
	"""

	cap = cv2.VideoCapture(video_path_or_url)
	frames = []
	print("loading video frames")
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(Image.fromarray(frame_rgb))
	# Gather fps if available
	fps_val = cap.get(cv2.CAP_PROP_FPS)
	cap.release()
	print("loaded video frames")
	info = {
	"num_frames": len(frames),
	"fps": float(fps_val) if fps_val and fps_val > 0 else None,
	}
	return frames, info


	def overlay_masks_on_frame(
	frame: Image.Image,
	masks_per_object: dict[int, np.ndarray],
	color_by_obj: dict[int, tuple[int, int, int]],
	alpha: float = 0.5,
	) -> Image.Image:
	"""Overlay per-object soft masks onto the RGB frame.

	masks_per_object: mapping of obj_id -> (H, W) float mask in [0,1]
	color_by_obj: mapping of obj_id -> (R, G, B)
	"""
	base = np.array(frame).astype(np.float32) / 255.0 # H, W, 3 in [0,1]
	height, width = base.shape[:2]
	overlay = base.copy()

	for obj_id, mask in masks_per_object.items():
	if mask is None:
	continue
	if mask.dtype != np.float32:
	mask = mask.astype(np.float32)
	# Ensure shape is H x W
	if mask.ndim == 3:
	mask = mask.squeeze()
	mask = np.clip(mask, 0.0, 1.0)
	color = np.array(color_by_obj.get(obj_id, (255, 0, 0)), dtype=np.float32) / 255.0
	# Blend: overlay = (1 - am)overlay + (am)color
	a = alpha
	m = mask[..., None]
	overlay = (1.0 - a * m) * overlay + (a * m) * color

	out = np.clip(overlay * 255.0, 0, 255).astype(np.uint8)
	return Image.fromarray(out)


	def get_device_and_dtype() -> tuple[str, torch.dtype]:
	device = "cpu"
	dtype = torch.bfloat16
	return device, dtype


	class AppState:
	def __init__(self):
	self.reset()

	def reset(self):
	self.video_frames: list[Image.Image] = []
	self.inference_session = None
	self.model: Optional[AutoModel] = None
	self.processor: Optional[Sam2VideoProcessor] = None
	self.device: str = "cpu"
	self.dtype: torch.dtype = torch.bfloat16
	self.video_fps: float \| None = None
	self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
	self.color_by_obj: dict[int, tuple[int, int, int]] = {}
	self.clicks_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int]]]] = {}
	self.boxes_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int, int]]]] = {}
	# Cache of composited frames (original + masks + clicks)
	self.composited_frames: dict[int, Image.Image] = {}
	# UI state for click handler
	self.current_frame_idx: int = 0
	self.current_obj_id: int = 1
	self.current_label: str = "positive"
	self.current_clear_old: bool = True
	self.current_prompt_type: str = "Points" # or "Boxes"
	self.pending_box_start: tuple[int, int] \| None = None
	self.pending_box_start_frame_idx: int \| None = None
	self.pending_box_start_obj_id: int \| None = None
	self.is_switching_model: bool = False
	self.ball_centers: dict[int, dict[int, tuple[int, int]]] = {}
	# Model selection
	self.model_repo_key: str = "tiny"
	self.model_repo_id: str \| None = None
	self.session_repo_id: str \| None = None

	def __repr__(self):
	return f"AppState(video_frames={self.video_frames}, inference_session={self.inference_session is not None}, model={self.model is not None}, processor={self.processor is not None}, device={self.device}, dtype={self.dtype}, video_fps={self.video_fps}, masks_by_frame={self.masks_by_frame}, color_by_obj={self.color_by_obj}, clicks_by_frame_obj={self.clicks_by_frame_obj}, boxes_by_frame_obj={self.boxes_by_frame_obj}, composited_frames={self.composited_frames}, current_frame_idx={self.current_frame_idx}, current_obj_id={self.current_obj_id}, current_label={self.current_label}, current_clear_old={self.current_clear_old}, current_prompt_type={self.current_prompt_type}, pending_box_start={self.pending_box_start}, pending_box_start_frame_idx={self.pending_box_start_frame_idx}, pending_box_start_obj_id={self.pending_box_start_obj_id}, is_switching_model={self.is_switching_model}, model_repo_key={self.model_repo_key}, model_repo_id={self.model_repo_id}, session_repo_id={self.session_repo_id})"

	@property
	def num_frames(self) -> int:
	return len(self.video_frames)


	def _model_repo_from_key(key: str) -> str:
	mapping = {
	"tiny": "facebook/sam2.1-hiera-tiny",
	"small": "facebook/sam2.1-hiera-small",
	"base_plus": "facebook/sam2.1-hiera-base-plus",
	"large": "facebook/sam2.1-hiera-large",
	}
	return mapping.get(key, mapping["base_plus"])


	def load_model_if_needed(GLOBAL_STATE: gr.State) -> tuple[AutoModel, Sam2VideoProcessor, str, torch.dtype]:
	desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
	if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
	if GLOBAL_STATE.model_repo_id == desired_repo:
	return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
	# Different repo requested: dispose current and reload
	GLOBAL_STATE.model = None
	GLOBAL_STATE.processor = None
	print(f"Loading model from {desired_repo}")
	device, dtype = get_device_and_dtype()
	# free up the gpu memory
	model = AutoModel.from_pretrained(desired_repo)
	processor = Sam2VideoProcessor.from_pretrained(desired_repo)
	model.to(device, dtype=dtype)

	GLOBAL_STATE.model = model
	GLOBAL_STATE.processor = processor
	GLOBAL_STATE.device = device
	GLOBAL_STATE.dtype = dtype
	GLOBAL_STATE.model_repo_id = desired_repo


	def ensure_session_for_current_model(GLOBAL_STATE: gr.State) -> None:
	"""Ensure the model/processor match the selected repo and inference_session exists.
	If a video is already loaded, re-initialize the inference session when needed.
	"""
	load_model_if_needed(GLOBAL_STATE)
	desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
	if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
	if GLOBAL_STATE.video_frames:
	# Clear session-related UI caches when switching model
	GLOBAL_STATE.masks_by_frame.clear()
	GLOBAL_STATE.clicks_by_frame_obj.clear()
	GLOBAL_STATE.boxes_by_frame_obj.clear()
	GLOBAL_STATE.composited_frames.clear()
	GLOBAL_STATE.inference_session = None
	GLOBAL_STATE.inference_session = GLOBAL_STATE.processor.init_video_session(
	inference_device=GLOBAL_STATE.device,
	video_storage_device="cpu",
	dtype=GLOBAL_STATE.dtype,
	)
	GLOBAL_STATE.session_repo_id = desired_repo


	def init_video_session(GLOBAL_STATE: gr.State, video: str \| dict) -> tuple[AppState, int, int, Image.Image, str]:
	"""Gradio handler: load video, init session, return state, slider bounds, and first frame."""
	# Reset ONLY video-related fields, keep model loaded
	GLOBAL_STATE.video_frames = []
	GLOBAL_STATE.inference_session = None
	GLOBAL_STATE.masks_by_frame = {}
	GLOBAL_STATE.color_by_obj = {}
	GLOBAL_STATE.ball_centers = {}

	load_model_if_needed(GLOBAL_STATE)

	# Gradio Video may provide a dict with 'name' or a direct file path
	video_path: Optional[str] = None
	if isinstance(video, dict):
	video_path = video.get("name") or video.get("path") or video.get("data")
	elif isinstance(video, str):
	video_path = video
	else:
	video_path = None

	if not video_path:
	raise gr.Error("Invalid video input.")

	frames, info = try_load_video_frames(video_path)
	if len(frames) == 0:
	raise gr.Error("No frames could be loaded from the video.")

	# Enforce max duration of 8 seconds (trim if longer)
	MAX_SECONDS = 8.0
	trimmed_note = ""
	fps_in = info.get("fps")
	max_frames_allowed = int(MAX_SECONDS * fps_in)
	if len(frames) > max_frames_allowed:
	frames = frames[:max_frames_allowed]
	trimmed_note = f" (trimmed to {int(MAX_SECONDS)}s = {len(frames)} frames)"
	if isinstance(info, dict):
	info["num_frames"] = len(frames)
	GLOBAL_STATE.video_frames = frames
	# Try to capture original FPS if provided by loader
	GLOBAL_STATE.video_fps = float(fps_in)
	# Initialize session
	inference_session = GLOBAL_STATE.processor.init_video_session(
	inference_device=GLOBAL_STATE.device,
	video_storage_device="cpu",
	dtype=GLOBAL_STATE.dtype,
	)
	GLOBAL_STATE.inference_session = inference_session

	first_frame = frames[0]
	max_idx = len(frames) - 1
	status = (
	f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps{trimmed_note}. "
	f"Device: {GLOBAL_STATE.device}, dtype: bfloat16"
	)
	return GLOBAL_STATE, 0, max_idx, first_frame, status


	def compose_frame(state: AppState, frame_idx: int, remove_bg: bool = False) -> Image.Image:
	if state is None or state.video_frames is None or len(state.video_frames) == 0:
	return None
	frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
	frame = state.video_frames[frame_idx]
	masks = state.masks_by_frame.get(frame_idx, {})
	out_img = frame

	if len(masks) != 0:
	if remove_bg:
	# Remove background - show only tracked objects
	frame_np = np.array(frame)
	# Create combined mask for all objects
	combined_mask = np.zeros((frame_np.shape[0], frame_np.shape[1]), dtype=np.float32)
	for obj_id, mask in masks.items():
	if mask is not None:
	if mask.dtype != np.float32:
	mask = mask.astype(np.float32)
	if mask.ndim == 3:
	mask = mask.squeeze()
	combined_mask = np.maximum(combined_mask, np.clip(mask, 0.0, 1.0))

	# Apply mask - black background where mask is 0
	mask_3d = np.repeat(combined_mask[:, :, np.newaxis], 3, axis=2)
	result_np = (frame_np * mask_3d).astype(np.uint8)
	out_img = Image.fromarray(result_np)
	else:
	# Original behavior - overlay colored masks
	out_img = overlay_masks_on_frame(out_img, masks, state.color_by_obj, alpha=0.65)

	# Draw crosses for conditioning frames only (frames with recorded clicks)
	clicks_map = state.clicks_by_frame_obj.get(frame_idx)
	if clicks_map:
	draw = ImageDraw.Draw(out_img)
	cross_half = 6
	for obj_id, pts in clicks_map.items():
	for x, y, lbl in pts:
	color = (0, 255, 0) if int(lbl) == 1 else (255, 0, 0)
	# horizontal
	draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
	# vertical
	draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
	# Draw temporary cross for first corner in box mode
	if (
	state.pending_box_start is not None
	and state.pending_box_start_frame_idx == frame_idx
	and state.pending_box_start_obj_id is not None
	):
	draw = ImageDraw.Draw(out_img)
	x, y = state.pending_box_start
	cross_half = 6
	color = state.color_by_obj.get(state.pending_box_start_obj_id, (255, 255, 255))
	draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
	draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
	# Draw boxes for conditioning frames
	box_map = state.boxes_by_frame_obj.get(frame_idx)
	if box_map:
	draw = ImageDraw.Draw(out_img)
	for obj_id, boxes in box_map.items():
	color = state.color_by_obj.get(obj_id, (255, 255, 255))
	for x1, y1, x2, y2 in boxes:
	draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=2)
	# Draw trajectory centers (all frames)
	if state.ball_centers:
	draw = ImageDraw.Draw(out_img)
	cross_half = 4
	for obj_id, centers in state.ball_centers.items():
	color = state.color_by_obj.get(obj_id, (255, 255, 0))
	for cx, cy in centers.values():
	draw.line([(cx - cross_half, cy), (cx + cross_half, cy)], fill=color, width=2)
	draw.line([(cx, cy - cross_half), (cx, cy + cross_half)], fill=color, width=2)
	# Save to cache and return
	state.composited_frames[frame_idx] = out_img
	return out_img


	def update_frame_display(state: AppState, frame_idx: int) -> Image.Image:
	if state is None or state.video_frames is None or len(state.video_frames) == 0:
	return None
	frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
	# Serve from cache when available
	cached = state.composited_frames.get(frame_idx)
	if cached is not None:
	return cached
	return compose_frame(state, frame_idx)


	def _ensure_color_for_obj(state: AppState, obj_id: int):
	if obj_id not in state.color_by_obj:
	state.color_by_obj[obj_id] = pastel_color_for_object(obj_id)


	def _compute_mask_centroid(mask: np.ndarray) -> tuple[int, int] \| None:
	if mask is None:
	return None
	mask_np = np.array(mask)
	if mask_np.ndim == 3:
	mask_np = mask_np.squeeze()
	if mask_np.size == 0:
	return None
	mask_float = np.clip(mask_np, 0.0, 1.0).astype(np.float32)
	moments = cv2.moments(mask_float)
	if moments["m00"] == 0:
	return None
	cx = int(moments["m10"] / moments["m00"])
	cy = int(moments["m01"] / moments["m00"])
	return cx, cy


	def _update_centroids_for_frame(state: AppState, frame_idx: int):
	if state is None:
	return
	masks = state.masks_by_frame.get(int(frame_idx), {})
	seen_obj_ids: set[int] = set()
	for obj_id, mask in masks.items():
	centroid = _compute_mask_centroid(mask)
	centers = state.ball_centers.setdefault(int(obj_id), {})
	if centroid is not None:
	centers[int(frame_idx)] = centroid
	else:
	centers.pop(int(frame_idx), None)
	seen_obj_ids.add(int(obj_id))
	_ensure_color_for_obj(state, int(obj_id))
	# Remove frames for objects without masks at this frame
	for obj_id, centers in state.ball_centers.items():
	if obj_id not in seen_obj_ids:
	centers.pop(int(frame_idx), None)


	def on_image_click(
	img: Image.Image \| np.ndarray,
	state: AppState,
	frame_idx: int,
	obj_id: int,
	label: str,
	clear_old: bool,
	evt: gr.SelectData,
	) -> Image.Image:
	if state is None or state.inference_session is None:
	return img # no-op preview when not ready
	if state.is_switching_model:
	# Gracefully ignore input during model switch; return current preview unchanged
	return update_frame_display(state, int(frame_idx))

	# Parse click coordinates from event
	x = y = None
	if evt is not None:
	# Try different gradio event data shapes for robustness
	try:
	if hasattr(evt, "index") and isinstance(evt.index, (list, tuple)) and len(evt.index) == 2:
	x, y = int(evt.index[0]), int(evt.index[1])
	elif hasattr(evt, "value") and isinstance(evt.value, dict) and "x" in evt.value and "y" in evt.value:
	x, y = int(evt.value["x"]), int(evt.value["y"])
	except Exception:
	x = y = None

	if x is None or y is None:
	raise gr.Error("Could not read click coordinates.")

	_ensure_color_for_obj(state, int(obj_id))

	processor = state.processor
	model = state.model
	inference_session = state.inference_session
	original_size = None
	pixel_values = None
	if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
	inputs = processor(images=state.video_frames[frame_idx], device=state.device, return_tensors="pt")
	original_size = inputs.original_sizes[0]
	pixel_values = inputs.pixel_values[0]

	if state.current_prompt_type == "Boxes":
	# Two-click box input
	if state.pending_box_start is None:
	# For boxes, always clear old inputs (points) for this object on this frame
	frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
	frame_clicks[int(obj_id)] = []
	state.composited_frames.pop(int(frame_idx), None)
	state.pending_box_start = (int(x), int(y))
	state.pending_box_start_frame_idx = int(frame_idx)
	state.pending_box_start_obj_id = int(obj_id)
	# Invalidate cache so temporary cross is drawn
	state.composited_frames.pop(int(frame_idx), None)
	return update_frame_display(state, int(frame_idx))
	else:
	x1, y1 = state.pending_box_start
	x2, y2 = int(x), int(y)
	# Clear temporary state and invalidate cache
	state.pending_box_start = None
	state.pending_box_start_frame_idx = None
	state.pending_box_start_obj_id = None
	state.composited_frames.pop(int(frame_idx), None)
	x_min, y_min = min(x1, x2), min(y1, y2)
	x_max, y_max = max(x1, x2), max(y1, y2)

	processor.add_inputs_to_inference_session(
	inference_session=inference_session,
	frame_idx=int(frame_idx),
	obj_ids=int(obj_id),
	input_boxes=[[[x_min, y_min, x_max, y_max]]],
	clear_old_inputs=True, # For boxes, always clear old inputs
	original_size=original_size,
	)

	frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
	obj_boxes = frame_boxes.setdefault(int(obj_id), [])
	# For boxes, always clear old inputs
	obj_boxes.clear()
	obj_boxes.append((x_min, y_min, x_max, y_max))
	state.composited_frames.pop(int(frame_idx), None)
	else:
	# Points mode
	label_int = 1 if str(label).lower().startswith("pos") else 0
	# If clear_old is enabled, clear prior boxes for this object on this frame
	if bool(clear_old):
	frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
	frame_boxes[int(obj_id)] = []
	state.composited_frames.pop(int(frame_idx), None)
	processor.add_inputs_to_inference_session(
	inference_session=inference_session,
	frame_idx=int(frame_idx),
	obj_ids=int(obj_id),
	input_points=[[[[int(x), int(y)]]]],
	input_labels=[[[int(label_int)]]],
	original_size=original_size,
	clear_old_inputs=bool(clear_old),
	)

	frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
	obj_clicks = frame_clicks.setdefault(int(obj_id), [])
	if bool(clear_old):
	obj_clicks.clear()
	obj_clicks.append((int(x), int(y), int(label_int)))
	state.composited_frames.pop(int(frame_idx), None)

	# Forward on that frame
	with torch.inference_mode():
	outputs = model(inference_session=inference_session, frame=pixel_values, frame_idx=int(frame_idx))

	H = inference_session.video_height
	W = inference_session.video_width
	# Detach and move off GPU as early as possible to reduce GPU memory pressure
	pred_masks = outputs.pred_masks.detach().cpu()
	video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]

	# Map returned masks to object ids. For single object forward, it's [1, 1, H, W]
	# But to be safe, iterate over session.obj_ids order.
	masks_for_frame: dict[int, np.ndarray] = {}
	obj_ids_order = list(inference_session.obj_ids)
	for i, oid in enumerate(obj_ids_order):
	mask_i = video_res_masks[i]
	# mask_i shape could be (1, H, W) or (H, W); squeeze to 2D
	mask_2d = mask_i.cpu().numpy().squeeze()
	masks_for_frame[int(oid)] = mask_2d

	state.masks_by_frame[int(frame_idx)] = masks_for_frame
	_update_centroids_for_frame(state, int(frame_idx))
	# Invalidate cache for this frame to force recomposition
	state.composited_frames.pop(int(frame_idx), None)

	# Return updated preview
	return update_frame_display(state, int(frame_idx))


	@spaces.GPU()
	def propagate_masks(GLOBAL_STATE: gr.State):
	if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
	# yield GLOBAL_STATE, "Load a video first.", gr.update()
	return GLOBAL_STATE, "Load a video first.", gr.update()

	processor = deepcopy(GLOBAL_STATE.processor)
	model = deepcopy(GLOBAL_STATE.model)
	inference_session = deepcopy(GLOBAL_STATE.inference_session)
	# set inference device to cuda to use zero gpu
	inference_session.inference_device = "cuda"
	inference_session.cache.inference_device = "cuda"
	model.to("cuda")

	total = max(1, GLOBAL_STATE.num_frames)
	processed = 0

	# Initial status; no slider change yet
	yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update()

	last_frame_idx = 0
	with torch.inference_mode():
	for frame_idx, frame in enumerate(GLOBAL_STATE.video_frames):
	pixel_values = None
	if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
	pixel_values = processor(images=frame, device="cuda", return_tensors="pt").pixel_values[0]
	sam2_video_output = model(inference_session=inference_session, frame=pixel_values, frame_idx=frame_idx)
	H = inference_session.video_height
	W = inference_session.video_width
	pred_masks = sam2_video_output.pred_masks.detach().cpu()
	video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
	last_frame_idx = frame_idx
	masks_for_frame: dict[int, np.ndarray] = {}
	obj_ids_order = list(inference_session.obj_ids)
	for i, oid in enumerate(obj_ids_order):
	mask_2d = video_res_masks[i].cpu().numpy().squeeze()
	masks_for_frame[int(oid)] = mask_2d
	GLOBAL_STATE.masks_by_frame[frame_idx] = masks_for_frame
	_update_centroids_for_frame(GLOBAL_STATE, frame_idx)
	# Invalidate cache for that frame to force recomposition
	GLOBAL_STATE.composited_frames.pop(frame_idx, None)

	processed += 1
	# Every 15th frame (or last), move slider to current frame to update preview via slider binding
	if processed % 30 == 0 or processed == total:
	yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)

	text = f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."

	# Final status; ensure slider points to last processed frame
	yield GLOBAL_STATE, text, gr.update(value=last_frame_idx)


	def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str]:
	# Reset only session-related state, keep uploaded video and model
	if not GLOBAL_STATE.video_frames:
	# Nothing loaded; keep behavior
	return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video."

	# Clear prompts and caches
	GLOBAL_STATE.masks_by_frame.clear()
	GLOBAL_STATE.clicks_by_frame_obj.clear()
	GLOBAL_STATE.boxes_by_frame_obj.clear()
	GLOBAL_STATE.composited_frames.clear()
	GLOBAL_STATE.pending_box_start = None
	GLOBAL_STATE.pending_box_start_frame_idx = None
	GLOBAL_STATE.pending_box_start_obj_id = None
	GLOBAL_STATE.ball_centers.clear()

	# Dispose and re-init inference session for current model with existing frames
	try:
	if GLOBAL_STATE.inference_session is not None:
	GLOBAL_STATE.inference_session.reset_inference_session()
	except Exception:
	pass
	GLOBAL_STATE.inference_session = None
	gc.collect()
	ensure_session_for_current_model(GLOBAL_STATE)

	# Keep current slider index if possible
	current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
	current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
	preview_img = update_frame_display(GLOBAL_STATE, current_idx)
	slider_minmax = gr.update(minimum=0, maximum=max(GLOBAL_STATE.num_frames - 1, 0), interactive=True)
	slider_value = gr.update(value=current_idx)
	status = "Session reset. Prompts cleared; video preserved."
	# clear and reload model and processor
	return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status, gr.update(visible=False, value="")


	def create_annotation_preview(video_file, annotations):
	"""
	Create a preview image showing annotation points on video frames.

	Args:
	video_file: Path to video file
	annotations: List of annotation dicts

	Returns:
	PIL Image with annotations visualized
	"""
	import tempfile
	from pathlib import Path

	# Get video frames for the annotated frame indices
	cap = cv2.VideoCapture(video_file)
	if not cap.isOpened():
	return None

	# Group annotations by frame
	frames_to_show = {}
	for ann in annotations:
	frame_idx = ann.get("frame", 0)
	if frame_idx not in frames_to_show:
	frames_to_show[frame_idx] = []
	frames_to_show[frame_idx].append(ann)

	# Read and annotate frames
	annotated_frames = []
	for frame_idx in sorted(frames_to_show.keys())[:3]: # Show max 3 frames
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
	ret, frame = cap.read()
	if not ret:
	continue

	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	pil_img = Image.fromarray(frame_rgb)
	draw = ImageDraw.Draw(pil_img)

	# Draw annotations
	for ann in frames_to_show[frame_idx]:
	x, y = ann.get("x", 0), ann.get("y", 0)
	obj_id = ann.get("object_id", 1)
	label = ann.get("label", "positive")

	# Color based on object ID
	color = pastel_color_for_object(obj_id)

	# Draw crosshair
	size = 20
	draw.line([(x-size, y), (x+size, y)], fill=color, width=3)
	draw.line([(x, y-size), (x, y+size)], fill=color, width=3)
	draw.ellipse([(x-10, y-10), (x+10, y+10)], outline=color, width=3)

	# Draw label
	text = f"Obj{obj_id} F{frame_idx}"
	draw.text((x+15, y-15), text, fill=color)

	# Add frame number label
	draw.text((10, 10), f"Frame {frame_idx}", fill=(255, 255, 255))

	annotated_frames.append(pil_img)

	cap.release()

	# Combine frames horizontally
	if not annotated_frames:
	return None

	total_width = sum(img.width for img in annotated_frames)
	max_height = max(img.height for img in annotated_frames)

	combined = Image.new('RGB', (total_width, max_height))
	x_offset = 0
	for img in annotated_frames:
	combined.paste(img, (x_offset, 0))
	x_offset += img.width

	return combined


	@spaces.GPU(duration=120) # Allocate GPU for up to 2 minutes
	def process_video_api(
	video_file,
	annotations_json_str: str,
	checkpoint: str = "base_plus",
	remove_background: bool = True
	):
	"""
	Single-endpoint API for programmatic video processing.

	Args:
	video_file: Uploaded video file
	annotations_json_str: JSON string with format:
	{
	"annotations": [
	{"object_id": 1, "frame": 139, "x": 369, "y": 652, "label": "positive"},
	{"object_id": 1, "frame": 156, "x": 374, "y": 513, "label": "positive"},
	{"object_id": 2, "frame": 156, "x": 374, "y": 257, "label": "positive"}
	]
	}
	checkpoint: SAM2 model checkpoint (tiny, small, base_plus, large)
	remove_background: Whether to remove background (default: True)

	Returns:
	Tuple of (preview_image, processed_video_path)
	"""
	import json

	try:
	# Parse annotations
	annotations_data = json.loads(annotations_json_str)
	annotations = annotations_data.get("annotations", [])
	client_fps = annotations_data.get("fps", None) # FPS used by iOS app to calculate frame indices

	print(f"[API] Processing video with {len(annotations)} annotations")
	print(f"[API] Client FPS: {client_fps}")
	print(f"[API] Checkpoint: {checkpoint}")
	print(f"[API] Remove background: {remove_background}")

	# Create preview of annotation points
	preview_img = create_annotation_preview(video_file, annotations)

	# Create a temporary state for this API call
	api_state = AppState()
	api_state.model_repo_key = checkpoint

	# Step 1: Initialize session with video
	api_state, min_idx, max_idx, first_frame, status = init_video_session(api_state, video_file)
	space_fps = api_state.video_fps
	print(f"[API] Video loaded: {status}")
	print(f"[API] ⚠️ FPS mismatch check: Client={client_fps}, Space={space_fps}")

	# If FPS mismatch, warn about potential frame offset
	if client_fps and space_fps and abs(client_fps - space_fps) > 0.5:
	offset_estimate = abs(int((client_fps - space_fps) * (api_state.num_frames / client_fps)))
	print(f"[API] ⚠️ FPS mismatch detected! Frame indices may be off by ~{offset_estimate} frames")
	print(f"[API] ℹ️ Recommendation: Use timestamps instead of frame indices for accuracy")

	# Step 2: Apply each annotation
	for i, ann in enumerate(annotations):
	object_id = ann.get("object_id", 1)
	timestamp_ms = ann.get("timestamp_ms", None)
	frame_idx = ann.get("frame", None)
	x = ann.get("x", 0)
	y = ann.get("y", 0)
	label = ann.get("label", "positive")

	# Calculate frame from timestamp using Space's FPS (more accurate)
	if timestamp_ms is not None and space_fps and space_fps > 0:
	calculated_frame = int((timestamp_ms / 1000.0) * space_fps)
	if frame_idx is not None and calculated_frame != frame_idx:
	print(f"[API] ✅ Using timestamp: {timestamp_ms}ms → Frame {calculated_frame} (client sent frame {frame_idx})")
	else:
	print(f"[API] ✅ Calculated frame from timestamp: {timestamp_ms}ms → Frame {calculated_frame}")
	frame_idx = calculated_frame
	elif frame_idx is None:
	print(f"[API] ⚠️ Warning: No timestamp or frame provided, using frame 0")
	frame_idx = 0

	print(f"[API] Adding annotation {i+1}/{len(annotations)}: "
	f"Object {object_id}, Frame {frame_idx}, ({x}, {y}), {label}")

	# Sync state
	api_state.current_frame_idx = int(frame_idx)
	api_state.current_obj_id = int(object_id)
	api_state.current_label = str(label)

	# Create a mock event with coordinates
	class MockEvent:
	def __init__(self, x, y):
	self.index = (x, y)

	mock_evt = MockEvent(x, y)

	# Add the point annotation
	preview_img = on_image_click(
	first_frame,
	api_state,
	frame_idx,
	object_id,
	label,
	clear_old=False,
	evt=mock_evt
	)

	# Step 3: Propagate masks across all frames
	print("[API] Propagating masks across video...")
	# We need to consume the generator
	for state_update, status_msg, slider_update in propagate_masks(api_state):
	api_state = state_update
	print(f"[API] Progress: {status_msg}")

	# Step 4: Render the final video
	print(f"[API] Rendering video with remove_background={remove_background}...")
	result_video_path = _render_video(api_state, remove_background)

	print(f"[API] ✅ Processing complete: {result_video_path}")
	return preview_img, result_video_path

	except Exception as e:
	print(f"[API] ❌ Error: {str(e)}")
	import traceback
	traceback.print_exc()
	raise gr.Error(f"Processing failed: {str(e)}")


	theme = Soft(primary_hue="blue", secondary_hue="rose", neutral_hue="slate")

	with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", theme=theme) as demo:
	GLOBAL_STATE = gr.State(AppState())

	gr.Markdown(
	"""
	### SAM2 Video Tracking · powered by Hugging Face 🤗 Transformers
	Segment and track objects across a video with SAM2 (Segment Anything 2). This demo runs the official implementation from the Hugging Face Transformers library for interactive, promptable video segmentation.
	"""
	)
	with gr.Row():
	with gr.Column():
	gr.Markdown(
	"""
	Quick start
	- Load a video: Upload your own or pick an example below.
	- Checkpoint: Tiny / Small / Base+ / Large (trade speed vs. accuracy).
	- Points mode: Select an Object ID and point label (positive/negative), then click the frame to add guidance. You can add multiple points per object and define multiple objects across frames.
	- Boxes mode: Click two opposite corners to draw a box. Old inputs for that object are cleared automatically.
	"""
	)
	with gr.Column():
	gr.Markdown(
	"""
	Working with results
	- Preview: Use the slider to navigate frames and see the current masks.
	- Propagate: Click “Propagate across video” to track all defined objects through the entire video. The preview follows progress periodically to keep things responsive.
	- Export: Render an MP4 for smooth playback using the original video FPS.
	- Note: More info on the Hugging Face 🤗 Transformers implementation of SAM2 can be found [here](https://huggingface.co/docs/transformers/en/main/en/model_doc/sam2_video).
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	video_in = gr.Video(label="Upload video", sources=["upload", "webcam"], interactive=True)
	ckpt_radio = gr.Radio(
	choices=["tiny", "small", "base_plus", "large"],
	value="tiny",
	label="SAM2.1 checkpoint",
	)
	ckpt_progress = gr.Markdown(visible=False)
	load_status = gr.Markdown(visible=True)
	reset_btn = gr.Button("Reset Session", variant="secondary")
	with gr.Column(scale=2):
	preview = gr.Image(label="Preview", interactive=True)
	with gr.Row():
	frame_slider = gr.Slider(label="Frame", minimum=0, maximum=0, step=1, value=0, interactive=True)
	with gr.Column(scale=0):
	detect_ball_btn = gr.Button("Detect Ball", variant="secondary")
	ball_status = gr.Markdown(visible=False)
	propagate_btn = gr.Button("Propagate across video", variant="primary")
	propagate_status = gr.Markdown(visible=True)
	with gr.Row():
	obj_id_inp = gr.Number(value=1, precision=0, label="Object ID", scale=0)
	label_radio = gr.Radio(choices=["positive", "negative"], value="positive", label="Point label")
	clear_old_chk = gr.Checkbox(value=False, label="Clear old inputs for this object")
	prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")

	# Wire events
	def _on_video_change(GLOBAL_STATE: gr.State, video):
	GLOBAL_STATE, min_idx, max_idx, first_frame, status = init_video_session(GLOBAL_STATE, video)
	return (
	GLOBAL_STATE,
	gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
	first_frame,
	status,
	gr.update(visible=False, value="")
	)

	video_in.change(
	_on_video_change,
	inputs=[GLOBAL_STATE, video_in],
	outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status],
	show_progress=True,
	)

	# (moved) Examples are defined above the render button
	# Each example row must match the number of inputs (GLOBAL_STATE, video_in)
	example_video_path = ensure_example_video()
	examples_list = [
	[None, example_video_path],
	]
	with gr.Row():
	gr.Examples(
	examples=examples_list,
	inputs=[GLOBAL_STATE, video_in],
	fn=_on_video_change,
	outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status],
	label="Examples",
	cache_examples=False,
	examples_per_page=5,
	)
	# Examples (place before the render MP4 button) — defined after handler below

	with gr.Row():
	remove_bg_checkbox = gr.Checkbox(
	label="Remove Background",
	value=False,
	info="If checked, shows only tracked objects on black background. If unchecked, overlays colored masks on original video."
	)
	with gr.Row():
	render_btn = gr.Button("Render MP4 for smooth playback", variant="primary")
	playback_video = gr.Video(label="Rendered Playback", interactive=False)

	def _on_ckpt_change(s: AppState, key: str):
	if s is not None and key:
	key = str(key)
	if key != s.model_repo_key:
	# Update and drop current model to reload lazily next time
	s.is_switching_model = True
	s.model_repo_key = key
	s.model_repo_id = None
	s.model = None
	s.processor = None
	# Stream progress text while loading (first yield shows text)
	yield gr.update(visible=True, value=f"Loading checkpoint: {key}...")
	ensure_session_for_current_model(s)
	if s is not None:
	s.is_switching_model = False
	# Final yield hides the text
	yield gr.update(visible=False, value="")

	ckpt_radio.change(_on_ckpt_change, inputs=[GLOBAL_STATE, ckpt_radio], outputs=[ckpt_progress])

	def _sync_frame_idx(state_in: AppState, idx: int):
	if state_in is not None:
	state_in.current_frame_idx = int(idx)
	return update_frame_display(state_in, int(idx))

	frame_slider.change(
	_sync_frame_idx,
	inputs=[GLOBAL_STATE, frame_slider],
	outputs=preview,
	)

	def _sync_obj_id(s: AppState, oid):
	if s is not None and oid is not None:
	s.current_obj_id = int(oid)
	return gr.update()

	obj_id_inp.change(_sync_obj_id, inputs=[GLOBAL_STATE, obj_id_inp], outputs=[])

	def _sync_label(s: AppState, lab: str):
	if s is not None and lab is not None:
	s.current_label = str(lab)
	return gr.update()

	label_radio.change(_sync_label, inputs=[GLOBAL_STATE, label_radio], outputs=[])

	def _sync_prompt_type(s: AppState, val: str):
	if s is not None and val is not None:
	s.current_prompt_type = str(val)
	s.pending_box_start = None
	is_points = str(val).lower() == "points"
	# Show labels only for points; hide and disable clear_old when boxes
	updates = [
	gr.update(visible=is_points),
	gr.update(interactive=is_points) if is_points else gr.update(value=True, interactive=False),
	]
	return updates

	prompt_type.change(
	_sync_prompt_type,
	inputs=[GLOBAL_STATE, prompt_type],
	outputs=[label_radio, clear_old_chk],
	)

	def _auto_detect_ball(
	state_in: AppState,
	obj_id,
	label_value: str,
	clear_old_value: bool,
	):
	if state_in is None or state_in.num_frames == 0:
	raise gr.Error("Load a video first, then try auto-detect.")

	frame_idx = 0
	frame = state_in.video_frames[frame_idx]
	detection = detect_ball_center(frame)
	if detection is None:
	return (
	update_frame_display(state_in, frame_idx),
	gr.update(
	value="❌ Unable to auto-detect the ball. Please add a point manually.",
	visible=True,
	),
	gr.update(value=frame_idx),
	)

	x_center, y_center, _, _, conf = detection
	frame_width, frame_height = frame.size
	x_center = max(0, min(frame_width - 1, int(x_center)))
	y_center = max(0, min(frame_height - 1, int(y_center)))
	obj_id_int = int(obj_id) if obj_id is not None else state_in.current_obj_id
	label_str = label_value if label_value else state_in.current_label
	clear_old_flag = bool(clear_old_value)

	# Build a synthetic click event to reuse existing handler
	synthetic_evt = SimpleNamespace(
	index=(x_center, y_center),
	value={"x": x_center, "y": y_center},
	)

	state_in.current_frame_idx = frame_idx
	preview_img = on_image_click(
	update_frame_display(state_in, frame_idx),
	state_in,
	frame_idx,
	obj_id_int,
	label_str,
	clear_old_flag,
	synthetic_evt,
	)

	status_text = f"✅ Auto-detected ball at ({x_center}, {y_center}) (conf={conf:.2f})"
	return preview_img, gr.update(value=status_text, visible=True), gr.update(value=frame_idx)

	detect_ball_btn.click(
	_auto_detect_ball,
	inputs=[GLOBAL_STATE, obj_id_inp, label_radio, clear_old_chk],
	outputs=[preview, ball_status, frame_slider],
	)

	# Image click to add a point and run forward on that frame
	preview.select(
	on_image_click, [preview, GLOBAL_STATE, frame_slider, obj_id_inp, label_radio, clear_old_chk], preview
	)

	# Playback via MP4 rendering only

	# Render a smooth MP4 using imageio/pyav (fallbacks to imageio v2 / OpenCV)
	def _render_video(s: AppState, remove_bg: bool = False):
	if s is None or s.num_frames == 0:
	raise gr.Error("Load a video first.")
	fps = s.video_fps if s.video_fps and s.video_fps > 0 else 12
	# Compose all frames (cache will help if already prepared)
	frames_np = []
	first = compose_frame(s, 0, remove_bg=remove_bg)
	h, w = first.size[1], first.size[0]
	for idx in range(s.num_frames):
	# Don't use cache when remove_bg changes behavior
	if remove_bg:
	img = compose_frame(s, idx, remove_bg=True)
	else:
	img = s.composited_frames.get(idx)
	if img is None:
	img = compose_frame(s, idx, remove_bg=False)
	frames_np.append(np.array(img)[:, :, ::-1]) # BGR for cv2
	# Periodically release CPU mem to reduce pressure
	if (idx + 1) % 60 == 0:
	gc.collect()
	out_path = "/tmp/sam2_playback.mp4"
	# Prefer imageio with PyAV/ffmpeg to respect exact fps
	try:
	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
	for fr_bgr in frames_np:
	writer.write(fr_bgr)
	writer.release()
	return out_path
	except Exception as e:
	print(f"Failed to render video with cv2: {e}")
	raise gr.Error(f"Failed to render video: {e}")

	render_btn.click(_render_video, inputs=[GLOBAL_STATE, remove_bg_checkbox], outputs=[playback_video])

	# While propagating, we stream two outputs: status text and slider value updates
	propagate_btn.click(
	propagate_masks,
	inputs=[GLOBAL_STATE],
	outputs=[GLOBAL_STATE, propagate_status, frame_slider],
	)

	reset_btn.click(
	reset_session,
	inputs=GLOBAL_STATE,
	outputs=[GLOBAL_STATE, preview, frame_slider, frame_slider, load_status, ball_status],
	)

	# ============================================================================
	# COMBINED INTERFACE WITH EXPLICIT API ENDPOINT
	# ============================================================================
	# Create API interface with explicit endpoint
	api_interface = gr.Interface(
	fn=process_video_api,
	inputs=[
	gr.Video(label="Video File"),
	gr.Textbox(
	label="Annotations JSON",
	placeholder='{"annotations": [{"object_id": 1, "frame": 139, "x": 369, "y": 652, "label": "positive"}]}',
	lines=5
	),
	gr.Radio(
	choices=["tiny", "small", "base_plus", "large"],
	value="base_plus",
	label="SAM2 Checkpoint"
	),
	gr.Checkbox(label="Remove Background", value=True)
	],
	outputs=[
	gr.Image(label="Annotation Preview (shows where points are placed)"),
	gr.Video(label="Processed Video")
	],
	title="SAM2 API",
	description="""
	## Programmatic API for Video Background Removal

	The preview image shows where your annotation points are placed on the video frames.

	Annotations JSON Format:
	```json
	{
	"annotations": [
	{"object_id": 1, "frame": 0, "x": 363, "y": 631, "label": "positive"},
	{"object_id": 1, "frame": 187, "x": 296, "y": 485, "label": "positive"},
	{"object_id": 2, "frame": 187, "x": 296, "y": 412, "label": "positive"}
	]
	}
	```

	- Object 1 (Ball): Frame 0 + Impact frame
	- Object 2 (Player): Impact frame
	- Colors represent different objects
	"""
	)

	# Use gr.Blocks to combine both with proper API exposure
	with gr.Blocks(title="SAM2 Video Tracking") as combined_demo:
	gr.Markdown("# SAM2 Video Tracking")

	with gr.Tabs():
	with gr.TabItem("Interactive UI"):
	demo.render()

	with gr.TabItem("API"):
	api_interface.render()

	# Explicitly expose the API function at root level for external API calls
	# This creates the /api/predict endpoint
	api_video_input_hidden = gr.Video(visible=False)
	api_annotations_input_hidden = gr.Textbox(visible=False)
	api_checkpoint_input_hidden = gr.Radio(choices=["tiny", "small", "base_plus", "large"], visible=False)
	api_remove_bg_input_hidden = gr.Checkbox(visible=False)
	api_preview_output_hidden = gr.Image(visible=False)
	api_video_output_hidden = gr.Video(visible=False)

	# This dummy component creates the external API endpoint
	api_dummy_btn = gr.Button("API", visible=False)
	api_dummy_btn.click(
	fn=process_video_api,
	inputs=[api_video_input_hidden, api_annotations_input_hidden, api_checkpoint_input_hidden, api_remove_bg_input_hidden],
	outputs=[api_preview_output_hidden, api_video_output_hidden],
	api_name="predict" # This creates /api/predict for external calls
	)

	# Launch with API enabled
	if __name__ == "__main__":
	combined_demo.queue(api_open=True).launch()