Spaces:

yonigozlan
/

Segment-Anything-2-video-tracking

Running on Zero

App Files Files Community

yonigozlan HF Staff commited on Aug 12

Commit

885c3cb

1 Parent(s): bdce3a4

initial app

Browse files

Files changed (2) hide show

app.py +607 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,607 @@

+import colorsys
+import gc
+from typing import Optional
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from PIL import Image, ImageDraw
+from transformers import Sam2VideoModel, Sam2VideoProcessor
+def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
+    golden_ratio_conjugate = 0.61803398875
+    hue = (obj_id * golden_ratio_conjugate) % 1.0
+    saturation = 0.45
+    value = 1.0
+    r_f, g_f, b_f = colorsys.hsv_to_rgb(hue, saturation, value)
+    return int(r_f * 255), int(g_f * 255), int(b_f * 255)
+def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
+    try:
+        from transformers.video_utils import load_video  # type: ignore
+        frames, info = load_video(video_path_or_url)
+        pil_frames = []
+        for fr in frames:
+            if isinstance(fr, Image.Image):
+                pil_frames.append(fr.convert("RGB"))
+            else:
+                pil_frames.append(Image.fromarray(fr).convert("RGB"))
+        info = info if info is not None else {}
+        if "fps" not in info or not info.get("fps"):
+            try:
+                import cv2  # type: ignore
+                cap = cv2.VideoCapture(video_path_or_url)
+                fps_val = cap.get(cv2.CAP_PROP_FPS)
+                cap.release()
+                if fps_val and fps_val > 0:
+                    info["fps"] = float(fps_val)
+            except Exception:
+                pass
+        return pil_frames, info
+    except Exception:
+        try:
+            import cv2  # type: ignore
+            cap = cv2.VideoCapture(video_path_or_url)
+            frames = []
+            while cap.isOpened():
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                frames.append(Image.fromarray(frame_rgb))
+            fps_val = cap.get(cv2.CAP_PROP_FPS)
+            cap.release()
+            info = {
+                "num_frames": len(frames),
+                "fps": float(fps_val) if fps_val and fps_val > 0 else None,
+            }
+            return frames, info
+        except Exception as e:
+            raise RuntimeError(f"Failed to load video: {e}")
+def overlay_masks_on_frame(
+    frame: Image.Image,
+    masks_per_object: dict[int, np.ndarray],
+    color_by_obj: dict[int, tuple[int, int, int]],
+    alpha: float = 0.65,
+) -> Image.Image:
+    base = np.array(frame).astype(np.float32) / 255.0
+    overlay = base.copy()
+    for obj_id, mask in masks_per_object.items():
+        if mask is None:
+            continue
+        if mask.dtype != np.float32:
+            mask = mask.astype(np.float32)
+        if mask.ndim == 3:
+            mask = mask.squeeze()
+        mask = np.clip(mask, 0.0, 1.0)
+        color = np.array(color_by_obj.get(obj_id, (255, 0, 0)), dtype=np.float32) / 255.0
+        m = mask[..., None]
+        overlay = (1.0 - alpha * m) * overlay + (alpha * m) * color
+    out = np.clip(overlay * 255.0, 0, 255).astype(np.uint8)
+    return Image.fromarray(out)
+def get_device_and_dtype() -> tuple[str, torch.dtype]:
+    # Force CPU-only on Spaces with zero GPU
+    return "cpu", torch.float32
+class AppState:
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.video_frames: list[Image.Image] = []
+        self.inference_session = None
+        self.model: Optional[Sam2VideoModel] = None
+        self.processor: Optional[Sam2VideoProcessor] = None
+        self.device: str = "cpu"
+        self.dtype: torch.dtype = torch.float32
+        self.video_fps: float | None = None
+        self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
+        self.color_by_obj: dict[int, tuple[int, int, int]] = {}
+        self.clicks_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int]]]] = {}
+        self.boxes_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int, int]]]] = {}
+        self.composited_frames: dict[int, Image.Image] = {}
+        self.current_frame_idx: int = 0
+        self.current_obj_id: int = 1
+        self.current_label: str = "positive"
+        self.current_clear_old: bool = True
+        self.current_prompt_type: str = "Points"
+        self.pending_box_start: tuple[int, int] | None = None
+        self.pending_box_start_frame_idx: int | None = None
+        self.pending_box_start_obj_id: int | None = None
+        self.is_switching_model: bool = False
+        self.model_repo_key: str = "tiny"
+        self.model_repo_id: str | None = None
+        self.session_repo_id: str | None = None
+    @property
+    def num_frames(self) -> int:
+        return len(self.video_frames)
+GLOBAL_STATE = AppState()
+def _model_repo_from_key(key: str) -> str:
+    mapping = {
+        "tiny": "yonigozlan/sam2.1_hiera_tiny_hf",
+        "small": "yonigozlan/sam2.1_hiera_small_hf",
+        "base_plus": "yonigozlan/sam2.1_hiera_base_plus_hf",
+        "large": "yonigozlan/sam2.1_hiera_large_hf",
+    }
+    return mapping.get(key, mapping["base_plus"])
+@spaces.GPU()
+def load_model_if_needed() -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, torch.dtype]:
+    desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
+    if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
+        if GLOBAL_STATE.model_repo_id == desired_repo:
+            return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
+        try:
+            del GLOBAL_STATE.model
+        except Exception:
+            pass
+        try:
+            del GLOBAL_STATE.processor
+        except Exception:
+            pass
+        GLOBAL_STATE.model = None
+        GLOBAL_STATE.processor = None
+    device, dtype = get_device_and_dtype()
+    model = Sam2VideoModel.from_pretrained(desired_repo, torch_dtype=dtype)
+    processor = Sam2VideoProcessor.from_pretrained(desired_repo)
+    model.to(device)
+    GLOBAL_STATE.model = model
+    GLOBAL_STATE.processor = processor
+    GLOBAL_STATE.device = device
+    GLOBAL_STATE.dtype = dtype
+    GLOBAL_STATE.model_repo_id = desired_repo
+    return model, processor, device, dtype
+def ensure_session_for_current_model() -> None:
+    model, processor, device, dtype = load_model_if_needed()
+    desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
+    if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
+        if GLOBAL_STATE.video_frames:
+            GLOBAL_STATE.masks_by_frame.clear()
+            GLOBAL_STATE.clicks_by_frame_obj.clear()
+            GLOBAL_STATE.boxes_by_frame_obj.clear()
+            GLOBAL_STATE.composited_frames.clear()
+            try:
+                if GLOBAL_STATE.inference_session is not None:
+                    GLOBAL_STATE.inference_session.reset_inference_session()
+            except Exception:
+                pass
+            GLOBAL_STATE.inference_session = None
+            gc.collect()
+            GLOBAL_STATE.inference_session = processor.init_video_session(
+                video=GLOBAL_STATE.video_frames,
+                inference_device=device,
+                video_storage_device="cpu",
+            )
+            GLOBAL_STATE.session_repo_id = desired_repo
+def init_video_session(video: str | dict):
+    GLOBAL_STATE.video_frames = []
+    GLOBAL_STATE.inference_session = None
+    GLOBAL_STATE.masks_by_frame = {}
+    GLOBAL_STATE.color_by_obj = {}
+    load_model_if_needed()
+    video_path: Optional[str] = None
+    if isinstance(video, dict):
+        video_path = video.get("name") or video.get("path") or video.get("data")
+    elif isinstance(video, str):
+        video_path = video
+    else:
+        video_path = None
+    if not video_path:
+        raise gr.Error("Invalid video input.")
+    frames, info = try_load_video_frames(video_path)
+    if len(frames) == 0:
+        raise gr.Error("No frames could be loaded from the video.")
+    GLOBAL_STATE.video_frames = frames
+    GLOBAL_STATE.video_fps = None
+    if isinstance(info, dict) and info.get("fps"):
+        try:
+            GLOBAL_STATE.video_fps = float(info["fps"]) or None
+        except Exception:
+            GLOBAL_STATE.video_fps = None
+    processor = GLOBAL_STATE.processor
+    device = GLOBAL_STATE.device
+    inference_session = processor.init_video_session(
+        video=frames,
+        inference_device=device,
+        video_storage_device="cpu",
+    )
+    GLOBAL_STATE.inference_session = inference_session
+    first_frame = frames[0]
+    max_idx = len(frames) - 1
+    status = f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps. Device: {device}, dtype: {GLOBAL_STATE.dtype}"
+    return GLOBAL_STATE, 0, max_idx, first_frame, status
+def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
+    if state is None or state.video_frames is None or len(state.video_frames) == 0:
+        return None
+    frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
+    frame = state.video_frames[frame_idx]
+    masks = state.masks_by_frame.get(frame_idx, {})
+    out_img = frame
+    if len(masks) != 0:
+        out_img = overlay_masks_on_frame(out_img, masks, state.color_by_obj, alpha=0.65)
+    clicks_map = state.clicks_by_frame_obj.get(frame_idx)
+    if clicks_map:
+        draw = ImageDraw.Draw(out_img)
+        cross_half = 6
+        for obj_id, pts in clicks_map.items():
+            for x, y, lbl in pts:
+                color = (0, 255, 0) if int(lbl) == 1 else (255, 0, 0)
+                draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
+                draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
+    box_map = state.boxes_by_frame_obj.get(frame_idx)
+    if box_map:
+        draw = ImageDraw.Draw(out_img)
+        for obj_id, boxes in box_map.items():
+            color = state.color_by_obj.get(obj_id, (255, 255, 255))
+            for x1, y1, x2, y2 in boxes:
+                draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=2)
+    if (
+        state.pending_box_start is not None
+        and state.pending_box_start_frame_idx == frame_idx
+        and state.pending_box_start_obj_id is not None
+    ):
+        draw = ImageDraw.Draw(out_img)
+        x, y = state.pending_box_start
+        cross_half = 6
+        color = state.color_by_obj.get(state.pending_box_start_obj_id, (255, 255, 255))
+        draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
+        draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
+    state.composited_frames[frame_idx] = out_img
+    return out_img
+def update_frame_display(state: AppState, frame_idx: int) -> Image.Image:
+    if state is None or state.video_frames is None or len(state.video_frames) == 0:
+        return None
+    frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
+    cached = state.composited_frames.get(frame_idx)
+    if cached is not None:
+        return cached
+    return compose_frame(state, frame_idx)
+def _ensure_color_for_obj(obj_id: int):
+    if obj_id not in GLOBAL_STATE.color_by_obj:
+        GLOBAL_STATE.color_by_obj[obj_id] = pastel_color_for_object(obj_id)
+@spaces.GPU()
+def on_image_click(
+    img: Image.Image | np.ndarray,
+    state: AppState,
+    frame_idx: int,
+    obj_id: int,
+    label: str,
+    clear_old: bool,
+    evt: gr.SelectData,
+):
+    if state is None or state.inference_session is None:
+        return img
+    if state.is_switching_model:
+        return update_frame_display(state, int(frame_idx))
+    x = y = None
+    if evt is not None:
+        try:
+            if hasattr(evt, "index") and isinstance(evt.index, (list, tuple)) and len(evt.index) == 2:
+                x, y = int(evt.index[0]), int(evt.index[1])
+            elif hasattr(evt, "value") and isinstance(evt.value, dict) and "x" in evt.value and "y" in evt.value:
+                x, y = int(evt.value["x"]), int(evt.value["y"])
+        except Exception:
+            x = y = None
+    if x is None or y is None:
+        return update_frame_display(state, int(frame_idx))
+    _ensure_color_for_obj(int(obj_id))
+    processor = GLOBAL_STATE.processor
+    model = GLOBAL_STATE.model
+    inference_session = GLOBAL_STATE.inference_session
+    if state.current_prompt_type == "Boxes":
+        if state.pending_box_start is None:
+            if bool(clear_old):
+                frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
+                frame_clicks[int(obj_id)] = []
+                state.composited_frames.pop(int(frame_idx), None)
+            state.pending_box_start = (int(x), int(y))
+            state.pending_box_start_frame_idx = int(frame_idx)
+            state.pending_box_start_obj_id = int(obj_id)
+            state.composited_frames.pop(int(frame_idx), None)
+            return update_frame_display(state, int(frame_idx))
+        else:
+            x1, y1 = state.pending_box_start
+            x2, y2 = int(x), int(y)
+            state.pending_box_start = None
+            state.pending_box_start_frame_idx = None
+            state.pending_box_start_obj_id = None
+            state.composited_frames.pop(int(frame_idx), None)
+            x_min, y_min = min(x1, x2), min(y1, y2)
+            x_max, y_max = max(x1, x2), max(y1, y2)
+            processor.add_inputs_to_inference_session(
+                inference_session=inference_session,
+                frame_idx=int(frame_idx),
+                obj_ids=int(obj_id),
+                input_boxes=[[[x_min, y_min, x_max, y_max]]],
+                clear_old_inputs=bool(clear_old),
+            )
+            frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
+            obj_boxes = frame_boxes.setdefault(int(obj_id), [])
+            if bool(clear_old):
+                obj_boxes.clear()
+            obj_boxes.append((x_min, y_min, x_max, y_max))
+            state.composited_frames.pop(int(frame_idx), None)
+    else:
+        label_int = 1 if str(label).lower().startswith("pos") else 0
+        if bool(clear_old):
+            frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
+            frame_boxes[int(obj_id)] = []
+            state.composited_frames.pop(int(frame_idx), None)
+        processor.add_inputs_to_inference_session(
+            inference_session=inference_session,
+            frame_idx=int(frame_idx),
+            obj_ids=int(obj_id),
+            input_points=[[[[int(x), int(y)]]]],
+            input_labels=[[[int(label_int)]]],
+            clear_old_inputs=bool(clear_old),
+        )
+        frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
+        obj_clicks = frame_clicks.setdefault(int(obj_id), [])
+        if bool(clear_old):
+            obj_clicks.clear()
+        obj_clicks.append((int(x), int(y), int(label_int)))
+        state.composited_frames.pop(int(frame_idx), None)
+    with torch.inference_mode():
+        outputs = model(inference_session=inference_session, frame_idx=int(frame_idx))
+    H = inference_session.video_height
+    W = inference_session.video_width
+    pred_masks = outputs.pred_masks.detach().cpu()
+    video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
+    masks_for_frame: dict[int, np.ndarray] = {}
+    obj_ids_order = list(inference_session.obj_ids)
+    for i, oid in enumerate(obj_ids_order):
+        mask_i = video_res_masks[i]
+        mask_2d = mask_i.cpu().numpy().squeeze()
+        masks_for_frame[int(oid)] = mask_2d
+    GLOBAL_STATE.masks_by_frame[int(frame_idx)] = masks_for_frame
+    GLOBAL_STATE.composited_frames.pop(int(frame_idx), None)
+    return update_frame_display(GLOBAL_STATE, int(frame_idx))
+@spaces.GPU()
+def propagate_masks(state: AppState, progress=gr.Progress()):
+    if state is None or state.inference_session is None:
+        yield "Load a video first."
+        return
+    processor = GLOBAL_STATE.processor
+    model = GLOBAL_STATE.model
+    inference_session = GLOBAL_STATE.inference_session
+    total = max(1, GLOBAL_STATE.num_frames)
+    processed = 0
+    yield f"Propagating masks: {processed}/{total}"
+    with torch.inference_mode():
+        for sam2_video_output in model.propagate_in_video_iterator(inference_session):
+            H = inference_session.video_height
+            W = inference_session.video_width
+            pred_masks = sam2_video_output.pred_masks.detach().cpu()
+            video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
+            frame_idx = int(sam2_video_output.frame_idx)
+            masks_for_frame: dict[int, np.ndarray] = {}
+            obj_ids_order = list(inference_session.obj_ids)
+            for i, oid in enumerate(obj_ids_order):
+                mask_2d = video_res_masks[i].cpu().numpy().squeeze()
+                masks_for_frame[int(oid)] = mask_2d
+            GLOBAL_STATE.masks_by_frame[frame_idx] = masks_for_frame
+            GLOBAL_STATE.composited_frames.pop(frame_idx, None)
+            processed += 1
+            progress((processed, total), f"Propagating masks: {processed}/{total}")
+            yield f"Propagating masks: {processed}/{total}"
+    yield f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
+def reset_session():
+    if not GLOBAL_STATE.video_frames:
+        return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video."
+    GLOBAL_STATE.masks_by_frame.clear()
+    GLOBAL_STATE.clicks_by_frame_obj.clear()
+    GLOBAL_STATE.boxes_by_frame_obj.clear()
+    GLOBAL_STATE.composited_frames.clear()
+    GLOBAL_STATE.pending_box_start = None
+    GLOBAL_STATE.pending_box_start_frame_idx = None
+    GLOBAL_STATE.pending_box_start_obj_id = None
+    try:
+        if GLOBAL_STATE.inference_session is not None:
+            GLOBAL_STATE.inference_session.reset_inference_session()
+    except Exception:
+        pass
+    GLOBAL_STATE.inference_session = None
+    gc.collect()
+    ensure_session_for_current_model()
+    current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
+    current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
+    preview_img = update_frame_display(GLOBAL_STATE, current_idx)
+    slider_minmax = gr.update(minimum=0, maximum=max(GLOBAL_STATE.num_frames - 1, 0), interactive=True)
+    slider_value = gr.update(value=current_idx)
+    status = "Session reset. Prompts cleared; video preserved."
+    return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status
+with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)") as demo:
+    state = gr.State(GLOBAL_STATE)
+    gr.Markdown(
+        """
+    **SAM2 Video (Transformers)** — CPU-only Space. Upload a video, click to add positive/negative points per object or draw two-click boxes, preview masks, then propagate across the video. Use the slider to scrub frames.
+    """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            video_in = gr.Video(label="Upload video", sources=["upload", "webcam"], interactive=True)
+            ckpt_radio = gr.Radio(
+                choices=["tiny", "small", "base_plus", "large"],
+                value="tiny",
+                label="SAM2 checkpoint",
+            )
+            ckpt_progress = gr.Markdown(visible=False)
+            load_status = gr.Markdown(visible=True)
+            reset_btn = gr.Button("Reset Session", variant="secondary")
+        with gr.Column(scale=2):
+            preview = gr.Image(label="Preview", interactive=True)
+            frame_slider = gr.Slider(label="Frame", minimum=0, maximum=0, step=1, value=0, interactive=True)
+    with gr.Row():
+        obj_id_inp = gr.Number(value=1, precision=0, label="Object ID")
+        label_radio = gr.Radio(choices=["positive", "negative"], value="positive", label="Point label")
+        clear_old_chk = gr.Checkbox(value=True, label="Clear old inputs for this object")
+        prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
+        with gr.Column():
+            propagate_btn = gr.Button("Propagate across video", variant="primary")
+            propagate_status = gr.Markdown(visible=True)
+    with gr.Row():
+        render_btn = gr.Button("Render MP4 for smooth playback")
+    playback_video = gr.Video(label="Rendered Playback", interactive=False)
+    def _on_video_change(video):
+        s, min_idx, max_idx, first_frame, status = init_video_session(video)
+        return s, gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True), first_frame, status
+    video_in.change(
+        _on_video_change, inputs=[video_in], outputs=[state, frame_slider, preview, load_status], show_progress=True
+    )
+    def _on_ckpt_change(s: AppState, key: str):
+        if s is not None and key:
+            key = str(key)
+            if key != s.model_repo_key:
+                s.is_switching_model = True
+                s.model_repo_key = key
+                s.model_repo_id = None
+                s.model = None
+                s.processor = None
+        yield gr.update(visible=True, value=f"Loading checkpoint: {key}...")
+        ensure_session_for_current_model()
+        if s is not None:
+            s.is_switching_model = False
+        yield gr.update(visible=False, value="")
+    ckpt_radio.change(_on_ckpt_change, inputs=[state, ckpt_radio], outputs=[ckpt_progress])
+    def _rebind_session_after_ckpt(s: AppState):
+        ensure_session_for_current_model()
+        if s is not None:
+            s.pending_box_start = None
+        return gr.update()
+    ckpt_radio.change(_rebind_session_after_ckpt, inputs=[state], outputs=[])
+    def _sync_frame_idx(state_in: AppState, idx: int):
+        if state_in is not None:
+            state_in.current_frame_idx = int(idx)
+        return update_frame_display(state_in, int(idx))
+    frame_slider.change(_sync_frame_idx, inputs=[state, frame_slider], outputs=preview)
+    def _sync_obj_id(s: AppState, oid):
+        if s is not None and oid is not None:
+            s.current_obj_id = int(oid)
+        return gr.update()
+    obj_id_inp.change(_sync_obj_id, inputs=[state, obj_id_inp], outputs=[])
+    def _sync_label(s: AppState, lab: str):
+        if s is not None and lab is not None:
+            s.current_label = str(lab)
+        return gr.update()
+    label_radio.change(_sync_label, inputs=[state, label_radio], outputs=[])
+    def _sync_prompt_type(s: AppState, val: str):
+        if s is not None and val is not None:
+            s.current_prompt_type = str(val)
+            s.pending_box_start = None
+        show_labels = str(val).lower() == "points"
+        return gr.update(visible=show_labels)
+    prompt_type.change(_sync_prompt_type, inputs=[state, prompt_type], outputs=[label_radio])
+    preview.select(on_image_click, [preview, state, frame_slider, obj_id_inp, label_radio, clear_old_chk], preview)
+    def _render_video(s: AppState):
+        if s is None or s.num_frames == 0:
+            raise gr.Error("Load a video first.")
+        fps = s.video_fps if s.video_fps and s.video_fps > 0 else 12
+        frames_np = []
+        for idx in range(s.num_frames):
+            img = s.composited_frames.get(idx)
+            if img is None:
+                img = compose_frame(s, idx)
+            frames_np.append(np.array(img)[:, :, ::-1])
+            if (idx + 1) % 60 == 0:
+                gc.collect()
+        out_path = "/tmp/sam2_playback.mp4"
+        try:
+            import imageio.v3 as iio  # type: ignore
+            iio.imwrite(out_path, [fr[:, :, ::-1] for fr in frames_np], plugin="pyav", fps=fps)
+            return out_path
+        except Exception:
+            try:
+                import imageio.v2 as imageio  # type: ignore
+                imageio.mimsave(out_path, [fr[:, :, ::-1] for fr in frames_np], fps=fps)
+                return out_path
+            except Exception as e:
+                raise gr.Error(f"Failed to render video: {e}")
+    render_btn.click(_render_video, inputs=[state], outputs=[playback_video])
+    propagate_btn.click(propagate_masks, inputs=[state], outputs=[propagate_status], show_progress=True)
+    reset_btn.click(
+        reset_session,
+        inputs=None,
+        outputs=[state, preview, frame_slider, frame_slider, load_status],
+    )
+demo.queue(api_open=False).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+git+https://github.com/SangbumChoi/transformers.git@sam2
+torch
+pillow
+opencv-python
+imageio[pyav]
+spaces