Spaces:

Anudeep05
/

VideoMAE-API

Sleeping

App Files Files Community

Anudeep Tippabathuni commited on Sep 4

Commit

47b7a70

1 Parent(s): c4a7cf5

Add model weights via Git LFS

Browse files

Files changed (3) hide show

app.py +84 -0
requirements.txt +5 -0
videomae_best.pth +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
+from decord import VideoReader, cpu
+import gradio as gr
+# -------------------------------
+# Device
+# -------------------------------
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# -------------------------------
+# Load processor and model
+# -------------------------------
+processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-small-finetuned-ssv2")
+model = VideoMAEForVideoClassification.from_pretrained(
+    "MCG-NJU/videomae-small-finetuned-ssv2",
+    num_labels=14,
+    ignore_mismatched_sizes=True
+)
+checkpoint = torch.load("videomae_best.pth", map_location=device)
+model.load_state_dict(checkpoint["model_state_dict"])
+model.to(device)
+model.eval()
+# -------------------------------
+# Class mapping
+# -------------------------------
+id2class = {
+    0: "AFGHANISTAN",
+    1: "AFRICA",
+    2: "ANDHRA_PRADESH",
+    3: "ARGENTINA",
+    4: "DELHI",
+    5: "DENMARK",
+    6: "ENGLAND",
+    7: "GANGTOK",
+    8: "GOA",
+    9: "GUJARAT",
+    10: "HARYANA",
+    11: "HIMACHAL_PRADESH",
+    12: "JAIPUR",
+    13: "JAMMU_AND_KASHMIR"
+}
+# -------------------------------
+# Video preprocessing
+# -------------------------------
+def preprocess_video(video_path, processor, num_frames=16):
+    vr = VideoReader(video_path, ctx=cpu(0))
+    total_frames = len(vr)
+    if total_frames < num_frames:
+        indices = [i % total_frames for i in range(num_frames)]
+    else:
+        indices = torch.linspace(0, total_frames - 1, num_frames).long().tolist()
+    video = vr.get_batch(indices).asnumpy()
+    inputs = processor(list(video), return_tensors="pt")
+    return inputs["pixel_values"][0]
+# -------------------------------
+# Prediction function
+# -------------------------------
+def predict_video(video_file):
+    # video_file is a file-like object from Gradio
+    video_path = video_file.name
+    pixel_values = preprocess_video(video_path, processor)
+    pixel_values = pixel_values.unsqueeze(0).to(device)
+    with torch.no_grad():
+        logits = model(pixel_values=pixel_values).logits
+        pred_index = torch.argmax(logits, dim=1).item()
+    return id2class[pred_index]
+# -------------------------------
+# Gradio Interface
+# -------------------------------
+iface = gr.Interface(
+    fn=predict_video,
+    inputs=gr.Video(type="file"),
+    outputs="text",
+    title="VideoMAE Classification API",
+    description="Upload a video and get the predicted class."
+)
+# Expose API
+iface.launch(server_name="0.0.0.0", server_port=7860, share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+transformers
+decord
+gradio
+numpy

videomae_best.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abe3dfa9deb07a43cf2dfb246a5b33fe11f5b1223237b210abee2aed11844d92
+size 144487411