asl-sign-language-lstm / inference.py
Sajjad-Ali-Shah's picture
Upload inference.py with huggingface_hub
dc2b51e verified
"""
Sample inference script for ASL Sign Language Detection
"""
import numpy as np
import cv2
import mediapipe as mp
from tensorflow.keras.models import load_model
# Load model and class names
model = load_model('asl_lstm_model.h5')
class_names = np.load('class_names.npy')
# Initialize MediaPipe
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
def extract_landmarks(image_path):
"""Extract hand landmarks from image"""
image = cv2.imread(image_path)
if image is None:
return None
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
results = hands.process(image_rgb)
if results.multi_hand_landmarks:
from google.protobuf.json_format import MessageToJson
import json
jsonObj = MessageToJson(results.multi_hand_landmarks[0])
lmk = json.loads(jsonObj)['landmark']
coords = []
for point in lmk:
coords.extend([point['x'], point['y']])
return np.array(coords)
return None
def distance_between(p1_idx, p2_idx, landmarks):
"""Calculate distance between two landmark points"""
p1 = np.array([landmarks[p1_idx * 2], landmarks[p1_idx * 2 + 1]])
p2 = np.array([landmarks[p2_idx * 2], landmarks[p2_idx * 2 + 1]])
return np.sqrt(np.sum((p1 - p2) ** 2))
def landmark_to_dist_emb(landmarks):
"""Convert landmarks to distance features"""
emb = np.array([
distance_between(4, 8, landmarks), distance_between(4, 12, landmarks),
distance_between(4, 16, landmarks), distance_between(4, 20, landmarks),
distance_between(0, 4, landmarks), distance_between(0, 8, landmarks),
distance_between(0, 12, landmarks), distance_between(0, 16, landmarks),
distance_between(0, 20, landmarks), distance_between(8, 12, landmarks),
distance_between(12, 16, landmarks), distance_between(1, 4, landmarks),
distance_between(5, 8, landmarks), distance_between(9, 12, landmarks),
distance_between(13, 16, landmarks), distance_between(17, 20, landmarks),
distance_between(2, 8, landmarks), distance_between(2, 12, landmarks),
distance_between(2, 16, landmarks), distance_between(2, 20, landmarks)
])
return emb / np.linalg.norm(emb)
def predict_sign(image_path):
"""Predict sign language gesture from image"""
landmarks = extract_landmarks(image_path)
if landmarks is None:
return None, None
features = landmark_to_dist_emb(landmarks)
# Repeat to create sequence of 50 timesteps
sequence = np.repeat([features], 50, axis=0).reshape(1, 50, 20)
prediction = model.predict(sequence, verbose=0)
class_idx = np.argmax(prediction)
confidence = prediction[0][class_idx]
return class_names[class_idx], confidence
# Example usage
if __name__ == "__main__":
result, confidence = predict_sign("path/to/your/image.jpg")
if result:
print(f"Predicted sign: {result} (confidence: {confidence:.2%})")
else:
print("No hand detected in image")