|
|
"""
|
|
|
Sample inference script for ASL Sign Language Detection
|
|
|
"""
|
|
|
import numpy as np
|
|
|
import cv2
|
|
|
import mediapipe as mp
|
|
|
from tensorflow.keras.models import load_model
|
|
|
|
|
|
|
|
|
model = load_model('asl_lstm_model.h5')
|
|
|
class_names = np.load('class_names.npy')
|
|
|
|
|
|
|
|
|
mp_hands = mp.solutions.hands
|
|
|
hands = mp_hands.Hands(static_image_mode=True, max_num_hands=1, min_detection_confidence=0.5)
|
|
|
|
|
|
def extract_landmarks(image_path):
|
|
|
"""Extract hand landmarks from image"""
|
|
|
image = cv2.imread(image_path)
|
|
|
if image is None:
|
|
|
return None
|
|
|
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
|
|
results = hands.process(image_rgb)
|
|
|
if results.multi_hand_landmarks:
|
|
|
from google.protobuf.json_format import MessageToJson
|
|
|
import json
|
|
|
jsonObj = MessageToJson(results.multi_hand_landmarks[0])
|
|
|
lmk = json.loads(jsonObj)['landmark']
|
|
|
coords = []
|
|
|
for point in lmk:
|
|
|
coords.extend([point['x'], point['y']])
|
|
|
return np.array(coords)
|
|
|
return None
|
|
|
|
|
|
def distance_between(p1_idx, p2_idx, landmarks):
|
|
|
"""Calculate distance between two landmark points"""
|
|
|
p1 = np.array([landmarks[p1_idx * 2], landmarks[p1_idx * 2 + 1]])
|
|
|
p2 = np.array([landmarks[p2_idx * 2], landmarks[p2_idx * 2 + 1]])
|
|
|
return np.sqrt(np.sum((p1 - p2) ** 2))
|
|
|
|
|
|
def landmark_to_dist_emb(landmarks):
|
|
|
"""Convert landmarks to distance features"""
|
|
|
emb = np.array([
|
|
|
distance_between(4, 8, landmarks), distance_between(4, 12, landmarks),
|
|
|
distance_between(4, 16, landmarks), distance_between(4, 20, landmarks),
|
|
|
distance_between(0, 4, landmarks), distance_between(0, 8, landmarks),
|
|
|
distance_between(0, 12, landmarks), distance_between(0, 16, landmarks),
|
|
|
distance_between(0, 20, landmarks), distance_between(8, 12, landmarks),
|
|
|
distance_between(12, 16, landmarks), distance_between(1, 4, landmarks),
|
|
|
distance_between(5, 8, landmarks), distance_between(9, 12, landmarks),
|
|
|
distance_between(13, 16, landmarks), distance_between(17, 20, landmarks),
|
|
|
distance_between(2, 8, landmarks), distance_between(2, 12, landmarks),
|
|
|
distance_between(2, 16, landmarks), distance_between(2, 20, landmarks)
|
|
|
])
|
|
|
return emb / np.linalg.norm(emb)
|
|
|
|
|
|
def predict_sign(image_path):
|
|
|
"""Predict sign language gesture from image"""
|
|
|
landmarks = extract_landmarks(image_path)
|
|
|
if landmarks is None:
|
|
|
return None, None
|
|
|
|
|
|
features = landmark_to_dist_emb(landmarks)
|
|
|
|
|
|
sequence = np.repeat([features], 50, axis=0).reshape(1, 50, 20)
|
|
|
|
|
|
prediction = model.predict(sequence, verbose=0)
|
|
|
class_idx = np.argmax(prediction)
|
|
|
confidence = prediction[0][class_idx]
|
|
|
|
|
|
return class_names[class_idx], confidence
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
result, confidence = predict_sign("path/to/your/image.jpg")
|
|
|
if result:
|
|
|
print(f"Predicted sign: {result} (confidence: {confidence:.2%})")
|
|
|
else:
|
|
|
print("No hand detected in image")
|
|
|
|