"""
app.py -- Gradio web app for ASL Recognition.
Fingerspell: real-time letter-by-letter recognition (A-Z)
Runs on Hugging Face Spaces or locally with: python app.py
"""
import urllib.request
from collections import deque
from pathlib import Path
import cv2
import gradio as gr
import mediapipe as mp_lib
import numpy as np
import torch
import torch.nn as nn
from mediapipe.tasks.python import BaseOptions
from mediapipe.tasks.python.vision import (
HandLandmarker, HandLandmarkerOptions,
RunningMode,
)
from spellchecker import SpellChecker
# ── Paths ─────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent
FS_MODEL_PATH = ROOT / "models" / "asl_model.pt"
FS_CLASSES_PATH = ROOT / "models" / "label_classes.npy"
HAND_MODEL_PATH = ROOT / "models" / "hand_landmarker.task"
HAND_MODEL_URL = (
"https://storage.googleapis.com/mediapipe-models/"
"hand_landmarker/hand_landmarker/float16/latest/hand_landmarker.task"
)
# ── Constants ──────────────────────────────────────────────────────────────────
NUM_LANDMARKS, COORDS_PER_LM = 21, 3
WINDOW_SIZE = 15
STABLE_COUNT = 10
COOLDOWN_FRAMES = 8
FS_CONFIDENCE_THRESHOLD = 0.50
_HAND_CONNECTIONS = [
(0,1),(1,2),(2,3),(3,4),(0,5),(5,6),(6,7),(7,8),
(0,9),(9,10),(10,11),(11,12),(0,13),(13,14),(14,15),(15,16),
(0,17),(17,18),(18,19),(19,20),(5,9),(9,13),(13,17),
]
# ── Model definition ───────────────────────────────────────────────────────────
class ASLClassifier(nn.Module):
def __init__(self, input_dim: int, num_classes: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
nn.Linear(512, 256), nn.BatchNorm1d(256), nn.ReLU(), nn.Dropout(0.3),
nn.Linear(256, 128), nn.BatchNorm1d(128), nn.ReLU(), nn.Dropout(0.2),
nn.Linear(128, num_classes),
)
def forward(self, x):
return self.net(x)
# ── Landmark helpers ───────────────────────────────────────────────────────────
def normalise_landmarks(landmarks_list):
pts = np.array(landmarks_list, dtype=np.float32).reshape(NUM_LANDMARKS, COORDS_PER_LM)
pts -= pts[0]
scale = np.max(np.linalg.norm(pts, axis=1))
if scale > 1e-6:
pts /= scale
return pts.flatten()
# ── Drawing helpers ────────────────────────────────────────────────────────────
def draw_hand(frame_bgr: np.ndarray, landmarks, color=(0, 255, 255)):
"""Draw hand skeleton on a BGR frame."""
h, w = frame_bgr.shape[:2]
pts = [(int(lm.x * w), int(lm.y * h)) for lm in landmarks]
for a, b in _HAND_CONNECTIONS:
cv2.line(frame_bgr, pts[a], pts[b], color, 2, cv2.LINE_AA)
for i, pt in enumerate(pts):
radius = 10 if i == 0 else 7
cv2.circle(frame_bgr, pt, radius + 2, (255, 255, 255), 2, cv2.LINE_AA) # white halo
cv2.circle(frame_bgr, pt, radius, color, -1, cv2.LINE_AA)
def _annotate(frame_rgb: np.ndarray, hand_landmarks_list=None) -> np.ndarray:
"""Convert RGB frame to BGR, draw hand landmarks, return RGB result."""
out = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
if hand_landmarks_list:
colors = [(0, 255, 255), (255, 0, 255)] # bright yellow, bright magenta (BGR)
for i, lms in enumerate(hand_landmarks_list):
draw_hand(out, lms, colors[i % 2])
return cv2.cvtColor(out, cv2.COLOR_BGR2RGB)
def _make_placeholder_image(line1: str, line2: str = "",
w: int = 640, h: int = 480) -> np.ndarray:
"""Dark placeholder image shown before the camera feed is active."""
img = np.zeros((h, w, 3), dtype=np.uint8)
img[:] = (30, 30, 30)
font, scale, thick = cv2.FONT_HERSHEY_SIMPLEX, 0.9, 2
tw, _ = cv2.getTextSize(line1, font, scale, thick)[0]
cv2.putText(img, line1, ((w - tw) // 2, h // 2 - (18 if line2 else 0)),
font, scale, (200, 200, 200), thick, cv2.LINE_AA)
if line2:
tw2, _ = cv2.getTextSize(line2, font, 0.6, 1)[0]
cv2.putText(img, line2, ((w - tw2) // 2, h // 2 + 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (120, 120, 120), 1, cv2.LINE_AA)
return img
# ── Multi-strategy hand detection (mirrors demo.py) ───────────────────────────
def _detect_hand_live(frame_rgb: np.ndarray, detector):
"""Try several preprocessing strategies to maximise hand detection rate."""
mp_img = mp_lib.Image(image_format=mp_lib.ImageFormat.SRGB, data=frame_rgb)
result = detector.detect(mp_img)
if result.hand_landmarks:
return result
h, w = frame_rgb.shape[:2]
pad = int(max(h, w) * 0.15)
padded = cv2.copyMakeBorder(frame_rgb, pad, pad, pad, pad,
cv2.BORDER_CONSTANT, value=(0, 0, 0))
mp_img = mp_lib.Image(image_format=mp_lib.ImageFormat.SRGB, data=padded)
result = detector.detect(mp_img)
if result.hand_landmarks:
return result
lab = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2LAB)
clahe_obj = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
lab[:, :, 0] = clahe_obj.apply(lab[:, :, 0])
enhanced = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
mp_img = mp_lib.Image(image_format=mp_lib.ImageFormat.SRGB, data=enhanced)
return detector.detect(mp_img)
# ── Caption HTML helpers ───────────────────────────────────────────────────────
_CAPTION_BASE = (
"background:#000;border-radius:10px;padding:28px 36px;"
"min-height:180px;display:flex;flex-direction:column;"
"align-items:center;justify-content:center;gap:14px;width:100%;"
)
def _fs_caption_html(current_letter: str | None, confidence: float,
word_buffer: list, suggestion: str) -> str:
spelled = "".join(word_buffer)
letter_html = (current_letter or "").strip() or " "
conf_html = f"{confidence*100:.0f}%" if (current_letter or "").strip() else ""
sugg_html = (f'Suggestion: '
f'{suggestion}') if suggestion else " "
return (
f'
'
f'
{letter_html}
'
f'
{conf_html}
'
f'
{spelled}_
'
f'
{sugg_html}
'
f'
'
)
# ── LetterSmoother ─────────────────────────────────────────────────────────────
class LetterSmoother:
def __init__(self):
self.window: deque[str | None] = deque(maxlen=WINDOW_SIZE)
self._prev_smoothed: str | None = None
self._same_count = 0
self._cooldown_remaining = 0
def update(self, letter: str | None) -> str | None:
self.window.append(letter)
counts: dict[str, int] = {}
for ltr in self.window:
if ltr is not None:
counts[ltr] = counts.get(ltr, 0) + 1
if not counts:
self._prev_smoothed = None
self._same_count = 0
return None
smoothed = max(counts, key=lambda k: counts[k])
if smoothed == self._prev_smoothed:
self._same_count += 1
else:
self._prev_smoothed = smoothed
self._same_count = 1
if self._cooldown_remaining > 0:
self._cooldown_remaining -= 1
return None
if self._same_count >= STABLE_COUNT:
self._same_count = 0
self._cooldown_remaining = COOLDOWN_FRAMES
return smoothed
return None
# ── Model loading (cached) ─────────────────────────────────────────────────────
def _ensure_model(path: Path, url: str, name: str):
if not path.exists():
print(f"Downloading {name} model ...")
path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(url, path)
def load_fingerspell():
if not FS_MODEL_PATH.exists() or not FS_CLASSES_PATH.exists():
return None
_ensure_model(HAND_MODEL_PATH, HAND_MODEL_URL, "hand")
ckpt = torch.load(FS_MODEL_PATH, map_location="cpu", weights_only=False)
classes = np.load(FS_CLASSES_PATH, allow_pickle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ASLClassifier(ckpt["input_dim"], ckpt["num_classes"]).to(device)
model.load_state_dict(ckpt["model_state_dict"])
model.eval()
detector = HandLandmarker.create_from_options(HandLandmarkerOptions(
base_options=BaseOptions(model_asset_path=str(HAND_MODEL_PATH)),
running_mode=RunningMode.IMAGE,
num_hands=1,
min_hand_detection_confidence=0.1,
min_hand_presence_confidence=0.1,
min_tracking_confidence=0.1,
))
print(f"Fingerspell model loaded ({len(classes)} classes, device={device})")
return model, classes, device, detector
def _warmup_fingerspell(assets):
"""Run one dummy inference to pre-compile kernels and avoid first-frame lag."""
if assets is None:
return
model, classes, device, detector = assets
dummy = np.zeros((480, 640, 3), dtype=np.uint8)
mp_img = mp_lib.Image(image_format=mp_lib.ImageFormat.SRGB, data=dummy)
detector.detect(mp_img)
feat_dim = model.net[0].in_features
with torch.no_grad():
model(torch.zeros((1, feat_dim), device=device))
print("Fingerspell warmup complete")
# Load model at import time (cached by the process)
fs_assets = load_fingerspell()
spell = SpellChecker()
# Warm up model so the first real frame has no compilation delay
_warmup_fingerspell(fs_assets)
# ── Fingerspell processing ─────────────────────────────────────────────────────
def _suggest_word(word_buffer: list[str]) -> str:
raw = "".join(word_buffer).strip().lower()
if not raw:
return ""
if not spell.unknown([raw]):
return raw.upper()
correction = spell.correction(raw)
return correction.upper() if correction else raw.upper()
_FS_PLACEHOLDER = _make_placeholder_image(
"Enable camera to begin", "Show ASL letters to the camera")
def process_fingerspell(frame, state):
if frame is None or fs_assets is None:
return _FS_PLACEHOLDER, _fs_caption_html(None, 0.0, [], ""), state
model, classes, device, detector = fs_assets
smoother = state["smoother"]
word_buffer = state["word_buffer"]
result = _detect_hand_live(frame, detector)
current_letter = None
confidence = 0.0
if result.hand_landmarks:
raw = []
for lm in result.hand_landmarks[0]:
raw.extend([lm.x, lm.y, lm.z])
features = normalise_landmarks(raw)
x = torch.from_numpy(features).unsqueeze(0).to(device)
with torch.no_grad():
logits = model(x)
probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
pred_idx = int(np.argmax(probs))
current_letter = str(classes[pred_idx])
confidence = float(probs[pred_idx])
if confidence < FS_CONFIDENCE_THRESHOLD:
current_letter = None
if current_letter in ("del", "nothing"):
current_letter = None
elif current_letter == "space":
current_letter = " "
accepted = smoother.update(current_letter)
if accepted is not None:
if accepted == " ":
if word_buffer and word_buffer[-1] != " ":
word_buffer.append(" ")
elif not word_buffer or word_buffer[-1] != accepted:
word_buffer.append(accepted)
state["smoother"] = smoother
state["word_buffer"] = word_buffer
annotated = _annotate(
frame,
hand_landmarks_list=result.hand_landmarks if result.hand_landmarks else None,
)
suggestion = _suggest_word(word_buffer)
caption = _fs_caption_html(current_letter, confidence, word_buffer, suggestion)
return annotated, caption, state
def clear_fingerspell(state):
state["word_buffer"] = []
state["smoother"] = LetterSmoother()
return _fs_caption_html(None, 0.0, [], ""), state
# ── Gradio state factory ───────────────────────────────────────────────────────
def make_fs_state():
return {"smoother": LetterSmoother(), "word_buffer": []}
# ── Gradio UI ──────────────────────────────────────────────────────────────────
# When the webcam is closed and re-opened, Gradio's internal recording state (I)
# is not reset, so the Record button shows "Stop" even though nothing is streaming.
# This JS auto-clicks any stale "Stop" button when a video element starts playing
# (i.e., when the webcam is re-enabled), driving Gradio's he() handler to reset I.
_WEBCAM_SYNC_JS = """
function() {
document.addEventListener('play', function(e) {
if (e.target.tagName !== 'VIDEO') return;
setTimeout(function() {
document.querySelectorAll('[title="stop recording"]').forEach(function(icon) {
var btn = icon.closest('button');
if (btn) btn.click();
});
}, 150);
}, true);
}
"""
with gr.Blocks(title="ASL Recognition", js=_WEBCAM_SYNC_JS) as demo:
gr.Markdown("# ASL Recognition\nReal-time American Sign Language recognition using your camera.")
gr.Markdown(
"**Fingerspelling mode**: Show ASL letters to the camera one at a time. "
"Hold each letter steady — the model spells out words in the caption below."
)
fs_state = gr.State(value=make_fs_state)
with gr.Row():
with gr.Column(scale=1, min_width=180):
fs_webcam = gr.Image(sources=["webcam"], streaming=True,
label="Enable Camera", mirror_webcam=False,
height=180)
with gr.Column(scale=3):
fs_output = gr.Image(label="Live Feed", mirror_webcam=False)
fs_caption = gr.HTML(value=_fs_caption_html(None, 0.0, [], ""))
fs_clear_btn = gr.Button("Clear")
fs_webcam.stream(
fn=process_fingerspell,
inputs=[fs_webcam, fs_state],
outputs=[fs_output, fs_caption, fs_state],
stream_every=0.1,
)
fs_clear_btn.click(
fn=clear_fingerspell,
inputs=[fs_state],
outputs=[fs_caption, fs_state],
)
if __name__ == "__main__":
demo.launch()