omi-stt-v1-demo / app.py
Farhang87's picture
Add Omi STT v1 Gradio demo scaffold
be9bd17 verified
Raw
History Blame Contribute Delete
2.43 kB
from __future__ import annotations
import tempfile
import time
from pathlib import Path
import gradio as gr
import numpy as np
import soundfile as sf
from huggingface_hub import hf_hub_download
from nemo.collections.asr.models import ASRModel
MODEL_REPO = "omi-health/omi-stt-v1"
MODEL_FILE = "omimedstt-v1.nemo"
MAX_SECONDS = 180
_MODEL = None
def load_model():
global _MODEL
if _MODEL is None:
checkpoint = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
_MODEL = ASRModel.restore_from(checkpoint, map_location="cuda")
return _MODEL
def normalize(audio_path: str) -> tuple[str, float]:
audio, sr = sf.read(audio_path, dtype="float32", always_2d=True)
mono = audio.mean(axis=1)
duration = len(mono) / float(sr)
if duration > MAX_SECONDS:
raise gr.Error(f"Audio is {duration:.1f}s. Demo limit is {MAX_SECONDS}s.")
if sr != 16000:
x_old = np.linspace(0.0, duration, num=len(mono), endpoint=False)
n = max(1, int(round(duration * 16000)))
x_new = np.linspace(0.0, duration, num=n, endpoint=False)
mono = np.interp(x_new, x_old, mono).astype("float32")
out = Path(tempfile.mkdtemp(prefix="omi_space_")) / "audio.16k.wav"
sf.write(out, mono, 16000)
return str(out), duration
def transcribe(audio_path: str) -> tuple[str, str]:
if not audio_path:
raise gr.Error("Upload an audio file first.")
start = time.time()
wav, duration = normalize(audio_path)
model = load_model()
output = model.transcribe([wav])[0]
text = output.text if hasattr(output, "text") else str(output)
meta = f"Duration: {duration:.1f}s | Wall time: {time.time() - start:.1f}s | Model: {MODEL_REPO}"
return text.strip(), meta
with gr.Blocks(title="Omi STT v1 Demo") as demo:
gr.Markdown(
"# Omi STT v1\n"
"English medical speech-to-text adapted from NVIDIA Parakeet v2. "
"**Not for clinical decision-making. Review transcripts before use.**"
)
audio = gr.Audio(type="filepath", label="Upload audio, max 180 seconds")
btn = gr.Button("Transcribe")
transcript = gr.Textbox(label="Raw transcript", lines=10)
meta = gr.Textbox(label="Run info", interactive=False)
btn.click(transcribe, inputs=audio, outputs=[transcript, meta])
gr.Markdown("[Model card](https://huggingface.co/omi-health/omi-stt-v1)")
if __name__ == "__main__":
demo.launch()