from __future__ import annotations import tempfile import time from pathlib import Path import gradio as gr import numpy as np import soundfile as sf from huggingface_hub import hf_hub_download from nemo.collections.asr.models import ASRModel MODEL_REPO = "omi-health/omi-stt-v1" MODEL_FILE = "omimedstt-v1.nemo" MAX_SECONDS = 180 _MODEL = None def load_model(): global _MODEL if _MODEL is None: checkpoint = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) _MODEL = ASRModel.restore_from(checkpoint, map_location="cuda") return _MODEL def normalize(audio_path: str) -> tuple[str, float]: audio, sr = sf.read(audio_path, dtype="float32", always_2d=True) mono = audio.mean(axis=1) duration = len(mono) / float(sr) if duration > MAX_SECONDS: raise gr.Error(f"Audio is {duration:.1f}s. Demo limit is {MAX_SECONDS}s.") if sr != 16000: x_old = np.linspace(0.0, duration, num=len(mono), endpoint=False) n = max(1, int(round(duration * 16000))) x_new = np.linspace(0.0, duration, num=n, endpoint=False) mono = np.interp(x_new, x_old, mono).astype("float32") out = Path(tempfile.mkdtemp(prefix="omi_space_")) / "audio.16k.wav" sf.write(out, mono, 16000) return str(out), duration def transcribe(audio_path: str) -> tuple[str, str]: if not audio_path: raise gr.Error("Upload an audio file first.") start = time.time() wav, duration = normalize(audio_path) model = load_model() output = model.transcribe([wav])[0] text = output.text if hasattr(output, "text") else str(output) meta = f"Duration: {duration:.1f}s | Wall time: {time.time() - start:.1f}s | Model: {MODEL_REPO}" return text.strip(), meta with gr.Blocks(title="Omi STT v1 Demo") as demo: gr.Markdown( "# Omi STT v1\n" "English medical speech-to-text adapted from NVIDIA Parakeet v2. " "**Not for clinical decision-making. Review transcripts before use.**" ) audio = gr.Audio(type="filepath", label="Upload audio, max 180 seconds") btn = gr.Button("Transcribe") transcript = gr.Textbox(label="Raw transcript", lines=10) meta = gr.Textbox(label="Run info", interactive=False) btn.click(transcribe, inputs=audio, outputs=[transcript, meta]) gr.Markdown("[Model card](https://huggingface.co/omi-health/omi-stt-v1)") if __name__ == "__main__": demo.launch()