Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import tempfile | |
| import time | |
| from pathlib import Path | |
| import gradio as gr | |
| import numpy as np | |
| import soundfile as sf | |
| from huggingface_hub import hf_hub_download | |
| from nemo.collections.asr.models import ASRModel | |
| MODEL_REPO = "omi-health/omi-stt-v1" | |
| MODEL_FILE = "omimedstt-v1.nemo" | |
| MAX_SECONDS = 180 | |
| _MODEL = None | |
| def load_model(): | |
| global _MODEL | |
| if _MODEL is None: | |
| checkpoint = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) | |
| _MODEL = ASRModel.restore_from(checkpoint, map_location="cuda") | |
| return _MODEL | |
| def normalize(audio_path: str) -> tuple[str, float]: | |
| audio, sr = sf.read(audio_path, dtype="float32", always_2d=True) | |
| mono = audio.mean(axis=1) | |
| duration = len(mono) / float(sr) | |
| if duration > MAX_SECONDS: | |
| raise gr.Error(f"Audio is {duration:.1f}s. Demo limit is {MAX_SECONDS}s.") | |
| if sr != 16000: | |
| x_old = np.linspace(0.0, duration, num=len(mono), endpoint=False) | |
| n = max(1, int(round(duration * 16000))) | |
| x_new = np.linspace(0.0, duration, num=n, endpoint=False) | |
| mono = np.interp(x_new, x_old, mono).astype("float32") | |
| out = Path(tempfile.mkdtemp(prefix="omi_space_")) / "audio.16k.wav" | |
| sf.write(out, mono, 16000) | |
| return str(out), duration | |
| def transcribe(audio_path: str) -> tuple[str, str]: | |
| if not audio_path: | |
| raise gr.Error("Upload an audio file first.") | |
| start = time.time() | |
| wav, duration = normalize(audio_path) | |
| model = load_model() | |
| output = model.transcribe([wav])[0] | |
| text = output.text if hasattr(output, "text") else str(output) | |
| meta = f"Duration: {duration:.1f}s | Wall time: {time.time() - start:.1f}s | Model: {MODEL_REPO}" | |
| return text.strip(), meta | |
| with gr.Blocks(title="Omi STT v1 Demo") as demo: | |
| gr.Markdown( | |
| "# Omi STT v1\n" | |
| "English medical speech-to-text adapted from NVIDIA Parakeet v2. " | |
| "**Not for clinical decision-making. Review transcripts before use.**" | |
| ) | |
| audio = gr.Audio(type="filepath", label="Upload audio, max 180 seconds") | |
| btn = gr.Button("Transcribe") | |
| transcript = gr.Textbox(label="Raw transcript", lines=10) | |
| meta = gr.Textbox(label="Run info", interactive=False) | |
| btn.click(transcribe, inputs=audio, outputs=[transcript, meta]) | |
| gr.Markdown("[Model card](https://huggingface.co/omi-health/omi-stt-v1)") | |
| if __name__ == "__main__": | |
| demo.launch() | |