Spaces:

omi-health
/

omi-stt-v1-demo

Sleeping

App Files Files Community

Farhang87 commited on 22 days ago

Commit

be9bd17

verified ·

1 Parent(s): 72e4ebf

Add Omi STT v1 Gradio demo scaffold

Browse files

Files changed (3) hide show

README.md +11 -7
app.py +71 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,13 +1,17 @@
 ---
-title: Omi Stt V1 Demo
-emoji: 👁
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 6.16.0
-python_version: '3.13'
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Omi STT v1 Demo
+emoji: 🎙️
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 5.0.0
 app_file: app.py
 pinned: false
+license: cc-by-4.0
 ---
+# Omi STT v1 Demo
+Upload English clinical-style audio and transcribe it with Omi STT v1.
+This demo is for research and product exploration. It is not for clinical decision-making.

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from __future__ import annotations
+import tempfile
+import time
+from pathlib import Path
+import gradio as gr
+import numpy as np
+import soundfile as sf
+from huggingface_hub import hf_hub_download
+from nemo.collections.asr.models import ASRModel
+MODEL_REPO = "omi-health/omi-stt-v1"
+MODEL_FILE = "omimedstt-v1.nemo"
+MAX_SECONDS = 180
+_MODEL = None
+def load_model():
+    global _MODEL
+    if _MODEL is None:
+        checkpoint = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+        _MODEL = ASRModel.restore_from(checkpoint, map_location="cuda")
+    return _MODEL
+def normalize(audio_path: str) -> tuple[str, float]:
+    audio, sr = sf.read(audio_path, dtype="float32", always_2d=True)
+    mono = audio.mean(axis=1)
+    duration = len(mono) / float(sr)
+    if duration > MAX_SECONDS:
+        raise gr.Error(f"Audio is {duration:.1f}s. Demo limit is {MAX_SECONDS}s.")
+    if sr != 16000:
+        x_old = np.linspace(0.0, duration, num=len(mono), endpoint=False)
+        n = max(1, int(round(duration * 16000)))
+        x_new = np.linspace(0.0, duration, num=n, endpoint=False)
+        mono = np.interp(x_new, x_old, mono).astype("float32")
+    out = Path(tempfile.mkdtemp(prefix="omi_space_")) / "audio.16k.wav"
+    sf.write(out, mono, 16000)
+    return str(out), duration
+def transcribe(audio_path: str) -> tuple[str, str]:
+    if not audio_path:
+        raise gr.Error("Upload an audio file first.")
+    start = time.time()
+    wav, duration = normalize(audio_path)
+    model = load_model()
+    output = model.transcribe([wav])[0]
+    text = output.text if hasattr(output, "text") else str(output)
+    meta = f"Duration: {duration:.1f}s | Wall time: {time.time() - start:.1f}s | Model: {MODEL_REPO}"
+    return text.strip(), meta
+with gr.Blocks(title="Omi STT v1 Demo") as demo:
+    gr.Markdown(
+        "# Omi STT v1\n"
+        "English medical speech-to-text adapted from NVIDIA Parakeet v2. "
+        "**Not for clinical decision-making. Review transcripts before use.**"
+    )
+    audio = gr.Audio(type="filepath", label="Upload audio, max 180 seconds")
+    btn = gr.Button("Transcribe")
+    transcript = gr.Textbox(label="Raw transcript", lines=10)
+    meta = gr.Textbox(label="Run info", interactive=False)
+    btn.click(transcribe, inputs=audio, outputs=[transcript, meta])
+    gr.Markdown("[Model card](https://huggingface.co/omi-health/omi-stt-v1)")
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=5.0
+huggingface_hub>=0.23
+numpy>=1.24
+soundfile>=0.12
+nemo_toolkit[asr]