Farhang87 commited on
Commit
be9bd17
·
verified ·
1 Parent(s): 72e4ebf

Add Omi STT v1 Gradio demo scaffold

Browse files
Files changed (3) hide show
  1. README.md +11 -7
  2. app.py +71 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,13 +1,17 @@
1
  ---
2
- title: Omi Stt V1 Demo
3
- emoji: 👁
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.16.0
8
- python_version: '3.13'
9
  app_file: app.py
10
  pinned: false
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
+ title: Omi STT v1 Demo
3
+ emoji: 🎙️
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.0.0
 
8
  app_file: app.py
9
  pinned: false
10
+ license: cc-by-4.0
11
  ---
12
 
13
+ # Omi STT v1 Demo
14
+
15
+ Upload English clinical-style audio and transcribe it with Omi STT v1.
16
+
17
+ This demo is for research and product exploration. It is not for clinical decision-making.
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import tempfile
4
+ import time
5
+ from pathlib import Path
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import soundfile as sf
10
+ from huggingface_hub import hf_hub_download
11
+ from nemo.collections.asr.models import ASRModel
12
+
13
+
14
+ MODEL_REPO = "omi-health/omi-stt-v1"
15
+ MODEL_FILE = "omimedstt-v1.nemo"
16
+ MAX_SECONDS = 180
17
+ _MODEL = None
18
+
19
+
20
+ def load_model():
21
+ global _MODEL
22
+ if _MODEL is None:
23
+ checkpoint = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
24
+ _MODEL = ASRModel.restore_from(checkpoint, map_location="cuda")
25
+ return _MODEL
26
+
27
+
28
+ def normalize(audio_path: str) -> tuple[str, float]:
29
+ audio, sr = sf.read(audio_path, dtype="float32", always_2d=True)
30
+ mono = audio.mean(axis=1)
31
+ duration = len(mono) / float(sr)
32
+ if duration > MAX_SECONDS:
33
+ raise gr.Error(f"Audio is {duration:.1f}s. Demo limit is {MAX_SECONDS}s.")
34
+ if sr != 16000:
35
+ x_old = np.linspace(0.0, duration, num=len(mono), endpoint=False)
36
+ n = max(1, int(round(duration * 16000)))
37
+ x_new = np.linspace(0.0, duration, num=n, endpoint=False)
38
+ mono = np.interp(x_new, x_old, mono).astype("float32")
39
+ out = Path(tempfile.mkdtemp(prefix="omi_space_")) / "audio.16k.wav"
40
+ sf.write(out, mono, 16000)
41
+ return str(out), duration
42
+
43
+
44
+ def transcribe(audio_path: str) -> tuple[str, str]:
45
+ if not audio_path:
46
+ raise gr.Error("Upload an audio file first.")
47
+ start = time.time()
48
+ wav, duration = normalize(audio_path)
49
+ model = load_model()
50
+ output = model.transcribe([wav])[0]
51
+ text = output.text if hasattr(output, "text") else str(output)
52
+ meta = f"Duration: {duration:.1f}s | Wall time: {time.time() - start:.1f}s | Model: {MODEL_REPO}"
53
+ return text.strip(), meta
54
+
55
+
56
+ with gr.Blocks(title="Omi STT v1 Demo") as demo:
57
+ gr.Markdown(
58
+ "# Omi STT v1\n"
59
+ "English medical speech-to-text adapted from NVIDIA Parakeet v2. "
60
+ "**Not for clinical decision-making. Review transcripts before use.**"
61
+ )
62
+ audio = gr.Audio(type="filepath", label="Upload audio, max 180 seconds")
63
+ btn = gr.Button("Transcribe")
64
+ transcript = gr.Textbox(label="Raw transcript", lines=10)
65
+ meta = gr.Textbox(label="Run info", interactive=False)
66
+ btn.click(transcribe, inputs=audio, outputs=[transcript, meta])
67
+ gr.Markdown("[Model card](https://huggingface.co/omi-health/omi-stt-v1)")
68
+
69
+
70
+ if __name__ == "__main__":
71
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=5.0
2
+ huggingface_hub>=0.23
3
+ numpy>=1.24
4
+ soundfile>=0.12
5
+ nemo_toolkit[asr]