# # app.py # import os # import io # import time # import numpy as np # import torch # import soundfile as sf # import gradio as gr # from transformers import WhisperProcessor, WhisperForConditionalGeneration # # ---------------- SETTINGS ---------------- # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # MODEL_ID = "heyIamUmair/whisper-base-sindhi2" # change to your HF repo or local path # # If model is private, provide HF token via the "HF_TOKEN" secret in the Space settings. # HF_TOKEN = os.environ.get("HF_TOKEN", None) # use_auth_token = HF_TOKEN if HF_TOKEN is not None else None # # ---------------- LOAD MODEL ---------------- # # We load processor + model once at startup # processor = WhisperProcessor.from_pretrained(MODEL_ID, use_auth_token=use_auth_token) # model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, use_auth_token=use_auth_token).to(DEVICE).eval() # # NOTE: set config-level forced_decoder_ids later per request (works for transformers modern versions) # # ------------------------------------------------- # # Utility: resample (tries torchaudio then scipy fallback) # def resample_to_16k(audio: np.ndarray, sr: int): # if sr == 16000: # return audio, sr # try: # import torchaudio # wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # (1, T) # wav = torchaudio.functional.resample(wav, sr, 16000) # return wav.squeeze(0).cpu().numpy(), 16000 # except Exception: # # fallback: scipy.signal.resample_poly # from math import gcd # g = gcd(sr, 16000) # up, down = 16000 // g, sr // g # try: # from scipy.signal import resample_poly # return resample_poly(audio, up, down), 16000 # except Exception: # raise RuntimeError("Install torchaudio or scipy to resample audio to 16kHz.") # # ---------------- TRANSCRIPTION (single clip) ---------------- # def transcribe_ndarray(audio: np.ndarray, sr: int, language: str = "Sindhi"): # """ # audio: numpy array (mono or stereo) # sr: original sampling rate # language: "Sindhi" or "English" (case-insensitive) # returns: transcription string # """ # # Ensure mono (average channels if needed) # if audio.ndim > 1: # audio = np.mean(audio, axis=1) # # Ensure float32 [-1, 1]. If integer type, scale to [-1,1] # if audio.dtype != np.float32: # if np.issubdtype(audio.dtype, np.integer): # maxv = np.iinfo(audio.dtype).max # audio = (audio.astype(np.float32) / maxv).astype(np.float32) # else: # audio = audio.astype(np.float32) # # Resample to 16 kHz if needed # audio, sr = resample_to_16k(audio, sr) # # Prepare model input # inputs = processor(audio, sampling_rate=sr, return_tensors="pt") # input_features = inputs.input_features.to(DEVICE) # # Set forced language tokens on the model config (works across transformers versions) # # Accept both "Sindhi"/"sindhi" or "English"/"english" # lang = language.strip().lower() # if lang.startswith("s"): # lang_code = "Sindhi" # else: # lang_code = "English" # try: # forced = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe") # model.config.forced_decoder_ids = forced # except Exception: # # If get_decoder_prompt_ids isn't available, ignore (older/newer versions) # pass # # Generate transcription (do NOT pass forced_decoder_ids here to be compatible with newer transformers) # with torch.no_grad(): # pred_ids = model.generate(input_features, max_length=225) # # Decode # text = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0].strip() # return text # # ---------------- GRADIO INTERFACE HANDLERS ---------------- # def transcribe_gradio(audio, language): # """ # audio: numpy array (from gradio), or filepath (when using "filepath" type) # language: dropdown string # """ # # Gradio can give: None, a dict, a numpy array, or a filepath depending on configuration. # if audio is None: # return "No audio provided." # # If Gradio gives a tuple/filepath: it often returns (sr, data) for type="numpy" # # If type="filepath", we will read file path. # # We'll accept both types. # # If audio is a filepath (string): # if isinstance(audio, str): # # load file # wave, sr = sf.read(audio, always_2d=False) # # convert channel->mono later in transcribe_ndarray # wave = wave.astype(np.float32) # else: # # If it's numpy array or tuple (sr, np.ndarray) # # Some Gradio configs pass a tuple (sr, np.ndarray) — detect: # if isinstance(audio, tuple) and len(audio) == 2: # sr, wave = audio # else: # # Often when using type="numpy" we get a 2D array (samples, channels) # wave = np.array(audio) # sr = 16000 # Gradio microphone usually records at 16000; if not, resampling will handle it # wave = wave.astype(np.float32) # # Call transcription # try: # txt = transcribe_ndarray(wave, sr, language=language) # except Exception as e: # return f"Transcription failed: {e}" # return txt # # ---------------- BUILD GRADIO UI ---------------- # title = "Sindhi / English Live Speech-to-Text" # desc = "Record using the microphone or upload audio. Select language and press Submit. Works with a fine-tuned Whisper model." # with gr.Blocks() as demo: # gr.Markdown(f"## {title}") # gr.Markdown(desc) # with gr.Row(): # with gr.Column(scale=2): # # mic = gr.Audio(source="microphone", type="filepath", label="Record voice (or upload file)") # mic = gr.Audio( # sources=["microphone", "upload"], # 👈 use list instead of "source" # type="filepath", # label="🎙️ Record or Upload Audio" # ) # lang = gr.Dropdown(choices=["Sindhi", "English"], value="Sindhi", label="Language") # btn = gr.Button("Transcribe") # with gr.Column(scale=3): # out = gr.Textbox(label="Transcription", lines=10, interactive=True) # def on_click_transcribe(audio, language): # # audio will be a filepath if type="filepath" # return transcribe_gradio(audio, language) # btn.click(on_click_transcribe, inputs=[mic, lang], outputs=out) # demo.launch(server_name="0.0.0.0", server_port=7860) # 2nd file # import os # import time # import numpy as np # import torch # import soundfile as sf # import gradio as gr # from transformers import WhisperProcessor, WhisperForConditionalGeneration # # ---------------- SETTINGS ---------------- # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # MODEL_ID = "heyIamUmair/whisper-base-sindhi2" # your HF repo path # HF_TOKEN = os.environ.get("HF_TOKEN", None) # use_auth_token = HF_TOKEN if HF_TOKEN is not None else None # # ---------------- LOAD MODEL ---------------- # processor = WhisperProcessor.from_pretrained(MODEL_ID, use_auth_token=use_auth_token) # model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, use_auth_token=use_auth_token).to(DEVICE).eval() # # ---------------- RESAMPLER ---------------- # def resample_to_16k(audio: np.ndarray, sr: int): # if sr == 16000: # return audio, sr # try: # import torchaudio # wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # (1, T) # wav = torchaudio.functional.resample(wav, sr, 16000) # return wav.squeeze(0).cpu().numpy(), 16000 # except Exception: # from scipy.signal import resample_poly # from math import gcd # g = gcd(sr, 16000) # up, down = 16000 // g, sr // g # return resample_poly(audio, up, down), 16000 # # ---------------- SINGLE TRANSCRIPTION ---------------- # def transcribe_ndarray(audio: np.ndarray, sr: int, language: str = "Sindhi"): # if audio.ndim > 1: # audio = np.mean(audio, axis=1) # mono # if audio.dtype != np.float32: # if np.issubdtype(audio.dtype, np.integer): # maxv = np.iinfo(audio.dtype).max # audio = (audio.astype(np.float32) / maxv) # else: # audio = audio.astype(np.float32) # audio, sr = resample_to_16k(audio, sr) # inputs = processor(audio, sampling_rate=sr, return_tensors="pt") # input_features = inputs.input_features.to(DEVICE) # lang = language.strip().lower() # if lang.startswith("s"): # lang_code = "Sindhi" # else: # lang_code = "English" # try: # forced = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe") # model.config.forced_decoder_ids = forced # except Exception: # pass # with torch.no_grad(): # pred_ids = model.generate(input_features, max_length=225) # text = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0].strip() # return text # # ---------------- STREAMING TRANSCRIPTION ---------------- # def stream_transcription(audio, language): # if audio is None: # yield "No audio provided." # return # # If audio is filepath (since type="filepath") # if isinstance(audio, str): # wav, sr = sf.read(audio) # else: # yield "Invalid audio input." # return # if wav.ndim > 1: # wav = wav.mean(axis=1) # wav = wav.astype(np.float32) # chunk_size = sr * 2 # 2 sec chunks # buffer = np.zeros(0, dtype=np.float32) # output_text = "" # for start in range(0, len(wav), chunk_size): # chunk = wav[start:start+chunk_size] # buffer = np.concatenate([buffer, chunk]) # # Keep last 5s # max_len = sr * 5 # if len(buffer) > max_len: # buffer = buffer[-max_len:] # # Transcribe current buffer # pred = transcribe_ndarray(buffer, sr, language) # output_text = pred # yield output_text # 👈 live update in Gradio # time.sleep(0.5) # simulate delay # # ---------------- BUILD GRADIO UI ---------------- # title = "Sindhi / English Real-time Speech-to-Text" # desc = "🎙️ Speak or upload audio. Select language. Transcription updates as audio is processed." # with gr.Blocks() as demo: # gr.Markdown(f"## {title}") # gr.Markdown(desc) # with gr.Row(): # with gr.Column(scale=2): # mic = gr.Audio( # sources=["microphone", "upload"], # type="filepath", # label="🎤 Record or Upload Audio" # ) # lang = gr.Dropdown(choices=["Sindhi", "English"], value="Sindhi", label="Language") # btn = gr.Button("Start Transcription") # with gr.Column(scale=3): # out = gr.Textbox(label="Live Transcription", lines=10, interactive=False) # btn.click(stream_transcription, inputs=[mic, lang], outputs=out) # demo.launch(server_name="0.0.0.0", server_port=7860) # app.pyimport os import os import io import gradio as gr import time import numpy as np import torch import soundfile as sf import gradio as gr from transformers import WhisperProcessor, WhisperForConditionalGeneration # ---------------- SETTINGS ---------------- DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODEL_ID = "heyIamUmair/whisper-base-sindhi2" # your HF repo path HF_TOKEN = os.environ.get("HF_TOKEN", None) use_auth_token = HF_TOKEN if HF_TOKEN is not None else None # ---------------- LOAD MODEL ---------------- processor = WhisperProcessor.from_pretrained(MODEL_ID, use_auth_token=use_auth_token) model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, use_auth_token=use_auth_token).to(DEVICE).eval() # ---------------- RESAMPLER ---------------- def resample_to_16k(audio: np.ndarray, sr: int): if sr == 16000: return audio, sr try: import torchaudio wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # (1, T) wav = torchaudio.functional.resample(wav, sr, 16000) return wav.squeeze(0).cpu().numpy(), 16000 except Exception: from scipy.signal import resample_poly from math import gcd g = gcd(sr, 16000) up, down = 16000 // g, sr // g return resample_poly(audio, up, down), 16000 # ---------------- SINGLE TRANSCRIPTION ---------------- def transcribe_ndarray(audio: np.ndarray, sr: int, language: str = "Sindhi"): if audio.ndim > 1: audio = np.mean(audio, axis=1) # mono if audio.dtype != np.float32: if np.issubdtype(audio.dtype, np.integer): maxv = np.iinfo(audio.dtype).max audio = (audio.astype(np.float32) / maxv) else: audio = audio.astype(np.float32) audio, sr = resample_to_16k(audio, sr) inputs = processor(audio, sampling_rate=sr, return_tensors="pt") input_features = inputs.input_features.to(DEVICE) lang = language.strip().lower() if lang.startswith("s"): lang_code = "Sindhi" else: lang_code = "English" try: forced = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe") model.config.forced_decoder_ids = forced except Exception: pass with torch.no_grad(): pred_ids = model.generate(input_features, max_length=225) text = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0].strip() return text # ---------------- STREAMING TRANSCRIPTION ---------------- def stream_transcription(audio, language): if audio is None: yield "No audio provided." return # If audio is filepath (since type="filepath") if isinstance(audio, str): wav, sr = sf.read(audio) else: yield "Invalid audio input." return if wav.ndim > 1: wav = wav.mean(axis=1) wav = wav.astype(np.float32) chunk_size = sr * 2 # 2 sec chunks buffer = np.zeros(0, dtype=np.float32) output_text = "" for start in range(0, len(wav), chunk_size): chunk = wav[start:start+chunk_size] buffer = np.concatenate([buffer, chunk]) # Keep last 5s max_len = sr * 5 if len(buffer) > max_len: buffer = buffer[-max_len:] # Transcribe current buffer pred = transcribe_ndarray(buffer, sr, language) output_text = pred yield output_text # 👈 live update in Gradio time.sleep(0.5) # simulate delay # ---------------- BUILD GRADIO UI ---------------- title = "Sindhi / English Real-time Speech-to-Text" desc = "🎙️ Speak or upload audio. Select language. Transcription updates as audio is processed." with gr.Blocks() as demo: gr.Markdown(f"## {title}") gr.Markdown(desc) with gr.Row(): with gr.Column(scale=2): # mic = gr.Audio( # sources=["microphone", "upload"], # type="filepath", # label="🎤 Record or Upload Audio" # ) mic = gr.Audio(sources="microphone", streaming=True) lang = gr.Dropdown(choices=["Sindhi", "English"], value="Sindhi", label="Language") btn = gr.Button("Start Transcription") with gr.Column(scale=3): out = gr.Textbox(label="Live Transcription", lines=10, interactive=False) btn.click(stream_transcription, inputs=[mic, lang], outputs=out) demo.launch(server_name="0.0.0.0", server_port=7860)