# import gradio as gr # import torch, numpy as np, soundfile as sf # from transformers import WhisperProcessor, WhisperForConditionalGeneration # import os # auth_token = os.environ.get("HF_TOKEN") # gets secret token # # ---- Settings ---- # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # MODEL_DIR = "heyIamUmair/whisper-base-sindhi1" # HF repo name # # ---- Load model & processor (with token if private) ---- # processor = WhisperProcessor.from_pretrained(MODEL_DIR, use_auth_token=auth_token) # model = WhisperForConditionalGeneration.from_pretrained(MODEL_DIR, use_auth_token=auth_token).to(DEVICE).eval() # model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="Sindhi", task="transcribe") # # ---- Resample function ---- # def resample_to_16k(audio, sr): # if sr == 16000: # return audio, sr # import torchaudio # wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # 1 x T # wav = torchaudio.functional.resample(wav, sr, 16000) # return wav.squeeze(0).cpu().numpy(), 16000 # # ---- Transcription function ---- # def transcribe(path): # audio, sr = sf.read(path, always_2d=False) # audio = audio.astype(np.float32) if audio.dtype != np.float32 else audio # if audio.ndim > 1: # audio = np.mean(audio, axis=1) # mono # if audio.dtype != np.float32: # maxv = np.iinfo(audio.dtype).max # audio = (audio / maxv).astype(np.float32) # audio, sr = resample_to_16k(audio, sr) # inputs = processor(audio, sampling_rate=sr, return_tensors="pt") # input_features = inputs.input_features.to(DEVICE) # with torch.no_grad(): # pred_ids = model.generate(input_features, max_length=225) # text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0] # return text.strip() # # ---- Gradio Interface ---- # demo = gr.Interface( # fn=transcribe, # inputs=gr.Audio(sources=["upload", "microphone"], type="filepath"), # outputs=gr.Textbox(lines=5, max_lines=20, interactive=True, label="Transcription"), # title="Sindhi Speech-to-Text", # description="Upload or record speech and get Sindhi transcription using Whisper fine-tuned model." # ) # demo.launch() import gradio as gr import torch, numpy as np, soundfile as sf from transformers import WhisperProcessor, WhisperForConditionalGeneration import os auth_token = os.environ.get("HF_TOKEN") # gets secret token # ---- Settings ---- DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODEL_DIR = "heyIamUmair/whisper-base-sindhi1" # HF repo name # ---- Load model & processor (with token if private) ---- processor = WhisperProcessor.from_pretrained(MODEL_DIR, use_auth_token=auth_token) model = WhisperForConditionalGeneration.from_pretrained(MODEL_DIR, use_auth_token=auth_token).to(DEVICE).eval() # ---- Resample function ---- def resample_to_16k(audio, sr): if sr == 16000: return audio, sr import torchaudio wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # 1 x T wav = torchaudio.functional.resample(wav, sr, 16000) return wav.squeeze(0).cpu().numpy(), 16000 # ---- Transcription function ---- def transcribe(path, language): audio, sr = sf.read(path, always_2d=False) audio = audio.astype(np.float32) if audio.dtype != np.float32 else audio if audio.ndim > 1: audio = np.mean(audio, axis=1) # mono if audio.dtype != np.float32: maxv = np.iinfo(audio.dtype).max audio = (audio / maxv).astype(np.float32) audio, sr = resample_to_16k(audio, sr) inputs = processor(audio, sampling_rate=sr, return_tensors="pt") input_features = inputs.input_features.to(DEVICE) # Force selected language forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") with torch.no_grad(): pred_ids = model.generate(input_features, max_length=225, forced_decoder_ids=forced_ids) text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0] return text.strip() # ---- Gradio Interface ---- demo = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources=["upload", "microphone"], type="filepath"), gr.Dropdown(["Sindhi", "Urdu", "English"], value="Sindhi", label="Select Language") ], outputs=gr.Textbox(lines=8, max_lines=40, interactive=True, label="Transcription"), title="Multilingual Speech-to-Text", description="Upload or record speech and transcribe into Sindhi, Urdu, or English using a fine-tuned Whisper model." ) demo.launch()