Spaces:
Sleeping
Sleeping
| # import gradio as gr | |
| # import torch, numpy as np, soundfile as sf | |
| # from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| # import os | |
| # auth_token = os.environ.get("HF_TOKEN") # gets secret token | |
| # # ---- Settings ---- | |
| # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| # MODEL_DIR = "heyIamUmair/whisper-base-sindhi1" # HF repo name | |
| # # ---- Load model & processor (with token if private) ---- | |
| # processor = WhisperProcessor.from_pretrained(MODEL_DIR, use_auth_token=auth_token) | |
| # model = WhisperForConditionalGeneration.from_pretrained(MODEL_DIR, use_auth_token=auth_token).to(DEVICE).eval() | |
| # model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="Sindhi", task="transcribe") | |
| # # ---- Resample function ---- | |
| # def resample_to_16k(audio, sr): | |
| # if sr == 16000: | |
| # return audio, sr | |
| # import torchaudio | |
| # wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # 1 x T | |
| # wav = torchaudio.functional.resample(wav, sr, 16000) | |
| # return wav.squeeze(0).cpu().numpy(), 16000 | |
| # # ---- Transcription function ---- | |
| # def transcribe(path): | |
| # audio, sr = sf.read(path, always_2d=False) | |
| # audio = audio.astype(np.float32) if audio.dtype != np.float32 else audio | |
| # if audio.ndim > 1: | |
| # audio = np.mean(audio, axis=1) # mono | |
| # if audio.dtype != np.float32: | |
| # maxv = np.iinfo(audio.dtype).max | |
| # audio = (audio / maxv).astype(np.float32) | |
| # audio, sr = resample_to_16k(audio, sr) | |
| # inputs = processor(audio, sampling_rate=sr, return_tensors="pt") | |
| # input_features = inputs.input_features.to(DEVICE) | |
| # with torch.no_grad(): | |
| # pred_ids = model.generate(input_features, max_length=225) | |
| # text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0] | |
| # return text.strip() | |
| # # ---- Gradio Interface ---- | |
| # demo = gr.Interface( | |
| # fn=transcribe, | |
| # inputs=gr.Audio(sources=["upload", "microphone"], type="filepath"), | |
| # outputs=gr.Textbox(lines=5, max_lines=20, interactive=True, label="Transcription"), | |
| # title="Sindhi Speech-to-Text", | |
| # description="Upload or record speech and get Sindhi transcription using Whisper fine-tuned model." | |
| # ) | |
| # demo.launch() | |
| import gradio as gr | |
| import torch, numpy as np, soundfile as sf | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| import os | |
| auth_token = os.environ.get("HF_TOKEN") # gets secret token | |
| # ---- Settings ---- | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| MODEL_DIR = "heyIamUmair/whisper-base-sindhi1" # HF repo name | |
| # ---- Load model & processor (with token if private) ---- | |
| processor = WhisperProcessor.from_pretrained(MODEL_DIR, use_auth_token=auth_token) | |
| model = WhisperForConditionalGeneration.from_pretrained(MODEL_DIR, use_auth_token=auth_token).to(DEVICE).eval() | |
| # ---- Resample function ---- | |
| def resample_to_16k(audio, sr): | |
| if sr == 16000: | |
| return audio, sr | |
| import torchaudio | |
| wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # 1 x T | |
| wav = torchaudio.functional.resample(wav, sr, 16000) | |
| return wav.squeeze(0).cpu().numpy(), 16000 | |
| # ---- Transcription function ---- | |
| def transcribe(path, language): | |
| audio, sr = sf.read(path, always_2d=False) | |
| audio = audio.astype(np.float32) if audio.dtype != np.float32 else audio | |
| if audio.ndim > 1: | |
| audio = np.mean(audio, axis=1) # mono | |
| if audio.dtype != np.float32: | |
| maxv = np.iinfo(audio.dtype).max | |
| audio = (audio / maxv).astype(np.float32) | |
| audio, sr = resample_to_16k(audio, sr) | |
| inputs = processor(audio, sampling_rate=sr, return_tensors="pt") | |
| input_features = inputs.input_features.to(DEVICE) | |
| # Force selected language | |
| forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe") | |
| with torch.no_grad(): | |
| pred_ids = model.generate(input_features, max_length=225, forced_decoder_ids=forced_ids) | |
| text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0] | |
| return text.strip() | |
| # ---- Gradio Interface ---- | |
| demo = gr.Interface( | |
| fn=transcribe, | |
| inputs=[ | |
| gr.Audio(sources=["upload", "microphone"], type="filepath"), | |
| gr.Dropdown(["Sindhi", "Urdu", "English"], value="Sindhi", label="Select Language") | |
| ], | |
| outputs=gr.Textbox(lines=8, max_lines=40, interactive=True, label="Transcription"), | |
| title="Multilingual Speech-to-Text", | |
| description="Upload or record speech and transcribe into Sindhi, Urdu, or English using a fine-tuned Whisper model." | |
| ) | |
| demo.launch() | |