Spaces:

heyIamUmair
/

Sindhi-Speech-to-Text

Sleeping

App Files Files Community

Sindhi-Speech-to-Text / app.py

heyIamUmair

multilingual

d821926 verified 9 months ago

Raw

History Blame Contribute Delete

4.55 kB

	# import gradio as gr
	# import torch, numpy as np, soundfile as sf
	# from transformers import WhisperProcessor, WhisperForConditionalGeneration
	# import os

	# auth_token = os.environ.get("HF_TOKEN") # gets secret token

	# # ---- Settings ----
	# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	# MODEL_DIR = "heyIamUmair/whisper-base-sindhi1" # HF repo name

	# # ---- Load model & processor (with token if private) ----
	# processor = WhisperProcessor.from_pretrained(MODEL_DIR, use_auth_token=auth_token)
	# model = WhisperForConditionalGeneration.from_pretrained(MODEL_DIR, use_auth_token=auth_token).to(DEVICE).eval()
	# model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="Sindhi", task="transcribe")

	# # ---- Resample function ----
	# def resample_to_16k(audio, sr):
	# if sr == 16000:
	# return audio, sr
	# import torchaudio
	# wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # 1 x T
	# wav = torchaudio.functional.resample(wav, sr, 16000)
	# return wav.squeeze(0).cpu().numpy(), 16000

	# # ---- Transcription function ----
	# def transcribe(path):
	# audio, sr = sf.read(path, always_2d=False)
	# audio = audio.astype(np.float32) if audio.dtype != np.float32 else audio
	# if audio.ndim > 1:
	# audio = np.mean(audio, axis=1) # mono
	# if audio.dtype != np.float32:
	# maxv = np.iinfo(audio.dtype).max
	# audio = (audio / maxv).astype(np.float32)

	# audio, sr = resample_to_16k(audio, sr)
	# inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
	# input_features = inputs.input_features.to(DEVICE)

	# with torch.no_grad():
	# pred_ids = model.generate(input_features, max_length=225)

	# text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
	# return text.strip()

	# # ---- Gradio Interface ----
	# demo = gr.Interface(
	# fn=transcribe,
	# inputs=gr.Audio(sources=["upload", "microphone"], type="filepath"),
	# outputs=gr.Textbox(lines=5, max_lines=20, interactive=True, label="Transcription"),
	# title="Sindhi Speech-to-Text",
	# description="Upload or record speech and get Sindhi transcription using Whisper fine-tuned model."
	# )

	# demo.launch()
	import gradio as gr
	import torch, numpy as np, soundfile as sf
	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import os

	auth_token = os.environ.get("HF_TOKEN") # gets secret token

	# ---- Settings ----
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_DIR = "heyIamUmair/whisper-base-sindhi1" # HF repo name

	# ---- Load model & processor (with token if private) ----
	processor = WhisperProcessor.from_pretrained(MODEL_DIR, use_auth_token=auth_token)
	model = WhisperForConditionalGeneration.from_pretrained(MODEL_DIR, use_auth_token=auth_token).to(DEVICE).eval()

	# ---- Resample function ----
	def resample_to_16k(audio, sr):
	if sr == 16000:
	return audio, sr
	import torchaudio
	wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0) # 1 x T
	wav = torchaudio.functional.resample(wav, sr, 16000)
	return wav.squeeze(0).cpu().numpy(), 16000

	# ---- Transcription function ----
	def transcribe(path, language):
	audio, sr = sf.read(path, always_2d=False)
	audio = audio.astype(np.float32) if audio.dtype != np.float32 else audio
	if audio.ndim > 1:
	audio = np.mean(audio, axis=1) # mono
	if audio.dtype != np.float32:
	maxv = np.iinfo(audio.dtype).max
	audio = (audio / maxv).astype(np.float32)

	audio, sr = resample_to_16k(audio, sr)
	inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
	input_features = inputs.input_features.to(DEVICE)

	# Force selected language
	forced_ids = processor.get_decoder_prompt_ids(language=language, task="transcribe")

	with torch.no_grad():
	pred_ids = model.generate(input_features, max_length=225, forced_decoder_ids=forced_ids)

	text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
	return text.strip()

	# ---- Gradio Interface ----
	demo = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources=["upload", "microphone"], type="filepath"),
	gr.Dropdown(["Sindhi", "Urdu", "English"], value="Sindhi", label="Select Language")
	],
	outputs=gr.Textbox(lines=8, max_lines=40, interactive=True, label="Transcription"),
	title="Multilingual Speech-to-Text",
	description="Upload or record speech and transcribe into Sindhi, Urdu, or English using a fine-tuned Whisper model."
	)

	demo.launch()