# # app.py
# import os
# import io
# import time
# import numpy as np
# import torch
# import soundfile as sf
# import gradio as gr

# from transformers import WhisperProcessor, WhisperForConditionalGeneration

# # ---------------- SETTINGS ----------------
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# MODEL_ID = "heyIamUmair/whisper-base-sindhi2"  # change to your HF repo or local path

# # If model is private, provide HF token via the "HF_TOKEN" secret in the Space settings.
# HF_TOKEN = os.environ.get("HF_TOKEN", None)
# use_auth_token = HF_TOKEN if HF_TOKEN is not None else None

# # ---------------- LOAD MODEL ----------------
# # We load processor + model once at startup
# processor = WhisperProcessor.from_pretrained(MODEL_ID, use_auth_token=use_auth_token)
# model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, use_auth_token=use_auth_token).to(DEVICE).eval()

# # NOTE: set config-level forced_decoder_ids later per request (works for transformers modern versions)
# # -------------------------------------------------
# # Utility: resample (tries torchaudio then scipy fallback)
# def resample_to_16k(audio: np.ndarray, sr: int):
#     if sr == 16000:
#         return audio, sr
#     try:
#         import torchaudio
#         wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)  # (1, T)
#         wav = torchaudio.functional.resample(wav, sr, 16000)
#         return wav.squeeze(0).cpu().numpy(), 16000
#     except Exception:
#         # fallback: scipy.signal.resample_poly
#         from math import gcd
#         g = gcd(sr, 16000)
#         up, down = 16000 // g, sr // g
#         try:
#             from scipy.signal import resample_poly
#             return resample_poly(audio, up, down), 16000
#         except Exception:
#             raise RuntimeError("Install torchaudio or scipy to resample audio to 16kHz.")

# # ---------------- TRANSCRIPTION (single clip) ----------------
# def transcribe_ndarray(audio: np.ndarray, sr: int, language: str = "Sindhi"):
#     """
#     audio: numpy array (mono or stereo)
#     sr: original sampling rate
#     language: "Sindhi" or "English" (case-insensitive)
#     returns: transcription string
#     """
#     # Ensure mono (average channels if needed)
#     if audio.ndim > 1:
#         audio = np.mean(audio, axis=1)

#     # Ensure float32 [-1, 1]. If integer type, scale to [-1,1]
#     if audio.dtype != np.float32:
#         if np.issubdtype(audio.dtype, np.integer):
#             maxv = np.iinfo(audio.dtype).max
#             audio = (audio.astype(np.float32) / maxv).astype(np.float32)
#         else:
#             audio = audio.astype(np.float32)

#     # Resample to 16 kHz if needed
#     audio, sr = resample_to_16k(audio, sr)

#     # Prepare model input
#     inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
#     input_features = inputs.input_features.to(DEVICE)

#     # Set forced language tokens on the model config (works across transformers versions)
#     # Accept both "Sindhi"/"sindhi" or "English"/"english"
#     lang = language.strip().lower()
#     if lang.startswith("s"):
#         lang_code = "Sindhi"
#     else:
#         lang_code = "English"

#     try:
#         forced = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
#         model.config.forced_decoder_ids = forced
#     except Exception:
#         # If get_decoder_prompt_ids isn't available, ignore (older/newer versions)
#         pass

#     # Generate transcription (do NOT pass forced_decoder_ids here to be compatible with newer transformers)
#     with torch.no_grad():
#         pred_ids = model.generate(input_features, max_length=225)

#     # Decode
#     text = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0].strip()
#     return text

# # ---------------- GRADIO INTERFACE HANDLERS ----------------
# def transcribe_gradio(audio, language):
#     """
#     audio: numpy array (from gradio), or filepath (when using "filepath" type)
#     language: dropdown string
#     """
#     # Gradio can give: None, a dict, a numpy array, or a filepath depending on configuration.
#     if audio is None:
#         return "No audio provided."

#     # If Gradio gives a tuple/filepath: it often returns (sr, data) for type="numpy"
#     # If type="filepath", we will read file path.
#     # We'll accept both types.

#     # If audio is a filepath (string):
#     if isinstance(audio, str):
#         # load file
#         wave, sr = sf.read(audio, always_2d=False)
#         # convert channel->mono later in transcribe_ndarray
#         wave = wave.astype(np.float32)
#     else:
#         # If it's numpy array or tuple (sr, np.ndarray)
#         # Some Gradio configs pass a tuple (sr, np.ndarray) — detect:
#         if isinstance(audio, tuple) and len(audio) == 2:
#             sr, wave = audio
#         else:
#             # Often when using type="numpy" we get a 2D array (samples, channels)
#             wave = np.array(audio)
#             sr = 16000  # Gradio microphone usually records at 16000; if not, resampling will handle it
#         wave = wave.astype(np.float32)

#     # Call transcription
#     try:
#         txt = transcribe_ndarray(wave, sr, language=language)
#     except Exception as e:
#         return f"Transcription failed: {e}"

#     return txt

# # ---------------- BUILD GRADIO UI ----------------
# title = "Sindhi / English Live Speech-to-Text"
# desc = "Record using the microphone or upload audio. Select language and press Submit. Works with a fine-tuned Whisper model."

# with gr.Blocks() as demo:
#     gr.Markdown(f"## {title}")
#     gr.Markdown(desc)

#     with gr.Row():
#         with gr.Column(scale=2):
#             # mic = gr.Audio(source="microphone", type="filepath", label="Record voice (or upload file)")
#             mic = gr.Audio(
#             sources=["microphone", "upload"],  # 👈 use list instead of "source"
#             type="filepath", 
#             label="🎙️ Record or Upload Audio"
#             )
#             lang = gr.Dropdown(choices=["Sindhi", "English"], value="Sindhi", label="Language")
#             btn = gr.Button("Transcribe")
#         with gr.Column(scale=3):
#             out = gr.Textbox(label="Transcription", lines=10, interactive=True)

#     def on_click_transcribe(audio, language):
#         # audio will be a filepath if type="filepath"
#         return transcribe_gradio(audio, language)

#     btn.click(on_click_transcribe, inputs=[mic, lang], outputs=out)

# demo.launch(server_name="0.0.0.0", server_port=7860)

# 2nd file
# import os
# import time
# import numpy as np
# import torch
# import soundfile as sf
# import gradio as gr

# from transformers import WhisperProcessor, WhisperForConditionalGeneration

# # ---------------- SETTINGS ----------------
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# MODEL_ID = "heyIamUmair/whisper-base-sindhi2"  # your HF repo path

# HF_TOKEN = os.environ.get("HF_TOKEN", None)
# use_auth_token = HF_TOKEN if HF_TOKEN is not None else None

# # ---------------- LOAD MODEL ----------------
# processor = WhisperProcessor.from_pretrained(MODEL_ID, use_auth_token=use_auth_token)
# model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, use_auth_token=use_auth_token).to(DEVICE).eval()

# # ---------------- RESAMPLER ----------------
# def resample_to_16k(audio: np.ndarray, sr: int):
#     if sr == 16000:
#         return audio, sr
#     try:
#         import torchaudio
#         wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)  # (1, T)
#         wav = torchaudio.functional.resample(wav, sr, 16000)
#         return wav.squeeze(0).cpu().numpy(), 16000
#     except Exception:
#         from scipy.signal import resample_poly
#         from math import gcd
#         g = gcd(sr, 16000)
#         up, down = 16000 // g, sr // g
#         return resample_poly(audio, up, down), 16000

# # ---------------- SINGLE TRANSCRIPTION ----------------
# def transcribe_ndarray(audio: np.ndarray, sr: int, language: str = "Sindhi"):
#     if audio.ndim > 1:
#         audio = np.mean(audio, axis=1)  # mono
#     if audio.dtype != np.float32:
#         if np.issubdtype(audio.dtype, np.integer):
#             maxv = np.iinfo(audio.dtype).max
#             audio = (audio.astype(np.float32) / maxv)
#         else:
#             audio = audio.astype(np.float32)

#     audio, sr = resample_to_16k(audio, sr)
#     inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
#     input_features = inputs.input_features.to(DEVICE)

#     lang = language.strip().lower()
#     if lang.startswith("s"):
#         lang_code = "Sindhi"
#     else:
#         lang_code = "English"

#     try:
#         forced = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
#         model.config.forced_decoder_ids = forced
#     except Exception:
#         pass

#     with torch.no_grad():
#         pred_ids = model.generate(input_features, max_length=225)

#     text = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0].strip()
#     return text

# # ---------------- STREAMING TRANSCRIPTION ----------------
# def stream_transcription(audio, language):
#     if audio is None:
#         yield "No audio provided."
#         return

#     # If audio is filepath (since type="filepath")
#     if isinstance(audio, str):
#         wav, sr = sf.read(audio)
#     else:
#         yield "Invalid audio input."
#         return

#     if wav.ndim > 1:
#         wav = wav.mean(axis=1)
#     wav = wav.astype(np.float32)

#     chunk_size = sr * 2  # 2 sec chunks
#     buffer = np.zeros(0, dtype=np.float32)
#     output_text = ""

#     for start in range(0, len(wav), chunk_size):
#         chunk = wav[start:start+chunk_size]
#         buffer = np.concatenate([buffer, chunk])

#         # Keep last 5s
#         max_len = sr * 5
#         if len(buffer) > max_len:
#             buffer = buffer[-max_len:]

#         # Transcribe current buffer
#         pred = transcribe_ndarray(buffer, sr, language)
#         output_text = pred

#         yield output_text   # 👈 live update in Gradio
#         time.sleep(0.5)     # simulate delay

# # ---------------- BUILD GRADIO UI ----------------
# title = "Sindhi / English Real-time Speech-to-Text"
# desc = "🎙️ Speak or upload audio. Select language. Transcription updates as audio is processed."

# with gr.Blocks() as demo:
#     gr.Markdown(f"## {title}")
#     gr.Markdown(desc)

#     with gr.Row():
#         with gr.Column(scale=2):
#             mic = gr.Audio(
#                 sources=["microphone", "upload"], 
#                 type="filepath", 
#                 label="🎤 Record or Upload Audio"
#             )
#             lang = gr.Dropdown(choices=["Sindhi", "English"], value="Sindhi", label="Language")
#             btn = gr.Button("Start Transcription")
#         with gr.Column(scale=3):
#             out = gr.Textbox(label="Live Transcription", lines=10, interactive=False)

#     btn.click(stream_transcription, inputs=[mic, lang], outputs=out)

# demo.launch(server_name="0.0.0.0", server_port=7860)
# app.pyimport os
import os
import io
import gradio as gr
import time
import numpy as np
import torch
import soundfile as sf
import gradio as gr

from transformers import WhisperProcessor, WhisperForConditionalGeneration

# ---------------- SETTINGS ----------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "heyIamUmair/whisper-base-sindhi2"  # your HF repo path

HF_TOKEN = os.environ.get("HF_TOKEN", None)
use_auth_token = HF_TOKEN if HF_TOKEN is not None else None

# ---------------- LOAD MODEL ----------------
processor = WhisperProcessor.from_pretrained(MODEL_ID, use_auth_token=use_auth_token)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID, use_auth_token=use_auth_token).to(DEVICE).eval()

# ---------------- RESAMPLER ----------------
def resample_to_16k(audio: np.ndarray, sr: int):
    if sr == 16000:
        return audio, sr
    try:
        import torchaudio
        wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)  # (1, T)
        wav = torchaudio.functional.resample(wav, sr, 16000)
        return wav.squeeze(0).cpu().numpy(), 16000
    except Exception:
        from scipy.signal import resample_poly
        from math import gcd
        g = gcd(sr, 16000)
        up, down = 16000 // g, sr // g
        return resample_poly(audio, up, down), 16000

# ---------------- SINGLE TRANSCRIPTION ----------------
def transcribe_ndarray(audio: np.ndarray, sr: int, language: str = "Sindhi"):
    if audio.ndim > 1:
        audio = np.mean(audio, axis=1)  # mono
    if audio.dtype != np.float32:
        if np.issubdtype(audio.dtype, np.integer):
            maxv = np.iinfo(audio.dtype).max
            audio = (audio.astype(np.float32) / maxv)
        else:
            audio = audio.astype(np.float32)

    audio, sr = resample_to_16k(audio, sr)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
    input_features = inputs.input_features.to(DEVICE)

    lang = language.strip().lower()
    if lang.startswith("s"):
        lang_code = "Sindhi"
    else:
        lang_code = "English"

    try:
        forced = processor.get_decoder_prompt_ids(language=lang_code, task="transcribe")
        model.config.forced_decoder_ids = forced
    except Exception:
        pass

    with torch.no_grad():
        pred_ids = model.generate(input_features, max_length=225)

    text = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)[0].strip()
    return text

# ---------------- STREAMING TRANSCRIPTION ----------------
def stream_transcription(audio, language):
    if audio is None:
        yield "No audio provided."
        return

    # If audio is filepath (since type="filepath")
    if isinstance(audio, str):
        wav, sr = sf.read(audio)
    else:
        yield "Invalid audio input."
        return

    if wav.ndim > 1:
        wav = wav.mean(axis=1)
    wav = wav.astype(np.float32)

    chunk_size = sr * 2  # 2 sec chunks
    buffer = np.zeros(0, dtype=np.float32)
    output_text = ""

    for start in range(0, len(wav), chunk_size):
        chunk = wav[start:start+chunk_size]
        buffer = np.concatenate([buffer, chunk])

        # Keep last 5s
        max_len = sr * 5
        if len(buffer) > max_len:
            buffer = buffer[-max_len:]

        # Transcribe current buffer
        pred = transcribe_ndarray(buffer, sr, language)
        output_text = pred

        yield output_text   # 👈 live update in Gradio
        time.sleep(0.5)     # simulate delay

# ---------------- BUILD GRADIO UI ----------------
title = "Sindhi / English Real-time Speech-to-Text"
desc = "🎙️ Speak or upload audio. Select language. Transcription updates as audio is processed."

with gr.Blocks() as demo:
    gr.Markdown(f"## {title}")
    gr.Markdown(desc)

    with gr.Row():
        with gr.Column(scale=2):
            # mic = gr.Audio(
            #     sources=["microphone", "upload"], 
            #     type="filepath", 
            #     label="🎤 Record or Upload Audio"
            # )
            mic = gr.Audio(sources="microphone", streaming=True)
            lang = gr.Dropdown(choices=["Sindhi", "English"], value="Sindhi", label="Language")
            btn = gr.Button("Start Transcription")
        with gr.Column(scale=3):
            out = gr.Textbox(label="Live Transcription", lines=10, interactive=False)

    btn.click(stream_transcription, inputs=[mic, lang], outputs=out)

demo.launch(server_name="0.0.0.0", server_port=7860)