import gradio as gr
from transformers import pipeline
import torch
import spaces 

# --- 1. Setup and Global Definitions ---
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

MODELS = {
    "Whisper Small": "AmirMohseni/whisper-small-persian-bf16",
    "Whisper Large v3": "AmirMohseni/whisper-large-v3-persian-bf16"
}
model_pipelines = {}

# --- 2. Model Loading Function ---
def load_model(model_name):
    model_id = MODELS[model_name]
    if model_id not in model_pipelines:
        print(f"Loading model: {model_name}...")
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model_id,
            torch_dtype="auto", 
            device=device,
        )
        model_pipelines[model_id] = pipe
        print(f"Model '{model_name}' loaded successfully.")
    return model_pipelines[model_id]

# --- 3. Main Transcription Function ---
@spaces.GPU(duration=90)
def transcribe(audio, model_name):
    # 'audio' is now a filepath string again
    if audio is None:
        gr.Warning("No audio recorded. Please record your voice first.")
        return ""

    selected_pipe = load_model(model_name)
    print(f"Transcribing with '{model_name}'...")
    
    # The pipeline now receives the filepath directly
    result = selected_pipe(audio, generate_kwargs={"language": "persian", "task": "transcribe"})
    
    transcription = result["text"]
    print(f"Transcription result: {transcription}")
    return transcription

# --- 4. Pre-load the Default Model ---
print("Pre-loading the default model ('Whisper Large v3')...")
load_model("Whisper Large v3")
print("Default model pre-loaded. The interface is ready.")

# --- 5. Gradio Interface Definition ---
iface = gr.Interface(
    fn=transcribe,
    inputs=[
        # Reverted the type back to "filepath"
        gr.Audio(sources=["microphone"], type="filepath", label="Record Audio 🎤"), # <-- REVERTED
        gr.Radio(
            choices=list(MODELS.keys()),
            value="Whisper Large v3",
            label="Choose Model",
            info="The 'Large' model is more accurate but slower. The 'Small' model is faster."
        )
    ],
    outputs=gr.Textbox(label="Transcription", lines=5),
    title="Whisper Farsi 🎙️",
    description="Real-time Persian speech recognition. Choose a model, press 'Record Audio', and start speaking.",
    allow_flagging="never"
)

# --- 6. Launch the Application ---
iface.launch()