import os import gradio as gr import torch from huggingface_hub import hf_hub_download from torongoxetu import TorongoModel HF_MODEL_REPO = "ananddey/torongoXetu-asr" MODEL_FILENAME = "torongoXetu-asr.nemo" CACHE_DIR = "/app/model_cache" os.makedirs(CACHE_DIR, exist_ok=True) model = None init_error = None try: print("⬇️ Downloading model from Hugging Face...") model_path = hf_hub_download( repo_id=HF_MODEL_REPO, filename=MODEL_FILENAME, cache_dir=CACHE_DIR, ) print(f"✅ Model downloaded to: {model_path}") model = TorongoModel(model_path) print("✅ TorongoXetu model loaded successfully") except Exception as e: init_error = str(e) print(f"❌ Model initialization failed: {init_error}") def transcribe(audio): if model is None: return f"Model not loaded: {init_error}" if audio is None: return "Please upload or record an audio file." try: text = model.transcribe(audio) return text if text else "No transcription generated." except Exception as e: return f"Error during transcription: {e}" EXAMPLES = [ [os.path.join("test-audio", "test.wav")], [os.path.join("test-audio", "test-2.wav")], [os.path.join("test-audio", "test-3.wav")], [os.path.join("test-audio", "test-4.wav")], [os.path.join("test-audio", "test-5.wav")], ] demo = gr.Interface( fn=transcribe, inputs=gr.Audio(type="filepath", label="Upload Assamese Audio"), outputs=gr.Textbox(label="Transcription (Assamese)", lines=4), title="🎙️ TorongoXetu – Assamese ASR", description=( "Automatic Speech Recognition for Assamese using the " "TorongoXetu model built with NVIDIA NeMo.\n\n" "Upload a WAV file or record audio to get instant transcription." ), examples=EXAMPLES, allow_flagging="never", api_name=False, ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True, )