Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Running

App Files Files Community

Multimodal_Language_Learning_Aid / app.py

Yilin0601

Update app.py

178dac1 verified over 1 year ago

Raw

History Blame

8.33 kB

	import gradio as gr
	import torch
	import numpy as np
	import librosa
	import soundfile as sf
	import tempfile
	import os

	from transformers import (
	pipeline,
	VitsModel,
	AutoTokenizer
	)

	# For Coqui TTS
	try:
	from TTS.api import TTS as CoquiTTS
	except ImportError:
	raise ImportError("Please install Coqui TTS via `pip install TTS`.")

	# ------------------------------------------------------
	# 1. ASR Pipeline (English) using Wav2Vec2
	# ------------------------------------------------------
	asr = pipeline(
	"automatic-speech-recognition",
	model="facebook/wav2vec2-base-960h"
	)

	# ------------------------------------------------------
	# 2. Translation Models (3 languages)
	# ------------------------------------------------------
	translation_models = {
	"Spanish": "Helsinki-NLP/opus-mt-en-es",
	"Chinese": "Helsinki-NLP/opus-mt-en-zh",
	"Japanese": "Helsinki-NLP/opus-mt-en-ja"
	}

	translation_tasks = {
	"Spanish": "translation_en_to_es",
	"Chinese": "translation_en_to_zh",
	"Japanese": "translation_en_to_ja"
	}

	# ------------------------------------------------------
	# 3. TTS Config:
	# - Spanish: MMS TTS (facebook/mms-tts-spa)
	# - Chinese, Japanese: Coqui XTTS-v2 (tts_models/multilingual/multi-dataset/xtts_v2)
	# ------------------------------------------------------
	SPANISH = "Spanish"
	CHINESE = "Chinese"
	JAPANESE = "Japanese"

	# For Spanish (MMS)
	mms_spanish_config = {
	"model_id": "facebook/mms-tts-spa",
	"architecture": "vits"
	}

	# We'll map Chinese/Japanese to Coqui language codes
	coqui_lang_map = {
	CHINESE: "zh",
	JAPANESE: "ja"
	}

	# ------------------------------------------------------
	# 4. Global Caches
	# ------------------------------------------------------
	translator_cache = {}
	spanish_vits_cache = None
	coqui_tts_cache = None

	def get_translator(lang):
	"""
	Return a cached MarianMT translator for the specified language.
	"""
	if lang in translator_cache:
	return translator_cache[lang]
	model_name = translation_models[lang]
	task_name = translation_tasks[lang]
	translator = pipeline(task_name, model=model_name)
	translator_cache[lang] = translator
	return translator

	# ------------------------------------------------------
	# 5. Spanish TTS: MMS (VITS)
	# ------------------------------------------------------
	def load_spanish_vits():
	"""
	Load and cache the Spanish MMS TTS model (VITS).
	"""
	global spanish_vits_cache
	if spanish_vits_cache is not None:
	return spanish_vits_cache

	try:
	model = VitsModel.from_pretrained(mms_spanish_config["model_id"])
	tokenizer = AutoTokenizer.from_pretrained(mms_spanish_config["model_id"])
	spanish_vits_cache = (model, tokenizer)
	except Exception as e:
	raise RuntimeError(f"Failed to load Spanish TTS model {mms_spanish_config['model_id']}: {e}")

	return spanish_vits_cache

	def run_spanish_tts(text):
	"""
	Run MMS TTS (VITS) for Spanish text.
	Returns (sample_rate, waveform).
	"""
	model, tokenizer = load_spanish_vits()
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	output = model(**inputs)
	if not hasattr(output, "waveform"):
	raise RuntimeError("Spanish TTS model output does not contain 'waveform'.")
	waveform = output.waveform.squeeze().cpu().numpy()
	sample_rate = 16000
	return sample_rate, waveform

	# ------------------------------------------------------
	# 6. Chinese/Japanese TTS: Coqui XTTS-v2
	# ------------------------------------------------------
	def load_coqui_tts():
	"""
	Load and cache the Coqui XTTS-v2 model (multilingual).
	"""
	global coqui_tts_cache
	if coqui_tts_cache is not None:
	return coqui_tts_cache

	try:
	# If you have a GPU on HF Spaces, you can set gpu=True.
	# If not, set gpu=False to run on CPU (slower).
	coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
	except Exception as e:
	raise RuntimeError("Failed to load Coqui XTTS-v2 TTS: %s" % e)

	return coqui_tts_cache

	def run_coqui_tts(text, lang):
	"""
	Run Coqui TTS for Chinese or Japanese text.
	We specify the language code from coqui_lang_map.
	Returns (sample_rate, waveform).
	"""
	coqui_tts = load_coqui_tts()
	lang_code = coqui_lang_map[lang] # "zh" or "ja"

	# We must output to a file, then read it back.
	# Use a temporary file to store the wave.
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	tmp_name = tmp.name

	try:
	coqui_tts.tts_to_file(
	text=text,
	file_path=tmp_name,
	language=lang_code # no speaker_wav, default voice
	)
	data, sr = sf.read(tmp_name)
	finally:
	# Cleanup the temporary file
	if os.path.exists(tmp_name):
	os.remove(tmp_name)

	return sr, data

	# ------------------------------------------------------
	# 7. Main Prediction Function
	# ------------------------------------------------------
	def predict(audio, text, target_language):
	"""
	1. Get English text (ASR if audio provided, else text).
	2. Translate to target_language.
	3. TTS with the chosen approach:
	- Spanish -> MMS TTS (VITS)
	- Chinese/Japanese -> Coqui XTTS-v2
	"""
	# Step 1: English text
	if text.strip():
	english_text = text.strip()
	elif audio is not None:
	sample_rate, audio_data = audio

	# Convert to float32 if needed
	if audio_data.dtype not in [np.float32, np.float64]:
	audio_data = audio_data.astype(np.float32)

	# Stereo -> mono
	if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
	audio_data = np.mean(audio_data, axis=1)

	# Resample to 16k if needed
	if sample_rate != 16000:
	audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)

	asr_input = {"array": audio_data, "sampling_rate": 16000}
	asr_result = asr(asr_input)
	english_text = asr_result["text"]
	else:
	return "No input provided.", "", None

	# Step 2: Translate
	translator = get_translator(target_language)
	try:
	translation_result = translator(english_text)
	translated_text = translation_result[0]["translation_text"]
	except Exception as e:
	return english_text, f"Translation error: {e}", None

	# Step 3: TTS
	try:
	if target_language == SPANISH:
	sr, waveform = run_spanish_tts(translated_text)
	else:
	# Chinese or Japanese
	sr, waveform = run_coqui_tts(translated_text, target_language)
	except Exception as e:
	return english_text, translated_text, f"TTS error: {e}"

	return english_text, translated_text, (sr, waveform)

	# ------------------------------------------------------
	# 8. Gradio Interface
	# ------------------------------------------------------
	iface = gr.Interface(
	fn=predict,
	inputs=[
	gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
	gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
	gr.Dropdown(choices=[SPANISH, CHINESE, JAPANESE], value=SPANISH, label="Target Language")
	],
	outputs=[
	gr.Textbox(label="English Transcription"),
	gr.Textbox(label="Translation (Target Language)"),
	gr.Audio(label="Synthesized Speech")
	],
	title="Multimodal Language Learning Aid",
	description=(
	"1. Transcribes English speech using Wav2Vec2 (or takes English text).\n"
	"2. Translates to Spanish, Chinese, or Japanese (via Helsinki-NLP).\n"
	"3. Synthesizes speech:\n"
	" - Spanish -> facebook/mms-tts-spa (VITS)\n"
	" - Chinese & Japanese -> Coqui XTTS-v2 (multilingual TTS)\n\n"
	"Note: The Coqui model is 'tts_models/multilingual/multi-dataset/xtts_v2' and expects language codes.\n"
	"If you need voice cloning, set `speaker_wav` in `tts_to_file()`. By default, it uses a single generic voice."
	),
	allow_flagging="never"
	)

	if __name__ == "__main__":
	iface.launch(server_name="0.0.0.0", server_port=7860)