import os
import uuid
import asyncio
import subprocess
import shutil
import nest_asyncio
import gradio as gr
import edge_tts
from deep_translator import GoogleTranslator
from faster_whisper import WhisperModel

# Allow asyncio to run inside Gradio's existing event loop
nest_asyncio.apply()

# Load Whisper model once at startup
# small = good balance between speed and accuracy on CPU
# int8 = quantized for lower memory usage
model = WhisperModel("small", device="cpu", compute_type="int8")

# Supported target languages
# Format: "Display Name": ("translation_code", "edge_tts_voice_name")
languages = {
    "English": ("en", "en-US-EricNeural"),
    "Spanish": ("es", "es-ES-AlvaroNeural"),
    "French": ("fr", "fr-FR-HenriNeural"),
    "German": ("de", "de-DE-ConradNeural"),
    "Italian": ("it", "it-IT-DiegoNeural"),
    "Russian": ("ru", "ru-RU-DmitryNeural"),
}


def transcribe(audio):
    """
    Transcribe audio file to text using faster-whisper.
    Returns a single string with all segments joined.
    """
    segments, _ = model.transcribe(audio)
    text = ""
    for s in segments:
        text += s.text + " "
    return text.strip()


async def tts_async(text, voice, out):
    """
    Async function to generate speech from text using Microsoft Edge TTS.
    Saves the result to the given output file path.
    """
    t = edge_tts.Communicate(text, voice)
    await t.save(out)


def run_tts(text, voice, out):
    """
    Wrapper to run the async TTS function synchronously
    inside the existing asyncio event loop (required for Gradio).
    """
    loop = asyncio.get_event_loop()
    loop.run_until_complete(tts_async(text, voice, out))


def process(video, language, use_lipsync):
    """
    Main video dubbing pipeline:
    Step 1 - Resize: scale video to 480p for faster processing
    Step 2 - Extract audio: pull mono 16kHz WAV from video (Whisper format)
    Step 3 - Transcribe: convert audio to text using Whisper
    Step 4 - Translate: translate text to target language using Google Translate
    Step 5 - TTS: generate new speech audio using Edge TTS
    Step 6 - Combine:
        - If lip sync enabled: run Wav2Lip to animate mouth movements
          - If Wav2Lip fails: fallback to simple audio replacement
        - If lip sync disabled: directly replace audio track with TTS audio
    Returns: (output_video_path, status_message)
    """
    try:
        # gr.Video returns the file path directly as a string
        video_path = video

        # Create an isolated temp directory for this job
        # Using short UUID to avoid path collisions between concurrent users
        uid = uuid.uuid4().hex[:6]
        work_dir = f"/tmp/{uid}"
        os.makedirs(work_dir, exist_ok=True)

        # Copy uploaded video into our work directory
        input_video = os.path.join(work_dir, "input.mp4")
        shutil.copy(video_path, input_video)

        # -------------------------------------------------------------------
        # Step 1: Resize video to 480p
        # -vf scale=-2:480 keeps aspect ratio, height = 480px
        # Smaller resolution = faster Whisper transcription and Wav2Lip
        # -------------------------------------------------------------------
        resized = os.path.join(work_dir, "video.mp4")
        subprocess.run(
            ["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized],
            check=True,
        )

        # -------------------------------------------------------------------
        # Step 2: Extract audio track from resized video
        # -vn = no video, -ac 1 = mono, -ar 16000 = 16kHz sample rate
        # 16kHz mono WAV is the required input format for Whisper
        # -------------------------------------------------------------------
        audio = os.path.join(work_dir, "audio.wav")
        subprocess.run(
            ["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio],
            check=True,
        )

        # -------------------------------------------------------------------
        # Step 3: Transcribe audio to text using Whisper
        # -------------------------------------------------------------------
        text = transcribe(audio)
        if not text:
            return None, "❌ Transcription failed or audio is silent."

        # -------------------------------------------------------------------
        # Step 4: Translate transcribed text to the target language
        # source="auto" = Whisper auto-detects the original language
        # -------------------------------------------------------------------
        lang, voice = languages[language]
        translated = GoogleTranslator(source="auto", target=lang).translate(text)
        if not translated:
            return None, "❌ Translation failed."

        # -------------------------------------------------------------------
        # Step 5: Generate TTS speech from translated text
        # Edge TTS uses Microsoft neural voices (free, no API key needed)
        # -------------------------------------------------------------------
        speech = os.path.join(work_dir, "tts.wav")
        run_tts(translated, voice, speech)

        # Output file path for final video
        output = os.path.join(work_dir, "lipsync.mp4")

        # -------------------------------------------------------------------
        # Step 6a: Lip sync mode — run Wav2Lip to animate mouth movements
        # Wav2Lip requires: face video + audio -> outputs lip-synced video
        # -------------------------------------------------------------------
        if use_lipsync:
            result = subprocess.run(
                [
                    "python", "Wav2Lip/inference.py",
                    "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
                    "--face", resized,       # input face video
                    "--audio", speech,       # new TTS audio
                    "--outfile", output,     # output lip-synced video
                ],
                capture_output=True,
                text=True,
            )

            # If Wav2Lip failed for any reason, fall back to simple audio swap
            if result.returncode != 0:
                print(f"WAV2LIP STDERR: {result.stderr}")
                print(f"WAV2LIP STDOUT: {result.stdout}")

                # Fallback: copy video stream, replace audio stream
                subprocess.run(
                    f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
                    f"-map 0:v:0 -map 1:a:0 {output}",
                    shell=True,
                    check=True,
                )
                return output, f"⚠️ Wav2Lip failed, used audio replacement instead.\n{result.stderr}"

            return output, "✅ Done with lip sync!"

        # -------------------------------------------------------------------
        # Step 6b: No lip sync — just replace the audio track
        # -c:v copy = keep original video stream unchanged
        # -c:a aac = encode new audio as AAC
        # -map 0:v:0 = take video from first input
        # -map 1:a:0 = take audio from second input (TTS)
        # -------------------------------------------------------------------
        else:
            subprocess.run(
                f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac "
                f"-map 0:v:0 -map 1:a:0 {output}",
                shell=True,
                check=True,
            )
            return output, "✅ Done! (audio replacement, no lip sync)"

    except Exception as e:
        # Catch any unexpected errors and return them as status message
        return None, f"❌ Error: {str(e)}"


# ---------------------------------------------------------------------------
# Gradio UI
# ---------------------------------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("# 🎬 AI Video Dubbing + Lip Sync")
    with gr.Row():
        with gr.Column():
            # Video upload widget — shows preview before processing
            video = gr.Video(label="Upload Video")

            # Target language selector
            lang = gr.Dropdown(
                list(languages.keys()),
                value="Spanish",
                label="Target Language",
            )

            # Toggle to enable/disable Wav2Lip lip sync
            # Disabled by default — faster, works on all videos
            # Enable only if video has close-up face shots
            use_lipsync = gr.Checkbox(
                label="Enable Lip Sync (Wav2Lip)",
                value=False,
                info="Enable if video has close-up face. Slower processing.",
            )

            # Submit button
            run = gr.Button("▶ Process", variant="primary")

        with gr.Column():
            # Output video player
            out = gr.Video(label="Result")

            # Status/error message box
            status = gr.Textbox(label="Status", lines=3)

    # Wire up the button click to the process function
    run.click(process, inputs=[video, lang, use_lipsync], outputs=[out, status])

demo.queue()
demo.launch()