import gradio as gr
import torch
from transformers import pipeline, VitsModel, VitsTokenizer
from PIL import Image
import numpy as np
import time

# Load object detection pipeline
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

# Initialize translation pipeline
translator = pipeline(task="translation",
                      model="facebook/nllb-200-distilled-600M",
                      torch_dtype=torch.bfloat16)

# Preload TTS components
tts_model = VitsModel.from_pretrained("facebook/mms-tts-ben")
tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-ben")

def translate_label(label_en):
    """Translate English text to Bengali using NLLB model"""
    try:
        result = translator(label_en, src_lang="eng_Latn", tgt_lang="ben_Beng")
        return result[0]['translation_text']
    except Exception as e:
        print(f"Translation error: {e}")
        return label_en  # Fallback to English if translation fails

def format_bangla_list(items):
    """Format a list of items in natural Bangla language"""
    if not items:
        return ""
    if len(items) == 1:
        return items[0]
    if len(items) == 2:
        return f"{items[0]} এবং {items[1]}"
    
    # For 3+ items: comma separate all but last, add 'এবং' before last
    return ", ".join(items[:-1]) + " এবং " + items[-1]

def recognize_object(image: Image.Image):
    """Detect all objects, translate labels, and generate combined speech"""
    start_time = time.time()
    
    # Detect objects
    detections = obj_detector(image)
    print(f"Detection time: {time.time() - start_time:.2f}s")
    
    if not detections:
        return "কোনো বস্তু শনাক্ত হয়নি", None
    
    # Extract and translate labels
    labels_en = [d['label'] for d in detections]
    print("Detected objects:", labels_en)
    
    # Translate all labels to Bangla
    bangla_labels = [translate_label(label) for label in labels_en]
    print("Translated labels:", bangla_labels)
    
    # Format the list naturally in Bangla
    formatted_text = format_bangla_list(bangla_labels)
    
    # Add prefix based on number of objects
    num_objects = len(bangla_labels)
    if num_objects == 1:
        prefix = "এখানে একটি বস্তু আছে: "
    else:
        prefix = f"এখানে {num_objects}টি বস্তু আছে: "
    
    full_text = prefix + formatted_text
    print("Full Bangla text:", full_text)
    
    # Generate speech
    tts_start = time.time()
    inputs = tts_tokenizer(text=full_text, return_tensors="pt")
    with torch.no_grad():
        output = tts_model(**inputs).waveform
    audio_data = (tts_model.config.sampling_rate, 
                 output.squeeze().numpy().astype(np.float32))
    print(f"TTS time: {time.time() - tts_start:.2f}s")
    
    # Create detailed output text
    details = "\n".join([f"{en} → {bn}" for en, bn in zip(labels_en, bangla_labels)])
    details = f"শনাক্তকৃত বস্তু ({num_objects}টি):\n" + details
    
    print(f"Total processing time: {time.time() - start_time:.2f}s")
    return details, audio_data

with gr.Blocks(title="এটা কী?") as demo:
    gr.Markdown("## 🧠 এটা কী? (Bangla Object Identifier)")
    gr.Markdown("ছবি দিন বা তুলুন—সমস্ত বস্তুর বাংলা নাম ও অডিও শুনুন।")

    img_input = gr.Image(label="ছবি দিন বা তুলুন", type="pil", sources=["upload","webcam"])
    label_out = gr.Textbox(label="শনাক্তকৃত বস্তু", lines=5)
    audio_out = gr.Audio(label="বাংলা অডিও", interactive=False)

    btn = gr.Button("বস্তু চিনুন")
    btn.click(recognize_object, inputs=img_input, outputs=[label_out, audio_out])

demo.launch()