Spaces:

kj03
/

whats-this-object-detector

Sleeping

File size: 3,923 Bytes

5ffa482
64a7f54
 
826741b
94bed02
cf95b7a
826741b
94bed02
826741b
94bed02
64a7f54
 
 
 
94bed02
cf95b7a
 
 
826741b
 
64a7f54
cf95b7a
 
 
 
 
 
826741b
cf95b7a
 
 
 
 
 
 
 
64a7f54
cf95b7a
 
 
 
 
 
64a7f54
 
826741b
cf95b7a
 
826741b
 
cf95b7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94bed02
64a7f54
cf95b7a
 
64a7f54
 
 
 
cf95b7a
 
 
 
 
94bed02
cf95b7a
 
826741b
 
 
cf95b7a
826741b
 
cf95b7a
826741b
 
cf95b7a
2faa641
826741b

import gradio as gr
import torch
from transformers import pipeline, VitsModel, VitsTokenizer
from PIL import Image
import numpy as np
import time

# Load object detection pipeline
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

# Initialize translation pipeline
translator = pipeline(task="translation",
                      model="facebook/nllb-200-distilled-600M",
                      torch_dtype=torch.bfloat16)

# Preload TTS components
tts_model = VitsModel.from_pretrained("facebook/mms-tts-ben")
tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-ben")

def translate_label(label_en):
    """Translate English text to Bengali using NLLB model"""
    try:
        result = translator(label_en, src_lang="eng_Latn", tgt_lang="ben_Beng")
        return result[0]['translation_text']
    except Exception as e:
        print(f"Translation error: {e}")
        return label_en  # Fallback to English if translation fails

def format_bangla_list(items):
    """Format a list of items in natural Bangla language"""
    if not items:
        return ""
    if len(items) == 1:
        return items[0]
    if len(items) == 2:
        return f"{items[0]} এবং {items[1]}"
    
    # For 3+ items: comma separate all but last, add 'এবং' before last
    return ", ".join(items[:-1]) + " এবং " + items[-1]

def recognize_object(image: Image.Image):
    """Detect all objects, translate labels, and generate combined speech"""
    start_time = time.time()
    
    # Detect objects
    detections = obj_detector(image)
    print(f"Detection time: {time.time() - start_time:.2f}s")
    
    if not detections:
        return "কোনো বস্তু শনাক্ত হয়নি", None
    
    # Extract and translate labels
    labels_en = [d['label'] for d in detections]
    print("Detected objects:", labels_en)
    
    # Translate all labels to Bangla
    bangla_labels = [translate_label(label) for label in labels_en]
    print("Translated labels:", bangla_labels)
    
    # Format the list naturally in Bangla
    formatted_text = format_bangla_list(bangla_labels)
    
    # Add prefix based on number of objects
    num_objects = len(bangla_labels)
    if num_objects == 1:
        prefix = "এখানে একটি বস্তু আছে: "
    else:
        prefix = f"এখানে {num_objects}টি বস্তু আছে: "
    
    full_text = prefix + formatted_text
    print("Full Bangla text:", full_text)
    
    # Generate speech
    tts_start = time.time()
    inputs = tts_tokenizer(text=full_text, return_tensors="pt")
    with torch.no_grad():
        output = tts_model(**inputs).waveform
    audio_data = (tts_model.config.sampling_rate, 
                 output.squeeze().numpy().astype(np.float32))
    print(f"TTS time: {time.time() - tts_start:.2f}s")
    
    # Create detailed output text
    details = "\n".join([f"{en} → {bn}" for en, bn in zip(labels_en, bangla_labels)])
    details = f"শনাক্তকৃত বস্তু ({num_objects}টি):\n" + details
    
    print(f"Total processing time: {time.time() - start_time:.2f}s")
    return details, audio_data

with gr.Blocks(title="এটা কী?") as demo:
    gr.Markdown("## 🧠 এটা কী? (Bangla Object Identifier)")
    gr.Markdown("ছবি দিন বা তুলুন—সমস্ত বস্তুর বাংলা নাম ও অডিও শুনুন।")

    img_input = gr.Image(label="ছবি দিন বা তুলুন", type="pil", sources=["upload","webcam"])
    label_out = gr.Textbox(label="শনাক্তকৃত বস্তু", lines=5)
    audio_out = gr.Audio(label="বাংলা অডিও", interactive=False)

    btn = gr.Button("বস্তু চিনুন")
    btn.click(recognize_object, inputs=img_input, outputs=[label_out, audio_out])

demo.launch()