Spaces:

kj03
/

whats-this-object-detector

Sleeping

App Files Files Community

whats-this-object-detector / app.py

kj03

Update app.py

cf95b7a verified about 1 year ago

Raw

History Blame Contribute Delete

3.92 kB

	import gradio as gr
	import torch
	from transformers import pipeline, VitsModel, VitsTokenizer
	from PIL import Image
	import numpy as np
	import time

	# Load object detection pipeline
	obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")

	# Initialize translation pipeline
	translator = pipeline(task="translation",
	model="facebook/nllb-200-distilled-600M",
	torch_dtype=torch.bfloat16)

	# Preload TTS components
	tts_model = VitsModel.from_pretrained("facebook/mms-tts-ben")
	tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-ben")

	def translate_label(label_en):
	"""Translate English text to Bengali using NLLB model"""
	try:
	result = translator(label_en, src_lang="eng_Latn", tgt_lang="ben_Beng")
	return result[0]['translation_text']
	except Exception as e:
	print(f"Translation error: {e}")
	return label_en # Fallback to English if translation fails

	def format_bangla_list(items):
	"""Format a list of items in natural Bangla language"""
	if not items:
	return ""
	if len(items) == 1:
	return items[0]
	if len(items) == 2:
	return f"{items[0]} এবং {items[1]}"

	# For 3+ items: comma separate all but last, add 'এবং' before last
	return ", ".join(items[:-1]) + " এবং " + items[-1]

	def recognize_object(image: Image.Image):
	"""Detect all objects, translate labels, and generate combined speech"""
	start_time = time.time()

	# Detect objects
	detections = obj_detector(image)
	print(f"Detection time: {time.time() - start_time:.2f}s")

	if not detections:
	return "কোনো বস্তু শনাক্ত হয়নি", None

	# Extract and translate labels
	labels_en = [d['label'] for d in detections]
	print("Detected objects:", labels_en)

	# Translate all labels to Bangla
	bangla_labels = [translate_label(label) for label in labels_en]
	print("Translated labels:", bangla_labels)

	# Format the list naturally in Bangla
	formatted_text = format_bangla_list(bangla_labels)

	# Add prefix based on number of objects
	num_objects = len(bangla_labels)
	if num_objects == 1:
	prefix = "এখানে একটি বস্তু আছে: "
	else:
	prefix = f"এখানে {num_objects}টি বস্তু আছে: "

	full_text = prefix + formatted_text
	print("Full Bangla text:", full_text)

	# Generate speech
	tts_start = time.time()
	inputs = tts_tokenizer(text=full_text, return_tensors="pt")
	with torch.no_grad():
	output = tts_model(**inputs).waveform
	audio_data = (tts_model.config.sampling_rate,
	output.squeeze().numpy().astype(np.float32))
	print(f"TTS time: {time.time() - tts_start:.2f}s")

	# Create detailed output text
	details = "\n".join([f"{en} → {bn}" for en, bn in zip(labels_en, bangla_labels)])
	details = f"শনাক্তকৃত বস্তু ({num_objects}টি):\n" + details

	print(f"Total processing time: {time.time() - start_time:.2f}s")
	return details, audio_data

	with gr.Blocks(title="এটা কী?") as demo:
	gr.Markdown("## 🧠 এটা কী? (Bangla Object Identifier)")
	gr.Markdown("ছবি দিন বা তুলুন—সমস্ত বস্তুর বাংলা নাম ও অডিও শুনুন।")

	img_input = gr.Image(label="ছবি দিন বা তুলুন", type="pil", sources=["upload","webcam"])
	label_out = gr.Textbox(label="শনাক্তকৃত বস্তু", lines=5)
	audio_out = gr.Audio(label="বাংলা অডিও", interactive=False)

	btn = gr.Button("বস্তু চিনুন")
	btn.click(recognize_object, inputs=img_input, outputs=[label_out, audio_out])

	demo.launch()