from transformers import DetrImageProcessor, DetrForObjectDetection from PIL import Image, ImageDraw import torch import gradio as gr import pyttsx3 import tempfile import os # Load model and processor model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") labels = model.config.id2label def speak_text_to_file(text): engine = pyttsx3.init() with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f: path = f.name engine.save_to_file(text, path) engine.runAndWait() return path def detect_objects(image): inputs = processor(images=image, return_tensors="pt") outputs = model(**inputs) target_size = torch.tensor([image.size[::-1]]) results = processor.post_process_object_detection(outputs, target_sizes=target_size, threshold=0.9)[0] draw = ImageDraw.Draw(image) object_details = [] for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): box = [round(i, 2) for i in box.tolist()] label_name = labels[label.item()] confidence = round(score.item(), 2) draw.rectangle(box, outline="red", width=2) draw.text((box[0], box[1] - 10), f"{label_name} ({confidence})", fill="red") object_details.append( f"{label_name} with confidence {confidence}" ) if object_details: summary_text = "Detected: " + ", ".join(object_details) audio_path = speak_text_to_file(summary_text) else: summary_text = "No objects detected with high confidence." audio_path = speak_text_to_file(summary_text) return image, summary_text, audio_path gr.Interface( fn=detect_objects, inputs=gr.Image(type="pil", source="webcam", label="Capture or Upload Image"), outputs=[ gr.Image(type="pil", label="Detected Image"), gr.Textbox(label="Detected Objects"), gr.Audio(label="Spoken Summary") ], title="What’s This? – Real-Time Object Detector", description="Take a picture or upload one to detect and hear object names.", live=True ).launch()