Spaces:
Sleeping
Sleeping
| from transformers import DetrImageProcessor, DetrForObjectDetection | |
| from PIL import Image, ImageDraw | |
| import torch | |
| import gradio as gr | |
| import pyttsx3 | |
| import tempfile | |
| import os | |
| # Load model and processor | |
| model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") | |
| processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50") | |
| labels = model.config.id2label | |
| def speak_text_to_file(text): | |
| engine = pyttsx3.init() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f: | |
| path = f.name | |
| engine.save_to_file(text, path) | |
| engine.runAndWait() | |
| return path | |
| def detect_objects(image): | |
| inputs = processor(images=image, return_tensors="pt") | |
| outputs = model(**inputs) | |
| target_size = torch.tensor([image.size[::-1]]) | |
| results = processor.post_process_object_detection(outputs, target_sizes=target_size, threshold=0.9)[0] | |
| draw = ImageDraw.Draw(image) | |
| object_details = [] | |
| for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): | |
| box = [round(i, 2) for i in box.tolist()] | |
| label_name = labels[label.item()] | |
| confidence = round(score.item(), 2) | |
| draw.rectangle(box, outline="red", width=2) | |
| draw.text((box[0], box[1] - 10), f"{label_name} ({confidence})", fill="red") | |
| object_details.append( | |
| f"{label_name} with confidence {confidence}" | |
| ) | |
| if object_details: | |
| summary_text = "Detected: " + ", ".join(object_details) | |
| audio_path = speak_text_to_file(summary_text) | |
| else: | |
| summary_text = "No objects detected with high confidence." | |
| audio_path = speak_text_to_file(summary_text) | |
| return image, summary_text, audio_path | |
| gr.Interface( | |
| fn=detect_objects, | |
| inputs=gr.Image(type="pil", source="webcam", label="Capture or Upload Image"), | |
| outputs=[ | |
| gr.Image(type="pil", label="Detected Image"), | |
| gr.Textbox(label="Detected Objects"), | |
| gr.Audio(label="Spoken Summary") | |
| ], | |
| title="What’s This? – Real-Time Object Detector", | |
| description="Take a picture or upload one to detect and hear object names.", | |
| live=True | |
| ).launch() |