kj03's picture
Update app.py
ad03eea verified
Raw
History Blame
2.17 kB
from transformers import DetrImageProcessor, DetrForObjectDetection
from PIL import Image, ImageDraw
import torch
import gradio as gr
import pyttsx3
import tempfile
import os
# Load model and processor
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
labels = model.config.id2label
def speak_text_to_file(text):
engine = pyttsx3.init()
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
path = f.name
engine.save_to_file(text, path)
engine.runAndWait()
return path
def detect_objects(image):
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
target_size = torch.tensor([image.size[::-1]])
results = processor.post_process_object_detection(outputs, target_sizes=target_size, threshold=0.9)[0]
draw = ImageDraw.Draw(image)
object_details = []
for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
box = [round(i, 2) for i in box.tolist()]
label_name = labels[label.item()]
confidence = round(score.item(), 2)
draw.rectangle(box, outline="red", width=2)
draw.text((box[0], box[1] - 10), f"{label_name} ({confidence})", fill="red")
object_details.append(
f"{label_name} with confidence {confidence}"
)
if object_details:
summary_text = "Detected: " + ", ".join(object_details)
audio_path = speak_text_to_file(summary_text)
else:
summary_text = "No objects detected with high confidence."
audio_path = speak_text_to_file(summary_text)
return image, summary_text, audio_path
gr.Interface(
fn=detect_objects,
inputs=gr.Image(type="pil", source="webcam", label="Capture or Upload Image"),
outputs=[
gr.Image(type="pil", label="Detected Image"),
gr.Textbox(label="Detected Objects"),
gr.Audio(label="Spoken Summary")
],
title="What’s This? – Real-Time Object Detector",
description="Take a picture or upload one to detect and hear object names.",
live=True
).launch()