Spaces:

djhui5710
/

reachy_mini_home_assistant

Running

File size: 6,726 Bytes

"""Lightweight head tracker using YOLO for face detection.

Ported from reachy_mini_conversation_app for voice assistant integration.
Model is loaded at initialization time (not lazy) to ensure face tracking
is ready immediately when the camera server starts.
"""

from __future__ import annotations
import logging
from typing import Tuple, Optional

import numpy as np
from numpy.typing import NDArray


logger = logging.getLogger(__name__)


class HeadTracker:
    """Lightweight head tracker using YOLO for face detection.
    
    Model is loaded at initialization time to ensure face tracking
    is ready immediately (matching conversation_app behavior).
    """

    def __init__(
        self,
        model_repo: str = "AdamCodd/YOLOv11n-face-detection",
        model_filename: str = "model.pt",
        confidence_threshold: float = 0.3,
        device: str = "cpu",
    ) -> None:
        """Initialize YOLO-based head tracker.

        Args:
            model_repo: HuggingFace model repository
            model_filename: Model file name
            confidence_threshold: Minimum confidence for face detection
            device: Device to run inference on ('cpu' or 'cuda')
        """
        self.confidence_threshold = confidence_threshold
        self.model = None
        self._model_repo = model_repo
        self._model_filename = model_filename
        self._device = device
        self._detections_class = None
        self._model_load_attempted = False
        self._model_load_error: Optional[str] = None
        
        # Load model immediately at init (not lazy)
        self._load_model()

    def _load_model(self) -> None:
        """Load YOLO model with retry logic."""
        if self._model_load_attempted:
            return
        
        self._model_load_attempted = True
        
        try:
            from ultralytics import YOLO
            from supervision import Detections
            from huggingface_hub import hf_hub_download
            import time
            
            self._detections_class = Detections
            
            # Download with retries
            max_retries = 3
            retry_delay = 5
            model_path = None
            last_error = None
            
            for attempt in range(max_retries):
                try:
                    model_path = hf_hub_download(
                        repo_id=self._model_repo,
                        filename=self._model_filename,
                    )
                    break
                except Exception as e:
                    last_error = e
                    if attempt < max_retries - 1:
                        logger.warning(
                            "Model download failed (attempt %d/%d): %s. Retrying in %ds...",
                            attempt + 1, max_retries, e, retry_delay
                        )
                        time.sleep(retry_delay)
            
            if model_path is None:
                raise last_error
            
            self.model = YOLO(model_path).to(self._device)
            logger.info("YOLO face detection model loaded")
        except ImportError as e:
            self._model_load_error = f"Missing dependencies: {e}"
            logger.warning("Face tracking disabled - missing dependencies: %s", e)
            self.model = None
        except Exception as e:
            self._model_load_error = str(e)
            logger.error("Failed to load YOLO model: %s", e)
            self.model = None

    @property
    def is_available(self) -> bool:
        """Check if the head tracker is available and ready."""
        return self.model is not None and self._detections_class is not None

    def _select_best_face(self, detections) -> Optional[int]:
        """Select the best face based on confidence and area.

        Args:
            detections: Supervision detections object

        Returns:
            Index of best face or None if no valid faces
        """
        if detections.xyxy.shape[0] == 0:
            return None

        if detections.confidence is None:
            return None

        # Filter by confidence threshold
        valid_mask = detections.confidence >= self.confidence_threshold
        if not np.any(valid_mask):
            return None

        valid_indices = np.where(valid_mask)[0]

        # Calculate areas for valid detections
        boxes = detections.xyxy[valid_indices]
        areas = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

        # Combine confidence and area (weighted towards larger faces)
        confidences = detections.confidence[valid_indices]
        scores = confidences * 0.7 + (areas / np.max(areas)) * 0.3

        best_idx = valid_indices[np.argmax(scores)]
        return int(best_idx)

    def _bbox_to_normalized_coords(
        self, bbox: NDArray[np.float32], w: int, h: int
    ) -> NDArray[np.float32]:
        """Convert bounding box center to normalized coordinates [-1, 1].

        Args:
            bbox: Bounding box [x1, y1, x2, y2]
            w: Image width
            h: Image height

        Returns:
            Center point in [-1, 1] coordinates
        """
        center_x = (bbox[0] + bbox[2]) / 2.0
        center_y = (bbox[1] + bbox[3]) / 2.0

        # Normalize to [0, 1] then to [-1, 1]
        norm_x = (center_x / w) * 2.0 - 1.0
        norm_y = (center_y / h) * 2.0 - 1.0

        return np.array([norm_x, norm_y], dtype=np.float32)

    def get_head_position(
        self, img: NDArray[np.uint8]
    ) -> Tuple[Optional[NDArray[np.float32]], Optional[float]]:
        """Get head position from face detection.

        Args:
            img: Input image (BGR format)

        Returns:
            Tuple of (face_center [-1,1], confidence) or (None, None) if no face
        """
        if not self.is_available:
            return None, None

        h, w = img.shape[:2]

        try:
            # Run YOLO inference
            results = self.model(img, verbose=False)
            detections = self._detections_class.from_ultralytics(results[0])

            # Select best face
            face_idx = self._select_best_face(detections)
            if face_idx is None:
                return None, None

            bbox = detections.xyxy[face_idx]
            confidence = None
            if detections.confidence is not None:
                confidence = float(detections.confidence[face_idx])

            # Get face center in [-1, 1] coordinates
            face_center = self._bbox_to_normalized_coords(bbox, w, h)

            return face_center, confidence

        except Exception as e:
            logger.debug("Error in head position detection: %s", e)
            return None, None