| """
|
| Voice Assistant Service for Reachy Mini.
|
|
|
| This module provides the main voice assistant service that integrates
|
| with Home Assistant via ESPHome protocol.
|
| """
|
|
|
| import asyncio
|
| import json
|
| import logging
|
| import threading
|
| import time
|
| from dataclasses import dataclass, field
|
| from pathlib import Path
|
| from queue import Queue
|
| from typing import Dict, List, Optional, Set, Union
|
|
|
| import numpy as np
|
|
|
| from reachy_mini import ReachyMini
|
|
|
| from .models import AvailableWakeWord, Preferences, ServerState, WakeWordType
|
| from .audio_player import AudioPlayer
|
| from .satellite import VoiceSatelliteProtocol
|
| from .util import get_mac
|
| from .zeroconf import HomeAssistantZeroconf
|
| from .motion import ReachyMiniMotion
|
| from .camera_server import MJPEGCameraServer
|
|
|
| _LOGGER = logging.getLogger(__name__)
|
|
|
| _MODULE_DIR = Path(__file__).parent
|
| _WAKEWORDS_DIR = _MODULE_DIR / "wakewords"
|
| _SOUNDS_DIR = _MODULE_DIR / "sounds"
|
| _LOCAL_DIR = _MODULE_DIR.parent / "local"
|
|
|
|
|
| @dataclass
|
| class AudioProcessingContext:
|
| """Context for audio processing, holding mutable state."""
|
| wake_words: List = field(default_factory=list)
|
| micro_features: Optional[object] = None
|
| micro_inputs: List = field(default_factory=list)
|
| oww_features: Optional[object] = None
|
| oww_inputs: List = field(default_factory=list)
|
| has_oww: bool = False
|
| last_active: Optional[float] = None
|
|
|
|
|
| class VoiceAssistantService:
|
| """Voice assistant service that runs ESPHome protocol server."""
|
|
|
| def __init__(
|
| self,
|
| reachy_mini: Optional[ReachyMini] = None,
|
| name: str = "Reachy Mini",
|
| host: str = "0.0.0.0",
|
| port: int = 6053,
|
| wake_model: str = "okay_nabu",
|
| camera_port: int = 8081,
|
| camera_enabled: bool = True,
|
| ):
|
| self.reachy_mini = reachy_mini
|
| self.name = name
|
| self.host = host
|
| self.port = port
|
| self.wake_model = wake_model
|
| self.camera_port = camera_port
|
| self.camera_enabled = camera_enabled
|
|
|
| self._server = None
|
| self._discovery = None
|
| self._audio_thread = None
|
| self._running = False
|
| self._state: Optional[ServerState] = None
|
| self._motion = ReachyMiniMotion(reachy_mini)
|
| self._camera_server: Optional[MJPEGCameraServer] = None
|
|
|
| async def start(self) -> None:
|
| """Start the voice assistant service."""
|
| _LOGGER.info("Initializing voice assistant service...")
|
|
|
|
|
| _WAKEWORDS_DIR.mkdir(parents=True, exist_ok=True)
|
| _SOUNDS_DIR.mkdir(parents=True, exist_ok=True)
|
| _LOCAL_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
| await self._verify_required_files()
|
|
|
|
|
| available_wake_words = self._load_available_wake_words()
|
| _LOGGER.debug("Available wake words: %s", list(available_wake_words.keys()))
|
|
|
|
|
| preferences_path = _LOCAL_DIR / "preferences.json"
|
| preferences = self._load_preferences(preferences_path)
|
|
|
|
|
| wake_models, active_wake_words = self._load_wake_models(
|
| available_wake_words, preferences
|
| )
|
|
|
|
|
| stop_model = self._load_stop_model()
|
|
|
|
|
| music_player = AudioPlayer(self.reachy_mini)
|
| tts_player = AudioPlayer(self.reachy_mini)
|
|
|
|
|
| self._state = ServerState(
|
| name=self.name,
|
| mac_address=get_mac(),
|
| audio_queue=Queue(),
|
| entities=[],
|
| available_wake_words=available_wake_words,
|
| wake_words=wake_models,
|
| active_wake_words=active_wake_words,
|
| stop_word=stop_model,
|
| music_player=music_player,
|
| tts_player=tts_player,
|
| wakeup_sound=str(_SOUNDS_DIR / "wake_word_triggered.flac"),
|
| timer_finished_sound=str(_SOUNDS_DIR / "timer_finished.flac"),
|
| preferences=preferences,
|
| preferences_path=preferences_path,
|
| refractory_seconds=2.0,
|
| download_dir=_LOCAL_DIR,
|
| reachy_mini=self.reachy_mini,
|
| motion_enabled=self.reachy_mini is not None,
|
| )
|
|
|
|
|
| self._state.motion = self._motion
|
|
|
|
|
| if self.reachy_mini is not None:
|
| try:
|
|
|
|
|
| if self.reachy_mini.media.audio is not None:
|
| self.reachy_mini.media.start_recording()
|
| self.reachy_mini.media.start_playing()
|
| _LOGGER.info("Reachy Mini media system initialized")
|
| else:
|
| _LOGGER.warning("Reachy Mini audio system not available")
|
| except Exception as e:
|
| _LOGGER.warning("Failed to initialize Reachy Mini media: %s", e)
|
|
|
|
|
| if self._motion is not None:
|
| self._motion.start()
|
|
|
|
|
| self._running = True
|
| self._audio_thread = threading.Thread(
|
| target=self._process_audio,
|
| daemon=True,
|
| )
|
| self._audio_thread.start()
|
|
|
|
|
| if self.camera_enabled:
|
| self._camera_server = MJPEGCameraServer(
|
| reachy_mini=self.reachy_mini,
|
| host=self.host,
|
| port=self.camera_port,
|
| fps=15,
|
| quality=80,
|
| )
|
| await self._camera_server.start()
|
|
|
|
|
| loop = asyncio.get_running_loop()
|
| camera_server = self._camera_server
|
| self._server = await loop.create_server(
|
| lambda: VoiceSatelliteProtocol(self._state, camera_server=camera_server),
|
| host=self.host,
|
| port=self.port,
|
| )
|
|
|
|
|
| self._discovery = HomeAssistantZeroconf(port=self.port, name=self.name)
|
| await self._discovery.register_server()
|
|
|
| _LOGGER.info("Voice assistant service started on %s:%s", self.host, self.port)
|
|
|
| async def stop(self) -> None:
|
| """Stop the voice assistant service."""
|
| _LOGGER.info("Stopping voice assistant service...")
|
|
|
|
|
| if self.reachy_mini is not None:
|
| try:
|
| self.reachy_mini.media.stop_recording()
|
| _LOGGER.debug("Reachy Mini recording stopped")
|
| except Exception as e:
|
| _LOGGER.warning("Error stopping Reachy Mini recording: %s", e)
|
|
|
|
|
| self._running = False
|
|
|
|
|
| if self._audio_thread:
|
| self._audio_thread.join(timeout=3.0)
|
| if self._audio_thread.is_alive():
|
| _LOGGER.warning("Audio thread did not stop in time")
|
|
|
|
|
| if self.reachy_mini is not None:
|
| try:
|
| self.reachy_mini.media.stop_playing()
|
| _LOGGER.debug("Reachy Mini playback stopped")
|
| except Exception as e:
|
| _LOGGER.warning("Error stopping Reachy Mini playback: %s", e)
|
|
|
|
|
| if self._server:
|
| self._server.close()
|
| await self._server.wait_closed()
|
|
|
|
|
| if self._discovery:
|
| await self._discovery.unregister_server()
|
|
|
|
|
| if self._camera_server:
|
| await self._camera_server.stop()
|
| self._camera_server = None
|
|
|
|
|
| if self._motion:
|
| self._motion.shutdown()
|
|
|
| _LOGGER.info("Voice assistant service stopped.")
|
|
|
| async def _verify_required_files(self) -> None:
|
| """Verify required model and sound files exist (bundled with package)."""
|
|
|
| required_wakewords = [
|
| "okay_nabu.tflite",
|
| "okay_nabu.json",
|
| "hey_jarvis.tflite",
|
| "hey_jarvis.json",
|
| "stop.tflite",
|
| "stop.json",
|
| ]
|
|
|
|
|
| required_sounds = [
|
| "wake_word_triggered.flac",
|
| "timer_finished.flac",
|
| ]
|
|
|
|
|
| missing_wakewords = []
|
| for filename in required_wakewords:
|
| filepath = _WAKEWORDS_DIR / filename
|
| if not filepath.exists():
|
| missing_wakewords.append(filename)
|
|
|
| if missing_wakewords:
|
| _LOGGER.warning(
|
| "Missing wake word files: %s. These should be bundled with the package.",
|
| missing_wakewords
|
| )
|
|
|
|
|
| missing_sounds = []
|
| for filename in required_sounds:
|
| filepath = _SOUNDS_DIR / filename
|
| if not filepath.exists():
|
| missing_sounds.append(filename)
|
|
|
| if missing_sounds:
|
| _LOGGER.warning(
|
| "Missing sound files: %s. These should be bundled with the package.",
|
| missing_sounds
|
| )
|
|
|
| if not missing_wakewords and not missing_sounds:
|
| _LOGGER.info("All required files verified successfully.")
|
|
|
| def _load_available_wake_words(self) -> Dict[str, AvailableWakeWord]:
|
| """Load available wake word configurations."""
|
| available_wake_words: Dict[str, AvailableWakeWord] = {}
|
|
|
| wake_word_dirs = [_WAKEWORDS_DIR, _LOCAL_DIR / "external_wake_words"]
|
|
|
| for wake_word_dir in wake_word_dirs:
|
| if not wake_word_dir.exists():
|
| continue
|
|
|
| for config_path in wake_word_dir.glob("*.json"):
|
| model_id = config_path.stem
|
| if model_id == "stop":
|
| continue
|
|
|
| try:
|
| with open(config_path, "r", encoding="utf-8") as f:
|
| config = json.load(f)
|
|
|
| model_type = WakeWordType(config.get("type", "micro"))
|
|
|
| if model_type == WakeWordType.OPEN_WAKE_WORD:
|
| wake_word_path = config_path.parent / config["model"]
|
| else:
|
| wake_word_path = config_path
|
|
|
| available_wake_words[model_id] = AvailableWakeWord(
|
| id=model_id,
|
| type=model_type,
|
| wake_word=config.get("wake_word", model_id),
|
| trained_languages=config.get("trained_languages", []),
|
| wake_word_path=wake_word_path,
|
| )
|
| except Exception as e:
|
| _LOGGER.warning("Failed to load wake word %s: %s", config_path, e)
|
|
|
| return available_wake_words
|
|
|
| def _load_preferences(self, preferences_path: Path) -> Preferences:
|
| """Load user preferences."""
|
| if preferences_path.exists():
|
| try:
|
| with open(preferences_path, "r", encoding="utf-8") as f:
|
| data = json.load(f)
|
| return Preferences(**data)
|
| except Exception as e:
|
| _LOGGER.warning("Failed to load preferences: %s", e)
|
|
|
| return Preferences()
|
|
|
| def _load_wake_models(
|
| self,
|
| available_wake_words: Dict[str, AvailableWakeWord],
|
| preferences: Preferences,
|
| ):
|
| """Load wake word models."""
|
| from pymicro_wakeword import MicroWakeWord
|
| from pyopen_wakeword import OpenWakeWord
|
|
|
| wake_models: Dict[str, Union[MicroWakeWord, OpenWakeWord]] = {}
|
| active_wake_words: Set[str] = set()
|
|
|
|
|
| if preferences.active_wake_words:
|
| for wake_word_id in preferences.active_wake_words:
|
| wake_word = available_wake_words.get(wake_word_id)
|
| if wake_word is None:
|
| _LOGGER.warning("Unknown wake word: %s", wake_word_id)
|
| continue
|
|
|
| try:
|
| _LOGGER.debug("Loading wake model: %s", wake_word_id)
|
| wake_models[wake_word_id] = wake_word.load()
|
| active_wake_words.add(wake_word_id)
|
| except Exception as e:
|
| _LOGGER.warning("Failed to load wake model %s: %s", wake_word_id, e)
|
|
|
|
|
| if not wake_models:
|
| wake_word = available_wake_words.get(self.wake_model)
|
| if wake_word:
|
| try:
|
| _LOGGER.debug("Loading default wake model: %s", self.wake_model)
|
| wake_models[self.wake_model] = wake_word.load()
|
| active_wake_words.add(self.wake_model)
|
| except Exception as e:
|
| _LOGGER.error("Failed to load default wake model: %s", e)
|
|
|
| return wake_models, active_wake_words
|
|
|
| def _load_stop_model(self):
|
| """Load the stop word model."""
|
| from pymicro_wakeword import MicroWakeWord
|
|
|
| stop_config = _WAKEWORDS_DIR / "stop.json"
|
| if stop_config.exists():
|
| try:
|
| return MicroWakeWord.from_config(stop_config)
|
| except Exception as e:
|
| _LOGGER.warning("Failed to load stop model: %s", e)
|
|
|
|
|
| _LOGGER.warning("Stop model not available, using fallback")
|
| okay_nabu_config = _WAKEWORDS_DIR / "okay_nabu.json"
|
| if okay_nabu_config.exists():
|
| return MicroWakeWord.from_config(okay_nabu_config)
|
|
|
| return None
|
|
|
| def _process_audio(self) -> None:
|
| """Process audio from microphone (Reachy Mini or system fallback)."""
|
| from pymicro_wakeword import MicroWakeWordFeatures
|
|
|
| ctx = AudioProcessingContext()
|
| ctx.micro_features = MicroWakeWordFeatures()
|
|
|
| try:
|
| _LOGGER.info("Starting audio processing...")
|
|
|
| if self.reachy_mini is not None:
|
| _LOGGER.info("Using Reachy Mini's microphone")
|
| self._audio_loop_reachy(ctx)
|
| else:
|
| _LOGGER.info("Using system microphone (fallback)")
|
| self._audio_loop_fallback(ctx)
|
|
|
| except Exception:
|
| _LOGGER.exception("Error processing audio")
|
|
|
| def _audio_loop_reachy(self, ctx: AudioProcessingContext) -> None:
|
| """Audio loop using Reachy Mini's microphone."""
|
| while self._running:
|
| try:
|
| if not self._wait_for_satellite():
|
| continue
|
|
|
| self._update_wake_words_list(ctx)
|
|
|
|
|
| audio_chunk = self._get_reachy_audio_chunk()
|
| if audio_chunk is None:
|
| time.sleep(0.01)
|
| continue
|
|
|
| self._process_audio_chunk(ctx, audio_chunk)
|
|
|
| except Exception as e:
|
| _LOGGER.error("Error in Reachy audio processing: %s", e)
|
| time.sleep(0.1)
|
|
|
| def _audio_loop_fallback(self, ctx: AudioProcessingContext) -> None:
|
| """Audio loop using system microphone (fallback)."""
|
| import sounddevice as sd
|
|
|
| block_size = 1024
|
|
|
| with sd.InputStream(
|
| samplerate=16000,
|
| channels=1,
|
| blocksize=block_size,
|
| dtype="float32",
|
| ) as stream:
|
| while self._running:
|
| if not self._wait_for_satellite():
|
| continue
|
|
|
| self._update_wake_words_list(ctx)
|
|
|
|
|
| audio_chunk_array, overflowed = stream.read(block_size)
|
| if overflowed:
|
| _LOGGER.warning("Audio buffer overflow")
|
|
|
| audio_chunk_array = audio_chunk_array.reshape(-1)
|
| audio_chunk = self._convert_to_pcm(audio_chunk_array)
|
|
|
| self._process_audio_chunk(ctx, audio_chunk)
|
|
|
| def _wait_for_satellite(self) -> bool:
|
| """Wait for satellite connection. Returns True if connected."""
|
| if self._state is None or self._state.satellite is None:
|
| time.sleep(0.1)
|
| return False
|
| return True
|
|
|
| def _update_wake_words_list(self, ctx: AudioProcessingContext) -> None:
|
| """Update wake words list if changed."""
|
| from pyopen_wakeword import OpenWakeWord, OpenWakeWordFeatures
|
|
|
| if (not ctx.wake_words) or (self._state.wake_words_changed and self._state.wake_words):
|
| self._state.wake_words_changed = False
|
| ctx.wake_words.clear()
|
| ctx.wake_words.extend([
|
| ww for ww in self._state.wake_words.values()
|
| if ww.id in self._state.active_wake_words
|
| ])
|
|
|
| ctx.has_oww = any(isinstance(ww, OpenWakeWord) for ww in ctx.wake_words)
|
| if ctx.has_oww and ctx.oww_features is None:
|
| ctx.oww_features = OpenWakeWordFeatures.from_builtin()
|
|
|
| _LOGGER.debug("Wake words updated: %s", [ww.id for ww in ctx.wake_words])
|
|
|
| def _get_reachy_audio_chunk(self) -> Optional[bytes]:
|
| """Get audio chunk from Reachy Mini's microphone.
|
|
|
| Returns:
|
| PCM audio bytes, or None if no valid audio available.
|
| """
|
| audio_data = self.reachy_mini.media.get_audio_sample()
|
|
|
|
|
| if audio_data is None:
|
| return None
|
| if not isinstance(audio_data, np.ndarray):
|
| return None
|
| if audio_data.size == 0:
|
| return None
|
|
|
|
|
| try:
|
| if audio_data.dtype.kind in ('S', 'U', 'O', 'V', 'b'):
|
| return None
|
| if audio_data.dtype != np.float32:
|
| audio_data = np.asarray(audio_data, dtype=np.float32)
|
| except (TypeError, ValueError):
|
| return None
|
|
|
|
|
| try:
|
| if audio_data.ndim == 2 and audio_data.shape[1] == 2:
|
| audio_chunk_array = audio_data.mean(axis=1)
|
| elif audio_data.ndim == 2:
|
| audio_chunk_array = audio_data[:, 0].copy()
|
| elif audio_data.ndim == 1:
|
| audio_chunk_array = audio_data
|
| else:
|
| return None
|
| except Exception:
|
| return None
|
|
|
| return self._convert_to_pcm(audio_chunk_array)
|
|
|
| def _convert_to_pcm(self, audio_chunk_array: np.ndarray) -> bytes:
|
| """Convert float32 audio array to 16-bit PCM bytes."""
|
| return (
|
| (np.clip(audio_chunk_array, -1.0, 1.0) * 32767.0)
|
| .astype("<i2")
|
| .tobytes()
|
| )
|
|
|
| def _process_audio_chunk(self, ctx: AudioProcessingContext, audio_chunk: bytes) -> None:
|
| """Process an audio chunk for wake word detection.
|
|
|
| Args:
|
| ctx: Audio processing context
|
| audio_chunk: PCM audio bytes
|
| """
|
|
|
| self._state.satellite.handle_audio(audio_chunk)
|
|
|
|
|
| self._process_features(ctx, audio_chunk)
|
|
|
|
|
| self._detect_wake_words(ctx)
|
|
|
|
|
| self._detect_stop_word(ctx)
|
|
|
| def _process_features(self, ctx: AudioProcessingContext, audio_chunk: bytes) -> None:
|
| """Process audio features for wake word detection."""
|
| ctx.micro_inputs.clear()
|
| ctx.micro_inputs.extend(ctx.micro_features.process_streaming(audio_chunk))
|
|
|
| if ctx.has_oww and ctx.oww_features is not None:
|
| ctx.oww_inputs.clear()
|
| ctx.oww_inputs.extend(ctx.oww_features.process_streaming(audio_chunk))
|
|
|
| def _detect_wake_words(self, ctx: AudioProcessingContext) -> None:
|
| """Detect wake words in the processed audio features."""
|
| from pymicro_wakeword import MicroWakeWord
|
| from pyopen_wakeword import OpenWakeWord
|
|
|
| for wake_word in ctx.wake_words:
|
| activated = False
|
|
|
| if isinstance(wake_word, MicroWakeWord):
|
| for micro_input in ctx.micro_inputs:
|
| if wake_word.process_streaming(micro_input):
|
| activated = True
|
| elif isinstance(wake_word, OpenWakeWord):
|
| for oww_input in ctx.oww_inputs:
|
| for prob in wake_word.process_streaming(oww_input):
|
| if prob > 0.5:
|
| activated = True
|
|
|
| if activated:
|
| now = time.monotonic()
|
| if (ctx.last_active is None) or ((now - ctx.last_active) > self._state.refractory_seconds):
|
| _LOGGER.info("Wake word detected: %s", wake_word.id)
|
| self._state.satellite.wakeup(wake_word)
|
|
|
| doa_angle_deg = self._get_doa_angle_deg()
|
| self._motion.on_wakeup(doa_angle_deg)
|
| ctx.last_active = now
|
|
|
| def _detect_stop_word(self, ctx: AudioProcessingContext) -> None:
|
| """Detect stop word in the processed audio features."""
|
| if not self._state.stop_word:
|
| return
|
|
|
| stopped = False
|
| for micro_input in ctx.micro_inputs:
|
| if self._state.stop_word.process_streaming(micro_input):
|
| stopped = True
|
|
|
| if stopped and (self._state.stop_word.id in self._state.active_wake_words):
|
| _LOGGER.info("Stop word detected")
|
| self._state.satellite.stop()
|
|
|
| def _get_doa_angle_deg(self) -> Optional[float]:
|
| """Get DOA angle in degrees from Reachy Mini's microphone array.
|
|
|
| The ReSpeaker DOA returns angle in radians where:
|
| - 0 radians = left
|
| - π/2 radians = front/back
|
| - π radians = right
|
|
|
| We convert this to head yaw degrees where:
|
| - 0 = front
|
| - positive = right
|
| - negative = left
|
|
|
| Returns:
|
| DOA angle in degrees suitable for head yaw, or None if unavailable.
|
| """
|
| if self.reachy_mini is None:
|
| return None
|
|
|
| try:
|
| import math
|
| doa_result = self.reachy_mini.media.get_DoA()
|
| if doa_result is None:
|
| _LOGGER.debug("DOA not available")
|
| return None
|
|
|
| doa_radians, speech_detected = doa_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| yaw_radians = doa_radians - (math.pi / 2)
|
| yaw_degrees = math.degrees(yaw_radians)
|
|
|
| _LOGGER.info("DOA detected: %.1f rad -> yaw %.1f deg (speech=%s)",
|
| doa_radians, yaw_degrees, speech_detected)
|
|
|
| return yaw_degrees
|
|
|
| except Exception as e:
|
| _LOGGER.error("Error getting DOA angle: %s", e)
|
| return None
|
|
|