Desmond-Dong commited on
Commit
d29db4b
·
1 Parent(s): b299642

feat: 添加基于音频分析的说话动画

Browse files

- 新增 AudioAnalyzer 类,分析 TTS 音频生成响度曲线
- 新增 SpeechSwayPlayer 类,同步播放音频驱动的头部动画
- 在 RUN_START 时预下载并分析 TTS 音频
- 在 TTS_START 时开始播放音频驱动的头部摆动
- 参考 reachy_mini_conversation_app 的 SwayRollRT 算法

reachy_mini_ha_voice/audio_analyzer.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Audio analyzer for speech-driven head motion.
2
+
3
+ This module analyzes TTS audio to generate loudness curves that drive
4
+ natural head movements during speech playback.
5
+
6
+ Inspired by reachy_mini_conversation_app's SwayRollRT algorithm.
7
+ """
8
+
9
+ import io
10
+ import logging
11
+ import math
12
+ import threading
13
+ import time
14
+ from dataclasses import dataclass
15
+ from typing import Callable, List, Optional, Tuple
16
+ from urllib.request import urlopen
17
+
18
+ import numpy as np
19
+
20
+ _LOGGER = logging.getLogger(__name__)
21
+
22
+ # Analysis parameters (matching reachy_mini_conversation_app)
23
+ SAMPLE_RATE = 16000
24
+ FRAME_MS = 20
25
+ HOP_MS = 50 # Output rate: 20 Hz
26
+
27
+ # Loudness parameters
28
+ SWAY_DB_LOW = -46.0
29
+ SWAY_DB_HIGH = -18.0
30
+ LOUDNESS_GAMMA = 0.9
31
+
32
+ # Sway parameters (from reachy_mini_conversation_app)
33
+ SWAY_MASTER = 1.5
34
+ SWAY_F_PITCH = 2.2
35
+ SWAY_A_PITCH_DEG = 4.5
36
+ SWAY_F_YAW = 0.6
37
+ SWAY_A_YAW_DEG = 7.5
38
+ SWAY_F_ROLL = 1.3
39
+ SWAY_A_ROLL_DEG = 2.25
40
+ SWAY_F_X = 0.35
41
+ SWAY_A_X_MM = 4.5
42
+ SWAY_F_Y = 0.45
43
+ SWAY_A_Y_MM = 3.75
44
+ SWAY_F_Z = 0.25
45
+ SWAY_A_Z_MM = 2.25
46
+
47
+
48
+ @dataclass
49
+ class SwayFrame:
50
+ """A single frame of sway offsets."""
51
+ timestamp_s: float
52
+ x_m: float
53
+ y_m: float
54
+ z_m: float
55
+ roll_rad: float
56
+ pitch_rad: float
57
+ yaw_rad: float
58
+
59
+
60
+ def _rms_dbfs(samples: np.ndarray) -> float:
61
+ """Calculate RMS in dBFS for float32 samples in [-1, 1]."""
62
+ rms = np.sqrt(np.mean(samples ** 2) + 1e-12)
63
+ return 20.0 * math.log10(rms + 1e-12)
64
+
65
+
66
+ def _loudness_gain(db: float) -> float:
67
+ """Normalize dB to [0, 1] with gamma correction."""
68
+ t = (db - SWAY_DB_LOW) / (SWAY_DB_HIGH - SWAY_DB_LOW)
69
+ t = max(0.0, min(1.0, t))
70
+ return t ** LOUDNESS_GAMMA
71
+
72
+
73
+ class AudioAnalyzer:
74
+ """Analyzes audio files to generate sway curves for head motion."""
75
+
76
+ def __init__(self):
77
+ self._sway_frames: List[SwayFrame] = []
78
+ self._duration_s: float = 0.0
79
+ self._lock = threading.Lock()
80
+ # Random phases for natural variation
81
+ self._phase_pitch = 0.0
82
+ self._phase_yaw = 0.0
83
+ self._phase_roll = 0.0
84
+ self._phase_x = 0.0
85
+ self._phase_y = 0.0
86
+ self._phase_z = 0.0
87
+
88
+ def _randomize_phases(self) -> None:
89
+ """Generate random phase offsets."""
90
+ import random
91
+ self._phase_pitch = random.random() * 2 * math.pi
92
+ self._phase_yaw = random.random() * 2 * math.pi
93
+ self._phase_roll = random.random() * 2 * math.pi
94
+ self._phase_x = random.random() * 2 * math.pi
95
+ self._phase_y = random.random() * 2 * math.pi
96
+ self._phase_z = random.random() * 2 * math.pi
97
+
98
+ def analyze_url(self, url: str) -> bool:
99
+ """Download and analyze audio from URL.
100
+
101
+ Args:
102
+ url: URL to audio file (mp3, wav, etc.)
103
+
104
+ Returns:
105
+ True if analysis succeeded
106
+ """
107
+ try:
108
+ _LOGGER.debug("Downloading audio from: %s", url)
109
+ with urlopen(url, timeout=5) as response:
110
+ audio_data = response.read()
111
+
112
+ return self.analyze_bytes(audio_data)
113
+ except Exception as e:
114
+ _LOGGER.error("Failed to download audio: %s", e)
115
+ return False
116
+
117
+ def analyze_bytes(self, audio_data: bytes) -> bool:
118
+ """Analyze audio data and generate sway frames.
119
+
120
+ Args:
121
+ audio_data: Raw audio file bytes (mp3, wav, etc.)
122
+
123
+ Returns:
124
+ True if analysis succeeded
125
+ """
126
+ try:
127
+ # Try to decode audio using soundfile
128
+ import soundfile as sf
129
+
130
+ audio_io = io.BytesIO(audio_data)
131
+ samples, sr = sf.read(audio_io, dtype='float32')
132
+
133
+ # Convert to mono if stereo
134
+ if samples.ndim == 2:
135
+ samples = samples.mean(axis=1)
136
+
137
+ # Resample if needed
138
+ if sr != SAMPLE_RATE:
139
+ samples = self._resample(samples, sr, SAMPLE_RATE)
140
+
141
+ return self._analyze_samples(samples)
142
+ except Exception as e:
143
+ _LOGGER.error("Failed to analyze audio: %s", e)
144
+ return False
145
+
146
+ def _resample(self, samples: np.ndarray, sr_in: int, sr_out: int) -> np.ndarray:
147
+ """Simple linear resampling."""
148
+ if sr_in == sr_out:
149
+ return samples
150
+ n_out = int(len(samples) * sr_out / sr_in)
151
+ t_in = np.linspace(0, 1, len(samples))
152
+ t_out = np.linspace(0, 1, n_out)
153
+ return np.interp(t_out, t_in, samples).astype(np.float32)
154
+
155
+ def _analyze_samples(self, samples: np.ndarray) -> bool:
156
+ """Analyze audio samples and generate sway frames."""
157
+ self._randomize_phases()
158
+
159
+ hop_samples = int(SAMPLE_RATE * HOP_MS / 1000)
160
+ frame_samples = int(SAMPLE_RATE * FRAME_MS / 1000)
161
+
162
+ frames: List[SwayFrame] = []
163
+ t = 0.0
164
+ hop_dt = HOP_MS / 1000.0
165
+
166
+ pos = 0
167
+ while pos + frame_samples <= len(samples):
168
+ frame = samples[pos:pos + frame_samples]
169
+ db = _rms_dbfs(frame)
170
+ loud = _loudness_gain(db) * SWAY_MASTER
171
+
172
+ # Generate sway offsets (matching reachy_mini_conversation_app)
173
+ pitch = (math.radians(SWAY_A_PITCH_DEG) * loud *
174
+ math.sin(2 * math.pi * SWAY_F_PITCH * t + self._phase_pitch))
175
+ yaw = (math.radians(SWAY_A_YAW_DEG) * loud *
176
+ math.sin(2 * math.pi * SWAY_F_YAW * t + self._phase_yaw))
177
+ roll = (math.radians(SWAY_A_ROLL_DEG) * loud *
178
+ math.sin(2 * math.pi * SWAY_F_ROLL * t + self._phase_roll))
179
+ x_mm = SWAY_A_X_MM * loud * math.sin(2 * math.pi * SWAY_F_X * t + self._phase_x)
180
+ y_mm = SWAY_A_Y_MM * loud * math.sin(2 * math.pi * SWAY_F_Y * t + self._phase_y)
181
+ z_mm = SWAY_A_Z_MM * loud * math.sin(2 * math.pi * SWAY_F_Z * t + self._phase_z)
182
+
183
+ frames.append(SwayFrame(
184
+ timestamp_s=t,
185
+ x_m=x_mm / 1000.0,
186
+ y_m=y_mm / 1000.0,
187
+ z_m=z_mm / 1000.0,
188
+ roll_rad=roll,
189
+ pitch_rad=pitch,
190
+ yaw_rad=yaw,
191
+ ))
192
+
193
+ pos += hop_samples
194
+ t += hop_dt
195
+
196
+ with self._lock:
197
+ self._sway_frames = frames
198
+ self._duration_s = t
199
+
200
+ _LOGGER.info("Analyzed audio: %.2fs, %d frames", t, len(frames))
201
+ return True
202
+
203
+ def get_frame_at(self, t: float) -> Optional[SwayFrame]:
204
+ """Get sway frame at time t (seconds).
205
+
206
+ Args:
207
+ t: Time in seconds from start of audio
208
+
209
+ Returns:
210
+ SwayFrame or None if out of range
211
+ """
212
+ with self._lock:
213
+ if not self._sway_frames:
214
+ return None
215
+
216
+ # Find frame index
217
+ hop_dt = HOP_MS / 1000.0
218
+ idx = int(t / hop_dt)
219
+
220
+ if idx < 0:
221
+ return self._sway_frames[0]
222
+ if idx >= len(self._sway_frames):
223
+ return None
224
+
225
+ return self._sway_frames[idx]
226
+
227
+ def clear(self) -> None:
228
+ """Clear analyzed data."""
229
+ with self._lock:
230
+ self._sway_frames = []
231
+ self._duration_s = 0.0
232
+
233
+ @property
234
+ def duration(self) -> float:
235
+ """Get duration of analyzed audio in seconds."""
236
+ with self._lock:
237
+ return self._duration_s
238
+
239
+ @property
240
+ def frame_count(self) -> int:
241
+ """Get number of sway frames."""
242
+ with self._lock:
243
+ return len(self._sway_frames)
244
+
245
+
246
+ class SpeechSwayPlayer:
247
+ """Plays pre-analyzed sway animation synchronized with TTS playback."""
248
+
249
+ def __init__(self, set_offsets_callback: Callable[[Tuple[float, ...]], None]):
250
+ """Initialize player.
251
+
252
+ Args:
253
+ set_offsets_callback: Function to call with (x, y, z, roll, pitch, yaw) offsets
254
+ """
255
+ self._set_offsets = set_offsets_callback
256
+ self._analyzer = AudioAnalyzer()
257
+ self._playing = False
258
+ self._start_time: float = 0.0
259
+ self._thread: Optional[threading.Thread] = None
260
+ self._stop_event = threading.Event()
261
+
262
+ def prepare(self, url: str) -> bool:
263
+ """Prepare sway animation by downloading and analyzing audio.
264
+
265
+ Call this when TTS URL is received (e.g., at RUN_START).
266
+
267
+ Args:
268
+ url: URL to TTS audio file
269
+
270
+ Returns:
271
+ True if preparation succeeded
272
+ """
273
+ self.stop()
274
+ return self._analyzer.analyze_url(url)
275
+
276
+ def start(self) -> None:
277
+ """Start playing sway animation.
278
+
279
+ Call this when TTS playback starts (e.g., at TTS_START).
280
+ """
281
+ if self._playing:
282
+ return
283
+
284
+ if self._analyzer.frame_count == 0:
285
+ _LOGGER.warning("No sway data to play")
286
+ return
287
+
288
+ self._stop_event.clear()
289
+ self._playing = True
290
+ self._start_time = time.monotonic()
291
+
292
+ self._thread = threading.Thread(target=self._playback_loop, daemon=True)
293
+ self._thread.start()
294
+ _LOGGER.debug("Started sway playback")
295
+
296
+ def stop(self) -> None:
297
+ """Stop sway animation playback."""
298
+ if not self._playing:
299
+ return
300
+
301
+ self._stop_event.set()
302
+ self._playing = False
303
+
304
+ if self._thread:
305
+ self._thread.join(timeout=0.5)
306
+ self._thread = None
307
+
308
+ # Reset offsets to zero
309
+ self._set_offsets((0.0, 0.0, 0.0, 0.0, 0.0, 0.0))
310
+ _LOGGER.debug("Stopped sway playback")
311
+
312
+ def _playback_loop(self) -> None:
313
+ """Playback loop that outputs sway frames at correct timing."""
314
+ hop_dt = HOP_MS / 1000.0
315
+
316
+ while not self._stop_event.is_set():
317
+ elapsed = time.monotonic() - self._start_time
318
+ frame = self._analyzer.get_frame_at(elapsed)
319
+
320
+ if frame is None:
321
+ # End of animation
322
+ break
323
+
324
+ # Output offsets
325
+ self._set_offsets((
326
+ frame.x_m,
327
+ frame.y_m,
328
+ frame.z_m,
329
+ frame.roll_rad,
330
+ frame.pitch_rad,
331
+ frame.yaw_rad,
332
+ ))
333
+
334
+ # Sleep until next frame
335
+ next_time = self._start_time + (int(elapsed / hop_dt) + 1) * hop_dt
336
+ sleep_time = next_time - time.monotonic()
337
+ if sleep_time > 0:
338
+ time.sleep(sleep_time)
339
+
340
+ # Reset offsets when done
341
+ self._set_offsets((0.0, 0.0, 0.0, 0.0, 0.0, 0.0))
342
+ self._playing = False
343
+
344
+ @property
345
+ def is_playing(self) -> bool:
346
+ """Check if sway animation is currently playing."""
347
+ return self._playing
reachy_mini_ha_voice/satellite.py CHANGED
@@ -5,6 +5,7 @@ import logging
5
  import math
6
  import posixpath
7
  import shutil
 
8
  import time
9
  from collections.abc import Iterable
10
  from typing import Dict, Optional, Set, Union, TYPE_CHECKING
@@ -50,6 +51,7 @@ from pymicro_wakeword import MicroWakeWord
50
  from pyopen_wakeword import OpenWakeWord
51
 
52
  from .api_server import APIServer
 
53
  from .entity import MediaPlayerEntity
54
  from .entity_registry import EntityRegistry, get_entity_key
55
  from .models import AvailableWakeWord, ServerState, WakeWordType
@@ -86,12 +88,19 @@ class VoiceSatelliteProtocol(APIServer):
86
  self._conversation_timeout = 300.0 # 5 minutes, same as ESPHome default
87
  self._last_conversation_time = 0.0
88
 
 
 
 
89
  # Initialize Reachy controller
90
  self.reachy_controller = ReachyController(state.reachy_mini)
91
 
92
  # Connect MovementManager to ReachyController for pose control from HA
93
  if state.motion is not None and state.motion.movement_manager is not None:
94
  self.reachy_controller.set_movement_manager(state.motion.movement_manager)
 
 
 
 
95
 
96
  # Initialize entity registry
97
  self._entity_registry = EntityRegistry(
@@ -139,6 +148,15 @@ class VoiceSatelliteProtocol(APIServer):
139
  # Reachy Mini: Start listening animation
140
  self._reachy_on_listening()
141
 
 
 
 
 
 
 
 
 
 
142
  elif event_type in (
143
  VoiceAssistantEventType.VOICE_ASSISTANT_STT_VAD_END,
144
  VoiceAssistantEventType.VOICE_ASSISTANT_STT_END,
@@ -157,19 +175,31 @@ class VoiceSatelliteProtocol(APIServer):
157
  self._continue_conversation = True
158
 
159
  elif event_type == VoiceAssistantEventType.VOICE_ASSISTANT_TTS_START:
160
- # Reachy Mini: Start speaking animation
161
  _LOGGER.info("TTS_START event received, triggering speaking animation")
162
  self._reachy_on_speaking()
163
 
 
 
 
 
164
  elif event_type == VoiceAssistantEventType.VOICE_ASSISTANT_TTS_END:
165
  self._tts_url = data.get("url")
166
  self.play_tts()
167
 
 
 
 
 
168
  elif event_type == VoiceAssistantEventType.VOICE_ASSISTANT_RUN_END:
169
  # Pipeline run ended
170
  self._tts_played = False
171
  self._is_streaming_audio = False
172
 
 
 
 
 
173
  # Check if should continue conversation
174
  self._handle_run_end()
175
 
@@ -723,6 +753,31 @@ class VoiceSatelliteProtocol(APIServer):
723
  except Exception as e:
724
  _LOGGER.error("Reachy Mini motion error: %s", e)
725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  def _reachy_on_idle(self) -> None:
727
  """Called when returning to idle state (HA state: Idle)."""
728
  # Disable high-frequency face tracking, switch to adaptive mode
 
5
  import math
6
  import posixpath
7
  import shutil
8
+ import threading
9
  import time
10
  from collections.abc import Iterable
11
  from typing import Dict, Optional, Set, Union, TYPE_CHECKING
 
51
  from pyopen_wakeword import OpenWakeWord
52
 
53
  from .api_server import APIServer
54
+ from .audio_analyzer import SpeechSwayPlayer
55
  from .entity import MediaPlayerEntity
56
  from .entity_registry import EntityRegistry, get_entity_key
57
  from .models import AvailableWakeWord, ServerState, WakeWordType
 
88
  self._conversation_timeout = 300.0 # 5 minutes, same as ESPHome default
89
  self._last_conversation_time = 0.0
90
 
91
+ # Speech sway player for audio-driven head motion
92
+ self._speech_sway_player: Optional[SpeechSwayPlayer] = None
93
+
94
  # Initialize Reachy controller
95
  self.reachy_controller = ReachyController(state.reachy_mini)
96
 
97
  # Connect MovementManager to ReachyController for pose control from HA
98
  if state.motion is not None and state.motion.movement_manager is not None:
99
  self.reachy_controller.set_movement_manager(state.motion.movement_manager)
100
+ # Initialize speech sway player with MovementManager callback
101
+ self._speech_sway_player = SpeechSwayPlayer(
102
+ set_offsets_callback=self._set_speech_sway_offsets
103
+ )
104
 
105
  # Initialize entity registry
106
  self._entity_registry = EntityRegistry(
 
148
  # Reachy Mini: Start listening animation
149
  self._reachy_on_listening()
150
 
151
+ # Pre-analyze TTS audio for speech sway (if URL available)
152
+ if self._tts_url and self._speech_sway_player:
153
+ # Run in background thread to avoid blocking
154
+ threading.Thread(
155
+ target=self._speech_sway_player.prepare,
156
+ args=(self._tts_url,),
157
+ daemon=True
158
+ ).start()
159
+
160
  elif event_type in (
161
  VoiceAssistantEventType.VOICE_ASSISTANT_STT_VAD_END,
162
  VoiceAssistantEventType.VOICE_ASSISTANT_STT_END,
 
175
  self._continue_conversation = True
176
 
177
  elif event_type == VoiceAssistantEventType.VOICE_ASSISTANT_TTS_START:
178
+ # Reachy Mini: Start speaking animation with audio-driven sway
179
  _LOGGER.info("TTS_START event received, triggering speaking animation")
180
  self._reachy_on_speaking()
181
 
182
+ # Start audio-driven speech sway
183
+ if self._speech_sway_player:
184
+ self._speech_sway_player.start()
185
+
186
  elif event_type == VoiceAssistantEventType.VOICE_ASSISTANT_TTS_END:
187
  self._tts_url = data.get("url")
188
  self.play_tts()
189
 
190
+ # Stop speech sway
191
+ if self._speech_sway_player:
192
+ self._speech_sway_player.stop()
193
+
194
  elif event_type == VoiceAssistantEventType.VOICE_ASSISTANT_RUN_END:
195
  # Pipeline run ended
196
  self._tts_played = False
197
  self._is_streaming_audio = False
198
 
199
+ # Ensure speech sway is stopped
200
+ if self._speech_sway_player:
201
+ self._speech_sway_player.stop()
202
+
203
  # Check if should continue conversation
204
  self._handle_run_end()
205
 
 
753
  except Exception as e:
754
  _LOGGER.error("Reachy Mini motion error: %s", e)
755
 
756
+ def _set_speech_sway_offsets(self, offsets: tuple) -> None:
757
+ """Apply speech sway offsets to MovementManager.
758
+
759
+ Args:
760
+ offsets: Tuple of (x, y, z, roll, pitch, yaw) in meters/radians
761
+ """
762
+ if self.state.motion is None or self.state.motion.movement_manager is None:
763
+ return
764
+
765
+ try:
766
+ # Set speech offsets on MovementManager
767
+ # These are additive offsets applied on top of the current animation
768
+ mm = self.state.motion.movement_manager
769
+ x, y, z, roll, pitch, yaw = offsets
770
+
771
+ # Update animation offsets directly
772
+ mm.state.anim_x = x
773
+ mm.state.anim_y = y
774
+ mm.state.anim_z = z
775
+ mm.state.anim_roll = roll
776
+ mm.state.anim_pitch = pitch
777
+ mm.state.anim_yaw = yaw
778
+ except Exception as e:
779
+ _LOGGER.debug("Failed to set speech sway offsets: %s", e)
780
+
781
  def _reachy_on_idle(self) -> None:
782
  """Called when returning to idle state (HA state: Idle)."""
783
  # Disable high-frequency face tracking, switch to adaptive mode