Spaces:
Running on Zero
Running on Zero
GitHub Actions
Quality improvements: Unicode chars, Token class, imports, type hints, formatting
3f78ea8 | """Edge TTS backend (Microsoft Edge text-to-speech via edge-tts package).""" | |
| from __future__ import annotations | |
| import io | |
| import time | |
| from typing import Any | |
| from hearthnet.constants import STT_MAX_AUDIO_SECONDS | |
| class EdgeTtsBackend: | |
| name = "edge_tts" | |
| requires_internet = True | |
| def __init__(self) -> None: | |
| pass | |
| def health(self) -> dict: | |
| try: | |
| import edge_tts # noqa: F401 | |
| return {"backend": self.name, "status": "ok", "requires_internet": True} | |
| except ImportError: | |
| return { | |
| "backend": self.name, | |
| "status": "unavailable", | |
| "reason": "edge-tts not installed", | |
| } | |
| async def synthesize( | |
| self, | |
| text: str, | |
| voice: str | None = "de-DE-KatjaNeural", | |
| language: str = "de", | |
| audio_format: str = "ogg_vorbis", | |
| ) -> Any: | |
| from hearthnet.services.speech.backends.base import TtsResult | |
| try: | |
| import edge_tts # type: ignore[import] | |
| except ImportError: | |
| raise RuntimeError("edge-tts not installed") from None | |
| selected_voice = voice or _default_voice(language) | |
| t0 = time.monotonic() | |
| communicate = edge_tts.Communicate(text, selected_voice) | |
| buf = io.BytesIO() | |
| async for chunk in communicate.stream(): | |
| if chunk["type"] == "audio": | |
| buf.write(chunk["data"]) | |
| audio_bytes = buf.getvalue() | |
| ms = int((time.monotonic() - t0) * 1000) | |
| # Estimate duration from audio length (rough: ~32kbps ogg) | |
| duration_seconds = min( | |
| len(audio_bytes) / (32 * 1024 / 8), | |
| float(STT_MAX_AUDIO_SECONDS), | |
| ) | |
| # edge-tts natively outputs mp3; wrap in chosen format label | |
| return TtsResult( | |
| audio_bytes=audio_bytes, | |
| audio_format="mp3", # edge-tts always outputs mp3 | |
| duration_seconds=duration_seconds, | |
| backend=self.name, | |
| ms=ms, | |
| ) | |
| def _default_voice(language: str) -> str: | |
| _VOICES: dict[str, str] = { | |
| "de": "de-DE-KatjaNeural", | |
| "en": "en-US-JennyNeural", | |
| "fr": "fr-FR-DeniseNeural", | |
| "es": "es-ES-ElviraNeural", | |
| "it": "it-IT-ElsaNeural", | |
| "nl": "nl-NL-ColetteNeural", | |
| "pl": "pl-PL-ZofiaNeural", | |
| "ru": "ru-RU-SvetlanaNeural", | |
| "uk": "uk-UA-PolinaNeural", | |
| "ar": "ar-SA-ZariyahNeural", | |
| "tr": "tr-TR-EmelNeural", | |
| } | |
| return _VOICES.get(language, "en-US-JennyNeural") | |