Spaces:
Running on Zero
Running on Zero
File size: 2,560 Bytes
4cd8837 4aaae80 4cd8837 4aaae80 4cd8837 3f78ea8 4cd8837 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | """Edge TTS backend (Microsoft Edge text-to-speech via edge-tts package)."""
from __future__ import annotations
import io
import time
from typing import Any
from hearthnet.constants import STT_MAX_AUDIO_SECONDS
class EdgeTtsBackend:
name = "edge_tts"
requires_internet = True
def __init__(self) -> None:
pass
def health(self) -> dict:
try:
import edge_tts # noqa: F401
return {"backend": self.name, "status": "ok", "requires_internet": True}
except ImportError:
return {
"backend": self.name,
"status": "unavailable",
"reason": "edge-tts not installed",
}
async def synthesize(
self,
text: str,
voice: str | None = "de-DE-KatjaNeural",
language: str = "de",
audio_format: str = "ogg_vorbis",
) -> Any:
from hearthnet.services.speech.backends.base import TtsResult
try:
import edge_tts # type: ignore[import]
except ImportError:
raise RuntimeError("edge-tts not installed") from None
selected_voice = voice or _default_voice(language)
t0 = time.monotonic()
communicate = edge_tts.Communicate(text, selected_voice)
buf = io.BytesIO()
async for chunk in communicate.stream():
if chunk["type"] == "audio":
buf.write(chunk["data"])
audio_bytes = buf.getvalue()
ms = int((time.monotonic() - t0) * 1000)
# Estimate duration from audio length (rough: ~32kbps ogg)
duration_seconds = min(
len(audio_bytes) / (32 * 1024 / 8),
float(STT_MAX_AUDIO_SECONDS),
)
# edge-tts natively outputs mp3; wrap in chosen format label
return TtsResult(
audio_bytes=audio_bytes,
audio_format="mp3", # edge-tts always outputs mp3
duration_seconds=duration_seconds,
backend=self.name,
ms=ms,
)
def _default_voice(language: str) -> str:
_VOICES: dict[str, str] = {
"de": "de-DE-KatjaNeural",
"en": "en-US-JennyNeural",
"fr": "fr-FR-DeniseNeural",
"es": "es-ES-ElviraNeural",
"it": "it-IT-ElsaNeural",
"nl": "nl-NL-ColetteNeural",
"pl": "pl-PL-ZofiaNeural",
"ru": "ru-RU-SvetlanaNeural",
"uk": "uk-UA-PolinaNeural",
"ar": "ar-SA-ZariyahNeural",
"tr": "tr-TR-EmelNeural",
}
return _VOICES.get(language, "en-US-JennyNeural")
|