File size: 2,560 Bytes
4cd8837
4aaae80
4cd8837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4aaae80
4cd8837
 
 
 
 
 
 
 
 
 
 
 
 
3f78ea8
4cd8837
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""Edge TTS backend (Microsoft Edge text-to-speech via edge-tts package)."""

from __future__ import annotations

import io
import time
from typing import Any

from hearthnet.constants import STT_MAX_AUDIO_SECONDS


class EdgeTtsBackend:
    name = "edge_tts"
    requires_internet = True

    def __init__(self) -> None:
        pass

    def health(self) -> dict:
        try:
            import edge_tts  # noqa: F401

            return {"backend": self.name, "status": "ok", "requires_internet": True}
        except ImportError:
            return {
                "backend": self.name,
                "status": "unavailable",
                "reason": "edge-tts not installed",
            }

    async def synthesize(
        self,
        text: str,
        voice: str | None = "de-DE-KatjaNeural",
        language: str = "de",
        audio_format: str = "ogg_vorbis",
    ) -> Any:
        from hearthnet.services.speech.backends.base import TtsResult

        try:
            import edge_tts  # type: ignore[import]
        except ImportError:
            raise RuntimeError("edge-tts not installed") from None

        selected_voice = voice or _default_voice(language)
        t0 = time.monotonic()

        communicate = edge_tts.Communicate(text, selected_voice)
        buf = io.BytesIO()
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                buf.write(chunk["data"])

        audio_bytes = buf.getvalue()
        ms = int((time.monotonic() - t0) * 1000)

        # Estimate duration from audio length (rough: ~32kbps ogg)
        duration_seconds = min(
            len(audio_bytes) / (32 * 1024 / 8),
            float(STT_MAX_AUDIO_SECONDS),
        )

        # edge-tts natively outputs mp3; wrap in chosen format label
        return TtsResult(
            audio_bytes=audio_bytes,
            audio_format="mp3",  # edge-tts always outputs mp3
            duration_seconds=duration_seconds,
            backend=self.name,
            ms=ms,
        )


def _default_voice(language: str) -> str:
    _VOICES: dict[str, str] = {
        "de": "de-DE-KatjaNeural",
        "en": "en-US-JennyNeural",
        "fr": "fr-FR-DeniseNeural",
        "es": "es-ES-ElviraNeural",
        "it": "it-IT-ElsaNeural",
        "nl": "nl-NL-ColetteNeural",
        "pl": "pl-PL-ZofiaNeural",
        "ru": "ru-RU-SvetlanaNeural",
        "uk": "uk-UA-PolinaNeural",
        "ar": "ar-SA-ZariyahNeural",
        "tr": "tr-TR-EmelNeural",
    }
    return _VOICES.get(language, "en-US-JennyNeural")