pandora-s's picture
Update tts.py
378b82f verified
raw
history blame contribute delete
2.12 kB
from __future__ import annotations
import base64
import os
import pathlib
import sys
from typing import Optional
from mistralai.client import Mistral
def synthesize_and_save_audio(
input_text: str = "Hello!",
voice_id: str = "",
model: str = "voxtral-mini-tts-2603",
api_key: str = "MISTRAL_API_KEY",
output_path: str = "/tmp/voxtral.wav",
response_format: str = "wav",
) -> int:
client = Mistral(api_key=api_key)
moderation_response = client.classifiers.moderate(
model="mistral-moderation-2603",
inputs=[input_text]
)
if moderation_response.results[0].categories["sexual"] or \
moderation_response.results[0].categories["hate_and_discrimination"] or \
moderation_response.results[0].categories["violence_and_threats"] or \
moderation_response.results[0].categories["selfharm"] or \
moderation_response.results[0].categories["jailbreaking"]:
print("Input text blocked by moderation layer.", file=sys.stderr)
return 3
if "." in voice_id:
print("Cloning voice from reference audio...")
reference_path = pathlib.Path(voice_id).expanduser().resolve()
if not reference_path.is_file():
print(f"Reference audio not found: {reference_path}", file=sys.stderr)
return 2
with open(reference_path, "rb") as f:
reference_bytes = f.read()
reference_b64 = base64.b64encode(reference_bytes).decode("ascii")
response = client.audio.speech.complete(
model=model,
input=input_text,
ref_audio=reference_b64,
response_format=response_format,
)
else:
response = client.audio.speech.complete(
model=model,
input=input_text,
voice_id=voice_id,
response_format=response_format,
)
audio_bytes = base64.b64decode(response.audio_data)
output_path_obj = pathlib.Path(output_path).expanduser()
output_path_obj.write_bytes(audio_bytes)
print(f"Wrote {len(audio_bytes)} bytes to {output_path_obj}")
return 0