from __future__ import annotations import base64 import os import pathlib import sys from typing import Optional from mistralai.client import Mistral def synthesize_and_save_audio( input_text: str = "Hello!", voice_id: str = "", model: str = "voxtral-mini-tts-2603", api_key: str = "MISTRAL_API_KEY", output_path: str = "/tmp/voxtral.wav", response_format: str = "wav", ) -> int: client = Mistral(api_key=api_key) moderation_response = client.classifiers.moderate( model="mistral-moderation-2603", inputs=[input_text] ) if moderation_response.results[0].categories["sexual"] or \ moderation_response.results[0].categories["hate_and_discrimination"] or \ moderation_response.results[0].categories["violence_and_threats"] or \ moderation_response.results[0].categories["selfharm"] or \ moderation_response.results[0].categories["jailbreaking"]: print("Input text blocked by moderation layer.", file=sys.stderr) return 3 if "." in voice_id: print("Cloning voice from reference audio...") reference_path = pathlib.Path(voice_id).expanduser().resolve() if not reference_path.is_file(): print(f"Reference audio not found: {reference_path}", file=sys.stderr) return 2 with open(reference_path, "rb") as f: reference_bytes = f.read() reference_b64 = base64.b64encode(reference_bytes).decode("ascii") response = client.audio.speech.complete( model=model, input=input_text, ref_audio=reference_b64, response_format=response_format, ) else: response = client.audio.speech.complete( model=model, input=input_text, voice_id=voice_id, response_format=response_format, ) audio_bytes = base64.b64decode(response.audio_data) output_path_obj = pathlib.Path(output_path).expanduser() output_path_obj.write_bytes(audio_bytes) print(f"Wrote {len(audio_bytes)} bytes to {output_path_obj}") return 0