#!/usr/bin/env python3 """Sinhala TTS — FastAPI REST Inference Server Endpoints: POST /tts — Generate speech from text GET /health — Health check GET /speakers — List available speakers Usage: uvicorn server:app --host 0.0.0.0 --port 8081 python server.py """ import io import json import os import re import warnings from pathlib import Path from typing import Optional import numpy as np import torch from fastapi import FastAPI, HTTPException from fastapi.responses import Response from pydantic import BaseModel, Field warnings.filterwarnings("ignore") SAMPLE_RATE = 16000 MODEL_DIR = Path(__file__).parent app = FastAPI( title="Sinhala TTS API", description="Sinhala Text-to-Speech VITS model inference server", version="1.0.0", ) SPEAKERS = [ "mettananda", "oshadi", "pn_sin_01", "sin_01", "sin_2241", "sin_2282", "sin_3531", "sin_3688", "sin_3976", "sin_4191", "sin_4499", "sin_5681", "sin_6314", "sin_6897", "sin_7183", "sin_9228", ] # ────────────────────────────────────────────────────────────────────── # Request / Response models # ────────────────────────────────────────────────────────────────────── class TTSRequest(BaseModel): text: str = Field(..., min_length=1, description="Sinhala text to synthesize") speaker: str = Field(default="mettananda", description="Speaker voice name") emotion: str = Field(default="neutral", description="Emotion style") class HealthResponse(BaseModel): status: str = "ok" model_loaded: bool = False class SpeakerInfo(BaseModel): id: int name: str class SpeakersResponse(BaseModel): speakers: list[SpeakerInfo] # ────────────────────────────────────────────────────────────────────── # Model loading (singleton) # ────────────────────────────────────────────────────────────────────── _model = None _tokenizer = None _ap = None _speaker_manager = None _config = None def _patch_sinhala_cleaners(): """Patch sinhala_cleaners into TTS before model init.""" import TTS.tts.utils.text.cleaners as cleaners_mod if hasattr(cleaners_mod, "sinhala_cleaners"): return def sinhala_cleaners(text: str) -> str: text = re.sub(r"\s+", " ", text) text = text.strip() text = re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text) return text cleaners_mod.sinhala_cleaners = sinhala_cleaners def load_model(): """Load the VITS model from safetensors (cached).""" global _model, _tokenizer, _ap, _speaker_manager, _config if _model is not None: return _model, _tokenizer, _ap, _speaker_manager, _config _patch_sinhala_cleaners() from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models.vits import Vits from TTS.tts.utils.text import TTSTokenizer from TTS.tts.utils.speakers import SpeakerManager from TTS.utils.audio import AudioProcessor config_path = MODEL_DIR / "config.json" safetensors_path = MODEL_DIR / "sinhala_tts_vits_model.safetensors" speakers_json = MODEL_DIR / "speakers.json" config = VitsConfig() config.load_json(str(config_path)) ap = AudioProcessor.init_from_config(config) tokenizer, new_config = TTSTokenizer.init_from_config(config) if speakers_json.exists(): config.speakers_file = str(speakers_json) speaker_manager = SpeakerManager.init_from_config(config) if speaker_manager is None or speaker_manager.num_speakers == 0: from TTS.tts.utils.speakers import SpeakerManager as SM speaker_manager = SM() speaker_manager.load_ids_from_file(str(speakers_json)) else: from TTS.tts.utils.speakers import SpeakerManager as SM speaker_manager = SM() default_speakers = {s: i for i, s in enumerate(SPEAKERS)} speaker_manager.set_ids_from_data(default_speakers) model = Vits(new_config, ap, tokenizer, speaker_manager) if safetensors_path.exists(): from safetensors.torch import load_file state_dict = load_file(str(safetensors_path)) model.load_state_dict(state_dict, strict=False) else: pth_path = MODEL_DIR / "vits_quick_test.pth" if pth_path.exists(): ckpt = torch.load(str(pth_path), map_location="cpu", weights_only=False) sd = ckpt.get("model_state_dict", ckpt) model.load_state_dict(sd, strict=False) else: raise FileNotFoundError("No model file found (safetensors or pth)") model.eval() _model = model _tokenizer = tokenizer _ap = ap _speaker_manager = speaker_manager _config = new_config return model, tokenizer, ap, speaker_manager, new_config def normalize_audio(wav: np.ndarray, target_max: float = 0.95) -> np.ndarray: wav = wav.astype(np.float64) current_max = np.max(np.abs(wav)) if current_max > 1e-8: wav = wav * (target_max / current_max) return wav.astype(np.float32) # ────────────────────────────────────────────────────────────────────── # API Endpoints # ────────────────────────────────────────────────────────────────────── @app.on_event("startup") async def startup(): """Warm up the model on server start.""" try: load_model() print("[INFO] Model loaded on startup") except Exception as e: print(f"[WARN] Model not loaded on startup: {e}") @app.get("/health", response_model=HealthResponse) async def health(): """Health check endpoint.""" try: load_model() return HealthResponse(status="ok", model_loaded=True) except Exception: return HealthResponse(status="model_not_loaded", model_loaded=False) @app.get("/speakers", response_model=SpeakersResponse) async def speakers(): """List all available speaker voices.""" speakers_list = [SpeakerInfo(id=i, name=name) for i, name in enumerate(SPEAKERS)] return SpeakersResponse(speakers=speakers_list) @app.post("/tts") async def text_to_speech(request: TTSRequest): """Generate speech audio from Sinhala text.""" if not request.text.strip(): raise HTTPException(status_code=400, detail="Text cannot be empty") if request.speaker not in SPEAKERS: raise HTTPException( status_code=400, detail=f"Unknown speaker '{request.speaker}'. Available: {', '.join(SPEAKERS)}", ) try: model, tokenizer, ap, speaker_manager, config = load_model() except Exception as e: raise HTTPException(status_code=500, detail=f"Model not loaded: {e}") try: prefix_map = { "neutral": "", "happy": "[laugh] ", "sad": "[sad] ", "angry": "[angry] ", "scared": "[breath] ", "surprised": "[excited] ", "whisper": "[whisper] ", "loud": "[loud] ", } ptext = prefix_map.get(request.emotion, "") + request.text with torch.no_grad(): outputs = model.synthesize(ptext, config=config, speaker=request.speaker) wav = outputs.get("wav") if wav is None: raise HTTPException(status_code=500, detail="Model produced no audio output") wav_norm = normalize_audio(wav) import soundfile as sf buf = io.BytesIO() sf.write(buf, wav_norm, SAMPLE_RATE, format="WAV") wav_bytes = buf.getvalue() return Response( content=wav_bytes, media_type="audio/wav", headers={ "Content-Disposition": f'attachment; filename="sinhala_tts_output.wav"', "X-Duration-Sec": f"{len(wav_norm) / SAMPLE_RATE:.2f}", "X-Speaker": request.speaker, }, ) except HTTPException: raise except Exception as e: import traceback raise HTTPException(status_code=500, detail=f"Generation failed: {e}\n{traceback.format_exc()}") # ────────────────────────────────────────────────────────────────────── # Main # ────────────────────────────────────────────────────────────────────── if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8081)