#import subprocess #subprocess.run('pip install flash-attn==2.7.4.post1', shell=True) import io import re import os import json import hashlib import threading import time import numpy as np import torch import spaces import whisper import gradio as gr from gradio.themes.base import Base from gradio.themes.utils import colors, fonts, sizes from typing import Iterable from dotenv import load_dotenv from urllib.request import urlopen, Request from scipy.signal import resample_poly #from huggingface_hub import snapshot_download #from qwen_tts import Qwen3TTSModel from faster_qwen3_tts import FasterQwen3TTS load_dotenv(verbose=False) #TTS_MODEL = Qwen3TTSModel.from_pretrained(snapshot_download('Qwen/Qwen3-TTS-12Hz-1.7B-Base', token=os.environ['HF_TOKEN']), device_map=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), dtype=torch.bfloat16, token=os.environ['HF_TOKEN'], attn_implementation='kernels-community/flash-attn3') TTS_MODEL = FasterQwen3TTS.from_pretrained('Qwen/Qwen3-TTS-12Hz-1.7B-Base') WHISPER_MODEL = whisper.load_model('turbo', device='cpu', download_root=os.environ.get('WHISPER_CACHE_DIR')) REFERENCE_AUDIO_TRANSCRIPTION_CACHE: dict[str, tuple[float, str, str]] = {} REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LOCK = threading.Lock() REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT = max(1, int(os.environ.get('REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT', 100))) class Theme(Base): def __init__( self, *, primary_hue: colors.Color | str = colors.neutral, secondary_hue: colors.Color | str = colors.neutral, neutral_hue: colors.Color | str = colors.neutral, spacing_size: sizes.Size | str = sizes.spacing_md, radius_size: sizes.Size | str = sizes.radius_md, text_size: sizes.Size | str = sizes.text_md, font: fonts.Font | str | Iterable[fonts.Font | str] = (fonts.GoogleFont('Barlow'), 'ui-sans-serif', 'sans-serif'), font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (fonts.GoogleFont('IBM Plex Mono'), 'ui-monospace', 'monospace'), ): super().__init__( primary_hue=primary_hue, secondary_hue=secondary_hue, neutral_hue=neutral_hue, spacing_size=spacing_size, radius_size=radius_size, text_size=text_size, font=font, font_mono=font_mono, ) super().set( color_accent='rgb(0 231 255 / 1)', slider_color='rgb(0 231 255 / 1)', slider_color_dark='rgb(0 231 255 / 1)', button_primary_background_fill='rgb(0 231 255 / 1)', button_primary_background_fill_hover='rgb(0 231 255 / .75)', button_primary_text_color='#ffffff', button_primary_background_fill_dark='rgb(0 231 255 / 1)', button_primary_background_fill_hover_dark='rgb(0 231 255 / .75)', button_primary_text_color_dark='#ffffff', loader_color='rgb(255 199 229 / 1)', loader_color_dark='rgb(255 199 229 / 1)' ) def _normalize_audio(wav, eps=1e-12, clip=True): """Normalize audio to float32 in [-1, 1] range.""" x = np.asarray(wav) if np.issubdtype(x.dtype, np.integer): info = np.iinfo(x.dtype) if info.min < 0: y = x.astype(np.float32) / max(abs(info.min), info.max) else: mid = (info.max + 1) / 2.0 y = (x.astype(np.float32) - mid) / mid elif np.issubdtype(x.dtype, np.floating): y = x.astype(np.float32) m = np.max(np.abs(y)) if y.size else 0.0 if m > 1.0 + 1e-6: y = y / (m + eps) else: return None if clip: y = np.clip(y, -1.0, 1.0) if y.ndim > 1: y = np.mean(y, axis=-1).astype(np.float32) return y def _resample(x: np.ndarray, original_sample_rate: int, target_sample_rate: int, axis: int = 0) -> np.ndarray: g = np.gcd(original_sample_rate, target_sample_rate) return resample_poly(x, up=target_sample_rate // g, down=original_sample_rate // g, axis=axis) def _reference_audio_hash(reference_audio: tuple[np.ndarray, int]) -> str: audio = reference_audio[0] audio = np.ascontiguousarray(np.asarray(audio)) digest = hashlib.sha256() digest.update(audio.tobytes()) return digest.hexdigest() def _detect_reference_text_and_language(reference_audio: tuple[np.ndarray, int], sample_rate: int) -> tuple[str, str]: audio = np.asarray(reference_audio[0]) if audio.ndim == 2: audio = audio.mean(axis=1) if sample_rate != 16000: audio = _resample(audio, sample_rate, 16000).astype(np.float32) model = WHISPER_MODEL.to(device='cuda' if torch.cuda.is_available() else 'cpu') audio = np.clip(audio, -1.0, 1.0) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device) _, probs = model.detect_language(mel) detected_language = max(probs, key=probs.get) result = whisper.decode(model, mel, whisper.DecodingOptions()) reference_text = re.sub(r'\s*\n\s*', '', result.text) if detected_language == 'ja': converted_reference_text = generate_text(reference_text) if converted_reference_text is not None: reference_text = converted_reference_text return reference_text, detected_language def _get_reference_text_and_language(reference_audio: tuple[np.ndarray, int], sample_rate: int) -> tuple[str, str]: cache_key = _reference_audio_hash(reference_audio) with REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LOCK: cached_result = REFERENCE_AUDIO_TRANSCRIPTION_CACHE.get(cache_key) if cached_result is not None: _, reference_text, detected_language = cached_result REFERENCE_AUDIO_TRANSCRIPTION_CACHE[cache_key] = (time.time(), reference_text, detected_language) if cached_result is not None: return reference_text, detected_language reference_text, detected_language = _detect_reference_text_and_language(reference_audio, sample_rate) with REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LOCK: REFERENCE_AUDIO_TRANSCRIPTION_CACHE[cache_key] = (time.time(), reference_text, detected_language) if len(REFERENCE_AUDIO_TRANSCRIPTION_CACHE) > REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT: expired_cache_keys = sorted( REFERENCE_AUDIO_TRANSCRIPTION_CACHE, key=lambda key: REFERENCE_AUDIO_TRANSCRIPTION_CACHE[key][0] )[:-REFERENCE_AUDIO_TRANSCRIPTION_CACHE_LIMIT] for expired_cache_key in expired_cache_keys: del REFERENCE_AUDIO_TRANSCRIPTION_CACHE[expired_cache_key] return reference_text, detected_language def generate_text(prompt: str) -> str | None: system_prompt = '''あなたは日本語テキストを「読み(かな)」だけに変換する変換器です。 出力に含めてよい文字は ひらがな・カタカナ・長音記号ー・空白 のみです。改行も禁止(1行で出力)。 入力に含まれる 漢字は必ずかなにする。 英数字・記号は、可能な範囲で日本語のカナ読みにする(例:AI→えーあい、LLM→えるえるえむ、2026→にせんにじゅうろく)。 出力は 変換後の本文のみ。説明、注釈、引用符、箇条書き、コードブロックは一切禁止。 最後に必ず自己検査を行う:出力が ^[ぁ-ゟ゠-ヿー ]+$ に一致しない場合、条件を満たすまで修正してから出力する。 それでも読めない文字がある場合は、意味を落としてよいので「最も近いかな」に置き換える(記号は省略よりも読みを優先。ただし許可文字以外は絶対に出さない)。''' request = Request('https://api.openai.com/v1/responses', data=json.dumps({ 'model': os.environ.get('OPENAI_MODEL', 'gpt-5.4-mini'), 'input': [{ 'role': 'developer', 'content': system_prompt }, { 'role': 'user', 'content': [ { 'type': 'input_text', 'text': prompt } ] }], 'temperature': 1, 'reasoning': {'effort': 'none'}, }).encode('utf-8'), method='POST', headers={'Content-Type': 'application/json', 'Authorization': f'Bearer {os.environ["OPENAI_API_KEY"]}'}) with urlopen(request) as response: result = json.loads(response.read().decode('utf-8')) for output in result['output']: if 'type' in output and output['type'] == 'message': for content in output['content']: if 'type' in content and content['type'] == 'output_text': return content['text'] return None @spaces.GPU(duration=30) def generate_voice_clone(input_text: str, language: str | None, reference_audio: np.ndarray, reference_text: str | None, temperature: float, progress: gr.Progress=gr.Progress(track_tqdm=True)) -> (np.ndarray, str | None, str | None): language_codes = {'en': 'English', 'ja': 'Japanese'} transcribed_text = None detected_language = None if isinstance(reference_audio, tuple) and len(reference_audio) == 2 and isinstance(reference_audio[0], int): sample_rate, wav = reference_audio sample_rate = int(sample_rate) reference_audio = (_normalize_audio(wav), sample_rate) if isinstance(reference_audio, dict) and 'sampling_rate' in reference_audio and 'data' in reference_audio: sample_rate = int(reference_audio['sampling_rate']) reference_audio = (_normalize_audio(reference_audio['data']), sample_rate) if reference_text is None or len(reference_text) == 0: reference_text, detected_language = _get_reference_text_and_language(reference_audio, sample_rate) transcribed_text = reference_text if language is None: if detected_language in language_codes: language = language_codes[detected_language] else: language = 'Auto' elif language == 'Auto': if detected_language in language_codes: language = language_codes[detected_language] elif language in language_codes: language = language_codes[language] elif language is None: language = 'Auto' elif language in language_codes: language = language_codes[language] if sample_rate != 48000: reference_audio = (_resample(reference_audio[0], sample_rate, 48000), 48000) wavs, sample_rate = TTS_MODEL.generate_voice_clone(text=input_text.strip(), language=language, ref_audio=reference_audio, ref_text=reference_text.strip(), temperature=temperature, append_silence=False) #wavs, sample_rate = TTS_MODEL.generate_voice_clone(text=input_text.strip(), language=language, ref_audio=reference_audio, ref_text=reference_text, max_new_tokens=2048, temperature=temperature) return (sample_rate, (np.clip(wavs[0], -1.0, 1.0) * 32768.0).round().astype(np.int16)), transcribed_text, detected_language with gr.Blocks() as demo: with gr.Row(): with gr.Column(scale=2): with gr.Group(): tts_reference_audio = gr.Audio(label='Reference Audio', type='numpy', buttons=['download'], waveform_options={'waveform_color': 'rgb(0 231 255 / 1)', 'waveform_progress_color': 'rgb(255 199 229 / 1)'}) tts_reference_text = gr.Textbox(label='Reference Text', value='', lines=1) tts_input_text = gr.Textbox(label='Input', lines=4) tts_language = gr.Dropdown(label='Language', choices=[('Automatic', 'Auto'), ('English', 'en'), ('Japanese', 'ja')], value='Auto', interactive=True) tts_temperature_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.9, step=0.1, label='Temperature') tts_generate_button = gr.Button('Generate', variant='primary') with gr.Column(scale=2): tts_audio_output = gr.Audio(label='Output', type='numpy', buttons=['download'], waveform_options={'waveform_color': 'rgb(0 231 255 / 1)', 'waveform_progress_color': 'rgb(255 199 229 / 1)'}) tts_transcribed_text = gr.Label(label='Transcript', value='') tts_detected_language = gr.Label(label='Language', value='') tts_generate_button.click(fn=generate_voice_clone, inputs=[tts_input_text, tts_language, tts_reference_audio, tts_reference_text, tts_temperature_slider], outputs=[tts_audio_output, tts_transcribed_text, tts_detected_language], api_name='synthesize') if __name__ == '__main__': demo.launch( server_name=os.environ.get('GRADIO_SERVER_NAME', '0.0.0.0'), server_port=int(os.environ.get('GRADIO_SERVER_PORT', os.environ.get('PORT', 7860))), theme=Theme(), css='.column>.row>.column:first-of-type .block { border-width: 0px !important; }' )