File size: 8,032 Bytes
49268bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ef9b95
49268bc
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import base64
import json
import gradio as gr
import numpy as np
import requests
import os
import io
import soundfile as sf

# Ensure API_KEY is set in your environment variables
API_KEY = os.environ.get('API_KEY', 'your_api_key_here')

def voice_design(prompt, text):
    url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {API_KEY}"
    }

    data = {
        "model": "qwen-voice-design",
        "input": {
            "action": "create",
            "voice_prompt": prompt,
            "preview_text": text,
            "target_model": "qwen3-tts-vd-realtime-2025-12-16",
            "preferred_name": "default"
        },
        "parameters": {
            "sample_rate": 24000,
            "response_format": "wav"
        }
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    sr = 24000

    if response.status_code == 200:
        res_json = json.loads(response.text)
        if 'output' in res_json and 'preview_audio' in res_json['output']:
            base64_audio = res_json['output']['preview_audio']['data']
            audio_bytes = base64.b64decode(base64_audio)

            audio_buffer = io.BytesIO(audio_bytes)
            audio_np, sr = sf.read(audio_buffer)
            return (sr, audio_np)
        else:
            print("Error in response output:", res_json)
            return (sr, np.array([]))
    else:
        print(f"API Error: {response.status_code} - {response.text}")
        return (sr, np.array([]))

def tts_interface(text: str, instruct: str):
    return voice_design(instruct, text)

def _launch_demo():
    # English Example Data
    examples = [
        [
            "Welcome to the speech synthesis system. Please enter the text you would like to have read aloud.",
            "A clear and natural female voice, moderate speed, stable tone, suitable for news broadcasting or daily conversation."
        ],
        [
            "He's not going to make it... it's all my fault. He only trusts you, Professor Ye. Please, you have to save him!",
            "Standard pronunciation with a dramatic, sobbing quality. The voice is slightly raspy and tense, conveying deep sorrow and desperate pleading."
        ],
        [
            "Our entire fleet and army are overdue for an equipment upgrade. It is time to modernize.",
            "A loud, powerful male voice exhibiting resilience and authority. The pace is brisk and fluent, slowing down slightly at the end for emphasis and decisiveness."
        ],
        [
            "Becoming a scientist is the dream of countless children. We must make scientific work attractive and a profession that children aspire to. We want to give their dreams the wings of technology.",
            "Professional broadcasting style. The pace starts slow and accelerates, with a bright, solid timbre. The tone should be inspiring, passionate, and persuasive."
        ],
        [
            "I have a couple of rifles here, but we have never had the habit of surrendering our weapons.",
            "A calm and confident tone. The speed is steady, with very clear articulation. The voice should feel firm and certain, with a slight downward inflection at the end."
        ],
        [
            "They say a family stays together through the wind and rain. I've spent my life on the waves just for a net of fish, but I never imagined a storm could be for the people. Isn't this a great opportunity?",
            "A young female voice, brave and determined, filled with idealism and warmth. High-mid pitch range, distinct cadence, and transitioning from steady to passionate."
        ],
        [
            "In this village, there is a very traditional and unique delicacy called the Green Leaf Feast. I bet I'll get to taste it today! Look, those two girls are gathering leaves right now. Letโ€™s go ask them about it.",
            "Cheerful and extroverted personality. Fast but fluent pace, with the pitch rising at key moments to emphasize excitement and curiosity about the food."
        ],
        [
            "I have a big brother named Wang. Heโ€™s got a huge appetite for soup! Even without a weapon in his hand, his words fire faster than a machine gun.",
            "A bright, high-pitched young girl's voice. Lively and animated tone that engages the listener, with a loud and clear volume reflecting an active personality."
        ]
    ]
    
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # ๐ŸŽ™๏ธ Qwen3-TTS Voice Design Demo
            
            Effortlessly design custom voice profiles using natural language descriptions.
            
            **How to use:**
            1. Enter the content in the **Input Text** box.
            2. Describe the desired voice (gender, age, speed, emotion) in the **Voice Instruction** box.
            3. Click **Generate Voice** or try one of the **Quick Examples** below.
            """
        )
        
        with gr.Row(equal_height=True):
            with gr.Column(scale=3):
                text_input = gr.Textbox(
                    label="๐Ÿ“ Input Text",
                    placeholder="Enter the text you want to synthesize...",
                    value="Welcome to the speech synthesis system. Please enter the text you would like to have read aloud.",
                    lines=6,
                )
                instruct_input = gr.Textbox(
                    label="๐ŸŽจ Voice Instruction",
                    placeholder="Describe the voice: gender, age, speed, tone, emotion, scenario...",
                    value="A clear and natural female voice, moderate speed, stable tone, suitable for news broadcasting or daily conversation.",
                    lines=6,
                )
                
                with gr.Row():
                    clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear", variant="secondary", scale=1)
                    generate_btn = gr.Button("๐ŸŽต Generate Voice", variant="primary", scale=3, size="lg")
            
            with gr.Column(scale=3):
                result_output = gr.Audio(
                    label="๐Ÿ”Š Generated Result",
                    type="numpy"
                )
                
                gr.Markdown(
                    """
                    ### ๐Ÿ’ก Voice Prompting Tips
                    
                    Try describing these dimensions:
                    - **Persona**: Male/Female, Young/Middle-aged/Elderly.
                    - **Pace**: Fast, moderate, or slow.
                    - **Timbre**: Clear, deep, sweet, magnetic, or raspy.
                    - **Emotion**: Calm, enthusiastic, gentle, serious, or lively.
                    - **Scenario**: News, storytelling, sales, or classroom.
                    """
                )
        
        # Examples Section
        gr.Markdown("### ๐Ÿ“š Quick Examples")
        gr.Examples(
            examples=examples,
            inputs=[text_input, instruct_input],
            outputs=result_output,
            fn=tts_interface,
            cache_examples=False,
            label="Click an example to try it out"
        )
        
        # Event Bindings
        generate_btn.click(
            fn=tts_interface,
            inputs=[text_input, instruct_input],
            outputs=[result_output]
        )
        
        clear_btn.click(
            fn=lambda: ("", "", None),
            inputs=[],
            outputs=[text_input, instruct_input, result_output]
        )
        
        gr.Markdown(
            """
            ---
            **Note:** Generation time depends on text length and server response. Please wait a moment after clicking generate.
            """
        )
    
    # Launch Settings
    demo.queue(default_concurrency_limit=100, max_size=20).launch(
        max_threads=100,
        share=False
    )

if __name__ == "__main__":
    _launch_demo()