Spaces:
Sleeping
Sleeping
File size: 8,032 Bytes
49268bc 7ef9b95 49268bc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | import base64
import json
import gradio as gr
import numpy as np
import requests
import os
import io
import soundfile as sf
# Ensure API_KEY is set in your environment variables
API_KEY = os.environ.get('API_KEY', 'your_api_key_here')
def voice_design(prompt, text):
url = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/customization"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}"
}
data = {
"model": "qwen-voice-design",
"input": {
"action": "create",
"voice_prompt": prompt,
"preview_text": text,
"target_model": "qwen3-tts-vd-realtime-2025-12-16",
"preferred_name": "default"
},
"parameters": {
"sample_rate": 24000,
"response_format": "wav"
}
}
response = requests.post(url, headers=headers, data=json.dumps(data))
sr = 24000
if response.status_code == 200:
res_json = json.loads(response.text)
if 'output' in res_json and 'preview_audio' in res_json['output']:
base64_audio = res_json['output']['preview_audio']['data']
audio_bytes = base64.b64decode(base64_audio)
audio_buffer = io.BytesIO(audio_bytes)
audio_np, sr = sf.read(audio_buffer)
return (sr, audio_np)
else:
print("Error in response output:", res_json)
return (sr, np.array([]))
else:
print(f"API Error: {response.status_code} - {response.text}")
return (sr, np.array([]))
def tts_interface(text: str, instruct: str):
return voice_design(instruct, text)
def _launch_demo():
# English Example Data
examples = [
[
"Welcome to the speech synthesis system. Please enter the text you would like to have read aloud.",
"A clear and natural female voice, moderate speed, stable tone, suitable for news broadcasting or daily conversation."
],
[
"He's not going to make it... it's all my fault. He only trusts you, Professor Ye. Please, you have to save him!",
"Standard pronunciation with a dramatic, sobbing quality. The voice is slightly raspy and tense, conveying deep sorrow and desperate pleading."
],
[
"Our entire fleet and army are overdue for an equipment upgrade. It is time to modernize.",
"A loud, powerful male voice exhibiting resilience and authority. The pace is brisk and fluent, slowing down slightly at the end for emphasis and decisiveness."
],
[
"Becoming a scientist is the dream of countless children. We must make scientific work attractive and a profession that children aspire to. We want to give their dreams the wings of technology.",
"Professional broadcasting style. The pace starts slow and accelerates, with a bright, solid timbre. The tone should be inspiring, passionate, and persuasive."
],
[
"I have a couple of rifles here, but we have never had the habit of surrendering our weapons.",
"A calm and confident tone. The speed is steady, with very clear articulation. The voice should feel firm and certain, with a slight downward inflection at the end."
],
[
"They say a family stays together through the wind and rain. I've spent my life on the waves just for a net of fish, but I never imagined a storm could be for the people. Isn't this a great opportunity?",
"A young female voice, brave and determined, filled with idealism and warmth. High-mid pitch range, distinct cadence, and transitioning from steady to passionate."
],
[
"In this village, there is a very traditional and unique delicacy called the Green Leaf Feast. I bet I'll get to taste it today! Look, those two girls are gathering leaves right now. Letโs go ask them about it.",
"Cheerful and extroverted personality. Fast but fluent pace, with the pitch rising at key moments to emphasize excitement and curiosity about the food."
],
[
"I have a big brother named Wang. Heโs got a huge appetite for soup! Even without a weapon in his hand, his words fire faster than a machine gun.",
"A bright, high-pitched young girl's voice. Lively and animated tone that engages the listener, with a loud and clear volume reflecting an active personality."
]
]
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# ๐๏ธ Qwen3-TTS Voice Design Demo
Effortlessly design custom voice profiles using natural language descriptions.
**How to use:**
1. Enter the content in the **Input Text** box.
2. Describe the desired voice (gender, age, speed, emotion) in the **Voice Instruction** box.
3. Click **Generate Voice** or try one of the **Quick Examples** below.
"""
)
with gr.Row(equal_height=True):
with gr.Column(scale=3):
text_input = gr.Textbox(
label="๐ Input Text",
placeholder="Enter the text you want to synthesize...",
value="Welcome to the speech synthesis system. Please enter the text you would like to have read aloud.",
lines=6,
)
instruct_input = gr.Textbox(
label="๐จ Voice Instruction",
placeholder="Describe the voice: gender, age, speed, tone, emotion, scenario...",
value="A clear and natural female voice, moderate speed, stable tone, suitable for news broadcasting or daily conversation.",
lines=6,
)
with gr.Row():
clear_btn = gr.Button("๐๏ธ Clear", variant="secondary", scale=1)
generate_btn = gr.Button("๐ต Generate Voice", variant="primary", scale=3, size="lg")
with gr.Column(scale=3):
result_output = gr.Audio(
label="๐ Generated Result",
type="numpy"
)
gr.Markdown(
"""
### ๐ก Voice Prompting Tips
Try describing these dimensions:
- **Persona**: Male/Female, Young/Middle-aged/Elderly.
- **Pace**: Fast, moderate, or slow.
- **Timbre**: Clear, deep, sweet, magnetic, or raspy.
- **Emotion**: Calm, enthusiastic, gentle, serious, or lively.
- **Scenario**: News, storytelling, sales, or classroom.
"""
)
# Examples Section
gr.Markdown("### ๐ Quick Examples")
gr.Examples(
examples=examples,
inputs=[text_input, instruct_input],
outputs=result_output,
fn=tts_interface,
cache_examples=False,
label="Click an example to try it out"
)
# Event Bindings
generate_btn.click(
fn=tts_interface,
inputs=[text_input, instruct_input],
outputs=[result_output]
)
clear_btn.click(
fn=lambda: ("", "", None),
inputs=[],
outputs=[text_input, instruct_input, result_output]
)
gr.Markdown(
"""
---
**Note:** Generation time depends on text length and server response. Please wait a moment after clicking generate.
"""
)
# Launch Settings
demo.queue(default_concurrency_limit=100, max_size=20).launch(
max_threads=100,
share=False
)
if __name__ == "__main__":
_launch_demo() |