Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,6 +34,12 @@ EXAMPLE_SENTENCES = {
|
|
| 34 |
"Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
|
| 35 |
}
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# ─── Model ───
|
| 38 |
print("Loading model...")
|
| 39 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -41,6 +47,14 @@ dtype = torch.float16 if device == "cuda" else torch.float32
|
|
| 41 |
model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
|
| 42 |
print(f"Model loaded ({device})")
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
def _build_instruct(gender, age, pitch, style):
|
| 45 |
parts = []
|
| 46 |
if gender and gender != "Auto":
|
|
@@ -105,6 +119,11 @@ def generate_design(text, mode, lang_choice, gender, age, pitch, style,
|
|
| 105 |
speed, duration, num_step, guidance_scale, denoise, postprocess):
|
| 106 |
if not text or not text.strip():
|
| 107 |
return None, "Please enter text."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
|
| 109 |
kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
|
| 110 |
kwargs["language"] = lang_code
|
|
@@ -128,12 +147,21 @@ def generate_design(text, mode, lang_choice, gender, age, pitch, style,
|
|
| 128 |
|
| 129 |
# ─── Voice Clone ───
|
| 130 |
@spaces.GPU
|
| 131 |
-
def generate_clone(text, ref_audio, ref_text, lang_choice, speed, duration,
|
| 132 |
num_step, guidance_scale, denoise, postprocess):
|
| 133 |
if not text or not text.strip():
|
| 134 |
return None, "Please enter text."
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# Ensure ref_audio is a valid file path
|
| 139 |
if isinstance(ref_audio, tuple):
|
|
@@ -161,11 +189,17 @@ def generate_clone(text, ref_audio, ref_text, lang_choice, speed, duration,
|
|
| 161 |
except Exception as e:
|
| 162 |
return None, f"Error: {e}"
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
# ─── UI ───
|
| 165 |
CSS = """
|
| 166 |
.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
|
| 167 |
.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
|
| 168 |
footer { display: none !important; }
|
|
|
|
|
|
|
| 169 |
"""
|
| 170 |
|
| 171 |
with gr.Blocks(title="OmniVoice") as app:
|
|
@@ -177,9 +211,14 @@ with gr.Blocks(title="OmniVoice") as app:
|
|
| 177 |
with gr.Tab("Voice Design"):
|
| 178 |
with gr.Row():
|
| 179 |
with gr.Column(scale=1):
|
| 180 |
-
d_text = gr.Textbox(
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
|
| 184 |
d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
|
| 185 |
label="Language", info="Select the language of the input text")
|
|
@@ -189,9 +228,9 @@ with gr.Blocks(title="OmniVoice") as app:
|
|
| 189 |
|
| 190 |
# ── Always visible gender buttons ──
|
| 191 |
with gr.Row():
|
| 192 |
-
male_btn = gr.Button("
|
| 193 |
-
female_btn = gr.Button("
|
| 194 |
-
gr.Markdown("
|
| 195 |
|
| 196 |
# Voice design attributes (visible only when mode == "Voice Design")
|
| 197 |
with gr.Group(visible=False) as d_voice_opts:
|
|
@@ -223,6 +262,14 @@ with gr.Blocks(title="OmniVoice") as app:
|
|
| 223 |
d_audio = gr.Audio(label="Generated Audio")
|
| 224 |
d_status = gr.Textbox(label="Status", interactive=False)
|
| 225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
# Button events
|
| 227 |
male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
|
| 228 |
female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])
|
|
@@ -243,10 +290,30 @@ with gr.Blocks(title="OmniVoice") as app:
|
|
| 243 |
with gr.Tab("Voice Clone"):
|
| 244 |
with gr.Row():
|
| 245 |
with gr.Column(scale=1):
|
| 246 |
-
c_text = gr.Textbox(
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
|
| 251 |
placeholder="Leave empty for auto-transcription")
|
| 252 |
c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
|
|
@@ -270,9 +337,19 @@ with gr.Blocks(title="OmniVoice") as app:
|
|
| 270 |
c_audio = gr.Audio(label="Generated Audio")
|
| 271 |
c_status = gr.Textbox(label="Status", interactive=False)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
c_btn.click(
|
| 274 |
fn=generate_clone,
|
| 275 |
-
inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
|
| 276 |
c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
|
| 277 |
outputs=[c_audio, c_status],
|
| 278 |
)
|
|
|
|
| 34 |
"Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
|
| 35 |
}
|
| 36 |
|
| 37 |
+
# ─── Pre‑loaded cloned voices ───
|
| 38 |
+
PRELOADED_VOICES = {
|
| 39 |
+
"Upload my own": None,
|
| 40 |
+
"Muhya (pre‑loaded)": "assets/muhya.mp3",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
# ─── Model ───
|
| 44 |
print("Loading model...")
|
| 45 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 47 |
model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
|
| 48 |
print(f"Model loaded ({device})")
|
| 49 |
|
| 50 |
+
MAX_WORDS = 50
|
| 51 |
+
|
| 52 |
+
def _count_words(text):
|
| 53 |
+
"""Count words in a string (splits on whitespace)."""
|
| 54 |
+
if not text:
|
| 55 |
+
return 0
|
| 56 |
+
return len(text.strip().split())
|
| 57 |
+
|
| 58 |
def _build_instruct(gender, age, pitch, style):
|
| 59 |
parts = []
|
| 60 |
if gender and gender != "Auto":
|
|
|
|
| 119 |
speed, duration, num_step, guidance_scale, denoise, postprocess):
|
| 120 |
if not text or not text.strip():
|
| 121 |
return None, "Please enter text."
|
| 122 |
+
|
| 123 |
+
word_count = _count_words(text)
|
| 124 |
+
if word_count > MAX_WORDS:
|
| 125 |
+
return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
|
| 126 |
+
|
| 127 |
lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
|
| 128 |
kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
|
| 129 |
kwargs["language"] = lang_code
|
|
|
|
| 147 |
|
| 148 |
# ─── Voice Clone ───
|
| 149 |
@spaces.GPU
|
| 150 |
+
def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration,
|
| 151 |
num_step, guidance_scale, denoise, postprocess):
|
| 152 |
if not text or not text.strip():
|
| 153 |
return None, "Please enter text."
|
| 154 |
+
|
| 155 |
+
word_count = _count_words(text)
|
| 156 |
+
if word_count > MAX_WORDS:
|
| 157 |
+
return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
|
| 158 |
+
|
| 159 |
+
# Determine the actual reference audio path
|
| 160 |
+
preloaded_path = PRELOADED_VOICES.get(voice_choice)
|
| 161 |
+
if preloaded_path:
|
| 162 |
+
ref_audio = preloaded_path
|
| 163 |
+
elif ref_audio is None:
|
| 164 |
+
return None, "Please upload reference audio or select a pre‑loaded voice."
|
| 165 |
|
| 166 |
# Ensure ref_audio is a valid file path
|
| 167 |
if isinstance(ref_audio, tuple):
|
|
|
|
| 189 |
except Exception as e:
|
| 190 |
return None, f"Error: {e}"
|
| 191 |
|
| 192 |
+
def toggle_ref_audio(voice_choice):
|
| 193 |
+
"""Show/hide the manual upload field based on voice selection."""
|
| 194 |
+
return gr.update(visible=(voice_choice == "Upload my own"))
|
| 195 |
+
|
| 196 |
# ─── UI ───
|
| 197 |
CSS = """
|
| 198 |
.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
|
| 199 |
.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
|
| 200 |
footer { display: none !important; }
|
| 201 |
+
.word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; }
|
| 202 |
+
.word-counter.over-limit { color: #d32f2f; font-weight: bold; }
|
| 203 |
"""
|
| 204 |
|
| 205 |
with gr.Blocks(title="OmniVoice") as app:
|
|
|
|
| 211 |
with gr.Tab("Voice Design"):
|
| 212 |
with gr.Row():
|
| 213 |
with gr.Column(scale=1):
|
| 214 |
+
d_text = gr.Textbox(
|
| 215 |
+
label="Text to speak", lines=6,
|
| 216 |
+
placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
|
| 217 |
+
value=DEFAULT_TEXT
|
| 218 |
+
)
|
| 219 |
+
d_word_counter = gr.HTML(
|
| 220 |
+
value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
|
| 221 |
+
)
|
| 222 |
d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
|
| 223 |
d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
|
| 224 |
label="Language", info="Select the language of the input text")
|
|
|
|
| 228 |
|
| 229 |
# ── Always visible gender buttons ──
|
| 230 |
with gr.Row():
|
| 231 |
+
male_btn = gr.Button("Masculine Voice", variant="secondary")
|
| 232 |
+
female_btn = gr.Button("Feminine Voice", variant="secondary")
|
| 233 |
+
gr.Markdown("*These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*")
|
| 234 |
|
| 235 |
# Voice design attributes (visible only when mode == "Voice Design")
|
| 236 |
with gr.Group(visible=False) as d_voice_opts:
|
|
|
|
| 262 |
d_audio = gr.Audio(label="Generated Audio")
|
| 263 |
d_status = gr.Textbox(label="Status", interactive=False)
|
| 264 |
|
| 265 |
+
# Live word counter update
|
| 266 |
+
def update_word_counter(text):
|
| 267 |
+
count = _count_words(text)
|
| 268 |
+
css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter"
|
| 269 |
+
return f'<div class="{css_class}">{count} / {MAX_WORDS} words</div>'
|
| 270 |
+
|
| 271 |
+
d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter)
|
| 272 |
+
|
| 273 |
# Button events
|
| 274 |
male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
|
| 275 |
female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])
|
|
|
|
| 290 |
with gr.Tab("Voice Clone"):
|
| 291 |
with gr.Row():
|
| 292 |
with gr.Column(scale=1):
|
| 293 |
+
c_text = gr.Textbox(
|
| 294 |
+
label="Text to speak", lines=6,
|
| 295 |
+
placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
|
| 296 |
+
value=DEFAULT_TEXT
|
| 297 |
+
)
|
| 298 |
+
c_word_counter = gr.HTML(
|
| 299 |
+
value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
# Pre-loaded voice selector
|
| 303 |
+
c_voice_choice = gr.Dropdown(
|
| 304 |
+
choices=list(PRELOADED_VOICES.keys()),
|
| 305 |
+
value="Upload my own",
|
| 306 |
+
label="Voice Source",
|
| 307 |
+
info="Choose a pre‑loaded voice or upload your own"
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
# Manual upload (hidden when a pre-loaded voice is selected)
|
| 311 |
+
c_ref = gr.Audio(
|
| 312 |
+
label="Reference Audio (3–15 seconds)",
|
| 313 |
+
type="filepath",
|
| 314 |
+
visible=True
|
| 315 |
+
)
|
| 316 |
+
|
| 317 |
c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
|
| 318 |
placeholder="Leave empty for auto-transcription")
|
| 319 |
c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
|
|
|
|
| 337 |
c_audio = gr.Audio(label="Generated Audio")
|
| 338 |
c_status = gr.Textbox(label="Status", interactive=False)
|
| 339 |
|
| 340 |
+
# Live word counter update
|
| 341 |
+
c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter)
|
| 342 |
+
|
| 343 |
+
# Toggle upload field visibility
|
| 344 |
+
c_voice_choice.change(
|
| 345 |
+
fn=toggle_ref_audio,
|
| 346 |
+
inputs=c_voice_choice,
|
| 347 |
+
outputs=c_ref
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
c_btn.click(
|
| 351 |
fn=generate_clone,
|
| 352 |
+
inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed,
|
| 353 |
c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
|
| 354 |
outputs=[c_audio, c_status],
|
| 355 |
)
|