boffire commited on
Commit
64e2649
·
verified ·
1 Parent(s): d59d5db

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -14
app.py CHANGED
@@ -34,6 +34,12 @@ EXAMPLE_SENTENCES = {
34
  "Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
35
  }
36
 
 
 
 
 
 
 
37
  # ─── Model ───
38
  print("Loading model...")
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -41,6 +47,14 @@ dtype = torch.float16 if device == "cuda" else torch.float32
41
  model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
42
  print(f"Model loaded ({device})")
43
 
 
 
 
 
 
 
 
 
44
  def _build_instruct(gender, age, pitch, style):
45
  parts = []
46
  if gender and gender != "Auto":
@@ -105,6 +119,11 @@ def generate_design(text, mode, lang_choice, gender, age, pitch, style,
105
  speed, duration, num_step, guidance_scale, denoise, postprocess):
106
  if not text or not text.strip():
107
  return None, "Please enter text."
 
 
 
 
 
108
  lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
109
  kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
110
  kwargs["language"] = lang_code
@@ -128,12 +147,21 @@ def generate_design(text, mode, lang_choice, gender, age, pitch, style,
128
 
129
  # ─── Voice Clone ───
130
  @spaces.GPU
131
- def generate_clone(text, ref_audio, ref_text, lang_choice, speed, duration,
132
  num_step, guidance_scale, denoise, postprocess):
133
  if not text or not text.strip():
134
  return None, "Please enter text."
135
- if ref_audio is None:
136
- return None, "Please upload reference audio."
 
 
 
 
 
 
 
 
 
137
 
138
  # Ensure ref_audio is a valid file path
139
  if isinstance(ref_audio, tuple):
@@ -161,11 +189,17 @@ def generate_clone(text, ref_audio, ref_text, lang_choice, speed, duration,
161
  except Exception as e:
162
  return None, f"Error: {e}"
163
 
 
 
 
 
164
  # ─── UI ───
165
  CSS = """
166
  .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
167
  .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
168
  footer { display: none !important; }
 
 
169
  """
170
 
171
  with gr.Blocks(title="OmniVoice") as app:
@@ -177,9 +211,14 @@ with gr.Blocks(title="OmniVoice") as app:
177
  with gr.Tab("Voice Design"):
178
  with gr.Row():
179
  with gr.Column(scale=1):
180
- d_text = gr.Textbox(label="Text to speak", lines=6,
181
- placeholder="Enter text in the selected language...",
182
- value=DEFAULT_TEXT)
 
 
 
 
 
183
  d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
184
  d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
185
  label="Language", info="Select the language of the input text")
@@ -189,9 +228,9 @@ with gr.Blocks(title="OmniVoice") as app:
189
 
190
  # ── Always visible gender buttons ──
191
  with gr.Row():
192
- male_btn = gr.Button("👨 Masculine Voice", variant="secondary")
193
- female_btn = gr.Button("👩 Feminine Voice", variant="secondary")
194
- gr.Markdown("📌 *These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*")
195
 
196
  # Voice design attributes (visible only when mode == "Voice Design")
197
  with gr.Group(visible=False) as d_voice_opts:
@@ -223,6 +262,14 @@ with gr.Blocks(title="OmniVoice") as app:
223
  d_audio = gr.Audio(label="Generated Audio")
224
  d_status = gr.Textbox(label="Status", interactive=False)
225
 
 
 
 
 
 
 
 
 
226
  # Button events
227
  male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
228
  female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])
@@ -243,10 +290,30 @@ with gr.Blocks(title="OmniVoice") as app:
243
  with gr.Tab("Voice Clone"):
244
  with gr.Row():
245
  with gr.Column(scale=1):
246
- c_text = gr.Textbox(label="Text to speak", lines=6,
247
- placeholder="Enter text in the selected language...",
248
- value=DEFAULT_TEXT)
249
- c_ref = gr.Audio(label="Reference Audio (3–15 seconds)", type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
251
  placeholder="Leave empty for auto-transcription")
252
  c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
@@ -270,9 +337,19 @@ with gr.Blocks(title="OmniVoice") as app:
270
  c_audio = gr.Audio(label="Generated Audio")
271
  c_status = gr.Textbox(label="Status", interactive=False)
272
 
 
 
 
 
 
 
 
 
 
 
273
  c_btn.click(
274
  fn=generate_clone,
275
- inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
276
  c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
277
  outputs=[c_audio, c_status],
278
  )
 
34
  "Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
35
  }
36
 
37
+ # ─── Pre‑loaded cloned voices ───
38
+ PRELOADED_VOICES = {
39
+ "Upload my own": None,
40
+ "Muhya (pre‑loaded)": "assets/muhya.mp3",
41
+ }
42
+
43
  # ─── Model ───
44
  print("Loading model...")
45
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
47
  model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
48
  print(f"Model loaded ({device})")
49
 
50
+ MAX_WORDS = 50
51
+
52
+ def _count_words(text):
53
+ """Count words in a string (splits on whitespace)."""
54
+ if not text:
55
+ return 0
56
+ return len(text.strip().split())
57
+
58
  def _build_instruct(gender, age, pitch, style):
59
  parts = []
60
  if gender and gender != "Auto":
 
119
  speed, duration, num_step, guidance_scale, denoise, postprocess):
120
  if not text or not text.strip():
121
  return None, "Please enter text."
122
+
123
+ word_count = _count_words(text)
124
+ if word_count > MAX_WORDS:
125
+ return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
126
+
127
  lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
128
  kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
129
  kwargs["language"] = lang_code
 
147
 
148
  # ─── Voice Clone ───
149
  @spaces.GPU
150
+ def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration,
151
  num_step, guidance_scale, denoise, postprocess):
152
  if not text or not text.strip():
153
  return None, "Please enter text."
154
+
155
+ word_count = _count_words(text)
156
+ if word_count > MAX_WORDS:
157
+ return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
158
+
159
+ # Determine the actual reference audio path
160
+ preloaded_path = PRELOADED_VOICES.get(voice_choice)
161
+ if preloaded_path:
162
+ ref_audio = preloaded_path
163
+ elif ref_audio is None:
164
+ return None, "Please upload reference audio or select a pre‑loaded voice."
165
 
166
  # Ensure ref_audio is a valid file path
167
  if isinstance(ref_audio, tuple):
 
189
  except Exception as e:
190
  return None, f"Error: {e}"
191
 
192
+ def toggle_ref_audio(voice_choice):
193
+ """Show/hide the manual upload field based on voice selection."""
194
+ return gr.update(visible=(voice_choice == "Upload my own"))
195
+
196
  # ─── UI ───
197
  CSS = """
198
  .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
199
  .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
200
  footer { display: none !important; }
201
+ .word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; }
202
+ .word-counter.over-limit { color: #d32f2f; font-weight: bold; }
203
  """
204
 
205
  with gr.Blocks(title="OmniVoice") as app:
 
211
  with gr.Tab("Voice Design"):
212
  with gr.Row():
213
  with gr.Column(scale=1):
214
+ d_text = gr.Textbox(
215
+ label="Text to speak", lines=6,
216
+ placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
217
+ value=DEFAULT_TEXT
218
+ )
219
+ d_word_counter = gr.HTML(
220
+ value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
221
+ )
222
  d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
223
  d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
224
  label="Language", info="Select the language of the input text")
 
228
 
229
  # ── Always visible gender buttons ──
230
  with gr.Row():
231
+ male_btn = gr.Button("Masculine Voice", variant="secondary")
232
+ female_btn = gr.Button("Feminine Voice", variant="secondary")
233
+ gr.Markdown("*These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*")
234
 
235
  # Voice design attributes (visible only when mode == "Voice Design")
236
  with gr.Group(visible=False) as d_voice_opts:
 
262
  d_audio = gr.Audio(label="Generated Audio")
263
  d_status = gr.Textbox(label="Status", interactive=False)
264
 
265
+ # Live word counter update
266
+ def update_word_counter(text):
267
+ count = _count_words(text)
268
+ css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter"
269
+ return f'<div class="{css_class}">{count} / {MAX_WORDS} words</div>'
270
+
271
+ d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter)
272
+
273
  # Button events
274
  male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
275
  female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])
 
290
  with gr.Tab("Voice Clone"):
291
  with gr.Row():
292
  with gr.Column(scale=1):
293
+ c_text = gr.Textbox(
294
+ label="Text to speak", lines=6,
295
+ placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
296
+ value=DEFAULT_TEXT
297
+ )
298
+ c_word_counter = gr.HTML(
299
+ value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
300
+ )
301
+
302
+ # Pre-loaded voice selector
303
+ c_voice_choice = gr.Dropdown(
304
+ choices=list(PRELOADED_VOICES.keys()),
305
+ value="Upload my own",
306
+ label="Voice Source",
307
+ info="Choose a pre‑loaded voice or upload your own"
308
+ )
309
+
310
+ # Manual upload (hidden when a pre-loaded voice is selected)
311
+ c_ref = gr.Audio(
312
+ label="Reference Audio (3–15 seconds)",
313
+ type="filepath",
314
+ visible=True
315
+ )
316
+
317
  c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
318
  placeholder="Leave empty for auto-transcription")
319
  c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
 
337
  c_audio = gr.Audio(label="Generated Audio")
338
  c_status = gr.Textbox(label="Status", interactive=False)
339
 
340
+ # Live word counter update
341
+ c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter)
342
+
343
+ # Toggle upload field visibility
344
+ c_voice_choice.change(
345
+ fn=toggle_ref_audio,
346
+ inputs=c_voice_choice,
347
+ outputs=c_ref
348
+ )
349
+
350
  c_btn.click(
351
  fn=generate_clone,
352
+ inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed,
353
  c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
354
  outputs=[c_audio, c_status],
355
  )