Spaces:

boffire
/

OmniVoice-kabyle

Sleeping

App Files Files Community

boffire commited on May 4

Commit

64e2649

verified ·

1 Parent(s): d59d5db

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -14

app.py CHANGED Viewed

@@ -34,6 +34,12 @@ EXAMPLE_SENTENCES = {
     "Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
 }
 # ─── Model ───
 print("Loading model...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -41,6 +47,14 @@ dtype = torch.float16 if device == "cuda" else torch.float32
 model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
 print(f"Model loaded ({device})")
 def _build_instruct(gender, age, pitch, style):
     parts = []
     if gender and gender != "Auto":
@@ -105,6 +119,11 @@ def generate_design(text, mode, lang_choice, gender, age, pitch, style,
                     speed, duration, num_step, guidance_scale, denoise, postprocess):
     if not text or not text.strip():
         return None, "Please enter text."
     lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
     kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
     kwargs["language"] = lang_code
@@ -128,12 +147,21 @@ def generate_design(text, mode, lang_choice, gender, age, pitch, style,
 # ─── Voice Clone ───
 @spaces.GPU
-def generate_clone(text, ref_audio, ref_text, lang_choice, speed, duration,
                    num_step, guidance_scale, denoise, postprocess):
     if not text or not text.strip():
         return None, "Please enter text."
-    if ref_audio is None:
-        return None, "Please upload reference audio."
     # Ensure ref_audio is a valid file path
     if isinstance(ref_audio, tuple):
@@ -161,11 +189,17 @@ def generate_clone(text, ref_audio, ref_text, lang_choice, speed, duration,
     except Exception as e:
         return None, f"Error: {e}"
 # ─── UI ───
 CSS = """
 .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
 .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
 footer { display: none !important; }
 """
 with gr.Blocks(title="OmniVoice") as app:
@@ -177,9 +211,14 @@ with gr.Blocks(title="OmniVoice") as app:
         with gr.Tab("Voice Design"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    d_text = gr.Textbox(label="Text to speak", lines=6,
-                                        placeholder="Enter text in the selected language...",
-                                        value=DEFAULT_TEXT)
                     d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
                     d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
                                          label="Language", info="Select the language of the input text")
@@ -189,9 +228,9 @@ with gr.Blocks(title="OmniVoice") as app:
                     # ── Always visible gender buttons ──
                     with gr.Row():
-                        male_btn = gr.Button("👨 Masculine Voice", variant="secondary")
-                        female_btn = gr.Button("👩 Feminine Voice", variant="secondary")
-                    gr.Markdown("📌 *These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*")
                     # Voice design attributes (visible only when mode == "Voice Design")
                     with gr.Group(visible=False) as d_voice_opts:
@@ -223,6 +262,14 @@ with gr.Blocks(title="OmniVoice") as app:
                     d_audio = gr.Audio(label="Generated Audio")
                     d_status = gr.Textbox(label="Status", interactive=False)
             # Button events
             male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
             female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])
@@ -243,10 +290,30 @@ with gr.Blocks(title="OmniVoice") as app:
         with gr.Tab("Voice Clone"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    c_text = gr.Textbox(label="Text to speak", lines=6,
-                                        placeholder="Enter text in the selected language...",
-                                        value=DEFAULT_TEXT)
-                    c_ref = gr.Audio(label="Reference Audio (3–15 seconds)", type="filepath")
                     c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
                                              placeholder="Leave empty for auto-transcription")
                     c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
@@ -270,9 +337,19 @@ with gr.Blocks(title="OmniVoice") as app:
                     c_audio = gr.Audio(label="Generated Audio")
                     c_status = gr.Textbox(label="Status", interactive=False)
             c_btn.click(
                 fn=generate_clone,
-                inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
                         c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
                 outputs=[c_audio, c_status],
             )

     "Algerian Arabic": "شحال شْبَابْ ليوم. ليوما رانا حابين نروحو للبحر. تحب تجي معانا ولٌا لا؟"
 }
+# ─── Pre‑loaded cloned voices ───
+PRELOADED_VOICES = {
+    "Upload my own": None,
+    "Muhya (pre‑loaded)": "assets/muhya.mp3",
+}
 # ─── Model ───
 print("Loading model...")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = OmniVoice.from_pretrained("k2-fsa/OmniVoice", device_map=device, dtype=dtype)
 print(f"Model loaded ({device})")
+MAX_WORDS = 50
+def _count_words(text):
+    """Count words in a string (splits on whitespace)."""
+    if not text:
+        return 0
+    return len(text.strip().split())
 def _build_instruct(gender, age, pitch, style):
     parts = []
     if gender and gender != "Auto":
                     speed, duration, num_step, guidance_scale, denoise, postprocess):
     if not text or not text.strip():
         return None, "Please enter text."
+    word_count = _count_words(text)
+    if word_count > MAX_WORDS:
+        return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
     lang_code = LANG_CODE_MAP.get(lang_choice, "kab")
     kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)
     kwargs["language"] = lang_code
 # ─── Voice Clone ───
 @spaces.GPU
+def generate_clone(text, voice_choice, ref_audio, ref_text, lang_choice, speed, duration,
                    num_step, guidance_scale, denoise, postprocess):
     if not text or not text.strip():
         return None, "Please enter text."
+    word_count = _count_words(text)
+    if word_count > MAX_WORDS:
+        return None, f"Text too long: {word_count} words (max {MAX_WORDS}). Please shorten your input."
+    # Determine the actual reference audio path
+    preloaded_path = PRELOADED_VOICES.get(voice_choice)
+    if preloaded_path:
+        ref_audio = preloaded_path
+    elif ref_audio is None:
+        return None, "Please upload reference audio or select a pre‑loaded voice."
     # Ensure ref_audio is a valid file path
     if isinstance(ref_audio, tuple):
     except Exception as e:
         return None, f"Error: {e}"
+def toggle_ref_audio(voice_choice):
+    """Show/hide the manual upload field based on voice selection."""
+    return gr.update(visible=(voice_choice == "Upload my own"))
 # ─── UI ───
 CSS = """
 .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
 .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
 footer { display: none !important; }
+.word-counter { text-align: right; font-size: 0.85em; color: #666; margin-top: -0.5em; }
+.word-counter.over-limit { color: #d32f2f; font-weight: bold; }
 """
 with gr.Blocks(title="OmniVoice") as app:
         with gr.Tab("Voice Design"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    d_text = gr.Textbox(
+                        label="Text to speak", lines=6,
+                        placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
+                        value=DEFAULT_TEXT
+                    )
+                    d_word_counter = gr.HTML(
+                        value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
+                    )
                     d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="Mode")
                     d_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
                                          label="Language", info="Select the language of the input text")
                     # ── Always visible gender buttons ──
                     with gr.Row():
+                        male_btn = gr.Button("Masculine Voice", variant="secondary")
+                        female_btn = gr.Button("Feminine Voice", variant="secondary")
+                    gr.Markdown("*These buttons switch to Voice Design mode and set the gender. Fine‑tune other attributes below.*")
                     # Voice design attributes (visible only when mode == "Voice Design")
                     with gr.Group(visible=False) as d_voice_opts:
                     d_audio = gr.Audio(label="Generated Audio")
                     d_status = gr.Textbox(label="Status", interactive=False)
+            # Live word counter update
+            def update_word_counter(text):
+                count = _count_words(text)
+                css_class = "word-counter over-limit" if count > MAX_WORDS else "word-counter"
+                return f'<div class="{css_class}">{count} / {MAX_WORDS} words</div>'
+            d_text.change(fn=update_word_counter, inputs=d_text, outputs=d_word_counter)
             # Button events
             male_btn.click(fn=set_male, inputs=[], outputs=[d_gender, d_mode])
             female_btn.click(fn=set_female, inputs=[], outputs=[d_gender, d_mode])
         with gr.Tab("Voice Clone"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    c_text = gr.Textbox(
+                        label="Text to speak", lines=6,
+                        placeholder=f"Enter text in the selected language... (max {MAX_WORDS} words)",
+                        value=DEFAULT_TEXT
+                    )
+                    c_word_counter = gr.HTML(
+                        value=f'<div class="word-counter">{_count_words(DEFAULT_TEXT)} / {MAX_WORDS} words</div>'
+                    )
+                    # Pre-loaded voice selector
+                    c_voice_choice = gr.Dropdown(
+                        choices=list(PRELOADED_VOICES.keys()),
+                        value="Upload my own",
+                        label="Voice Source",
+                        info="Choose a pre‑loaded voice or upload your own"
+                    )
+                    # Manual upload (hidden when a pre-loaded voice is selected)
+                    c_ref = gr.Audio(
+                        label="Reference Audio (3–15 seconds)",
+                        type="filepath",
+                        visible=True
+                    )
                     c_ref_text = gr.Textbox(label="Transcription (optional)", lines=2,
                                              placeholder="Leave empty for auto-transcription")
                     c_lang = gr.Dropdown(choices=LANGUAGE_CHOICES, value="Kabyle (default)",
                     c_audio = gr.Audio(label="Generated Audio")
                     c_status = gr.Textbox(label="Status", interactive=False)
+            # Live word counter update
+            c_text.change(fn=update_word_counter, inputs=c_text, outputs=c_word_counter)
+            # Toggle upload field visibility
+            c_voice_choice.change(
+                fn=toggle_ref_audio,
+                inputs=c_voice_choice,
+                outputs=c_ref
+            )
             c_btn.click(
                 fn=generate_clone,
+                inputs=[c_text, c_voice_choice, c_ref, c_ref_text, c_lang, c_speed,
                         c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
                 outputs=[c_audio, c_status],
             )