"""Minimal baseline Gradio entry point for the Month 1-3 rebuild. Wires the simplest possible slice: Whisper (zero-shot) -> Aya-Expanse -> MMS-TTS. No LoRA adapters, no memory loop, no speaker ID, no voice cloning, no IoT, no phrase matcher. Used for field testing and building a real-user eval set. See docs/baseline_rebuild.md for the plan this fits into. Run locally: HF_TOKEN=hf_xxx python app_minimal.py Environment variables (all optional except HF_TOKEN, which is needed for the HF Serverless LLM call): HF_TOKEN — HuggingFace token with read access LLM_MODEL_ID — default "CohereLabs/aya-expanse-32b" (23-language multilingual, strong African-language coverage) DEVICE — "cuda" or "cpu" (auto if unset) LOG_LEVEL — default "INFO" """ from __future__ import annotations import logging import os from typing import Optional, Tuple import numpy as np # Load .env (HF_TOKEN etc.) before reading os.environ below. Silent no-op if # python-dotenv is not installed or no .env is present. try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # Local imports — the four modules the baseline-rebuild plan authorizes. # Everything else in src/ is intentionally unused here. from src.data.bam_normalize import normalize as bam_normalize from src.engine.turn_logger import TurnLogger from src.engine.whisper_base import WhisperBackbone from src.llm.minimal_client import MinimalClient from src.llm.phrasebook import lookup as phrasebook_lookup, top_k as phrasebook_top_k from src.tts.mms_tts import MMSTTSEngine logging.basicConfig( level=os.getenv("LOG_LEVEL", "INFO"), format="%(asctime)s %(name)-30s %(levelname)-7s %(message)s", ) logger = logging.getLogger(__name__) # ── Environment ────────────────────────────────────────────────────────────── HF_TOKEN = os.environ.get("HF_TOKEN") LLM_MODEL_ID = os.environ.get("LLM_MODEL_ID", "CohereLabs/aya-expanse-32b") _REQUESTED_DEVICE = os.environ.get("DEVICE") # optional override LANG_CHOICES = [("Bambara", "bam"), ("Fula", "ful"), ("French", "fr"), ("English", "en")] LANG_NAMES = {"bam": "Bambara", "ful": "Fula", "fr": "French", "en": "English"} LANG_TO_WHISPER_HINT = { # Whisper large-v3-turbo does not know Bambara/Fula as first-class # languages. We leave `language` unset for those so Whisper auto-detects; # fr/en are explicit hints for clean decoding. "bam": None, "ful": None, "fr": "french", "en": "english", } # Reply-language steering is handled inside MinimalClient via a dialect-anchored # system prompt (see src/llm/minimal_client.py). No per-turn directive needed. # ── Service singletons (lazy-loaded) ──────────────────────────────────────── _backbone: Optional[WhisperBackbone] = None _llm: Optional[MinimalClient] = None _tts: Optional[MMSTTSEngine] = None _turn_logger: TurnLogger = TurnLogger() def _resolve_device() -> str: """Pick 'cuda' if torch sees a GPU, else 'cpu'. DEVICE env overrides. Some torch builds (CPU-only wheels) report `cuda.is_available() == True` in error states; we additionally probe device_count and fall back to cpu on any exception to keep the app usable on CPU-only laptops. """ import torch # lazy if _REQUESTED_DEVICE: return _REQUESTED_DEVICE try: if torch.cuda.is_available() and torch.cuda.device_count() > 0: return "cuda" except Exception: pass return "cpu" def get_backbone() -> WhisperBackbone: """Load the Whisper backbone once and cache. Zero-shot — no adapters.""" global _backbone if _backbone is None: _backbone = WhisperBackbone(config_path="configs/base_config.yaml") _backbone.load(device=_resolve_device(), hf_token=HF_TOKEN) logger.info("Whisper backbone ready: %s on %s", _backbone.model_id, _backbone.device) return _backbone def get_llm() -> MinimalClient: global _llm if _llm is None: _llm = MinimalClient(model_id=LLM_MODEL_ID, hf_token=HF_TOKEN) logger.info("Minimal LLM client configured: %s", LLM_MODEL_ID) return _llm def get_tts() -> MMSTTSEngine: global _tts if _tts is None: _tts = MMSTTSEngine() logger.info("MMS-TTS engine ready (lazy per-language load)") return _tts # ── Core pipeline ──────────────────────────────────────────────────────────── def transcribe(audio_np: np.ndarray, sample_rate: int, input_lang: str) -> str: """Run zero-shot Whisper on a numpy audio array. Returns the raw transcript. `input_lang` drives two things only: the Whisper language hint (for fr/en) and whether bam_normalize is applied. It has no effect on the TTS voice or on the LLM reply language — those are driven by the separate output-language dropdown in the UI. """ import torch # lazy import librosa # lazy — resample if the mic gave us something non-16k backbone = get_backbone() target_sr = 16_000 # Ensure mono float32 if audio_np.ndim == 2: audio_np = audio_np.mean(axis=1) audio_np = audio_np.astype(np.float32) # Gradio's gr.Audio often returns int16-scaled floats or ints — normalize. peak = np.max(np.abs(audio_np)) if audio_np.size else 0.0 if peak > 1.5: # looks like raw int16 cast to float audio_np = audio_np / 32768.0 if sample_rate != target_sr: audio_np = librosa.resample(audio_np, orig_sr=sample_rate, target_sr=target_sr) inputs = backbone.processor( audio_np, sampling_rate=target_sr, return_tensors="pt" ) input_features = inputs.input_features.to(backbone.device) if backbone.device == "cuda": input_features = input_features.half() gen_kwargs: dict = {"max_new_tokens": 128} hint = LANG_TO_WHISPER_HINT.get(input_lang) if hint: gen_kwargs["language"] = hint gen_kwargs["task"] = "transcribe" with torch.no_grad(): output_ids = backbone.model.generate(input_features, **gen_kwargs) transcript = backbone.processor.batch_decode( output_ids, skip_special_tokens=True )[0].strip() if input_lang == "bam" and transcript: transcript = bam_normalize(transcript) return transcript NO_TRANSLATION = "(no curated translation — try Generate reply)" def _synthesize(text: str, output_lang: str ) -> Tuple[Optional[Tuple[int, np.ndarray]], Optional[int], Optional[str]]: """Run TTS on `text` in `output_lang`. Returns (audio_or_None, tts_ms, error).""" import time if not text: return None, None, None t = time.perf_counter() device = _resolve_device() try: wav, sr = get_tts().synthesize(text, language=output_lang, device=device) return (sr, wav), int((time.perf_counter() - t) * 1000), None except AssertionError as exc: # Most common: "Torch not compiled with CUDA enabled" on CPU-only boxes # where is_available() lied. Retry once on CPU. if device != "cpu": logger.warning("TTS failed on %s (%s) — retrying on cpu", device, exc) try: wav, sr = get_tts().synthesize(text, language=output_lang, device="cpu") return (sr, wav), int((time.perf_counter() - t) * 1000), None except Exception as exc2: # pragma: no cover logger.exception("TTS failed on cpu fallback") return None, None, f"tts: {exc2}" logger.exception("TTS failed") return None, None, f"tts: {exc}" except Exception as exc: # pragma: no cover logger.exception("TTS failed") return None, None, f"tts: {exc}" def _translate_only(user_text: str, output_lang: str ) -> Tuple[str, Optional[Tuple[int, np.ndarray]], Optional[dict], Optional[int]]: """Phrasebook-only translation — never calls the LLM. Returns (translation_text, translation_audio, hit_or_None, tts_ms). On miss for bam/ful, returns NO_TRANSLATION and no audio. For en/fr targets (no curated phrasebook), echoes the input as the translation since the user likely wants to hear it spoken — TTS in that language is still the right thing to play. """ text = (user_text or "").strip() if not text: return "", None, None, None hit = phrasebook_lookup(text, output_lang) if hit: logger.info( "Phrasebook hit (%s, score=%.2f): %r → %r [cat=%s]", hit["match"], hit["score"], text, hit["target"], hit["category"], ) target = hit["target"] or "" audio, tts_ms, _ = _synthesize(target, output_lang) return target, audio, hit, tts_ms # No curated translation. For en/fr we still synthesize the input itself # (the user can use the app as a TTS box). For bam/ful we surface the # honest "no curated translation" sentinel — the user can then click # "Generate reply" if they want the LLM to handle it. if output_lang in ("en", "fr"): audio, tts_ms, _ = _synthesize(text, output_lang) return text, audio, None, tts_ms return NO_TRANSLATION, None, None, None def _generate_reply(user_text: str, output_lang: str ) -> Tuple[str, Optional[Tuple[int, np.ndarray]], Optional[int], Optional[int], Optional[str]]: """Dialect-anchored LLM reply (with RAG top-3 few-shot) + TTS. Returns (reply_text, reply_audio, llm_ms, tts_ms, error). Always returns a usable text string — even on LLM failure it returns a short parenthetical so the UI never goes blank. """ import time text = (user_text or "").strip() if not text: return "(nothing to reply to)", None, None, None, None extras = phrasebook_top_k(text, output_lang, k=3) or None if extras: logger.info( "RAG-injecting top-%d nearest phrasebook entries (top score=%.2f)", len(extras), extras[0]["score"], ) t_llm = time.perf_counter() try: reply = get_llm().chat( text, target_lang=output_lang, extra_examples=extras, ) except Exception as exc: # pragma: no cover logger.exception("LLM call failed") llm_ms = int((time.perf_counter() - t_llm) * 1000) return f"(LLM error: {exc})", None, llm_ms, None, f"llm: {exc}" llm_ms = int((time.perf_counter() - t_llm) * 1000) reply = (reply or "").strip() or "(empty reply)" audio, tts_ms, tts_error = _synthesize(reply, output_lang) return reply, audio, llm_ms, tts_ms, tts_error # ── Tab handlers ───────────────────────────────────────────────────────────── def run_text_translate( text: str, output_lang: str, ) -> Tuple[str, Optional[Tuple[int, np.ndarray]], str]: """Text tab → Send: phrasebook-only translation. Always-on, no LLM. Returns (translation_text, translation_audio, transcript_state). `transcript_state` is the canonicalised input passed to the Generate-reply button so it doesn't need to re-read the textbox. """ import time t0 = time.perf_counter() text = (text or "").strip() if not text: return "(no text entered)", None, "" translation, audio, hit, tts_ms = _translate_only(text, output_lang) _turn_logger.log( phase="translate", tab="text", input_lang=None, output_lang=output_lang, user_text=text, transcript=None, transcribe_ms=None, phrasebook=hit, llm_model=None, llm_ms=None, reply_text=translation, tts_ms=tts_ms, total_ms=int((time.perf_counter() - t0) * 1000), error=None, ) return translation, audio, text def run_text_reply( transcript_state: str, output_lang: str, ) -> Tuple[str, Optional[Tuple[int, np.ndarray]]]: """Text tab → Generate reply: dialect-anchored LLM + TTS.""" import time t0 = time.perf_counter() if not (transcript_state or "").strip(): return "(send a message first)", None reply, audio, llm_ms, tts_ms, error = _generate_reply( transcript_state, output_lang ) _turn_logger.log( phase="reply", tab="text", input_lang=None, output_lang=output_lang, user_text=transcript_state, transcript=None, transcribe_ms=None, phrasebook=None, llm_model=LLM_MODEL_ID, llm_ms=llm_ms, reply_text=reply, tts_ms=tts_ms, total_ms=int((time.perf_counter() - t0) * 1000), error=error, ) return reply, audio def run_voice_translate( audio: Optional[Tuple[int, np.ndarray]], input_lang: str, output_lang: str, ) -> Tuple[str, str, Optional[Tuple[int, np.ndarray]], str]: """Voice tab → Submit: Whisper transcribe + phrasebook-only translation. Returns (transcript, translation_text, translation_audio, transcript_state). """ import time t0 = time.perf_counter() if audio is None: return "", "(no audio received)", None, "" sample_rate, audio_np = audio if audio_np.size == 0: return "", "(empty audio)", None, "" t_stt = time.perf_counter() try: transcript = transcribe(audio_np, sample_rate, input_lang) except Exception as exc: # pragma: no cover logger.exception("Transcription failed") _turn_logger.log( phase="translate", tab="voice", input_lang=input_lang, output_lang=output_lang, user_text=None, transcript=None, transcribe_ms=None, phrasebook=None, llm_model=None, llm_ms=None, reply_text=None, tts_ms=None, total_ms=int((time.perf_counter() - t0) * 1000), error=f"stt: {exc}", ) return "", f"(STT error: {exc})", None, "" transcribe_ms = int((time.perf_counter() - t_stt) * 1000) if not transcript: _turn_logger.log( phase="translate", tab="voice", input_lang=input_lang, output_lang=output_lang, user_text=None, transcript="", transcribe_ms=transcribe_ms, phrasebook=None, llm_model=None, llm_ms=None, reply_text=None, tts_ms=None, total_ms=int((time.perf_counter() - t0) * 1000), error="no_speech", ) return "", "(no speech detected)", None, "" translation, t_audio, hit, tts_ms = _translate_only(transcript, output_lang) _turn_logger.log( phase="translate", tab="voice", input_lang=input_lang, output_lang=output_lang, user_text=transcript, transcript=transcript, transcribe_ms=transcribe_ms, phrasebook=hit, llm_model=None, llm_ms=None, reply_text=translation, tts_ms=tts_ms, total_ms=int((time.perf_counter() - t0) * 1000), error=None, ) return transcript, translation, t_audio, transcript def run_voice_reply( transcript_state: str, output_lang: str, ) -> Tuple[str, Optional[Tuple[int, np.ndarray]]]: """Voice tab → Generate reply: uses the stored transcript, no re-Whisper.""" import time t0 = time.perf_counter() if not (transcript_state or "").strip(): return "(record audio and submit first)", None reply, audio, llm_ms, tts_ms, error = _generate_reply( transcript_state, output_lang ) _turn_logger.log( phase="reply", tab="voice", input_lang=None, output_lang=output_lang, user_text=transcript_state, transcript=transcript_state, transcribe_ms=None, phrasebook=None, llm_model=LLM_MODEL_ID, llm_ms=llm_ms, reply_text=reply, tts_ms=tts_ms, total_ms=int((time.perf_counter() - t0) * 1000), error=error, ) return reply, audio # ── Gradio UI ──────────────────────────────────────────────────────────────── def build_ui(): """Construct and return the Gradio Blocks app.""" import gradio as gr # lazy — keeps module importable without gradio installed with gr.Blocks(title="Sahel-Voice — Minimal Baseline") as demo: gr.Markdown( "# 🌾 Sahel-Voice — Minimal Baseline\n" f"Zero-shot Whisper → {LLM_MODEL_ID} → MMS-TTS, with a curated " "Bambara/Pular phrasebook short-circuit in front of the LLM. " "No adapters, no memory, no polish. This is the field-test " "baseline — see `docs/baseline_rebuild.md`." ) # Shared across tabs. Split into two so input and output language # are never conflated — the Voice tab cares about both; the Text tab # only uses output_lang (it doesn't feed Whisper). with gr.Row(): input_lang = gr.Dropdown( choices=LANG_CHOICES, value="bam", label="Input language", info="Language you're speaking/typing. Drives Whisper hint " "(fr/en only) and bam_normalize (bam only).", ) output_lang = gr.Dropdown( choices=LANG_CHOICES, value="bam", label="Output language", info="Language the LLM should reply in. Also picks the TTS voice.", ) # Carries the canonical input (typed text, or Whisper transcript) from # Submit/Send into the Generate-reply button so we don't re-transcribe # or re-read the textbox. transcript_state = gr.State("") with gr.Tabs(): # ── Voice tab — the actual baseline the field test measures ───── with gr.Tab("🎤 Voice (full STT → translation + optional reply)"): with gr.Row(): with gr.Column(): audio_in = gr.Audio( sources=["microphone", "upload"], type="numpy", label="Speak (or upload a .wav)", ) voice_submit = gr.Button( "Transcribe + translate", variant="primary" ) voice_transcript_out = gr.Textbox( label="Transcript (zero-shot Whisper)", lines=2, interactive=False, ) with gr.Column(): voice_translation_out = gr.Textbox( label="Phrasebook translation", lines=3, interactive=False, ) voice_translation_audio = gr.Audio( label="Translation audio", type="numpy", autoplay=False, ) voice_reply_btn = gr.Button( "Generate reply (LLM)", variant="secondary" ) voice_reply_out = gr.Textbox( label="LLM reply", lines=4, interactive=False, ) voice_reply_audio = gr.Audio( label="Reply audio", type="numpy", autoplay=False, ) voice_submit.click( fn=run_voice_translate, inputs=[audio_in, input_lang, output_lang], outputs=[ voice_transcript_out, voice_translation_out, voice_translation_audio, transcript_state, ], ) voice_reply_btn.click( fn=run_voice_reply, inputs=[transcript_state, output_lang], outputs=[voice_reply_out, voice_reply_audio], ) # ── Text tab — dev loop, skips Whisper ────────────────────────── with gr.Tab("⌨️ Text (translation + optional reply, dev loop)"): with gr.Row(): with gr.Column(): text_in = gr.Textbox( label="Type your message", lines=3, placeholder="e.g. Good morning, how are you?", ) text_submit = gr.Button("Send", variant="primary") with gr.Column(): text_translation_out = gr.Textbox( label="Phrasebook translation", lines=3, interactive=False, ) text_translation_audio = gr.Audio( label="Translation audio", type="numpy", autoplay=False, ) text_reply_btn = gr.Button( "Generate reply (LLM)", variant="secondary" ) text_reply_out = gr.Textbox( label="LLM reply", lines=4, interactive=False, ) text_reply_audio = gr.Audio( label="Reply audio", type="numpy", autoplay=False, ) # Text tab only uses output_lang — input_lang is a no-op here. text_submit.click( fn=run_text_translate, inputs=[text_in, output_lang], outputs=[ text_translation_out, text_translation_audio, transcript_state, ], ) # Pressing Enter in the textbox also submits. text_in.submit( fn=run_text_translate, inputs=[text_in, output_lang], outputs=[ text_translation_out, text_translation_audio, transcript_state, ], ) text_reply_btn.click( fn=run_text_reply, inputs=[transcript_state, output_lang], outputs=[text_reply_out, text_reply_audio], ) gr.Markdown( "---\n" "**What's intentionally missing:** LoRA adapters, memory/vocabulary " "persistence, speaker ID, Waxal/F5 TTS, IoT sensor integration, " "phrase-matcher shortcuts. All of those live in `app.py` — this is the " "stripped-down baseline used to measure what Whisper zero-shot does on " "real Bambara/Fula recordings and to collect a real-user eval set.\n\n" "The **Text** tab skips Whisper — it's for fast iteration on the " "LLM + TTS path, not for field-test measurement.\n\n" "**How the two boxes differ:** the top pair is a phrasebook lookup " "(no LLM, instant, gold-curated translation). If your input isn't " "in the curated list you'll see *(no curated translation)* — click " "**Generate reply** to get a dialect-anchored LLM response in the " "bottom pair." ) return demo def main() -> None: if not HF_TOKEN: logger.warning( "HF_TOKEN is not set — the LLM call will fail. " "Export HF_TOKEN before launching for the pipeline to work end-to-end." ) demo = build_ui() demo.queue().launch() if __name__ == "__main__": main()