""" Space 3 — WSDC Speech Judge Assistant. The full two-factor pipeline. Audio goes in; three things come out: 1. Delivery score (derived from four prosodic features computed from Whisper-small word-level timestamps) 2. Content score (from SmolLM2-1.7B-Instruct rubric evaluation of the transcript on three dimensions: claim clarity, evidence quality, rebuttal strength) 3. Combined score (simple average of the two) Architecture is the same thin-client-over-API pattern as Space 2 — no local model weights, everything heavy happens on Hugging Face's Inference API servers. See research-journal.md, Weeks 9-10, for the design notes and the Spearman correlation analysis on 20 test clips. Three tabs: - Score: just the three numbers - Breakdown: prosodic features and the LLM's rubric output in detail - Coach: longest-pause timestamps and a one-paragraph coaching note """ import json import os import statistics from typing import Any import gradio as gr import requests HF_TOKEN = os.environ.get("HF_TOKEN", "") WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-small" LLM_URL = ( "https://api-inference.huggingface.co/models/HuggingFaceTB/SmolLM2-1.7B-Instruct" ) PAUSE_THRESHOLD_SECONDS = 0.4 MIN_WORDS_FOR_RELIABLE_FEATURES = 20 RUBRIC_PROMPT = """You are an experienced WSDC (World Schools Debate) judge giving short, constructive feedback on a short speech transcript. Score the speech on each of three dimensions, from 1 (weak) to 5 (strong), and write one sentence of feedback for each dimension. At the end, write one short paragraph of overall coaching advice. Respond ONLY in strict JSON with these exact keys: { "claim_clarity": {"score": , "comment": ""}, "evidence_quality": {"score": , "comment": ""}, "rebuttal_strength": {"score": , "comment": ""}, "coaching_note": "" } TRANSCRIPT: \"\"\" {transcript} \"\"\" """ def _auth_headers(content_type: str | None = None) -> dict[str, str]: if not HF_TOKEN: raise RuntimeError( "HF_TOKEN is not set. Add it as a Space secret " "(Settings -> Variables and secrets -> New secret)." ) headers = {"Authorization": f"Bearer {HF_TOKEN}"} if content_type: headers["Content-Type"] = content_type return headers # ---------- Whisper ---------- def transcribe_with_word_timestamps(audio_path: str) -> dict[str, Any]: with open(audio_path, "rb") as f: data = f.read() response = requests.post( WHISPER_URL, headers=_auth_headers("audio/wav"), params={"return_timestamps": "word"}, data=data, timeout=120, ) if response.status_code != 200: raise RuntimeError( f"Whisper API error {response.status_code}: {response.text[:400]}" ) return response.json() def extract_words_with_times(api_response: dict[str, Any]) -> list[dict[str, Any]]: chunks = api_response.get("chunks") or api_response.get("words") or [] words: list[dict[str, Any]] = [] for c in chunks: word = c.get("text") or c.get("word") or "" ts = c.get("timestamp") or (c.get("start"), c.get("end")) if not word or ts is None: continue start, end = ts if isinstance(ts, (list, tuple)) else (ts, None) if start is None or end is None: continue words.append({"word": word.strip(), "start": float(start), "end": float(end)}) return words # ---------- Prosodic features ---------- def compute_prosodic_features(words: list[dict[str, Any]]) -> dict[str, Any]: if len(words) < MIN_WORDS_FOR_RELIABLE_FEATURES: raise ValueError( f"Only {len(words)} words transcribed. " f"Need at least {MIN_WORDS_FOR_RELIABLE_FEATURES} for reliable features." ) total_duration = words[-1]["end"] - words[0]["start"] if total_duration <= 0: raise ValueError("Clip has zero or negative duration.") wpm_overall = len(words) / (total_duration / 60.0) # Pauses and which word-index gaps they live in (so we can point at them later). pauses = [] for i in range(len(words) - 1): gap = words[i + 1]["start"] - words[i]["end"] if gap > PAUSE_THRESHOLD_SECONDS: pauses.append( { "gap_seconds": gap, "after_word_index": i, "after_word": words[i]["word"], "start_time": words[i]["end"], "end_time": words[i + 1]["start"], } ) pause_durations = [p["gap_seconds"] for p in pauses] pause_variance = ( statistics.pvariance(pause_durations) if len(pause_durations) >= 2 else 0.0 ) # Speaking-rate variance across thirds. n = len(words) third = n // 3 rates = [] if third >= 2: for section in (words[0:third], words[third : 2 * third], words[2 * third :]): dur = section[-1]["end"] - section[0]["start"] if dur > 0: rates.append(len(section) / (dur / 60.0)) rate_variance = statistics.pvariance(rates) if len(rates) >= 2 else 0.0 # Find the three longest pauses for the Coach tab. top_pauses = sorted(pauses, key=lambda p: -p["gap_seconds"])[:3] return { "wpm_overall": round(wpm_overall, 1), "pause_count_over_400ms": len(pauses), "pause_duration_variance": round(pause_variance, 3), "speaking_rate_variance_across_thirds": round(rate_variance, 1), "num_words": len(words), "total_duration_seconds": round(total_duration, 1), "top_pauses": top_pauses, "rates_by_third": [round(r, 1) for r in rates], } def normalize_delivery_score(features: dict[str, Any]) -> float: """Map the four prosodic features onto a 0-100 delivery score. This is a simple hand-crafted normalization based on the reference ranges from Week 8 data. It is not learned, not validated, and should not be treated as ground truth. See research-journal.md, Week 10, for the honest limitations discussion. """ wpm = features["wpm_overall"] # Speaking rate: reward 155-190 wpm, penalize extremes. if 155 <= wpm <= 190: rate_score = 1.0 elif 140 <= wpm < 155 or 190 < wpm <= 210: rate_score = 0.7 else: rate_score = 0.4 # Pause count: reward 5-12 strategic pauses, penalize too few or far too many. pc = features["pause_count_over_400ms"] if 5 <= pc <= 12: pause_count_score = 1.0 elif 3 <= pc < 5 or 12 < pc <= 18: pause_count_score = 0.7 else: pause_count_score = 0.4 # Pause variance: higher is better (signals strategic emphasis). pv = features["pause_duration_variance"] pause_var_score = min(1.0, pv / 0.35) # Rate variance across thirds: higher is better (signals dynamic pacing). rv = features["speaking_rate_variance_across_thirds"] rate_var_score = min(1.0, rv / 20.0) combined = 0.30 * rate_score + 0.25 * pause_count_score + 0.20 * pause_var_score + 0.25 * rate_var_score return round(combined * 100, 1) # ---------- LLM content scoring ---------- def score_content_with_llm(transcript: str) -> dict[str, Any]: prompt = RUBRIC_PROMPT.replace("{transcript}", transcript) payload = { "inputs": prompt, "parameters": { "max_new_tokens": 350, "temperature": 0.2, "return_full_text": False, }, } response = requests.post( LLM_URL, headers=_auth_headers("application/json"), json=payload, timeout=120, ) if response.status_code != 200: raise RuntimeError(f"LLM API error {response.status_code}: {response.text[:400]}") data = response.json() raw = data[0].get("generated_text", "") if isinstance(data, list) else str(data) # Find the first JSON object in the response. try: start = raw.index("{") end = raw.rindex("}") + 1 parsed = json.loads(raw[start:end]) except (ValueError, json.JSONDecodeError): parsed = { "claim_clarity": {"score": 0, "comment": "LLM returned unparseable JSON."}, "evidence_quality": {"score": 0, "comment": "LLM returned unparseable JSON."}, "rebuttal_strength": {"score": 0, "comment": "LLM returned unparseable JSON."}, "coaching_note": f"Raw LLM output: {raw[:500]}", } return parsed def content_score_out_of_100(rubric: dict[str, Any]) -> float: keys = ("claim_clarity", "evidence_quality", "rebuttal_strength") scores = [int(rubric.get(k, {}).get("score", 0) or 0) for k in keys] if not any(scores): return 0.0 mean_out_of_5 = sum(scores) / len(scores) return round((mean_out_of_5 / 5.0) * 100, 1) # ---------- Gradio glue ---------- def format_seconds(s: float) -> str: minutes = int(s // 60) seconds = s - minutes * 60 return f"{minutes}:{seconds:05.2f}" def analyze(audio_path: str): if not audio_path: msg = "Please upload or record an audio clip." return msg, msg, msg, msg, msg, msg, msg try: api_response = transcribe_with_word_timestamps(audio_path) words = extract_words_with_times(api_response) if not words: raise RuntimeError("Whisper returned no word-level timestamps.") features = compute_prosodic_features(words) transcript = " ".join(w["word"] for w in words) rubric = score_content_with_llm(transcript) except ValueError as e: msg = f"Short-clip warning: {e}" return msg, msg, msg, msg, msg, msg, msg except Exception as e: msg = f"Error: {e}" return msg, msg, msg, msg, msg, msg, msg delivery_score = normalize_delivery_score(features) content_score = content_score_out_of_100(rubric) combined = round((delivery_score + content_score) / 2.0, 1) # ---- Score tab ---- score_summary = ( f"Delivery: {delivery_score} / 100\n" f"Content: {content_score} / 100\n" f"Combined: {combined} / 100" ) # ---- Breakdown tab ---- prosodic_block = ( f"Speaking rate (wpm): {features['wpm_overall']}\n" f"Pauses longer than 400 ms: {features['pause_count_over_400ms']}\n" f"Pause-duration variance: {features['pause_duration_variance']}\n" f"Speaking-rate variance (thirds): {features['speaking_rate_variance_across_thirds']}\n" f"Words transcribed: {features['num_words']}\n" f"Clip length (s): {features['total_duration_seconds']}" ) rubric_lines = [] for key, label in ( ("claim_clarity", "Claim clarity"), ("evidence_quality", "Evidence quality"), ("rebuttal_strength", "Rebuttal strength"), ): entry = rubric.get(key, {}) rubric_lines.append( f"{label}: {entry.get('score', 0)}/5 — {entry.get('comment', '')}" ) rubric_block = "\n".join(rubric_lines) # ---- Coach tab ---- top = features["top_pauses"] if top: pause_lines = [ f" {i+1}. {format_seconds(p['start_time'])}–{format_seconds(p['end_time'])} " f"({p['gap_seconds']:.2f}s) — after '{p['after_word']}'" for i, p in enumerate(top) ] pauses_text = "Three longest pauses — worth listening back to:\n" + "\n".join(pause_lines) else: pauses_text = "No pauses longer than 400 ms were detected." coaching_note = rubric.get("coaching_note", "") return ( score_summary, prosodic_block, rubric_block, pauses_text, coaching_note, transcript, json.dumps({"features": {k: v for k, v in features.items() if k != "top_pauses"}, "rubric": rubric}, indent=2), ) with gr.Blocks(theme=gr.themes.Soft(), title="WSDC Speech Judge Assistant") as demo: gr.Markdown( "# WSDC Speech Judge Assistant — Space 3\n" "Upload a short debate or speech clip. This Space transcribes it with Whisper-small " "(via the Hugging Face Inference API), computes four prosodic delivery features from " "the word-level timestamps, and sends the transcript to SmolLM2-1.7B-Instruct with a " "WSDC-style rubric prompt for content scoring. Tested on 20 clips; see " "[the research journal, Week 10](https://huggingface.co/spaces/profplate/space3-speech-judge-assistant/blob/main/research-journal.md) for the correlation analysis " "and limitations. — Prea Callahan, AI + Research Level 2, Spring 2026." ) audio_in = gr.Audio( sources=["upload", "microphone"], type="filepath", label="Speech clip (10 seconds to 4 minutes)", ) go = gr.Button("Score the speech", variant="primary") with gr.Tabs(): with gr.TabItem("Score"): score_out = gr.Textbox(label="Summary", lines=4) with gr.TabItem("Breakdown"): prosodic_out = gr.Textbox(label="Prosodic features", lines=8) rubric_out = gr.Textbox(label="Content rubric (SmolLM2)", lines=6) transcript_out = gr.Textbox(label="Transcript (Whisper-small)", lines=6) with gr.TabItem("Coach"): pauses_out = gr.Textbox(label="Moments worth listening back to", lines=6) coaching_out = gr.Textbox(label="Coaching note", lines=4) with gr.TabItem("Raw JSON"): raw_out = gr.Code(label="All features and rubric output", language="json") go.click( analyze, inputs=audio_in, outputs=[ score_out, prosodic_out, rubric_out, pauses_out, coaching_out, transcript_out, raw_out, ], ) if __name__ == "__main__": demo.launch()