| """ |
| Space 3 — WSDC Speech Judge Assistant. |
| |
| The full two-factor pipeline. Audio goes in; three things come out: |
| |
| 1. Delivery score (derived from four prosodic features computed from |
| Whisper-small word-level timestamps) |
| 2. Content score (from SmolLM2-1.7B-Instruct rubric evaluation of the |
| transcript on three dimensions: claim clarity, evidence quality, |
| rebuttal strength) |
| 3. Combined score (simple average of the two) |
| |
| Architecture is the same thin-client-over-API pattern as Space 2 — no local |
| model weights, everything heavy happens on Hugging Face's Inference API |
| servers. See research-journal.md, Weeks 9-10, for the design notes and the |
| Spearman correlation analysis on 20 test clips. |
| |
| Three tabs: |
| - Score: just the three numbers |
| - Breakdown: prosodic features and the LLM's rubric output in detail |
| - Coach: longest-pause timestamps and a one-paragraph coaching note |
| """ |
|
|
| import json |
| import os |
| import statistics |
| from typing import Any |
|
|
| import gradio as gr |
| import requests |
|
|
| HF_TOKEN = os.environ.get("HF_TOKEN", "") |
| WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-small" |
| LLM_URL = ( |
| "https://api-inference.huggingface.co/models/HuggingFaceTB/SmolLM2-1.7B-Instruct" |
| ) |
|
|
| PAUSE_THRESHOLD_SECONDS = 0.4 |
| MIN_WORDS_FOR_RELIABLE_FEATURES = 20 |
|
|
| RUBRIC_PROMPT = """You are an experienced WSDC (World Schools Debate) judge giving short, constructive feedback on a short speech transcript. Score the speech on each of three dimensions, from 1 (weak) to 5 (strong), and write one sentence of feedback for each dimension. At the end, write one short paragraph of overall coaching advice. Respond ONLY in strict JSON with these exact keys: |
| |
| { |
| "claim_clarity": {"score": <int 1-5>, "comment": "<one sentence>"}, |
| "evidence_quality": {"score": <int 1-5>, "comment": "<one sentence>"}, |
| "rebuttal_strength": {"score": <int 1-5>, "comment": "<one sentence>"}, |
| "coaching_note": "<one short paragraph, 2-3 sentences>" |
| } |
| |
| TRANSCRIPT: |
| \"\"\" |
| {transcript} |
| \"\"\" |
| """ |
|
|
|
|
| def _auth_headers(content_type: str | None = None) -> dict[str, str]: |
| if not HF_TOKEN: |
| raise RuntimeError( |
| "HF_TOKEN is not set. Add it as a Space secret " |
| "(Settings -> Variables and secrets -> New secret)." |
| ) |
| headers = {"Authorization": f"Bearer {HF_TOKEN}"} |
| if content_type: |
| headers["Content-Type"] = content_type |
| return headers |
|
|
|
|
| |
|
|
|
|
| def transcribe_with_word_timestamps(audio_path: str) -> dict[str, Any]: |
| with open(audio_path, "rb") as f: |
| data = f.read() |
| response = requests.post( |
| WHISPER_URL, |
| headers=_auth_headers("audio/wav"), |
| params={"return_timestamps": "word"}, |
| data=data, |
| timeout=120, |
| ) |
| if response.status_code != 200: |
| raise RuntimeError( |
| f"Whisper API error {response.status_code}: {response.text[:400]}" |
| ) |
| return response.json() |
|
|
|
|
| def extract_words_with_times(api_response: dict[str, Any]) -> list[dict[str, Any]]: |
| chunks = api_response.get("chunks") or api_response.get("words") or [] |
| words: list[dict[str, Any]] = [] |
| for c in chunks: |
| word = c.get("text") or c.get("word") or "" |
| ts = c.get("timestamp") or (c.get("start"), c.get("end")) |
| if not word or ts is None: |
| continue |
| start, end = ts if isinstance(ts, (list, tuple)) else (ts, None) |
| if start is None or end is None: |
| continue |
| words.append({"word": word.strip(), "start": float(start), "end": float(end)}) |
| return words |
|
|
|
|
| |
|
|
|
|
| def compute_prosodic_features(words: list[dict[str, Any]]) -> dict[str, Any]: |
| if len(words) < MIN_WORDS_FOR_RELIABLE_FEATURES: |
| raise ValueError( |
| f"Only {len(words)} words transcribed. " |
| f"Need at least {MIN_WORDS_FOR_RELIABLE_FEATURES} for reliable features." |
| ) |
| total_duration = words[-1]["end"] - words[0]["start"] |
| if total_duration <= 0: |
| raise ValueError("Clip has zero or negative duration.") |
|
|
| wpm_overall = len(words) / (total_duration / 60.0) |
|
|
| |
| pauses = [] |
| for i in range(len(words) - 1): |
| gap = words[i + 1]["start"] - words[i]["end"] |
| if gap > PAUSE_THRESHOLD_SECONDS: |
| pauses.append( |
| { |
| "gap_seconds": gap, |
| "after_word_index": i, |
| "after_word": words[i]["word"], |
| "start_time": words[i]["end"], |
| "end_time": words[i + 1]["start"], |
| } |
| ) |
| pause_durations = [p["gap_seconds"] for p in pauses] |
| pause_variance = ( |
| statistics.pvariance(pause_durations) if len(pause_durations) >= 2 else 0.0 |
| ) |
|
|
| |
| n = len(words) |
| third = n // 3 |
| rates = [] |
| if third >= 2: |
| for section in (words[0:third], words[third : 2 * third], words[2 * third :]): |
| dur = section[-1]["end"] - section[0]["start"] |
| if dur > 0: |
| rates.append(len(section) / (dur / 60.0)) |
| rate_variance = statistics.pvariance(rates) if len(rates) >= 2 else 0.0 |
|
|
| |
| top_pauses = sorted(pauses, key=lambda p: -p["gap_seconds"])[:3] |
|
|
| return { |
| "wpm_overall": round(wpm_overall, 1), |
| "pause_count_over_400ms": len(pauses), |
| "pause_duration_variance": round(pause_variance, 3), |
| "speaking_rate_variance_across_thirds": round(rate_variance, 1), |
| "num_words": len(words), |
| "total_duration_seconds": round(total_duration, 1), |
| "top_pauses": top_pauses, |
| "rates_by_third": [round(r, 1) for r in rates], |
| } |
|
|
|
|
| def normalize_delivery_score(features: dict[str, Any]) -> float: |
| """Map the four prosodic features onto a 0-100 delivery score. |
| |
| This is a simple hand-crafted normalization based on the reference |
| ranges from Week 8 data. It is not learned, not validated, and should |
| not be treated as ground truth. See research-journal.md, Week 10, |
| for the honest limitations discussion. |
| """ |
| wpm = features["wpm_overall"] |
| |
| if 155 <= wpm <= 190: |
| rate_score = 1.0 |
| elif 140 <= wpm < 155 or 190 < wpm <= 210: |
| rate_score = 0.7 |
| else: |
| rate_score = 0.4 |
|
|
| |
| pc = features["pause_count_over_400ms"] |
| if 5 <= pc <= 12: |
| pause_count_score = 1.0 |
| elif 3 <= pc < 5 or 12 < pc <= 18: |
| pause_count_score = 0.7 |
| else: |
| pause_count_score = 0.4 |
|
|
| |
| pv = features["pause_duration_variance"] |
| pause_var_score = min(1.0, pv / 0.35) |
|
|
| |
| rv = features["speaking_rate_variance_across_thirds"] |
| rate_var_score = min(1.0, rv / 20.0) |
|
|
| combined = 0.30 * rate_score + 0.25 * pause_count_score + 0.20 * pause_var_score + 0.25 * rate_var_score |
| return round(combined * 100, 1) |
|
|
|
|
| |
|
|
|
|
| def score_content_with_llm(transcript: str) -> dict[str, Any]: |
| prompt = RUBRIC_PROMPT.replace("{transcript}", transcript) |
| payload = { |
| "inputs": prompt, |
| "parameters": { |
| "max_new_tokens": 350, |
| "temperature": 0.2, |
| "return_full_text": False, |
| }, |
| } |
| response = requests.post( |
| LLM_URL, |
| headers=_auth_headers("application/json"), |
| json=payload, |
| timeout=120, |
| ) |
| if response.status_code != 200: |
| raise RuntimeError(f"LLM API error {response.status_code}: {response.text[:400]}") |
| data = response.json() |
| raw = data[0].get("generated_text", "") if isinstance(data, list) else str(data) |
| |
| try: |
| start = raw.index("{") |
| end = raw.rindex("}") + 1 |
| parsed = json.loads(raw[start:end]) |
| except (ValueError, json.JSONDecodeError): |
| parsed = { |
| "claim_clarity": {"score": 0, "comment": "LLM returned unparseable JSON."}, |
| "evidence_quality": {"score": 0, "comment": "LLM returned unparseable JSON."}, |
| "rebuttal_strength": {"score": 0, "comment": "LLM returned unparseable JSON."}, |
| "coaching_note": f"Raw LLM output: {raw[:500]}", |
| } |
| return parsed |
|
|
|
|
| def content_score_out_of_100(rubric: dict[str, Any]) -> float: |
| keys = ("claim_clarity", "evidence_quality", "rebuttal_strength") |
| scores = [int(rubric.get(k, {}).get("score", 0) or 0) for k in keys] |
| if not any(scores): |
| return 0.0 |
| mean_out_of_5 = sum(scores) / len(scores) |
| return round((mean_out_of_5 / 5.0) * 100, 1) |
|
|
|
|
| |
|
|
|
|
| def format_seconds(s: float) -> str: |
| minutes = int(s // 60) |
| seconds = s - minutes * 60 |
| return f"{minutes}:{seconds:05.2f}" |
|
|
|
|
| def analyze(audio_path: str): |
| if not audio_path: |
| msg = "Please upload or record an audio clip." |
| return msg, msg, msg, msg, msg, msg, msg |
|
|
| try: |
| api_response = transcribe_with_word_timestamps(audio_path) |
| words = extract_words_with_times(api_response) |
| if not words: |
| raise RuntimeError("Whisper returned no word-level timestamps.") |
| features = compute_prosodic_features(words) |
| transcript = " ".join(w["word"] for w in words) |
| rubric = score_content_with_llm(transcript) |
| except ValueError as e: |
| msg = f"Short-clip warning: {e}" |
| return msg, msg, msg, msg, msg, msg, msg |
| except Exception as e: |
| msg = f"Error: {e}" |
| return msg, msg, msg, msg, msg, msg, msg |
|
|
| delivery_score = normalize_delivery_score(features) |
| content_score = content_score_out_of_100(rubric) |
| combined = round((delivery_score + content_score) / 2.0, 1) |
|
|
| |
| score_summary = ( |
| f"Delivery: {delivery_score} / 100\n" |
| f"Content: {content_score} / 100\n" |
| f"Combined: {combined} / 100" |
| ) |
|
|
| |
| prosodic_block = ( |
| f"Speaking rate (wpm): {features['wpm_overall']}\n" |
| f"Pauses longer than 400 ms: {features['pause_count_over_400ms']}\n" |
| f"Pause-duration variance: {features['pause_duration_variance']}\n" |
| f"Speaking-rate variance (thirds): {features['speaking_rate_variance_across_thirds']}\n" |
| f"Words transcribed: {features['num_words']}\n" |
| f"Clip length (s): {features['total_duration_seconds']}" |
| ) |
| rubric_lines = [] |
| for key, label in ( |
| ("claim_clarity", "Claim clarity"), |
| ("evidence_quality", "Evidence quality"), |
| ("rebuttal_strength", "Rebuttal strength"), |
| ): |
| entry = rubric.get(key, {}) |
| rubric_lines.append( |
| f"{label}: {entry.get('score', 0)}/5 — {entry.get('comment', '')}" |
| ) |
| rubric_block = "\n".join(rubric_lines) |
|
|
| |
| top = features["top_pauses"] |
| if top: |
| pause_lines = [ |
| f" {i+1}. {format_seconds(p['start_time'])}–{format_seconds(p['end_time'])} " |
| f"({p['gap_seconds']:.2f}s) — after '{p['after_word']}'" |
| for i, p in enumerate(top) |
| ] |
| pauses_text = "Three longest pauses — worth listening back to:\n" + "\n".join(pause_lines) |
| else: |
| pauses_text = "No pauses longer than 400 ms were detected." |
|
|
| coaching_note = rubric.get("coaching_note", "") |
|
|
| return ( |
| score_summary, |
| prosodic_block, |
| rubric_block, |
| pauses_text, |
| coaching_note, |
| transcript, |
| json.dumps({"features": {k: v for k, v in features.items() if k != "top_pauses"}, "rubric": rubric}, indent=2), |
| ) |
|
|
|
|
| with gr.Blocks(theme=gr.themes.Soft(), title="WSDC Speech Judge Assistant") as demo: |
| gr.Markdown( |
| "# WSDC Speech Judge Assistant — Space 3\n" |
| "Upload a short debate or speech clip. This Space transcribes it with Whisper-small " |
| "(via the Hugging Face Inference API), computes four prosodic delivery features from " |
| "the word-level timestamps, and sends the transcript to SmolLM2-1.7B-Instruct with a " |
| "WSDC-style rubric prompt for content scoring. Tested on 20 clips; see " |
| "[the research journal, Week 10](https://huggingface.co/spaces/profplate/space3-speech-judge-assistant/blob/main/research-journal.md) for the correlation analysis " |
| "and limitations. — Prea Callahan, AI + Research Level 2, Spring 2026." |
| ) |
| audio_in = gr.Audio( |
| sources=["upload", "microphone"], |
| type="filepath", |
| label="Speech clip (10 seconds to 4 minutes)", |
| ) |
| go = gr.Button("Score the speech", variant="primary") |
|
|
| with gr.Tabs(): |
| with gr.TabItem("Score"): |
| score_out = gr.Textbox(label="Summary", lines=4) |
| with gr.TabItem("Breakdown"): |
| prosodic_out = gr.Textbox(label="Prosodic features", lines=8) |
| rubric_out = gr.Textbox(label="Content rubric (SmolLM2)", lines=6) |
| transcript_out = gr.Textbox(label="Transcript (Whisper-small)", lines=6) |
| with gr.TabItem("Coach"): |
| pauses_out = gr.Textbox(label="Moments worth listening back to", lines=6) |
| coaching_out = gr.Textbox(label="Coaching note", lines=4) |
| with gr.TabItem("Raw JSON"): |
| raw_out = gr.Code(label="All features and rubric output", language="json") |
|
|
| go.click( |
| analyze, |
| inputs=audio_in, |
| outputs=[ |
| score_out, |
| prosodic_out, |
| rubric_out, |
| pauses_out, |
| coaching_out, |
| transcript_out, |
| raw_out, |
| ], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|