profplate's picture
Initial publish from local folder
1fb58ac verified
"""
Space 3 — WSDC Speech Judge Assistant.
The full two-factor pipeline. Audio goes in; three things come out:
1. Delivery score (derived from four prosodic features computed from
Whisper-small word-level timestamps)
2. Content score (from SmolLM2-1.7B-Instruct rubric evaluation of the
transcript on three dimensions: claim clarity, evidence quality,
rebuttal strength)
3. Combined score (simple average of the two)
Architecture is the same thin-client-over-API pattern as Space 2 — no local
model weights, everything heavy happens on Hugging Face's Inference API
servers. See research-journal.md, Weeks 9-10, for the design notes and the
Spearman correlation analysis on 20 test clips.
Three tabs:
- Score: just the three numbers
- Breakdown: prosodic features and the LLM's rubric output in detail
- Coach: longest-pause timestamps and a one-paragraph coaching note
"""
import json
import os
import statistics
from typing import Any
import gradio as gr
import requests
HF_TOKEN = os.environ.get("HF_TOKEN", "")
WHISPER_URL = "https://api-inference.huggingface.co/models/openai/whisper-small"
LLM_URL = (
"https://api-inference.huggingface.co/models/HuggingFaceTB/SmolLM2-1.7B-Instruct"
)
PAUSE_THRESHOLD_SECONDS = 0.4
MIN_WORDS_FOR_RELIABLE_FEATURES = 20
RUBRIC_PROMPT = """You are an experienced WSDC (World Schools Debate) judge giving short, constructive feedback on a short speech transcript. Score the speech on each of three dimensions, from 1 (weak) to 5 (strong), and write one sentence of feedback for each dimension. At the end, write one short paragraph of overall coaching advice. Respond ONLY in strict JSON with these exact keys:
{
"claim_clarity": {"score": <int 1-5>, "comment": "<one sentence>"},
"evidence_quality": {"score": <int 1-5>, "comment": "<one sentence>"},
"rebuttal_strength": {"score": <int 1-5>, "comment": "<one sentence>"},
"coaching_note": "<one short paragraph, 2-3 sentences>"
}
TRANSCRIPT:
\"\"\"
{transcript}
\"\"\"
"""
def _auth_headers(content_type: str | None = None) -> dict[str, str]:
if not HF_TOKEN:
raise RuntimeError(
"HF_TOKEN is not set. Add it as a Space secret "
"(Settings -> Variables and secrets -> New secret)."
)
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
if content_type:
headers["Content-Type"] = content_type
return headers
# ---------- Whisper ----------
def transcribe_with_word_timestamps(audio_path: str) -> dict[str, Any]:
with open(audio_path, "rb") as f:
data = f.read()
response = requests.post(
WHISPER_URL,
headers=_auth_headers("audio/wav"),
params={"return_timestamps": "word"},
data=data,
timeout=120,
)
if response.status_code != 200:
raise RuntimeError(
f"Whisper API error {response.status_code}: {response.text[:400]}"
)
return response.json()
def extract_words_with_times(api_response: dict[str, Any]) -> list[dict[str, Any]]:
chunks = api_response.get("chunks") or api_response.get("words") or []
words: list[dict[str, Any]] = []
for c in chunks:
word = c.get("text") or c.get("word") or ""
ts = c.get("timestamp") or (c.get("start"), c.get("end"))
if not word or ts is None:
continue
start, end = ts if isinstance(ts, (list, tuple)) else (ts, None)
if start is None or end is None:
continue
words.append({"word": word.strip(), "start": float(start), "end": float(end)})
return words
# ---------- Prosodic features ----------
def compute_prosodic_features(words: list[dict[str, Any]]) -> dict[str, Any]:
if len(words) < MIN_WORDS_FOR_RELIABLE_FEATURES:
raise ValueError(
f"Only {len(words)} words transcribed. "
f"Need at least {MIN_WORDS_FOR_RELIABLE_FEATURES} for reliable features."
)
total_duration = words[-1]["end"] - words[0]["start"]
if total_duration <= 0:
raise ValueError("Clip has zero or negative duration.")
wpm_overall = len(words) / (total_duration / 60.0)
# Pauses and which word-index gaps they live in (so we can point at them later).
pauses = []
for i in range(len(words) - 1):
gap = words[i + 1]["start"] - words[i]["end"]
if gap > PAUSE_THRESHOLD_SECONDS:
pauses.append(
{
"gap_seconds": gap,
"after_word_index": i,
"after_word": words[i]["word"],
"start_time": words[i]["end"],
"end_time": words[i + 1]["start"],
}
)
pause_durations = [p["gap_seconds"] for p in pauses]
pause_variance = (
statistics.pvariance(pause_durations) if len(pause_durations) >= 2 else 0.0
)
# Speaking-rate variance across thirds.
n = len(words)
third = n // 3
rates = []
if third >= 2:
for section in (words[0:third], words[third : 2 * third], words[2 * third :]):
dur = section[-1]["end"] - section[0]["start"]
if dur > 0:
rates.append(len(section) / (dur / 60.0))
rate_variance = statistics.pvariance(rates) if len(rates) >= 2 else 0.0
# Find the three longest pauses for the Coach tab.
top_pauses = sorted(pauses, key=lambda p: -p["gap_seconds"])[:3]
return {
"wpm_overall": round(wpm_overall, 1),
"pause_count_over_400ms": len(pauses),
"pause_duration_variance": round(pause_variance, 3),
"speaking_rate_variance_across_thirds": round(rate_variance, 1),
"num_words": len(words),
"total_duration_seconds": round(total_duration, 1),
"top_pauses": top_pauses,
"rates_by_third": [round(r, 1) for r in rates],
}
def normalize_delivery_score(features: dict[str, Any]) -> float:
"""Map the four prosodic features onto a 0-100 delivery score.
This is a simple hand-crafted normalization based on the reference
ranges from Week 8 data. It is not learned, not validated, and should
not be treated as ground truth. See research-journal.md, Week 10,
for the honest limitations discussion.
"""
wpm = features["wpm_overall"]
# Speaking rate: reward 155-190 wpm, penalize extremes.
if 155 <= wpm <= 190:
rate_score = 1.0
elif 140 <= wpm < 155 or 190 < wpm <= 210:
rate_score = 0.7
else:
rate_score = 0.4
# Pause count: reward 5-12 strategic pauses, penalize too few or far too many.
pc = features["pause_count_over_400ms"]
if 5 <= pc <= 12:
pause_count_score = 1.0
elif 3 <= pc < 5 or 12 < pc <= 18:
pause_count_score = 0.7
else:
pause_count_score = 0.4
# Pause variance: higher is better (signals strategic emphasis).
pv = features["pause_duration_variance"]
pause_var_score = min(1.0, pv / 0.35)
# Rate variance across thirds: higher is better (signals dynamic pacing).
rv = features["speaking_rate_variance_across_thirds"]
rate_var_score = min(1.0, rv / 20.0)
combined = 0.30 * rate_score + 0.25 * pause_count_score + 0.20 * pause_var_score + 0.25 * rate_var_score
return round(combined * 100, 1)
# ---------- LLM content scoring ----------
def score_content_with_llm(transcript: str) -> dict[str, Any]:
prompt = RUBRIC_PROMPT.replace("{transcript}", transcript)
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": 350,
"temperature": 0.2,
"return_full_text": False,
},
}
response = requests.post(
LLM_URL,
headers=_auth_headers("application/json"),
json=payload,
timeout=120,
)
if response.status_code != 200:
raise RuntimeError(f"LLM API error {response.status_code}: {response.text[:400]}")
data = response.json()
raw = data[0].get("generated_text", "") if isinstance(data, list) else str(data)
# Find the first JSON object in the response.
try:
start = raw.index("{")
end = raw.rindex("}") + 1
parsed = json.loads(raw[start:end])
except (ValueError, json.JSONDecodeError):
parsed = {
"claim_clarity": {"score": 0, "comment": "LLM returned unparseable JSON."},
"evidence_quality": {"score": 0, "comment": "LLM returned unparseable JSON."},
"rebuttal_strength": {"score": 0, "comment": "LLM returned unparseable JSON."},
"coaching_note": f"Raw LLM output: {raw[:500]}",
}
return parsed
def content_score_out_of_100(rubric: dict[str, Any]) -> float:
keys = ("claim_clarity", "evidence_quality", "rebuttal_strength")
scores = [int(rubric.get(k, {}).get("score", 0) or 0) for k in keys]
if not any(scores):
return 0.0
mean_out_of_5 = sum(scores) / len(scores)
return round((mean_out_of_5 / 5.0) * 100, 1)
# ---------- Gradio glue ----------
def format_seconds(s: float) -> str:
minutes = int(s // 60)
seconds = s - minutes * 60
return f"{minutes}:{seconds:05.2f}"
def analyze(audio_path: str):
if not audio_path:
msg = "Please upload or record an audio clip."
return msg, msg, msg, msg, msg, msg, msg
try:
api_response = transcribe_with_word_timestamps(audio_path)
words = extract_words_with_times(api_response)
if not words:
raise RuntimeError("Whisper returned no word-level timestamps.")
features = compute_prosodic_features(words)
transcript = " ".join(w["word"] for w in words)
rubric = score_content_with_llm(transcript)
except ValueError as e:
msg = f"Short-clip warning: {e}"
return msg, msg, msg, msg, msg, msg, msg
except Exception as e:
msg = f"Error: {e}"
return msg, msg, msg, msg, msg, msg, msg
delivery_score = normalize_delivery_score(features)
content_score = content_score_out_of_100(rubric)
combined = round((delivery_score + content_score) / 2.0, 1)
# ---- Score tab ----
score_summary = (
f"Delivery: {delivery_score} / 100\n"
f"Content: {content_score} / 100\n"
f"Combined: {combined} / 100"
)
# ---- Breakdown tab ----
prosodic_block = (
f"Speaking rate (wpm): {features['wpm_overall']}\n"
f"Pauses longer than 400 ms: {features['pause_count_over_400ms']}\n"
f"Pause-duration variance: {features['pause_duration_variance']}\n"
f"Speaking-rate variance (thirds): {features['speaking_rate_variance_across_thirds']}\n"
f"Words transcribed: {features['num_words']}\n"
f"Clip length (s): {features['total_duration_seconds']}"
)
rubric_lines = []
for key, label in (
("claim_clarity", "Claim clarity"),
("evidence_quality", "Evidence quality"),
("rebuttal_strength", "Rebuttal strength"),
):
entry = rubric.get(key, {})
rubric_lines.append(
f"{label}: {entry.get('score', 0)}/5 — {entry.get('comment', '')}"
)
rubric_block = "\n".join(rubric_lines)
# ---- Coach tab ----
top = features["top_pauses"]
if top:
pause_lines = [
f" {i+1}. {format_seconds(p['start_time'])}{format_seconds(p['end_time'])} "
f"({p['gap_seconds']:.2f}s) — after '{p['after_word']}'"
for i, p in enumerate(top)
]
pauses_text = "Three longest pauses — worth listening back to:\n" + "\n".join(pause_lines)
else:
pauses_text = "No pauses longer than 400 ms were detected."
coaching_note = rubric.get("coaching_note", "")
return (
score_summary,
prosodic_block,
rubric_block,
pauses_text,
coaching_note,
transcript,
json.dumps({"features": {k: v for k, v in features.items() if k != "top_pauses"}, "rubric": rubric}, indent=2),
)
with gr.Blocks(theme=gr.themes.Soft(), title="WSDC Speech Judge Assistant") as demo:
gr.Markdown(
"# WSDC Speech Judge Assistant — Space 3\n"
"Upload a short debate or speech clip. This Space transcribes it with Whisper-small "
"(via the Hugging Face Inference API), computes four prosodic delivery features from "
"the word-level timestamps, and sends the transcript to SmolLM2-1.7B-Instruct with a "
"WSDC-style rubric prompt for content scoring. Tested on 20 clips; see "
"[the research journal, Week 10](https://huggingface.co/spaces/profplate/space3-speech-judge-assistant/blob/main/research-journal.md) for the correlation analysis "
"and limitations. — Prea Callahan, AI + Research Level 2, Spring 2026."
)
audio_in = gr.Audio(
sources=["upload", "microphone"],
type="filepath",
label="Speech clip (10 seconds to 4 minutes)",
)
go = gr.Button("Score the speech", variant="primary")
with gr.Tabs():
with gr.TabItem("Score"):
score_out = gr.Textbox(label="Summary", lines=4)
with gr.TabItem("Breakdown"):
prosodic_out = gr.Textbox(label="Prosodic features", lines=8)
rubric_out = gr.Textbox(label="Content rubric (SmolLM2)", lines=6)
transcript_out = gr.Textbox(label="Transcript (Whisper-small)", lines=6)
with gr.TabItem("Coach"):
pauses_out = gr.Textbox(label="Moments worth listening back to", lines=6)
coaching_out = gr.Textbox(label="Coaching note", lines=4)
with gr.TabItem("Raw JSON"):
raw_out = gr.Code(label="All features and rubric output", language="json")
go.click(
analyze,
inputs=audio_in,
outputs=[
score_out,
prosodic_out,
rubric_out,
pauses_out,
coaching_out,
transcript_out,
raw_out,
],
)
if __name__ == "__main__":
demo.launch()