Upload 11 files
Browse files- main.py +20 -4
- tests/benchmark.py +302 -0
- tests/fixtures/.gitignore +6 -0
- tests/fixtures/.gitkeep +0 -0
- tests/fixtures/README.md +88 -0
main.py
CHANGED
|
@@ -61,7 +61,19 @@ logger = logging.getLogger(__name__)
|
|
| 61 |
# ---------------------------------------------------------------------------
|
| 62 |
# Configuration
|
| 63 |
# ---------------------------------------------------------------------------
|
| 64 |
-
MODEL_ID
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
API_KEY = os.getenv("DETECTOR_API_KEY", "your-fallback-test-key")
|
| 66 |
MAX_FILE_SIZE = 5 * 1024 * 1024 # 5 MB
|
| 67 |
MIN_AUDIO_DURATION = 1.0 # seconds
|
|
@@ -359,8 +371,9 @@ def run_detection_pipeline(audio_data: np.ndarray, sr: int) -> AnalysisResult:
|
|
| 359 |
model_score = 0.0
|
| 360 |
if classifier:
|
| 361 |
results = classifier(audio_data)
|
|
|
|
| 362 |
for res in results:
|
| 363 |
-
if res["label"].lower() in
|
| 364 |
model_score = res["score"]
|
| 365 |
break
|
| 366 |
|
|
@@ -508,8 +521,9 @@ async def _stream_url_analysis(url: str, request_id: str):
|
|
| 508 |
model_score = 0.0
|
| 509 |
if classifier:
|
| 510 |
results = await asyncio.to_thread(classifier, audio_data)
|
|
|
|
| 511 |
for res in results:
|
| 512 |
-
if res["label"].lower() in
|
| 513 |
model_score = res["score"]
|
| 514 |
break
|
| 515 |
|
|
@@ -556,8 +570,9 @@ async def _stream_file_analysis(file_path: str, request_id: str):
|
|
| 556 |
model_score = 0.0
|
| 557 |
if classifier:
|
| 558 |
results = await asyncio.to_thread(classifier, audio_data)
|
|
|
|
| 559 |
for res in results:
|
| 560 |
-
if res["label"].lower() in
|
| 561 |
model_score = res["score"]
|
| 562 |
break
|
| 563 |
|
|
@@ -825,6 +840,7 @@ async def on_startup():
|
|
| 825 |
|
| 826 |
logger.info("=== CheckAI Backend Starting ===")
|
| 827 |
logger.info(f"Model: {MODEL_ID}")
|
|
|
|
| 828 |
logger.info(f"Global concurrency: {MAX_GLOBAL_CONCURRENCY}")
|
| 829 |
logger.info(f"Daily limit per IP: {DAILY_LIMIT}")
|
| 830 |
logger.info(f"Allowed origins: {ALLOWED_ORIGINS}")
|
|
|
|
| 61 |
# ---------------------------------------------------------------------------
|
| 62 |
# Configuration
|
| 63 |
# ---------------------------------------------------------------------------
|
| 64 |
+
# MODEL_ID is env-configurable so we can A/B test candidate detectors without
|
| 65 |
+
# redeploying. Verified so far:
|
| 66 |
+
# - "MelodyMachine/Deepfake-audio-detection-V2" β BROKEN (constant ~1.0 on
|
| 67 |
+
# both real music and AI music; do not use)
|
| 68 |
+
# - "mo-thecreator/Deepfake-audio-detection" β to evaluate (speech-trained)
|
| 69 |
+
MODEL_ID = os.getenv("MODEL_ID", "MelodyMachine/Deepfake-audio-detection-V2")
|
| 70 |
+
|
| 71 |
+
# Which pipeline labels count as "this is AI"? Comma-separated, case-insensitive.
|
| 72 |
+
# Some HF models use LABEL_0 / LABEL_1 instead of semantic names β check the
|
| 73 |
+
# model's config.json and set this accordingly.
|
| 74 |
+
_ai_labels_raw = os.getenv("MODEL_AI_LABELS", "fake,ai,synthetic,spoof,label_1")
|
| 75 |
+
AI_LABELS = {s.strip().lower() for s in _ai_labels_raw.split(",") if s.strip()}
|
| 76 |
+
|
| 77 |
API_KEY = os.getenv("DETECTOR_API_KEY", "your-fallback-test-key")
|
| 78 |
MAX_FILE_SIZE = 5 * 1024 * 1024 # 5 MB
|
| 79 |
MIN_AUDIO_DURATION = 1.0 # seconds
|
|
|
|
| 371 |
model_score = 0.0
|
| 372 |
if classifier:
|
| 373 |
results = classifier(audio_data)
|
| 374 |
+
logger.info(f"[model] raw output: {results}")
|
| 375 |
for res in results:
|
| 376 |
+
if res["label"].lower() in AI_LABELS:
|
| 377 |
model_score = res["score"]
|
| 378 |
break
|
| 379 |
|
|
|
|
| 521 |
model_score = 0.0
|
| 522 |
if classifier:
|
| 523 |
results = await asyncio.to_thread(classifier, audio_data)
|
| 524 |
+
logger.info(f"[model] raw output: {results}")
|
| 525 |
for res in results:
|
| 526 |
+
if res["label"].lower() in AI_LABELS:
|
| 527 |
model_score = res["score"]
|
| 528 |
break
|
| 529 |
|
|
|
|
| 570 |
model_score = 0.0
|
| 571 |
if classifier:
|
| 572 |
results = await asyncio.to_thread(classifier, audio_data)
|
| 573 |
+
logger.info(f"[model] raw output: {results}")
|
| 574 |
for res in results:
|
| 575 |
+
if res["label"].lower() in AI_LABELS:
|
| 576 |
model_score = res["score"]
|
| 577 |
break
|
| 578 |
|
|
|
|
| 840 |
|
| 841 |
logger.info("=== CheckAI Backend Starting ===")
|
| 842 |
logger.info(f"Model: {MODEL_ID}")
|
| 843 |
+
logger.info(f"AI labels: {sorted(AI_LABELS)}")
|
| 844 |
logger.info(f"Global concurrency: {MAX_GLOBAL_CONCURRENCY}")
|
| 845 |
logger.info(f"Daily limit per IP: {DAILY_LIMIT}")
|
| 846 |
logger.info(f"Allowed origins: {ALLOWED_ORIGINS}")
|
tests/benchmark.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Model benchmark harness.
|
| 4 |
+
|
| 5 |
+
Runs every audio file in `backend/tests/fixtures/{ai,human}/` against the
|
| 6 |
+
deployed backend's `/analyze/upload` endpoint and reports:
|
| 7 |
+
|
| 8 |
+
* Confusion matrix (TP / FP / TN / FN)
|
| 9 |
+
* Accuracy, precision, recall, F1
|
| 10 |
+
* Per-clip table: expected vs. observed + raw scores
|
| 11 |
+
* Score distribution histogram (text bar chart)
|
| 12 |
+
* CSV export for spreadsheet analysis
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
export DETECTOR_API_URL='https://michal-giza-audio-detector-backend.hf.space'
|
| 16 |
+
export DETECTOR_API_KEY='...'
|
| 17 |
+
|
| 18 |
+
# 1. Drop AI clips into backend/tests/fixtures/ai/*.{mp3,wav,m4a}
|
| 19 |
+
# 2. Drop HUMAN clips into backend/tests/fixtures/human/*.{mp3,wav,m4a}
|
| 20 |
+
# 3. Run:
|
| 21 |
+
python3 benchmark.py # verbose
|
| 22 |
+
python3 benchmark.py --csv results.csv # also write CSV
|
| 23 |
+
python3 benchmark.py --threshold 0.65 # explore other decision thresholds
|
| 24 |
+
|
| 25 |
+
Exit code 0 on benchmark completion (regardless of model quality).
|
| 26 |
+
Exit 2 if no fixtures are present.
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
import argparse
|
| 30 |
+
import csv
|
| 31 |
+
import os
|
| 32 |
+
import sys
|
| 33 |
+
import time
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
from typing import Iterator
|
| 36 |
+
|
| 37 |
+
import requests
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Config
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
GREEN = "\033[92m"
|
| 43 |
+
RED = "\033[91m"
|
| 44 |
+
YELLOW = "\033[93m"
|
| 45 |
+
CYAN = "\033[96m"
|
| 46 |
+
BOLD = "\033[1m"
|
| 47 |
+
RESET = "\033[0m"
|
| 48 |
+
|
| 49 |
+
BASE_URL = os.environ.get("DETECTOR_API_URL", "").rstrip("/")
|
| 50 |
+
API_KEY = os.environ.get("DETECTOR_API_KEY", "")
|
| 51 |
+
|
| 52 |
+
FIXTURES_DIR = Path(__file__).parent / "fixtures"
|
| 53 |
+
AUDIO_EXTS = {".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg"}
|
| 54 |
+
|
| 55 |
+
MIME_FOR_EXT = {
|
| 56 |
+
".mp3": "audio/mpeg",
|
| 57 |
+
".wav": "audio/wav",
|
| 58 |
+
".m4a": "audio/mp4",
|
| 59 |
+
".aac": "audio/aac",
|
| 60 |
+
".flac": "audio/flac",
|
| 61 |
+
".ogg": "audio/ogg",
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Model-under-test wrapper
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
def analyze_clip(path: Path, timeout: int = 120) -> dict:
|
| 69 |
+
"""POST one clip to /analyze/upload. Returns the JSON body or raises."""
|
| 70 |
+
mime = MIME_FOR_EXT.get(path.suffix.lower(), "application/octet-stream")
|
| 71 |
+
with path.open("rb") as f:
|
| 72 |
+
resp = requests.post(
|
| 73 |
+
f"{BASE_URL}/analyze/upload",
|
| 74 |
+
headers={"X-Api-Key": API_KEY},
|
| 75 |
+
files={"file": (path.name, f, mime)},
|
| 76 |
+
timeout=timeout,
|
| 77 |
+
)
|
| 78 |
+
if resp.status_code != 200:
|
| 79 |
+
raise RuntimeError(
|
| 80 |
+
f"HTTP {resp.status_code}: {resp.text[:200]}"
|
| 81 |
+
)
|
| 82 |
+
return resp.json()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def iter_fixtures() -> Iterator[tuple[Path, bool]]:
|
| 86 |
+
"""Yield (path, is_ai_expected) for every clip under fixtures/."""
|
| 87 |
+
for subdir, expected_ai in [("ai", True), ("human", False)]:
|
| 88 |
+
root = FIXTURES_DIR / subdir
|
| 89 |
+
if not root.exists():
|
| 90 |
+
continue
|
| 91 |
+
for p in sorted(root.iterdir()):
|
| 92 |
+
if p.is_file() and p.suffix.lower() in AUDIO_EXTS:
|
| 93 |
+
yield p, expected_ai
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
# ---------------------------------------------------------------------------
|
| 97 |
+
# Metrics
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
def compute_metrics(rows: list[dict], threshold: float) -> dict:
|
| 100 |
+
"""Compute confusion matrix + derived rates for a given decision threshold."""
|
| 101 |
+
tp = fp = tn = fn = 0
|
| 102 |
+
for r in rows:
|
| 103 |
+
predicted_ai = r["confidence"] >= threshold
|
| 104 |
+
actual_ai = r["expected_ai"]
|
| 105 |
+
if predicted_ai and actual_ai:
|
| 106 |
+
tp += 1
|
| 107 |
+
elif predicted_ai and not actual_ai:
|
| 108 |
+
fp += 1
|
| 109 |
+
elif not predicted_ai and not actual_ai:
|
| 110 |
+
tn += 1
|
| 111 |
+
else:
|
| 112 |
+
fn += 1
|
| 113 |
+
|
| 114 |
+
total = tp + fp + tn + fn
|
| 115 |
+
accuracy = (tp + tn) / total if total else 0.0
|
| 116 |
+
precision = tp / (tp + fp) if (tp + fp) else 0.0
|
| 117 |
+
recall = tp / (tp + fn) if (tp + fn) else 0.0
|
| 118 |
+
f1 = (
|
| 119 |
+
2 * precision * recall / (precision + recall)
|
| 120 |
+
if (precision + recall)
|
| 121 |
+
else 0.0
|
| 122 |
+
)
|
| 123 |
+
return {
|
| 124 |
+
"tp": tp, "fp": fp, "tn": tn, "fn": fn,
|
| 125 |
+
"accuracy": accuracy,
|
| 126 |
+
"precision": precision,
|
| 127 |
+
"recall": recall,
|
| 128 |
+
"f1": f1,
|
| 129 |
+
"total": total,
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def text_histogram(values: list[float], width: int = 40, buckets: int = 20) -> str:
|
| 134 |
+
"""Tiny ASCII histogram of [0..1] scores."""
|
| 135 |
+
if not values:
|
| 136 |
+
return "(no data)"
|
| 137 |
+
counts = [0] * buckets
|
| 138 |
+
for v in values:
|
| 139 |
+
idx = min(int(v * buckets), buckets - 1)
|
| 140 |
+
counts[idx] += 1
|
| 141 |
+
peak = max(counts) or 1
|
| 142 |
+
lines = []
|
| 143 |
+
for i, c in enumerate(counts):
|
| 144 |
+
lo = i / buckets
|
| 145 |
+
hi = (i + 1) / buckets
|
| 146 |
+
bar = "β" * int(c / peak * width)
|
| 147 |
+
lines.append(f" [{lo:.2f}-{hi:.2f}) {c:3d} {bar}")
|
| 148 |
+
return "\n".join(lines)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# ---------------------------------------------------------------------------
|
| 152 |
+
# Main
|
| 153 |
+
# ---------------------------------------------------------------------------
|
| 154 |
+
def main() -> int:
|
| 155 |
+
parser = argparse.ArgumentParser()
|
| 156 |
+
parser.add_argument("--threshold", type=float, default=0.5,
|
| 157 |
+
help="Decision threshold on `confidence` (default 0.5)")
|
| 158 |
+
parser.add_argument("--csv", type=Path, default=None,
|
| 159 |
+
help="Optional CSV export path")
|
| 160 |
+
parser.add_argument("--sweep", action="store_true",
|
| 161 |
+
help="Also show metrics at 9 thresholds 0.1..0.9")
|
| 162 |
+
args = parser.parse_args()
|
| 163 |
+
|
| 164 |
+
if not BASE_URL or not API_KEY:
|
| 165 |
+
print("DETECTOR_API_URL and DETECTOR_API_KEY must be set.", file=sys.stderr)
|
| 166 |
+
return 2
|
| 167 |
+
|
| 168 |
+
fixtures = list(iter_fixtures())
|
| 169 |
+
if not fixtures:
|
| 170 |
+
print(f"{YELLOW}No fixtures found in {FIXTURES_DIR}/.{RESET}", file=sys.stderr)
|
| 171 |
+
print(" Expected layout:", file=sys.stderr)
|
| 172 |
+
print(f" {FIXTURES_DIR}/ai/*.mp3", file=sys.stderr)
|
| 173 |
+
print(f" {FIXTURES_DIR}/human/*.mp3", file=sys.stderr)
|
| 174 |
+
return 2
|
| 175 |
+
|
| 176 |
+
n_ai = sum(1 for _, is_ai in fixtures if is_ai)
|
| 177 |
+
n_human = len(fixtures) - n_ai
|
| 178 |
+
print(f"{BOLD}Benchmark β {BASE_URL}{RESET}")
|
| 179 |
+
print(f" fixtures: {len(fixtures)} ({n_ai} AI, {n_human} human)")
|
| 180 |
+
print(f" threshold: {args.threshold}")
|
| 181 |
+
print()
|
| 182 |
+
|
| 183 |
+
# --- Run ---
|
| 184 |
+
rows: list[dict] = []
|
| 185 |
+
print(f"{BOLD}{'path':<45} {'expect':<7} {'conf':<6} {'wav2vec':<7} {'fp':<6} {'verdict':<7}{RESET}")
|
| 186 |
+
print("-" * 86)
|
| 187 |
+
for path, expected_ai in fixtures:
|
| 188 |
+
rel = path.relative_to(FIXTURES_DIR)
|
| 189 |
+
try:
|
| 190 |
+
start = time.time()
|
| 191 |
+
body = analyze_clip(path)
|
| 192 |
+
elapsed = time.time() - start
|
| 193 |
+
conf = body["confidence"]
|
| 194 |
+
details = body.get("details", {})
|
| 195 |
+
wav2vec = details.get("wav2vec2_score", float("nan"))
|
| 196 |
+
fp_score = details.get("fingerprint_score", float("nan"))
|
| 197 |
+
predicted = conf >= args.threshold
|
| 198 |
+
correct = predicted == expected_ai
|
| 199 |
+
verdict = "AI" if predicted else "HUMAN"
|
| 200 |
+
color = GREEN if correct else RED
|
| 201 |
+
exp_label = "AI" if expected_ai else "HUMAN"
|
| 202 |
+
print(
|
| 203 |
+
f"{color}{str(rel):<45} {exp_label:<7} {conf:<6.3f} "
|
| 204 |
+
f"{wav2vec:<7.3f} {fp_score:<6.3f} {verdict:<7}{RESET} "
|
| 205 |
+
f"({elapsed:.1f}s)"
|
| 206 |
+
)
|
| 207 |
+
rows.append({
|
| 208 |
+
"path": str(rel),
|
| 209 |
+
"expected_ai": expected_ai,
|
| 210 |
+
"confidence": conf,
|
| 211 |
+
"wav2vec2_score": wav2vec,
|
| 212 |
+
"fingerprint_score": fp_score,
|
| 213 |
+
"elapsed_seconds": elapsed,
|
| 214 |
+
})
|
| 215 |
+
except Exception as e:
|
| 216 |
+
print(f"{RED}{str(rel):<45} ERROR: {e}{RESET}")
|
| 217 |
+
rows.append({
|
| 218 |
+
"path": str(rel),
|
| 219 |
+
"expected_ai": expected_ai,
|
| 220 |
+
"confidence": float("nan"),
|
| 221 |
+
"wav2vec2_score": float("nan"),
|
| 222 |
+
"fingerprint_score": float("nan"),
|
| 223 |
+
"elapsed_seconds": 0.0,
|
| 224 |
+
"error": str(e),
|
| 225 |
+
})
|
| 226 |
+
|
| 227 |
+
# --- Metrics ---
|
| 228 |
+
clean = [r for r in rows if "error" not in r]
|
| 229 |
+
if not clean:
|
| 230 |
+
print(f"\n{RED}No successful runs.{RESET}")
|
| 231 |
+
return 1
|
| 232 |
+
|
| 233 |
+
metrics = compute_metrics(clean, args.threshold)
|
| 234 |
+
print()
|
| 235 |
+
print(f"{BOLD}Confusion matrix @ threshold={args.threshold}{RESET}")
|
| 236 |
+
print(f" predicted AI predicted HUMAN")
|
| 237 |
+
print(f" actual AI {metrics['tp']:>4d} {metrics['fn']:>4d}")
|
| 238 |
+
print(f" actual HUMAN {metrics['fp']:>4d} {metrics['tn']:>4d}")
|
| 239 |
+
print()
|
| 240 |
+
print(f" accuracy {metrics['accuracy']:.3f}")
|
| 241 |
+
print(f" precision {metrics['precision']:.3f} (of predicted-AI, how many were AI)")
|
| 242 |
+
print(f" recall {metrics['recall']:.3f} (of actual-AI, how many we caught)")
|
| 243 |
+
print(f" f1 {metrics['f1']:.3f}")
|
| 244 |
+
|
| 245 |
+
# --- Score distributions (this is what reveals whether the model discriminates) ---
|
| 246 |
+
ai_scores = [r["wav2vec2_score"] for r in clean if r["expected_ai"]]
|
| 247 |
+
human_scores = [r["wav2vec2_score"] for r in clean if not r["expected_ai"]]
|
| 248 |
+
print()
|
| 249 |
+
print(f"{BOLD}wav2vec2 score distribution β AI clips (n={len(ai_scores)}){RESET}")
|
| 250 |
+
print(text_histogram(ai_scores))
|
| 251 |
+
print()
|
| 252 |
+
print(f"{BOLD}wav2vec2 score distribution β HUMAN clips (n={len(human_scores)}){RESET}")
|
| 253 |
+
print(text_histogram(human_scores))
|
| 254 |
+
print()
|
| 255 |
+
|
| 256 |
+
# Quick sanity read β means overlap = model doesn't discriminate.
|
| 257 |
+
if ai_scores and human_scores:
|
| 258 |
+
mean_ai = sum(ai_scores) / len(ai_scores)
|
| 259 |
+
mean_human = sum(human_scores) / len(human_scores)
|
| 260 |
+
separation = abs(mean_ai - mean_human)
|
| 261 |
+
print(f" mean(AI wav2vec2) = {mean_ai:.3f}")
|
| 262 |
+
print(f" mean(HUMAN wav2vec2) = {mean_human:.3f}")
|
| 263 |
+
print(f" separation = {separation:.3f}")
|
| 264 |
+
if separation < 0.1:
|
| 265 |
+
print(f" {RED}β model does not discriminate β replace it.{RESET}")
|
| 266 |
+
elif separation < 0.3:
|
| 267 |
+
print(f" {YELLOW}β weak discrimination β consider alternatives.{RESET}")
|
| 268 |
+
else:
|
| 269 |
+
print(f" {GREEN}β meaningful discrimination.{RESET}")
|
| 270 |
+
|
| 271 |
+
# --- Threshold sweep ---
|
| 272 |
+
if args.sweep:
|
| 273 |
+
print()
|
| 274 |
+
print(f"{BOLD}Threshold sweep{RESET}")
|
| 275 |
+
print(f" {'t':<6} {'accuracy':<10} {'precision':<11} {'recall':<8} {'f1':<6}")
|
| 276 |
+
for t in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
|
| 277 |
+
m = compute_metrics(clean, t)
|
| 278 |
+
print(f" {t:<6.2f} {m['accuracy']:<10.3f} "
|
| 279 |
+
f"{m['precision']:<11.3f} {m['recall']:<8.3f} {m['f1']:<6.3f}")
|
| 280 |
+
|
| 281 |
+
# --- CSV export ---
|
| 282 |
+
if args.csv:
|
| 283 |
+
with args.csv.open("w", newline="") as f:
|
| 284 |
+
writer = csv.DictWriter(
|
| 285 |
+
f,
|
| 286 |
+
fieldnames=[
|
| 287 |
+
"path", "expected_ai", "confidence",
|
| 288 |
+
"wav2vec2_score", "fingerprint_score",
|
| 289 |
+
"elapsed_seconds", "error",
|
| 290 |
+
],
|
| 291 |
+
)
|
| 292 |
+
writer.writeheader()
|
| 293 |
+
for r in rows:
|
| 294 |
+
writer.writerow({k: r.get(k, "") for k in writer.fieldnames})
|
| 295 |
+
print()
|
| 296 |
+
print(f"CSV written to {args.csv}")
|
| 297 |
+
|
| 298 |
+
return 0
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
if __name__ == "__main__":
|
| 302 |
+
sys.exit(main())
|
tests/fixtures/.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fixtures/*/*.mp3
|
| 2 |
+
fixtures/*/*.wav
|
| 3 |
+
fixtures/*/*.m4a
|
| 4 |
+
fixtures/*/*.aac
|
| 5 |
+
fixtures/*/*.flac
|
| 6 |
+
fixtures/*/*.ogg
|
tests/fixtures/.gitkeep
ADDED
|
File without changes
|
tests/fixtures/README.md
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Benchmark fixtures
|
| 2 |
+
|
| 3 |
+
Drop audio clips here to run `benchmark.py` against the deployed backend.
|
| 4 |
+
The folder structure **is** the ground-truth label:
|
| 5 |
+
|
| 6 |
+
```
|
| 7 |
+
fixtures/
|
| 8 |
+
βββ ai/ β AI-generated clips (expected: is_ai=true)
|
| 9 |
+
βββ human/ β real human-performed / human-produced clips (expected: is_ai=false)
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
Supported: `.mp3 .wav .m4a .aac .flac .ogg`. Clips are gitignored β never
|
| 13 |
+
commit copyrighted audio or paid-generator outputs.
|
| 14 |
+
|
| 15 |
+
## How many clips?
|
| 16 |
+
|
| 17 |
+
| Use case | Per folder | Total |
|
| 18 |
+
|---|---|---|
|
| 19 |
+
| Quick sanity check | 5 | 10 |
|
| 20 |
+
| Meaningful comparison between models | 20β30 | 40β60 |
|
| 21 |
+
| Publishable numbers | 100+ | 200+ |
|
| 22 |
+
|
| 23 |
+
For the **model replacement decision**, 20β30 per folder (60 total) is
|
| 24 |
+
enough to distinguish a broken model from a working one and to choose
|
| 25 |
+
among 2β3 candidates.
|
| 26 |
+
|
| 27 |
+
## Collection tips
|
| 28 |
+
|
| 29 |
+
### AI clips
|
| 30 |
+
- **Suno**: 5β10 across genres (rock, pop, hip-hop, classical, EDM)
|
| 31 |
+
- **Udio**: 5β10 different prompts
|
| 32 |
+
- **ElevenLabs**: 3β5 AI vocals / music
|
| 33 |
+
- **Soundraw / AIVA / Boomy / Mubert**: 1β2 each for coverage
|
| 34 |
+
- **Style variety matters more than quantity** β if all your AI clips are
|
| 35 |
+
Suno pop songs, you're only measuring Suno-pop detection.
|
| 36 |
+
|
| 37 |
+
### Human clips
|
| 38 |
+
- **Varied decades**: 1970s β 2020s
|
| 39 |
+
- **Varied production quality**: studio albums, live recordings,
|
| 40 |
+
lo-fi / demos, acoustic, heavy production
|
| 41 |
+
- **Varied sources**:
|
| 42 |
+
- Your own Apple Music library export
|
| 43 |
+
- Free-to-use samples from `freemusicarchive.org` or `ccmixter.org`
|
| 44 |
+
- 30s previews from iTunes (use `fetch_apple_previews.py --json` to
|
| 45 |
+
grab URLs, then `curl` + `ffmpeg -t 20` to make fixtures)
|
| 46 |
+
- **Avoid**: recent 2024+ chart hits (might be AI-assisted); solo
|
| 47 |
+
synthesized instruments (too easy); meme songs (too out-of-dist)
|
| 48 |
+
|
| 49 |
+
## Duration
|
| 50 |
+
|
| 51 |
+
Backend rejects clips > 30s. Trim with:
|
| 52 |
+
```bash
|
| 53 |
+
ffmpeg -i input.mp3 -t 20 -c copy output.mp3
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
Or batch:
|
| 57 |
+
```bash
|
| 58 |
+
for f in *.mp3; do ffmpeg -i "$f" -t 20 -c copy "trimmed_$f"; done
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
## Running the benchmark
|
| 62 |
+
|
| 63 |
+
```bash
|
| 64 |
+
cd backend/tests
|
| 65 |
+
export DETECTOR_API_URL='https://michal-giza-audio-detector-backend.hf.space'
|
| 66 |
+
export DETECTOR_API_KEY='...'
|
| 67 |
+
|
| 68 |
+
python3 benchmark.py # basic run
|
| 69 |
+
python3 benchmark.py --sweep # try 9 thresholds
|
| 70 |
+
python3 benchmark.py --csv results.csv # also export CSV
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
## What to look at
|
| 74 |
+
|
| 75 |
+
The **score distribution** section is more important than the accuracy
|
| 76 |
+
number. If the wav2vec2 score histograms for AI and HUMAN clips
|
| 77 |
+
**overlap completely**, the model isn't discriminating β it's just
|
| 78 |
+
returning the same value for everything. No threshold will save it.
|
| 79 |
+
|
| 80 |
+
If they separate cleanly (AI scores cluster high, HUMAN scores cluster
|
| 81 |
+
low or vice-versa), the model is working and threshold tuning could
|
| 82 |
+
produce a usable classifier.
|
| 83 |
+
|
| 84 |
+
## Quota awareness
|
| 85 |
+
|
| 86 |
+
Backend enforces **50 requests/IP/day**. A 40-clip benchmark run uses
|
| 87 |
+
40 of those. If you hit the quota you'll see `429` responses β rerun
|
| 88 |
+
tomorrow, or temporarily increase `DAILY_LIMIT` in `main.py` for eval.
|