Spaces:

michal-giza
/

audio-detector-backend

Running

App Files Files Community

michal-giza commited on Apr 17

Commit

70eab61

verified ·

1 Parent(s): 9cedd4f

Upload 11 files

Browse files

Files changed (5) hide show

main.py +20 -4
tests/benchmark.py +302 -0
tests/fixtures/.gitignore +6 -0
tests/fixtures/.gitkeep +0 -0
tests/fixtures/README.md +88 -0

main.py CHANGED Viewed

@@ -61,7 +61,19 @@ logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
-MODEL_ID = "MelodyMachine/Deepfake-audio-detection-V2"
 API_KEY = os.getenv("DETECTOR_API_KEY", "your-fallback-test-key")
 MAX_FILE_SIZE = 5 * 1024 * 1024  # 5 MB
 MIN_AUDIO_DURATION = 1.0  # seconds
@@ -359,8 +371,9 @@ def run_detection_pipeline(audio_data: np.ndarray, sr: int) -> AnalysisResult:
     model_score = 0.0
     if classifier:
         results = classifier(audio_data)
         for res in results:
-            if res["label"].lower() in ["fake", "ai", "synthetic"]:
                 model_score = res["score"]
                 break
@@ -508,8 +521,9 @@ async def _stream_url_analysis(url: str, request_id: str):
     model_score = 0.0
     if classifier:
         results = await asyncio.to_thread(classifier, audio_data)
         for res in results:
-            if res["label"].lower() in ["fake", "ai", "synthetic"]:
                 model_score = res["score"]
                 break
@@ -556,8 +570,9 @@ async def _stream_file_analysis(file_path: str, request_id: str):
     model_score = 0.0
     if classifier:
         results = await asyncio.to_thread(classifier, audio_data)
         for res in results:
-            if res["label"].lower() in ["fake", "ai", "synthetic"]:
                 model_score = res["score"]
                 break
@@ -825,6 +840,7 @@ async def on_startup():
     logger.info("=== CheckAI Backend Starting ===")
     logger.info(f"Model: {MODEL_ID}")
     logger.info(f"Global concurrency: {MAX_GLOBAL_CONCURRENCY}")
     logger.info(f"Daily limit per IP: {DAILY_LIMIT}")
     logger.info(f"Allowed origins: {ALLOWED_ORIGINS}")

 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
+# MODEL_ID is env-configurable so we can A/B test candidate detectors without
+# redeploying. Verified so far:
+#   - "MelodyMachine/Deepfake-audio-detection-V2" → BROKEN (constant ~1.0 on
+#     both real music and AI music; do not use)
+#   - "mo-thecreator/Deepfake-audio-detection"    → to evaluate (speech-trained)
+MODEL_ID = os.getenv("MODEL_ID", "MelodyMachine/Deepfake-audio-detection-V2")
+# Which pipeline labels count as "this is AI"? Comma-separated, case-insensitive.
+# Some HF models use LABEL_0 / LABEL_1 instead of semantic names — check the
+# model's config.json and set this accordingly.
+_ai_labels_raw = os.getenv("MODEL_AI_LABELS", "fake,ai,synthetic,spoof,label_1")
+AI_LABELS = {s.strip().lower() for s in _ai_labels_raw.split(",") if s.strip()}
 API_KEY = os.getenv("DETECTOR_API_KEY", "your-fallback-test-key")
 MAX_FILE_SIZE = 5 * 1024 * 1024  # 5 MB
 MIN_AUDIO_DURATION = 1.0  # seconds
     model_score = 0.0
     if classifier:
         results = classifier(audio_data)
+        logger.info(f"[model] raw output: {results}")
         for res in results:
+            if res["label"].lower() in AI_LABELS:
                 model_score = res["score"]
                 break
     model_score = 0.0
     if classifier:
         results = await asyncio.to_thread(classifier, audio_data)
+        logger.info(f"[model] raw output: {results}")
         for res in results:
+            if res["label"].lower() in AI_LABELS:
                 model_score = res["score"]
                 break
     model_score = 0.0
     if classifier:
         results = await asyncio.to_thread(classifier, audio_data)
+        logger.info(f"[model] raw output: {results}")
         for res in results:
+            if res["label"].lower() in AI_LABELS:
                 model_score = res["score"]
                 break
     logger.info("=== CheckAI Backend Starting ===")
     logger.info(f"Model: {MODEL_ID}")
+    logger.info(f"AI labels: {sorted(AI_LABELS)}")
     logger.info(f"Global concurrency: {MAX_GLOBAL_CONCURRENCY}")
     logger.info(f"Daily limit per IP: {DAILY_LIMIT}")
     logger.info(f"Allowed origins: {ALLOWED_ORIGINS}")

tests/benchmark.py ADDED Viewed

	@@ -0,0 +1,302 @@

+#!/usr/bin/env python3
+"""
+Model benchmark harness.
+Runs every audio file in `backend/tests/fixtures/{ai,human}/` against the
+deployed backend's `/analyze/upload` endpoint and reports:
+  * Confusion matrix (TP / FP / TN / FN)
+  * Accuracy, precision, recall, F1
+  * Per-clip table: expected vs. observed + raw scores
+  * Score distribution histogram (text bar chart)
+  * CSV export for spreadsheet analysis
+Usage:
+    export DETECTOR_API_URL='https://michal-giza-audio-detector-backend.hf.space'
+    export DETECTOR_API_KEY='...'
+    # 1. Drop AI clips into backend/tests/fixtures/ai/*.{mp3,wav,m4a}
+    # 2. Drop HUMAN clips into backend/tests/fixtures/human/*.{mp3,wav,m4a}
+    # 3. Run:
+    python3 benchmark.py                     # verbose
+    python3 benchmark.py --csv results.csv   # also write CSV
+    python3 benchmark.py --threshold 0.65    # explore other decision thresholds
+Exit code 0 on benchmark completion (regardless of model quality).
+Exit 2 if no fixtures are present.
+"""
+import argparse
+import csv
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Iterator
+import requests
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+GREEN = "\033[92m"
+RED = "\033[91m"
+YELLOW = "\033[93m"
+CYAN = "\033[96m"
+BOLD = "\033[1m"
+RESET = "\033[0m"
+BASE_URL = os.environ.get("DETECTOR_API_URL", "").rstrip("/")
+API_KEY = os.environ.get("DETECTOR_API_KEY", "")
+FIXTURES_DIR = Path(__file__).parent / "fixtures"
+AUDIO_EXTS = {".mp3", ".wav", ".m4a", ".aac", ".flac", ".ogg"}
+MIME_FOR_EXT = {
+    ".mp3": "audio/mpeg",
+    ".wav": "audio/wav",
+    ".m4a": "audio/mp4",
+    ".aac": "audio/aac",
+    ".flac": "audio/flac",
+    ".ogg": "audio/ogg",
+}
+# ---------------------------------------------------------------------------
+# Model-under-test wrapper
+# ---------------------------------------------------------------------------
+def analyze_clip(path: Path, timeout: int = 120) -> dict:
+    """POST one clip to /analyze/upload. Returns the JSON body or raises."""
+    mime = MIME_FOR_EXT.get(path.suffix.lower(), "application/octet-stream")
+    with path.open("rb") as f:
+        resp = requests.post(
+            f"{BASE_URL}/analyze/upload",
+            headers={"X-Api-Key": API_KEY},
+            files={"file": (path.name, f, mime)},
+            timeout=timeout,
+        )
+    if resp.status_code != 200:
+        raise RuntimeError(
+            f"HTTP {resp.status_code}: {resp.text[:200]}"
+        )
+    return resp.json()
+def iter_fixtures() -> Iterator[tuple[Path, bool]]:
+    """Yield (path, is_ai_expected) for every clip under fixtures/."""
+    for subdir, expected_ai in [("ai", True), ("human", False)]:
+        root = FIXTURES_DIR / subdir
+        if not root.exists():
+            continue
+        for p in sorted(root.iterdir()):
+            if p.is_file() and p.suffix.lower() in AUDIO_EXTS:
+                yield p, expected_ai
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+def compute_metrics(rows: list[dict], threshold: float) -> dict:
+    """Compute confusion matrix + derived rates for a given decision threshold."""
+    tp = fp = tn = fn = 0
+    for r in rows:
+        predicted_ai = r["confidence"] >= threshold
+        actual_ai = r["expected_ai"]
+        if predicted_ai and actual_ai:
+            tp += 1
+        elif predicted_ai and not actual_ai:
+            fp += 1
+        elif not predicted_ai and not actual_ai:
+            tn += 1
+        else:
+            fn += 1
+    total = tp + fp + tn + fn
+    accuracy = (tp + tn) / total if total else 0.0
+    precision = tp / (tp + fp) if (tp + fp) else 0.0
+    recall = tp / (tp + fn) if (tp + fn) else 0.0
+    f1 = (
+        2 * precision * recall / (precision + recall)
+        if (precision + recall)
+        else 0.0
+    )
+    return {
+        "tp": tp, "fp": fp, "tn": tn, "fn": fn,
+        "accuracy": accuracy,
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+        "total": total,
+    }
+def text_histogram(values: list[float], width: int = 40, buckets: int = 20) -> str:
+    """Tiny ASCII histogram of [0..1] scores."""
+    if not values:
+        return "(no data)"
+    counts = [0] * buckets
+    for v in values:
+        idx = min(int(v * buckets), buckets - 1)
+        counts[idx] += 1
+    peak = max(counts) or 1
+    lines = []
+    for i, c in enumerate(counts):
+        lo = i / buckets
+        hi = (i + 1) / buckets
+        bar = "█" * int(c / peak * width)
+        lines.append(f"  [{lo:.2f}-{hi:.2f})  {c:3d}  {bar}")
+    return "\n".join(lines)
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--threshold", type=float, default=0.5,
+                        help="Decision threshold on `confidence` (default 0.5)")
+    parser.add_argument("--csv", type=Path, default=None,
+                        help="Optional CSV export path")
+    parser.add_argument("--sweep", action="store_true",
+                        help="Also show metrics at 9 thresholds 0.1..0.9")
+    args = parser.parse_args()
+    if not BASE_URL or not API_KEY:
+        print("DETECTOR_API_URL and DETECTOR_API_KEY must be set.", file=sys.stderr)
+        return 2
+    fixtures = list(iter_fixtures())
+    if not fixtures:
+        print(f"{YELLOW}No fixtures found in {FIXTURES_DIR}/.{RESET}", file=sys.stderr)
+        print("  Expected layout:", file=sys.stderr)
+        print(f"    {FIXTURES_DIR}/ai/*.mp3", file=sys.stderr)
+        print(f"    {FIXTURES_DIR}/human/*.mp3", file=sys.stderr)
+        return 2
+    n_ai = sum(1 for _, is_ai in fixtures if is_ai)
+    n_human = len(fixtures) - n_ai
+    print(f"{BOLD}Benchmark — {BASE_URL}{RESET}")
+    print(f"  fixtures: {len(fixtures)}  ({n_ai} AI, {n_human} human)")
+    print(f"  threshold: {args.threshold}")
+    print()
+    # --- Run ---
+    rows: list[dict] = []
+    print(f"{BOLD}{'path':<45} {'expect':<7} {'conf':<6} {'wav2vec':<7} {'fp':<6} {'verdict':<7}{RESET}")
+    print("-" * 86)
+    for path, expected_ai in fixtures:
+        rel = path.relative_to(FIXTURES_DIR)
+        try:
+            start = time.time()
+            body = analyze_clip(path)
+            elapsed = time.time() - start
+            conf = body["confidence"]
+            details = body.get("details", {})
+            wav2vec = details.get("wav2vec2_score", float("nan"))
+            fp_score = details.get("fingerprint_score", float("nan"))
+            predicted = conf >= args.threshold
+            correct = predicted == expected_ai
+            verdict = "AI" if predicted else "HUMAN"
+            color = GREEN if correct else RED
+            exp_label = "AI" if expected_ai else "HUMAN"
+            print(
+                f"{color}{str(rel):<45} {exp_label:<7} {conf:<6.3f} "
+                f"{wav2vec:<7.3f} {fp_score:<6.3f} {verdict:<7}{RESET}  "
+                f"({elapsed:.1f}s)"
+            )
+            rows.append({
+                "path": str(rel),
+                "expected_ai": expected_ai,
+                "confidence": conf,
+                "wav2vec2_score": wav2vec,
+                "fingerprint_score": fp_score,
+                "elapsed_seconds": elapsed,
+            })
+        except Exception as e:
+            print(f"{RED}{str(rel):<45}  ERROR: {e}{RESET}")
+            rows.append({
+                "path": str(rel),
+                "expected_ai": expected_ai,
+                "confidence": float("nan"),
+                "wav2vec2_score": float("nan"),
+                "fingerprint_score": float("nan"),
+                "elapsed_seconds": 0.0,
+                "error": str(e),
+            })
+    # --- Metrics ---
+    clean = [r for r in rows if "error" not in r]
+    if not clean:
+        print(f"\n{RED}No successful runs.{RESET}")
+        return 1
+    metrics = compute_metrics(clean, args.threshold)
+    print()
+    print(f"{BOLD}Confusion matrix @ threshold={args.threshold}{RESET}")
+    print(f"                predicted AI   predicted HUMAN")
+    print(f"  actual AI        {metrics['tp']:>4d}           {metrics['fn']:>4d}")
+    print(f"  actual HUMAN     {metrics['fp']:>4d}           {metrics['tn']:>4d}")
+    print()
+    print(f"  accuracy  {metrics['accuracy']:.3f}")
+    print(f"  precision {metrics['precision']:.3f}  (of predicted-AI, how many were AI)")
+    print(f"  recall    {metrics['recall']:.3f}  (of actual-AI, how many we caught)")
+    print(f"  f1        {metrics['f1']:.3f}")
+    # --- Score distributions (this is what reveals whether the model discriminates) ---
+    ai_scores = [r["wav2vec2_score"] for r in clean if r["expected_ai"]]
+    human_scores = [r["wav2vec2_score"] for r in clean if not r["expected_ai"]]
+    print()
+    print(f"{BOLD}wav2vec2 score distribution — AI clips (n={len(ai_scores)}){RESET}")
+    print(text_histogram(ai_scores))
+    print()
+    print(f"{BOLD}wav2vec2 score distribution — HUMAN clips (n={len(human_scores)}){RESET}")
+    print(text_histogram(human_scores))
+    print()
+    # Quick sanity read — means overlap = model doesn't discriminate.
+    if ai_scores and human_scores:
+        mean_ai = sum(ai_scores) / len(ai_scores)
+        mean_human = sum(human_scores) / len(human_scores)
+        separation = abs(mean_ai - mean_human)
+        print(f"  mean(AI wav2vec2)     = {mean_ai:.3f}")
+        print(f"  mean(HUMAN wav2vec2)  = {mean_human:.3f}")
+        print(f"  separation            = {separation:.3f}")
+        if separation < 0.1:
+            print(f"  {RED}→ model does not discriminate — replace it.{RESET}")
+        elif separation < 0.3:
+            print(f"  {YELLOW}→ weak discrimination — consider alternatives.{RESET}")
+        else:
+            print(f"  {GREEN}→ meaningful discrimination.{RESET}")
+    # --- Threshold sweep ---
+    if args.sweep:
+        print()
+        print(f"{BOLD}Threshold sweep{RESET}")
+        print(f"  {'t':<6} {'accuracy':<10} {'precision':<11} {'recall':<8} {'f1':<6}")
+        for t in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
+            m = compute_metrics(clean, t)
+            print(f"  {t:<6.2f} {m['accuracy']:<10.3f} "
+                  f"{m['precision']:<11.3f} {m['recall']:<8.3f} {m['f1']:<6.3f}")
+    # --- CSV export ---
+    if args.csv:
+        with args.csv.open("w", newline="") as f:
+            writer = csv.DictWriter(
+                f,
+                fieldnames=[
+                    "path", "expected_ai", "confidence",
+                    "wav2vec2_score", "fingerprint_score",
+                    "elapsed_seconds", "error",
+                ],
+            )
+            writer.writeheader()
+            for r in rows:
+                writer.writerow({k: r.get(k, "") for k in writer.fieldnames})
+        print()
+        print(f"CSV written to {args.csv}")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

tests/fixtures/.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+fixtures/*/*.mp3
+fixtures/*/*.wav
+fixtures/*/*.m4a
+fixtures/*/*.aac
+fixtures/*/*.flac
+fixtures/*/*.ogg

tests/fixtures/.gitkeep ADDED Viewed

File without changes

tests/fixtures/README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+# Benchmark fixtures
+Drop audio clips here to run `benchmark.py` against the deployed backend.
+The folder structure **is** the ground-truth label:
+```
+fixtures/
+├── ai/       ← AI-generated clips (expected: is_ai=true)
+└── human/    ← real human-performed / human-produced clips (expected: is_ai=false)
+```
+Supported: `.mp3 .wav .m4a .aac .flac .ogg`. Clips are gitignored — never
+commit copyrighted audio or paid-generator outputs.
+## How many clips?
+| Use case | Per folder | Total |
+|---|---|---|
+| Quick sanity check | 5 | 10 |
+| Meaningful comparison between models | 20–30 | 40–60 |
+| Publishable numbers | 100+ | 200+ |
+For the **model replacement decision**, 20–30 per folder (60 total) is
+enough to distinguish a broken model from a working one and to choose
+among 2–3 candidates.
+## Collection tips
+### AI clips
+- **Suno**: 5–10 across genres (rock, pop, hip-hop, classical, EDM)
+- **Udio**: 5–10 different prompts
+- **ElevenLabs**: 3–5 AI vocals / music
+- **Soundraw / AIVA / Boomy / Mubert**: 1–2 each for coverage
+- **Style variety matters more than quantity** — if all your AI clips are
+  Suno pop songs, you're only measuring Suno-pop detection.
+### Human clips
+- **Varied decades**: 1970s → 2020s
+- **Varied production quality**: studio albums, live recordings,
+  lo-fi / demos, acoustic, heavy production
+- **Varied sources**:
+  - Your own Apple Music library export
+  - Free-to-use samples from `freemusicarchive.org` or `ccmixter.org`
+  - 30s previews from iTunes (use `fetch_apple_previews.py --json` to
+    grab URLs, then `curl` + `ffmpeg -t 20` to make fixtures)
+- **Avoid**: recent 2024+ chart hits (might be AI-assisted); solo
+  synthesized instruments (too easy); meme songs (too out-of-dist)
+## Duration
+Backend rejects clips > 30s. Trim with:
+```bash
+ffmpeg -i input.mp3 -t 20 -c copy output.mp3
+```
+Or batch:
+```bash
+for f in *.mp3; do ffmpeg -i "$f" -t 20 -c copy "trimmed_$f"; done
+```
+## Running the benchmark
+```bash
+cd backend/tests
+export DETECTOR_API_URL='https://michal-giza-audio-detector-backend.hf.space'
+export DETECTOR_API_KEY='...'
+python3 benchmark.py                         # basic run
+python3 benchmark.py --sweep                 # try 9 thresholds
+python3 benchmark.py --csv results.csv       # also export CSV
+```
+## What to look at
+The **score distribution** section is more important than the accuracy
+number. If the wav2vec2 score histograms for AI and HUMAN clips
+**overlap completely**, the model isn't discriminating — it's just
+returning the same value for everything. No threshold will save it.
+If they separate cleanly (AI scores cluster high, HUMAN scores cluster
+low or vice-versa), the model is working and threshold tuning could
+produce a usable classifier.
+## Quota awareness
+Backend enforces **50 requests/IP/day**. A 40-clip benchmark run uses
+40 of those. If you hit the quota you'll see `429` responses — rerun
+tomorrow, or temporarily increase `DAILY_LIMIT` in `main.py` for eval.