audio-detector-backend / tests /test_ensemble.py
michal-giza's picture
music detection model update
2aefee4 verified
"""Unit tests for the 3-model ensemble math in `_finalise_score`.
These pin the product behaviour the user explicitly asked for in the
build-15 spec: when 2 of 3 models say AI but the third disagrees, the
combined score reflects that disagreement and the verdict is
issued with moderate (not maximal) confidence.
Also locks in the graceful 2-model fallback when the music model
fails to load so the product never silently regresses if the upstream
HF model goes 404.
"""
from __future__ import annotations
import math
import os
import sys
from pathlib import Path
import pytest
# Make `import main` work when pytest is invoked from the repo root.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import main # noqa: E402
@pytest.fixture(autouse=True)
def _reset_music_model_state():
"""Each test owns its own view of whether the music model loaded.
We toggle `main.music_classifier_weights` between None and a
sentinel array; the actual weight values don't matter to
`_finalise_score`, only whether they are present.
"""
original = main.music_classifier_weights
yield
main.music_classifier_weights = original
# ---------------------------------------------------------------------------
# 3-model ensemble (all weights present)
# ---------------------------------------------------------------------------
class _SentinelArray:
"""Stand-in for a numpy ndarray — `_finalise_score` only checks
`is not None`, never inspects shape or values, so we can avoid a
numpy import in the test."""
def _enable_music_model():
main.music_classifier_weights = _SentinelArray()
def _disable_music_model():
main.music_classifier_weights = None
def test_all_three_models_strongly_agree_on_ai():
"""Confident AI from all three → confidence ≈ max."""
_enable_music_model()
r = main._finalise_score(
model_score=0.95, fingerprint_score=0.95, music_model_score=0.95
)
assert r.is_ai is True
assert math.isclose(r.confidence, 0.95, abs_tol=1e-6)
assert "3-model" in r.details["ensemble_strategy"]
def test_all_three_models_strongly_agree_on_real():
"""Confident human from all three → low confidence (i.e. not-AI)."""
_enable_music_model()
r = main._finalise_score(
model_score=0.05, fingerprint_score=0.05, music_model_score=0.05
)
assert r.is_ai is False
assert math.isclose(r.confidence, 0.05, abs_tol=1e-6)
def test_two_models_say_ai_one_dissents__verdict_ai_but_lower_confidence():
"""The behaviour the user spec'd: 2 of 3 say AI, dissenting model
pulls the combined score toward the middle."""
_enable_music_model()
# wav2vec2 = AI (0.95), music_model = AI (0.95), fingerprint = human (0.05).
# Weights: 0.30 + 0.50 + 0.20.
expected = 0.95 * 0.30 + 0.95 * 0.50 + 0.05 * 0.20
r = main._finalise_score(
model_score=0.95, fingerprint_score=0.05, music_model_score=0.95
)
assert math.isclose(r.confidence, expected, abs_tol=1e-6)
assert r.is_ai is True
# The crucial behaviour: the dissent has moved the verdict away
# from "near-1.0 confident" toward a more honest middle.
assert r.confidence < 0.85, (
f"Two-of-three agreement should NOT yield >0.85 confidence — "
f"got {r.confidence}"
)
assert r.confidence > 0.5, "Majority vote should still tip into AI verdict"
def test_two_models_say_real_one_dissents__verdict_real():
"""Symmetric inverse: 2 say real, 1 says AI → verdict real."""
_enable_music_model()
r = main._finalise_score(
model_score=0.05, fingerprint_score=0.05, music_model_score=0.95
)
# 0.05*0.30 + 0.95*0.50 + 0.05*0.20 = 0.015 + 0.475 + 0.01 = 0.5
# Right at the boundary; depending on rounding, is_ai may flip.
# The spec'd behaviour is "the dissent pulls toward middle" and
# at exactly 0.5 the verdict is `> 0.5` so == False.
assert r.confidence > 0.45 and r.confidence < 0.55
def test_music_model_dominant_when_strongest_signal():
"""Highest single-model weight is the music model. Verifies the
intent that music-native signal carries the most weight on a
music-positioned product."""
_enable_music_model()
# Only the music model says AI; the other two say real.
r = main._finalise_score(
model_score=0.0, fingerprint_score=0.0, music_model_score=1.0
)
# Expected: 0.0*0.30 + 1.0*0.50 + 0.0*0.20 = 0.50
assert math.isclose(r.confidence, 0.50, abs_tol=1e-6)
def test_details_payload_includes_all_three_scores():
"""Flutter's ConfidenceBreakdown widget reads the three sub-scores
out of `details`. Renaming or dropping any of them is a
backwards-incompatible client change."""
_enable_music_model()
r = main._finalise_score(
model_score=0.6, fingerprint_score=0.4, music_model_score=0.7
)
assert "wav2vec2_score" in r.details
assert "music_model_score" in r.details
assert "fingerprint_score" in r.details
assert r.details["wav2vec2_score"] == pytest.approx(0.6)
assert r.details["music_model_score"] == pytest.approx(0.7)
assert r.details["fingerprint_score"] == pytest.approx(0.4)
# ---------------------------------------------------------------------------
# 2-model fallback (music model offline)
# ---------------------------------------------------------------------------
def test_fallback_when_music_model_offline_uses_legacy_70_30_weights():
"""The pre-build-15 ensemble had wav2vec2 at 0.7 and fingerprint
at 0.3. When the music model fails to load (HF blip, 404, etc),
`_finalise_score` MUST gracefully revert to that exact split so
users on the failed-load path see no behaviour change at all."""
_disable_music_model()
r = main._finalise_score(
model_score=0.8, fingerprint_score=0.4, music_model_score=0.0
)
expected = 0.8 * 0.7 + 0.4 * 0.3
assert math.isclose(r.confidence, expected, abs_tol=1e-6)
assert r.is_ai is True
assert "music model offline" in r.details["ensemble_strategy"]
def test_fallback_includes_music_model_score_field_at_zero():
"""Even on the fallback path the `music_model_score` field is
present (with value 0.0). Flutter clients that always render the
third bar can't crash on a missing key."""
_disable_music_model()
r = main._finalise_score(
model_score=0.5, fingerprint_score=0.5, music_model_score=0.0
)
assert r.details.get("music_model_score") == 0.0
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
def test_zero_input_returns_zero_score():
"""Pure silence in, near-zero score out, regardless of model state."""
_enable_music_model()
r = main._finalise_score(0.0, 0.0, 0.0)
assert r.confidence == 0.0
assert r.is_ai is False
def test_all_one_input_returns_one_score():
_enable_music_model()
r = main._finalise_score(1.0, 1.0, 1.0)
assert math.isclose(r.confidence, 1.0, abs_tol=1e-6)
assert r.is_ai is True
def test_weights_sum_to_one_in_three_model_path():
"""Sanity: confidence is a weighted *average*, not a sum. With
three identical inputs the output equals the input."""
_enable_music_model()
for v in [0.1, 0.5, 0.9]:
r = main._finalise_score(v, v, v)
assert math.isclose(r.confidence, v, abs_tol=1e-6)
def test_weights_sum_to_one_in_fallback_path():
"""Same sanity check on the 2-model fallback."""
_disable_music_model()
for v in [0.1, 0.5, 0.9]:
r = main._finalise_score(v, v, music_model_score=0.0)
assert math.isclose(r.confidence, v, abs_tol=1e-6)