Spaces:

michal-giza
/

audio-detector-backend

Running

App Files Files Community

audio-detector-backend / tests /test_ensemble.py

michal-giza

music detection model update

2aefee4 verified about 1 month ago

raw

history blame contribute delete

7.73 kB

	"""Unit tests for the 3-model ensemble math in `_finalise_score`.

	These pin the product behaviour the user explicitly asked for in the
	build-15 spec: when 2 of 3 models say AI but the third disagrees, the
	combined score reflects that disagreement and the verdict is
	issued with moderate (not maximal) confidence.

	Also locks in the graceful 2-model fallback when the music model
	fails to load so the product never silently regresses if the upstream
	HF model goes 404.
	"""

	from __future__ import annotations

	import math
	import os
	import sys
	from pathlib import Path

	import pytest

	# Make `import main` work when pytest is invoked from the repo root.
	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	import main # noqa: E402


	@pytest.fixture(autouse=True)
	def _reset_music_model_state():
	"""Each test owns its own view of whether the music model loaded.

	We toggle `main.music_classifier_weights` between None and a
	sentinel array; the actual weight values don't matter to
	`_finalise_score`, only whether they are present.
	"""
	original = main.music_classifier_weights
	yield
	main.music_classifier_weights = original


	# ---------------------------------------------------------------------------
	# 3-model ensemble (all weights present)
	# ---------------------------------------------------------------------------

	class _SentinelArray:
	"""Stand-in for a numpy ndarray — `_finalise_score` only checks
	`is not None`, never inspects shape or values, so we can avoid a
	numpy import in the test."""


	def _enable_music_model():
	main.music_classifier_weights = _SentinelArray()


	def _disable_music_model():
	main.music_classifier_weights = None


	def test_all_three_models_strongly_agree_on_ai():
	"""Confident AI from all three → confidence ≈ max."""
	_enable_music_model()
	r = main._finalise_score(
	model_score=0.95, fingerprint_score=0.95, music_model_score=0.95
	)
	assert r.is_ai is True
	assert math.isclose(r.confidence, 0.95, abs_tol=1e-6)
	assert "3-model" in r.details["ensemble_strategy"]


	def test_all_three_models_strongly_agree_on_real():
	"""Confident human from all three → low confidence (i.e. not-AI)."""
	_enable_music_model()
	r = main._finalise_score(
	model_score=0.05, fingerprint_score=0.05, music_model_score=0.05
	)
	assert r.is_ai is False
	assert math.isclose(r.confidence, 0.05, abs_tol=1e-6)


	def test_two_models_say_ai_one_dissents__verdict_ai_but_lower_confidence():
	"""The behaviour the user spec'd: 2 of 3 say AI, dissenting model
	pulls the combined score toward the middle."""
	_enable_music_model()
	# wav2vec2 = AI (0.95), music_model = AI (0.95), fingerprint = human (0.05).
	# Weights: 0.30 + 0.50 + 0.20.
	expected = 0.95 * 0.30 + 0.95 * 0.50 + 0.05 * 0.20
	r = main._finalise_score(
	model_score=0.95, fingerprint_score=0.05, music_model_score=0.95
	)
	assert math.isclose(r.confidence, expected, abs_tol=1e-6)
	assert r.is_ai is True
	# The crucial behaviour: the dissent has moved the verdict away
	# from "near-1.0 confident" toward a more honest middle.
	assert r.confidence < 0.85, (
	f"Two-of-three agreement should NOT yield >0.85 confidence — "
	f"got {r.confidence}"
	)
	assert r.confidence > 0.5, "Majority vote should still tip into AI verdict"


	def test_two_models_say_real_one_dissents__verdict_real():
	"""Symmetric inverse: 2 say real, 1 says AI → verdict real."""
	_enable_music_model()
	r = main._finalise_score(
	model_score=0.05, fingerprint_score=0.05, music_model_score=0.95
	)
	# 0.050.30 + 0.950.50 + 0.05*0.20 = 0.015 + 0.475 + 0.01 = 0.5
	# Right at the boundary; depending on rounding, is_ai may flip.
	# The spec'd behaviour is "the dissent pulls toward middle" and
	# at exactly 0.5 the verdict is `> 0.5` so == False.
	assert r.confidence > 0.45 and r.confidence < 0.55


	def test_music_model_dominant_when_strongest_signal():
	"""Highest single-model weight is the music model. Verifies the
	intent that music-native signal carries the most weight on a
	music-positioned product."""
	_enable_music_model()
	# Only the music model says AI; the other two say real.
	r = main._finalise_score(
	model_score=0.0, fingerprint_score=0.0, music_model_score=1.0
	)
	# Expected: 0.00.30 + 1.00.50 + 0.0*0.20 = 0.50
	assert math.isclose(r.confidence, 0.50, abs_tol=1e-6)


	def test_details_payload_includes_all_three_scores():
	"""Flutter's ConfidenceBreakdown widget reads the three sub-scores
	out of `details`. Renaming or dropping any of them is a
	backwards-incompatible client change."""
	_enable_music_model()
	r = main._finalise_score(
	model_score=0.6, fingerprint_score=0.4, music_model_score=0.7
	)
	assert "wav2vec2_score" in r.details
	assert "music_model_score" in r.details
	assert "fingerprint_score" in r.details
	assert r.details["wav2vec2_score"] == pytest.approx(0.6)
	assert r.details["music_model_score"] == pytest.approx(0.7)
	assert r.details["fingerprint_score"] == pytest.approx(0.4)


	# ---------------------------------------------------------------------------
	# 2-model fallback (music model offline)
	# ---------------------------------------------------------------------------

	def test_fallback_when_music_model_offline_uses_legacy_70_30_weights():
	"""The pre-build-15 ensemble had wav2vec2 at 0.7 and fingerprint
	at 0.3. When the music model fails to load (HF blip, 404, etc),
	`_finalise_score` MUST gracefully revert to that exact split so
	users on the failed-load path see no behaviour change at all."""
	_disable_music_model()
	r = main._finalise_score(
	model_score=0.8, fingerprint_score=0.4, music_model_score=0.0
	)
	expected = 0.8 * 0.7 + 0.4 * 0.3
	assert math.isclose(r.confidence, expected, abs_tol=1e-6)
	assert r.is_ai is True
	assert "music model offline" in r.details["ensemble_strategy"]


	def test_fallback_includes_music_model_score_field_at_zero():
	"""Even on the fallback path the `music_model_score` field is
	present (with value 0.0). Flutter clients that always render the
	third bar can't crash on a missing key."""
	_disable_music_model()
	r = main._finalise_score(
	model_score=0.5, fingerprint_score=0.5, music_model_score=0.0
	)
	assert r.details.get("music_model_score") == 0.0


	# ---------------------------------------------------------------------------
	# Edge cases
	# ---------------------------------------------------------------------------

	def test_zero_input_returns_zero_score():
	"""Pure silence in, near-zero score out, regardless of model state."""
	_enable_music_model()
	r = main._finalise_score(0.0, 0.0, 0.0)
	assert r.confidence == 0.0
	assert r.is_ai is False


	def test_all_one_input_returns_one_score():
	_enable_music_model()
	r = main._finalise_score(1.0, 1.0, 1.0)
	assert math.isclose(r.confidence, 1.0, abs_tol=1e-6)
	assert r.is_ai is True


	def test_weights_sum_to_one_in_three_model_path():
	"""Sanity: confidence is a weighted average, not a sum. With
	three identical inputs the output equals the input."""
	_enable_music_model()
	for v in [0.1, 0.5, 0.9]:
	r = main._finalise_score(v, v, v)
	assert math.isclose(r.confidence, v, abs_tol=1e-6)


	def test_weights_sum_to_one_in_fallback_path():
	"""Same sanity check on the 2-model fallback."""
	_disable_music_model()
	for v in [0.1, 0.5, 0.9]:
	r = main._finalise_score(v, v, music_model_score=0.0)
	assert math.isclose(r.confidence, v, abs_tol=1e-6)