"""Unit tests for the 3-model ensemble math in `_finalise_score`. These pin the product behaviour the user explicitly asked for in the build-15 spec: when 2 of 3 models say AI but the third disagrees, the combined score reflects that disagreement and the verdict is issued with moderate (not maximal) confidence. Also locks in the graceful 2-model fallback when the music model fails to load so the product never silently regresses if the upstream HF model goes 404. """ from __future__ import annotations import math import os import sys from pathlib import Path import pytest # Make `import main` work when pytest is invoked from the repo root. sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) import main # noqa: E402 @pytest.fixture(autouse=True) def _reset_music_model_state(): """Each test owns its own view of whether the music model loaded. We toggle `main.music_classifier_weights` between None and a sentinel array; the actual weight values don't matter to `_finalise_score`, only whether they are present. """ original = main.music_classifier_weights yield main.music_classifier_weights = original # --------------------------------------------------------------------------- # 3-model ensemble (all weights present) # --------------------------------------------------------------------------- class _SentinelArray: """Stand-in for a numpy ndarray — `_finalise_score` only checks `is not None`, never inspects shape or values, so we can avoid a numpy import in the test.""" def _enable_music_model(): main.music_classifier_weights = _SentinelArray() def _disable_music_model(): main.music_classifier_weights = None def test_all_three_models_strongly_agree_on_ai(): """Confident AI from all three → confidence ≈ max.""" _enable_music_model() r = main._finalise_score( model_score=0.95, fingerprint_score=0.95, music_model_score=0.95 ) assert r.is_ai is True assert math.isclose(r.confidence, 0.95, abs_tol=1e-6) assert "3-model" in r.details["ensemble_strategy"] def test_all_three_models_strongly_agree_on_real(): """Confident human from all three → low confidence (i.e. not-AI).""" _enable_music_model() r = main._finalise_score( model_score=0.05, fingerprint_score=0.05, music_model_score=0.05 ) assert r.is_ai is False assert math.isclose(r.confidence, 0.05, abs_tol=1e-6) def test_two_models_say_ai_one_dissents__verdict_ai_but_lower_confidence(): """The behaviour the user spec'd: 2 of 3 say AI, dissenting model pulls the combined score toward the middle.""" _enable_music_model() # wav2vec2 = AI (0.95), music_model = AI (0.95), fingerprint = human (0.05). # Weights: 0.30 + 0.50 + 0.20. expected = 0.95 * 0.30 + 0.95 * 0.50 + 0.05 * 0.20 r = main._finalise_score( model_score=0.95, fingerprint_score=0.05, music_model_score=0.95 ) assert math.isclose(r.confidence, expected, abs_tol=1e-6) assert r.is_ai is True # The crucial behaviour: the dissent has moved the verdict away # from "near-1.0 confident" toward a more honest middle. assert r.confidence < 0.85, ( f"Two-of-three agreement should NOT yield >0.85 confidence — " f"got {r.confidence}" ) assert r.confidence > 0.5, "Majority vote should still tip into AI verdict" def test_two_models_say_real_one_dissents__verdict_real(): """Symmetric inverse: 2 say real, 1 says AI → verdict real.""" _enable_music_model() r = main._finalise_score( model_score=0.05, fingerprint_score=0.05, music_model_score=0.95 ) # 0.05*0.30 + 0.95*0.50 + 0.05*0.20 = 0.015 + 0.475 + 0.01 = 0.5 # Right at the boundary; depending on rounding, is_ai may flip. # The spec'd behaviour is "the dissent pulls toward middle" and # at exactly 0.5 the verdict is `> 0.5` so == False. assert r.confidence > 0.45 and r.confidence < 0.55 def test_music_model_dominant_when_strongest_signal(): """Highest single-model weight is the music model. Verifies the intent that music-native signal carries the most weight on a music-positioned product.""" _enable_music_model() # Only the music model says AI; the other two say real. r = main._finalise_score( model_score=0.0, fingerprint_score=0.0, music_model_score=1.0 ) # Expected: 0.0*0.30 + 1.0*0.50 + 0.0*0.20 = 0.50 assert math.isclose(r.confidence, 0.50, abs_tol=1e-6) def test_details_payload_includes_all_three_scores(): """Flutter's ConfidenceBreakdown widget reads the three sub-scores out of `details`. Renaming or dropping any of them is a backwards-incompatible client change.""" _enable_music_model() r = main._finalise_score( model_score=0.6, fingerprint_score=0.4, music_model_score=0.7 ) assert "wav2vec2_score" in r.details assert "music_model_score" in r.details assert "fingerprint_score" in r.details assert r.details["wav2vec2_score"] == pytest.approx(0.6) assert r.details["music_model_score"] == pytest.approx(0.7) assert r.details["fingerprint_score"] == pytest.approx(0.4) # --------------------------------------------------------------------------- # 2-model fallback (music model offline) # --------------------------------------------------------------------------- def test_fallback_when_music_model_offline_uses_legacy_70_30_weights(): """The pre-build-15 ensemble had wav2vec2 at 0.7 and fingerprint at 0.3. When the music model fails to load (HF blip, 404, etc), `_finalise_score` MUST gracefully revert to that exact split so users on the failed-load path see no behaviour change at all.""" _disable_music_model() r = main._finalise_score( model_score=0.8, fingerprint_score=0.4, music_model_score=0.0 ) expected = 0.8 * 0.7 + 0.4 * 0.3 assert math.isclose(r.confidence, expected, abs_tol=1e-6) assert r.is_ai is True assert "music model offline" in r.details["ensemble_strategy"] def test_fallback_includes_music_model_score_field_at_zero(): """Even on the fallback path the `music_model_score` field is present (with value 0.0). Flutter clients that always render the third bar can't crash on a missing key.""" _disable_music_model() r = main._finalise_score( model_score=0.5, fingerprint_score=0.5, music_model_score=0.0 ) assert r.details.get("music_model_score") == 0.0 # --------------------------------------------------------------------------- # Edge cases # --------------------------------------------------------------------------- def test_zero_input_returns_zero_score(): """Pure silence in, near-zero score out, regardless of model state.""" _enable_music_model() r = main._finalise_score(0.0, 0.0, 0.0) assert r.confidence == 0.0 assert r.is_ai is False def test_all_one_input_returns_one_score(): _enable_music_model() r = main._finalise_score(1.0, 1.0, 1.0) assert math.isclose(r.confidence, 1.0, abs_tol=1e-6) assert r.is_ai is True def test_weights_sum_to_one_in_three_model_path(): """Sanity: confidence is a weighted *average*, not a sum. With three identical inputs the output equals the input.""" _enable_music_model() for v in [0.1, 0.5, 0.9]: r = main._finalise_score(v, v, v) assert math.isclose(r.confidence, v, abs_tol=1e-6) def test_weights_sum_to_one_in_fallback_path(): """Same sanity check on the 2-model fallback.""" _disable_music_model() for v in [0.1, 0.5, 0.9]: r = main._finalise_score(v, v, music_model_score=0.0) assert math.isclose(r.confidence, v, abs_tol=1e-6)