| """Unit tests for the 3-model ensemble math in `_finalise_score`. |
| |
| These pin the product behaviour the user explicitly asked for in the |
| build-15 spec: when 2 of 3 models say AI but the third disagrees, the |
| combined score reflects that disagreement and the verdict is |
| issued with moderate (not maximal) confidence. |
| |
| Also locks in the graceful 2-model fallback when the music model |
| fails to load so the product never silently regresses if the upstream |
| HF model goes 404. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import math |
| import os |
| import sys |
| from pathlib import Path |
|
|
| import pytest |
|
|
| |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) |
|
|
| import main |
|
|
|
|
| @pytest.fixture(autouse=True) |
| def _reset_music_model_state(): |
| """Each test owns its own view of whether the music model loaded. |
| |
| We toggle `main.music_classifier_weights` between None and a |
| sentinel array; the actual weight values don't matter to |
| `_finalise_score`, only whether they are present. |
| """ |
| original = main.music_classifier_weights |
| yield |
| main.music_classifier_weights = original |
|
|
|
|
| |
| |
| |
|
|
| class _SentinelArray: |
| """Stand-in for a numpy ndarray — `_finalise_score` only checks |
| `is not None`, never inspects shape or values, so we can avoid a |
| numpy import in the test.""" |
|
|
|
|
| def _enable_music_model(): |
| main.music_classifier_weights = _SentinelArray() |
|
|
|
|
| def _disable_music_model(): |
| main.music_classifier_weights = None |
|
|
|
|
| def test_all_three_models_strongly_agree_on_ai(): |
| """Confident AI from all three → confidence ≈ max.""" |
| _enable_music_model() |
| r = main._finalise_score( |
| model_score=0.95, fingerprint_score=0.95, music_model_score=0.95 |
| ) |
| assert r.is_ai is True |
| assert math.isclose(r.confidence, 0.95, abs_tol=1e-6) |
| assert "3-model" in r.details["ensemble_strategy"] |
|
|
|
|
| def test_all_three_models_strongly_agree_on_real(): |
| """Confident human from all three → low confidence (i.e. not-AI).""" |
| _enable_music_model() |
| r = main._finalise_score( |
| model_score=0.05, fingerprint_score=0.05, music_model_score=0.05 |
| ) |
| assert r.is_ai is False |
| assert math.isclose(r.confidence, 0.05, abs_tol=1e-6) |
|
|
|
|
| def test_two_models_say_ai_one_dissents__verdict_ai_but_lower_confidence(): |
| """The behaviour the user spec'd: 2 of 3 say AI, dissenting model |
| pulls the combined score toward the middle.""" |
| _enable_music_model() |
| |
| |
| expected = 0.95 * 0.30 + 0.95 * 0.50 + 0.05 * 0.20 |
| r = main._finalise_score( |
| model_score=0.95, fingerprint_score=0.05, music_model_score=0.95 |
| ) |
| assert math.isclose(r.confidence, expected, abs_tol=1e-6) |
| assert r.is_ai is True |
| |
| |
| assert r.confidence < 0.85, ( |
| f"Two-of-three agreement should NOT yield >0.85 confidence — " |
| f"got {r.confidence}" |
| ) |
| assert r.confidence > 0.5, "Majority vote should still tip into AI verdict" |
|
|
|
|
| def test_two_models_say_real_one_dissents__verdict_real(): |
| """Symmetric inverse: 2 say real, 1 says AI → verdict real.""" |
| _enable_music_model() |
| r = main._finalise_score( |
| model_score=0.05, fingerprint_score=0.05, music_model_score=0.95 |
| ) |
| |
| |
| |
| |
| assert r.confidence > 0.45 and r.confidence < 0.55 |
|
|
|
|
| def test_music_model_dominant_when_strongest_signal(): |
| """Highest single-model weight is the music model. Verifies the |
| intent that music-native signal carries the most weight on a |
| music-positioned product.""" |
| _enable_music_model() |
| |
| r = main._finalise_score( |
| model_score=0.0, fingerprint_score=0.0, music_model_score=1.0 |
| ) |
| |
| assert math.isclose(r.confidence, 0.50, abs_tol=1e-6) |
|
|
|
|
| def test_details_payload_includes_all_three_scores(): |
| """Flutter's ConfidenceBreakdown widget reads the three sub-scores |
| out of `details`. Renaming or dropping any of them is a |
| backwards-incompatible client change.""" |
| _enable_music_model() |
| r = main._finalise_score( |
| model_score=0.6, fingerprint_score=0.4, music_model_score=0.7 |
| ) |
| assert "wav2vec2_score" in r.details |
| assert "music_model_score" in r.details |
| assert "fingerprint_score" in r.details |
| assert r.details["wav2vec2_score"] == pytest.approx(0.6) |
| assert r.details["music_model_score"] == pytest.approx(0.7) |
| assert r.details["fingerprint_score"] == pytest.approx(0.4) |
|
|
|
|
| |
| |
| |
|
|
| def test_fallback_when_music_model_offline_uses_legacy_70_30_weights(): |
| """The pre-build-15 ensemble had wav2vec2 at 0.7 and fingerprint |
| at 0.3. When the music model fails to load (HF blip, 404, etc), |
| `_finalise_score` MUST gracefully revert to that exact split so |
| users on the failed-load path see no behaviour change at all.""" |
| _disable_music_model() |
| r = main._finalise_score( |
| model_score=0.8, fingerprint_score=0.4, music_model_score=0.0 |
| ) |
| expected = 0.8 * 0.7 + 0.4 * 0.3 |
| assert math.isclose(r.confidence, expected, abs_tol=1e-6) |
| assert r.is_ai is True |
| assert "music model offline" in r.details["ensemble_strategy"] |
|
|
|
|
| def test_fallback_includes_music_model_score_field_at_zero(): |
| """Even on the fallback path the `music_model_score` field is |
| present (with value 0.0). Flutter clients that always render the |
| third bar can't crash on a missing key.""" |
| _disable_music_model() |
| r = main._finalise_score( |
| model_score=0.5, fingerprint_score=0.5, music_model_score=0.0 |
| ) |
| assert r.details.get("music_model_score") == 0.0 |
|
|
|
|
| |
| |
| |
|
|
| def test_zero_input_returns_zero_score(): |
| """Pure silence in, near-zero score out, regardless of model state.""" |
| _enable_music_model() |
| r = main._finalise_score(0.0, 0.0, 0.0) |
| assert r.confidence == 0.0 |
| assert r.is_ai is False |
|
|
|
|
| def test_all_one_input_returns_one_score(): |
| _enable_music_model() |
| r = main._finalise_score(1.0, 1.0, 1.0) |
| assert math.isclose(r.confidence, 1.0, abs_tol=1e-6) |
| assert r.is_ai is True |
|
|
|
|
| def test_weights_sum_to_one_in_three_model_path(): |
| """Sanity: confidence is a weighted *average*, not a sum. With |
| three identical inputs the output equals the input.""" |
| _enable_music_model() |
| for v in [0.1, 0.5, 0.9]: |
| r = main._finalise_score(v, v, v) |
| assert math.isclose(r.confidence, v, abs_tol=1e-6) |
|
|
|
|
| def test_weights_sum_to_one_in_fallback_path(): |
| """Same sanity check on the 2-model fallback.""" |
| _disable_music_model() |
| for v in [0.1, 0.5, 0.9]: |
| r = main._finalise_score(v, v, music_model_score=0.0) |
| assert math.isclose(r.confidence, v, abs_tol=1e-6) |
|
|