"""Unit tests for the 3-model ensemble math in `_finalise_score`.

These pin the product behaviour the user explicitly asked for in the
build-15 spec: when 2 of 3 models say AI but the third disagrees, the
combined score reflects that disagreement and the verdict is
issued with moderate (not maximal) confidence.

Also locks in the graceful 2-model fallback when the music model
fails to load so the product never silently regresses if the upstream
HF model goes 404.
"""

from __future__ import annotations

import math
import os
import sys
from pathlib import Path

import pytest

# Make `import main` work when pytest is invoked from the repo root.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

import main  # noqa: E402


@pytest.fixture(autouse=True)
def _reset_music_model_state():
    """Each test owns its own view of whether the music model loaded.

    We toggle `main.music_classifier_weights` between None and a
    sentinel array; the actual weight values don't matter to
    `_finalise_score`, only whether they are present.
    """
    original = main.music_classifier_weights
    yield
    main.music_classifier_weights = original


# ---------------------------------------------------------------------------
# 3-model ensemble (all weights present)
# ---------------------------------------------------------------------------

class _SentinelArray:
    """Stand-in for a numpy ndarray — `_finalise_score` only checks
    `is not None`, never inspects shape or values, so we can avoid a
    numpy import in the test."""


def _enable_music_model():
    main.music_classifier_weights = _SentinelArray()


def _disable_music_model():
    main.music_classifier_weights = None


def test_all_three_models_strongly_agree_on_ai():
    """Confident AI from all three → confidence ≈ max."""
    _enable_music_model()
    r = main._finalise_score(
        model_score=0.95, fingerprint_score=0.95, music_model_score=0.95
    )
    assert r.is_ai is True
    assert math.isclose(r.confidence, 0.95, abs_tol=1e-6)
    assert "3-model" in r.details["ensemble_strategy"]


def test_all_three_models_strongly_agree_on_real():
    """Confident human from all three → low confidence (i.e. not-AI)."""
    _enable_music_model()
    r = main._finalise_score(
        model_score=0.05, fingerprint_score=0.05, music_model_score=0.05
    )
    assert r.is_ai is False
    assert math.isclose(r.confidence, 0.05, abs_tol=1e-6)


def test_two_models_say_ai_one_dissents__verdict_ai_but_lower_confidence():
    """The behaviour the user spec'd: 2 of 3 say AI, dissenting model
    pulls the combined score toward the middle."""
    _enable_music_model()
    # wav2vec2 = AI (0.95), music_model = AI (0.95), fingerprint = human (0.05).
    # Weights: 0.30 + 0.50 + 0.20.
    expected = 0.95 * 0.30 + 0.95 * 0.50 + 0.05 * 0.20
    r = main._finalise_score(
        model_score=0.95, fingerprint_score=0.05, music_model_score=0.95
    )
    assert math.isclose(r.confidence, expected, abs_tol=1e-6)
    assert r.is_ai is True
    # The crucial behaviour: the dissent has moved the verdict away
    # from "near-1.0 confident" toward a more honest middle.
    assert r.confidence < 0.85, (
        f"Two-of-three agreement should NOT yield >0.85 confidence — "
        f"got {r.confidence}"
    )
    assert r.confidence > 0.5, "Majority vote should still tip into AI verdict"


def test_two_models_say_real_one_dissents__verdict_real():
    """Symmetric inverse: 2 say real, 1 says AI → verdict real."""
    _enable_music_model()
    r = main._finalise_score(
        model_score=0.05, fingerprint_score=0.05, music_model_score=0.95
    )
    # 0.05*0.30 + 0.95*0.50 + 0.05*0.20 = 0.015 + 0.475 + 0.01 = 0.5
    # Right at the boundary; depending on rounding, is_ai may flip.
    # The spec'd behaviour is "the dissent pulls toward middle" and
    # at exactly 0.5 the verdict is `> 0.5` so == False.
    assert r.confidence > 0.45 and r.confidence < 0.55


def test_music_model_dominant_when_strongest_signal():
    """Highest single-model weight is the music model. Verifies the
    intent that music-native signal carries the most weight on a
    music-positioned product."""
    _enable_music_model()
    # Only the music model says AI; the other two say real.
    r = main._finalise_score(
        model_score=0.0, fingerprint_score=0.0, music_model_score=1.0
    )
    # Expected: 0.0*0.30 + 1.0*0.50 + 0.0*0.20 = 0.50
    assert math.isclose(r.confidence, 0.50, abs_tol=1e-6)


def test_details_payload_includes_all_three_scores():
    """Flutter's ConfidenceBreakdown widget reads the three sub-scores
    out of `details`. Renaming or dropping any of them is a
    backwards-incompatible client change."""
    _enable_music_model()
    r = main._finalise_score(
        model_score=0.6, fingerprint_score=0.4, music_model_score=0.7
    )
    assert "wav2vec2_score" in r.details
    assert "music_model_score" in r.details
    assert "fingerprint_score" in r.details
    assert r.details["wav2vec2_score"] == pytest.approx(0.6)
    assert r.details["music_model_score"] == pytest.approx(0.7)
    assert r.details["fingerprint_score"] == pytest.approx(0.4)


# ---------------------------------------------------------------------------
# 2-model fallback (music model offline)
# ---------------------------------------------------------------------------

def test_fallback_when_music_model_offline_uses_legacy_70_30_weights():
    """The pre-build-15 ensemble had wav2vec2 at 0.7 and fingerprint
    at 0.3. When the music model fails to load (HF blip, 404, etc),
    `_finalise_score` MUST gracefully revert to that exact split so
    users on the failed-load path see no behaviour change at all."""
    _disable_music_model()
    r = main._finalise_score(
        model_score=0.8, fingerprint_score=0.4, music_model_score=0.0
    )
    expected = 0.8 * 0.7 + 0.4 * 0.3
    assert math.isclose(r.confidence, expected, abs_tol=1e-6)
    assert r.is_ai is True
    assert "music model offline" in r.details["ensemble_strategy"]


def test_fallback_includes_music_model_score_field_at_zero():
    """Even on the fallback path the `music_model_score` field is
    present (with value 0.0). Flutter clients that always render the
    third bar can't crash on a missing key."""
    _disable_music_model()
    r = main._finalise_score(
        model_score=0.5, fingerprint_score=0.5, music_model_score=0.0
    )
    assert r.details.get("music_model_score") == 0.0


# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------

def test_zero_input_returns_zero_score():
    """Pure silence in, near-zero score out, regardless of model state."""
    _enable_music_model()
    r = main._finalise_score(0.0, 0.0, 0.0)
    assert r.confidence == 0.0
    assert r.is_ai is False


def test_all_one_input_returns_one_score():
    _enable_music_model()
    r = main._finalise_score(1.0, 1.0, 1.0)
    assert math.isclose(r.confidence, 1.0, abs_tol=1e-6)
    assert r.is_ai is True


def test_weights_sum_to_one_in_three_model_path():
    """Sanity: confidence is a weighted *average*, not a sum. With
    three identical inputs the output equals the input."""
    _enable_music_model()
    for v in [0.1, 0.5, 0.9]:
        r = main._finalise_score(v, v, v)
        assert math.isclose(r.confidence, v, abs_tol=1e-6)


def test_weights_sum_to_one_in_fallback_path():
    """Same sanity check on the 2-model fallback."""
    _disable_music_model()
    for v in [0.1, 0.5, 0.9]:
        r = main._finalise_score(v, v, music_model_score=0.0)
        assert math.isclose(r.confidence, v, abs_tol=1e-6)