"""
Golden reference test for NVIDIA-Nemotron-Parse-v1.2 through vLLM.

This mirrors the generation layer in test_golden.py, but exercises the vLLM
encoder/decoder interface. vLLM returns completion text, while Transformers
stores the full decoded decoder sequence in golden_outputs.json, so the
comparison normalizes vLLM output back to the full decoded form.
"""

from __future__ import annotations

import json

import pytest

pytest.importorskip("vllm", reason="vLLM is not installed")

from test_golden import (
    GOLDEN_FILE,
    MAX_NEW_TOKENS_GOLDEN,
    MODEL_PATH,
    TASK_PROMPT,
    make_test_image,
)


def _load_golden_generation() -> str:
    if not GOLDEN_FILE.exists():
        pytest.skip("golden_outputs.json not found - run: python test_golden.py --capture")
    with open(GOLDEN_FILE) as f:
        return json.load(f)["generation"]["decoded_text"]


def _as_full_decoded_text(vllm_text: str, expected_full_text: str) -> str:
    if vllm_text.startswith(TASK_PROMPT):
        full_text = vllm_text
    elif expected_full_text.startswith(TASK_PROMPT):
        full_text = TASK_PROMPT + vllm_text
    else:
        full_text = vllm_text

    # vLLM stops on EOS but does not include that stop token in completion text.
    if expected_full_text.endswith("</s>") and not full_text.endswith("</s>"):
        with_eos = full_text + "</s>"
        if with_eos == expected_full_text:
            return with_eos

    return full_text


def test_vllm_generation_matches_golden():
    from vllm import LLM, SamplingParams

    expected = _load_golden_generation()
    image = make_test_image()

    sampling_params = SamplingParams(
        temperature=0.0,
        top_k=1,
        repetition_penalty=1.1,
        max_tokens=MAX_NEW_TOKENS_GOLDEN,
        skip_special_tokens=False,
    )
    llm = LLM(
        model=MODEL_PATH,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": 1},
        dtype="bfloat16",
        trust_remote_code=True,
        attention_config={"backend": "TRITON_ATTN"},
    )

    request_names = ["implicit", "explicit_encoder_decoder"]
    requests = [
        {
            "prompt": TASK_PROMPT,
            "multi_modal_data": {"image": image.copy()},
        },
        {
            "encoder_prompt": {
                "prompt": "",
                "multi_modal_data": {"image": image.copy()},
            },
            "decoder_prompt": TASK_PROMPT,
        },
    ]

    outputs = llm.generate(requests, sampling_params)
    assert len(outputs) == len(request_names)

    mismatches = []
    for name, output in zip(request_names, outputs):
        text = output.outputs[0].text
        full_text = _as_full_decoded_text(text, expected)
        if full_text != expected:
            mismatches.append(
                f"vLLM {name} generation differs from golden.\n"
                f"  raw vLLM text: {text!r}\n"
                f"  normalized:   {full_text!r}\n"
                f"  expected:     {expected!r}"
            )

    assert not mismatches, "\n\n".join(mismatches)