""" Golden reference test for NVIDIA-Nemotron-Parse-v1.2 through vLLM. This mirrors the generation layer in test_golden.py, but exercises the vLLM encoder/decoder interface. vLLM returns completion text, while Transformers stores the full decoded decoder sequence in golden_outputs.json, so the comparison normalizes vLLM output back to the full decoded form. """ from __future__ import annotations import json import pytest pytest.importorskip("vllm", reason="vLLM is not installed") from test_golden import ( GOLDEN_FILE, MAX_NEW_TOKENS_GOLDEN, MODEL_PATH, TASK_PROMPT, make_test_image, ) def _load_golden_generation() -> str: if not GOLDEN_FILE.exists(): pytest.skip("golden_outputs.json not found - run: python test_golden.py --capture") with open(GOLDEN_FILE) as f: return json.load(f)["generation"]["decoded_text"] def _as_full_decoded_text(vllm_text: str, expected_full_text: str) -> str: if vllm_text.startswith(TASK_PROMPT): full_text = vllm_text elif expected_full_text.startswith(TASK_PROMPT): full_text = TASK_PROMPT + vllm_text else: full_text = vllm_text # vLLM stops on EOS but does not include that stop token in completion text. if expected_full_text.endswith("") and not full_text.endswith(""): with_eos = full_text + "" if with_eos == expected_full_text: return with_eos return full_text def test_vllm_generation_matches_golden(): from vllm import LLM, SamplingParams expected = _load_golden_generation() image = make_test_image() sampling_params = SamplingParams( temperature=0.0, top_k=1, repetition_penalty=1.1, max_tokens=MAX_NEW_TOKENS_GOLDEN, skip_special_tokens=False, ) llm = LLM( model=MODEL_PATH, max_num_seqs=2, limit_mm_per_prompt={"image": 1}, dtype="bfloat16", trust_remote_code=True, attention_config={"backend": "TRITON_ATTN"}, ) request_names = ["implicit", "explicit_encoder_decoder"] requests = [ { "prompt": TASK_PROMPT, "multi_modal_data": {"image": image.copy()}, }, { "encoder_prompt": { "prompt": "", "multi_modal_data": {"image": image.copy()}, }, "decoder_prompt": TASK_PROMPT, }, ] outputs = llm.generate(requests, sampling_params) assert len(outputs) == len(request_names) mismatches = [] for name, output in zip(request_names, outputs): text = output.outputs[0].text full_text = _as_full_decoded_text(text, expected) if full_text != expected: mismatches.append( f"vLLM {name} generation differs from golden.\n" f" raw vLLM text: {text!r}\n" f" normalized: {full_text!r}\n" f" expected: {expected!r}" ) assert not mismatches, "\n\n".join(mismatches)