using UnityEngine; using UnityEngine.UI; using UnityEngine.Video; using System.Collections; using System.IO; public class VLMController : MonoBehaviour { [Header("Model")] [SerializeField] private ModelVLM vlmModel; [Header("UI Components")] [SerializeField] private RawImage imageDisplay; [SerializeField] private InputField promptInput; [SerializeField] private Text outputText; [SerializeField] private VideoPlayer videoPlayer; [Header("Inference Loop")] [SerializeField] private float inferenceIntervalSeconds = 0.25f; [SerializeField] private string defaultPrompt = "Describe the image briefly."; private RenderTexture videoRenderTexture; private Texture2D loadedImageTexture; private Coroutine continuousInferenceCoroutine; private bool isDestroying; private float nextInferenceTime; private async void Start() { if (vlmModel == null) { Debug.LogError("VLM Model reference not assigned!"); return; } vlmModel.OnTokenGenerated += OnTokenGenerated; vlmModel.OnGenerationComplete += OnGenerationComplete; vlmModel.OnGenerationError += OnGenerationError; SetupVideoToImageUI(); SetOutputText("Initializing models..."); await vlmModel.Initialize(); if (!vlmModel.IsInitialized) { SetOutputText("Failed to initialize models."); return; } if (promptInput != null && string.IsNullOrWhiteSpace(promptInput.text)) promptInput.text = defaultPrompt; SetOutputText("Starting continuous scene description..."); StartContinuousInference(); } private void StartGeneration() { if (vlmModel == null || !vlmModel.IsInitialized) return; if (vlmModel.IsGenerating) return; string prompt = !string.IsNullOrWhiteSpace(promptInput?.text) ? promptInput.text : defaultPrompt; Texture image = GetCurrentImage(); if (image == null) { SetOutputText("No frame available for inference."); return; } _ = vlmModel.GenerateFromPrompt(prompt, image); } private void OnTokenGenerated(string decodedText) { SetOutputText(decodedText); } private void OnGenerationComplete(string finalText, int tokenCount, long elapsedMs) { SetOutputText(finalText); } private void OnGenerationError(string errorMessage) { SetOutputText($"Error: {errorMessage}"); } public void LoadImageFromFile(string filePath) { if (!File.Exists(filePath)) { Debug.LogError($"Image file not found: {filePath}"); return; } byte[] imageData = File.ReadAllBytes(filePath); Texture2D loadedTexture = new Texture2D(2, 2); if (loadedTexture.LoadImage(imageData)) { ReleaseLoadedImageTexture(); loadedImageTexture = loadedTexture; if (imageDisplay != null) { imageDisplay.texture = loadedTexture; } } else { Debug.LogError($"Failed to load image from: {filePath}"); Destroy(loadedTexture); } } private Texture GetCurrentImage() { if (imageDisplay == null || imageDisplay.texture == null) return null; return imageDisplay.texture; } private void StartContinuousInference() { if (continuousInferenceCoroutine != null) StopCoroutine(continuousInferenceCoroutine); nextInferenceTime = Time.unscaledTime; continuousInferenceCoroutine = StartCoroutine(ContinuousInferenceLoop()); } private IEnumerator ContinuousInferenceLoop() { while (!isDestroying) { bool canGenerate = vlmModel != null && vlmModel.IsInitialized && !vlmModel.IsGenerating; if (canGenerate && Time.unscaledTime >= nextInferenceTime) { StartGeneration(); yield return new WaitUntil(() => isDestroying || vlmModel == null || !vlmModel.IsGenerating); nextInferenceTime = Time.unscaledTime + Mathf.Max(0f, inferenceIntervalSeconds); } else { yield return null; } } } private bool SetupVideoToImageUI() { if (imageDisplay == null) return false; if (videoPlayer == null) videoPlayer = FindFirstObjectByType(); if (videoPlayer == null || !HasVideoSourceConfigured(videoPlayer)) return false; videoPlayer.playbackSpeed = 1f; videoPlayer.skipOnDrop = false; videoPlayer.renderMode = VideoRenderMode.RenderTexture; videoPlayer.prepareCompleted -= OnVideoPrepared; videoPlayer.errorReceived -= OnVideoError; videoPlayer.prepareCompleted += OnVideoPrepared; videoPlayer.errorReceived += OnVideoError; AssignVideoRenderTexture(); imageDisplay.texture = videoRenderTexture; videoPlayer.Prepare(); return true; } private static bool HasVideoSourceConfigured(VideoPlayer player) { if (player.source == VideoSource.VideoClip) return player.clip != null; if (player.source == VideoSource.Url) return !string.IsNullOrWhiteSpace(player.url); return false; } private void AssignVideoRenderTexture() { int width = 1024; int height = 1024; if (videoPlayer.clip != null) { if (videoPlayer.clip.width > 0) width = (int)videoPlayer.clip.width; if (videoPlayer.clip.height > 0) height = (int)videoPlayer.clip.height; } EnsureVideoRenderTexture(width, height); videoPlayer.targetTexture = videoRenderTexture; } private void OnVideoPrepared(VideoPlayer player) { if (player == null) return; if (player.texture != null) { EnsureVideoRenderTexture(player.texture.width, player.texture.height); player.targetTexture = videoRenderTexture; } if (imageDisplay != null) imageDisplay.texture = player.targetTexture; } private void EnsureVideoRenderTexture(int width, int height) { if (videoRenderTexture != null && videoRenderTexture.width == width && videoRenderTexture.height == height) return; if (videoRenderTexture != null) { videoRenderTexture.Release(); Destroy(videoRenderTexture); } videoRenderTexture = new RenderTexture(width, height, 0, RenderTextureFormat.ARGB32) { name = "VLM_VideoRenderTexture" }; videoRenderTexture.Create(); } private void OnVideoError(VideoPlayer player, string message) { Debug.LogError($"VideoPlayer error: {message}"); } private void SetOutputText(string text) { if (outputText != null) outputText.text = text; } private void ReleaseLoadedImageTexture() { if (loadedImageTexture == null) return; if (imageDisplay != null && imageDisplay.texture == loadedImageTexture) imageDisplay.texture = null; Destroy(loadedImageTexture); loadedImageTexture = null; } private void OnDestroy() { isDestroying = true; if (continuousInferenceCoroutine != null) { StopCoroutine(continuousInferenceCoroutine); continuousInferenceCoroutine = null; } if (vlmModel != null) { vlmModel.OnTokenGenerated -= OnTokenGenerated; vlmModel.OnGenerationComplete -= OnGenerationComplete; vlmModel.OnGenerationError -= OnGenerationError; } if (videoPlayer != null) { videoPlayer.prepareCompleted -= OnVideoPrepared; videoPlayer.errorReceived -= OnVideoError; if (videoPlayer.targetTexture == videoRenderTexture) videoPlayer.targetTexture = null; } if (videoRenderTexture != null) { videoRenderTexture.Release(); Destroy(videoRenderTexture); videoRenderTexture = null; } ReleaseLoadedImageTexture(); } }