import os from abc import ABCMeta, abstractmethod from typing import Optional, Union, Dict, List from termcolor import colored import torch from transformers import ( LlavaConfig, ) import decord import PIL.Image from tarsier2.dataset.utils import format_one_sample from tarsier2.modeling_tarsier2 import Tarsier2ForConditionalGeneration from tarsier2.modeling_qwen2_vl_fast import Qwen2VLForCausalLM from tarsier2.dataset.tarsier_datamodule import init_processor decord.bridge.set_bridge("torch") EOL_PROMPTS = { 'text': '\nSummary above sentence in one word:', 'image': '\nSummary above image in one word:', 'video': '