--- # For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 # Doc / guide: https://huggingface.co/docs/hub/model-cards library_name: nanovlm license: mit pipeline_tag: image-text-to-text tags: - vision-language - multimodal - research --- **Usage:** Clone the nanoVLM repository: https://github.com/huggingface/nanoVLM. Follow the install instructions and run the following code: ```python from models.vision_language_model import VisionLanguageModel model = VisionLanguageModel.from_pretrained("lusxvr/nanoVLM-460M-8k") ``` ``` "results": { "docvqa_val_anls": 0.7695125277111309, "docvqa_val_anls_stderr": 0.00541393399604242, "infovqa_val_anls": 0.3571108969338406, "infovqa_val_anls_stderr": 0.007752114157367035, "mme_mme_cognition_score": 302.5, "mme_mme_perception_score": 1259.329131652661, "mmmu_val_mmmu_acc": 0.31889, "mmstar_coarse perception": 0.5657538907367322, "mmstar_average": 0.36039272693114954, "mmstar_fine-grained perception": 0.30654016212232865, "mmstar_instance reasoning": 0.4100153400624586, "mmstar_logical reasoning": 0.36221314439136226, "mmstar_math": 0.2498719148177849, "mmstar_science & technology": 0.26796190945623083, "ocrbench_ocrbench_accuracy": 0.688, "scienceqa_exact_match": 0.565432680971469, "scienceqa_exact_match_stderr": 0.007612653385710115, "textvqa_val_exact_match": 0.6537400000000001, "textvqa_val_exact_match_stderr": 0.006407293747178485, "chartqa_relaxed_overall": 0.7288, "chartqa_relaxed_overall_stderr": 0.008893360486581982, "chartqa_relaxed_human_split": 0.552, "chartqa_relaxed_human_split_stderr": 0.01407107658130413, "chartqa_relaxed_augmented_split": 0.9056, "chartqa_relaxed_augmented_split_stderr": 0.00827318974367371, "ai2d_exact_match": 0.4375, "ai2d_exact_match_stderr": 0.0625, "mathvista_testmini_cot_gpt_eval_score": 34.9, "mathvista_testmini_format_gpt_eval_score": 40.3, "mathvista_testmini_solution_gpt_eval_score": 35.2 } ```