Zero-Shot Image Classification
Transformers
ONNX
Chinese
English
m2_encoder
image-feature-extraction
feature-extraction
multimodal
image-text-retrieval
bilingual
chinese
english
vision-language
custom-code
custom_code
Eval Results (legacy)
Instructions to use malusama/M2-Encoder-0.4B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use malusama/M2-Encoder-0.4B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("zero-shot-image-classification", model="malusama/M2-Encoder-0.4B", trust_remote_code=True) pipe( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png", candidate_labels=["animals", "humans", "landscape"], )# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("malusama/M2-Encoder-0.4B", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| from sacred import Experiment | |
| ex = Experiment("VLMo") | |
| def _loss_names(d): | |
| ret = { | |
| "itm": 0, # image-text matching loss | |
| "itc": 0, # image-text contrastive loss | |
| "caption": 0, # image captioning loss | |
| "mvlm": 0, # masked language modeling loss | |
| "textmlm": 0, # text-only masked language modeling | |
| "imagemlm": 0, # image-only masked language modeling | |
| "vqa": 0, | |
| "nlvr2": 0, | |
| "irtr": 0, # retrieval task ft | |
| } | |
| ret.update(d) | |
| return ret | |
| def config(): | |
| exp_name = "vlmo" | |
| seed = 1 | |
| datasets = ["coco", "vg", "sbu", "gcc"] # dataset name, the definition can refer to: vlmo/datamodules/__init__.py # noqa | |
| loss_names = _loss_names({"itm": 0, "itc": 0, "mvlm": 0}) # training loss | |
| batch_size = 1024 # this is a desired batch size; pl trainer will accumulate gradients. | |
| # BEiT-v3 setting | |
| encoder_layers = 12 # the layer number of backbone | |
| encoder_embed_dim = 768 # the hidden size of tokenizer | |
| out_embed_dim = 768 # the hidden size of output embedding | |
| beit_version = "base" # model size: base(0.4B)|large(1B)|huge(10B) | |
| beit3_vl_layers = 3 # the layer number of vl_backbone | |
| deepnorm_init = True # init method | |
| share_layer = False # if share the weight between layer within backbone | |
| share_attn = False # if share the attention weight of different layer | |
| one_attn = False # if share the attention weight of vision and language | |
| # Image setting | |
| train_transform_keys = ["square_transform_randaug"] # train transform: refer to vlmo/transforms/__init__.py | |
| val_transform_keys = ["square_transform"] # test transform: refer to refer to vlmo/transforms/__init__.py | |
| image_size = 224 # image size | |
| reclip_image_size = None # reclip image size | |
| patch_size = 16 # patch size | |
| draw_false_image = 0 # if get negative image | |
| image_only = False # only input image | |
| text_only = False # # only input text | |
| # Video setting, video_num_frm is not None means video input | |
| video_num_frm = None | |
| # Visual tokenizer setting based on beit2 | |
| tokenizer_model = "beit2_visual_tokenizer" | |
| codebook_size = 8192 | |
| codebook_dim = 32 | |
| visual_mask_size = 14 | |
| visual_mask_num = 80 | |
| # Text Setting | |
| lang = 'cn' # language for zero-shot imagenet testing: cn|en | |
| vqav2_label_size = 3129 | |
| max_text_len = 52 # the number of characters | |
| max_text_len_of_initckpt = 196 | |
| tokenizer_type = "BertTokenizer" # Chinese text | |
| vocab_size = 21128 | |
| tokenizer = "./vocab.txt" | |
| whole_word_masking = True | |
| mlm_prob = 0.15 # language mask ratio | |
| draw_false_text = 0 | |
| mvlm_prob = 0.50 # vision-langurage mlm task | |
| mask_ratio = 0 # flip: mask ratio for image | |
| # cap setting | |
| cap_onlytext = False # default caption image to text | |
| # imagemlm setting | |
| split_data_for_imagemlm = False # if True, split a batch data to two parts, and the first part for imagemlm. | |
| # itc setting | |
| itc_mask = False # itc use masked token | |
| aggregate_nodes = -1 # aggregate nodes num for compute_itc, default -1 is for all nodes | |
| # Transformer Setting | |
| model_arch = "vlmo_base_patch16" | |
| drop_path_rate = 0.1 | |
| # Downstream Setting | |
| get_recall_metric = False | |
| get_recall_rerank_metric = False | |
| get_zeroshot_metric = False | |
| get_muge_feat = False | |
| get_f30k_feat = False | |
| k_test = 32 | |
| # PL Trainer Setting | |
| resume_from = None | |
| fast_dev_run = False | |
| val_check_interval = 1.0 | |
| test_only = False | |
| use_sharded_training = False | |
| resume_during_training = False | |
| save_top_k = 10 | |
| every_n_train_steps = 2000 # the step to save checkpoint | |
| log_metric_steps = 100 # the step to log metric | |
| # below params varies with the environment | |
| use_pcache = False # data storage method: pcache or nas | |
| pcache_root = "" | |
| # main_site: pcache://multimodalproxyi-pool.cz50c.alipay.com:39999/mnt/ | |
| # public_cloud: pcache://pcache_public_cloud.pcache.local:39999/mnt/abc7c88079a60b45ddfce7afa40720b7/ | |
| gpu_env = "main_site" # public_cloud or main_site | |
| data_root = "" # data root for data list | |
| log_dir = "result" | |
| per_gpu_batchsize = 4 # you should define this manually with per_gpu_batch_size=# | |
| num_gpus = 1 | |
| num_nodes = 1 | |
| load_path = "" | |
| num_workers = 8 | |
| precision = 16 | |
| local_run = True | |
| flash_attn = False | |
| deepspeed_config = None # "ds_config.json" | |
| coalesce_backbone = False | |
| mask_data = "v+l" # 'v+l':choose input of imagemlm+textmlm task, 'vl': choose input of mvlm task. | |
| communication_benchmark = False | |
| checkpoint_activations = False | |
| # dataset setting | |
| single_cap = True # if have only one caption | |
| random_one = False # if choose one caption from caption list | |
| # ITC setting | |
| itc_feats_name = "cls_vlffn_feats" # feat for itc loss | |
| itc_distill = "" | |
| itc_distill_dim = 1024 | |
| itc_teacher_weights = "" | |
| # mup training setting | |
| mup = False | |
| base_encoder_embed_dim = 1 | |
| delta_encoder_embed_dim = 2 | |
| mup_encoder_attention_heads = 1 | |
| base_encoder_ffn_embed_dim = 1 | |
| delta_encoder_ffn_embed_dim = 2 | |
| # atorch | |
| atorch_config = None | |
| compile_op = False | |
| optimizer_state_shard_save = False | |
| model_state_shard_save = False | |
| # itc loss | |
| local_loss = False | |
| use_dual_softmax = False | |
| num_frames = 1 | |
| # ----------------------- LMM pretraining config ----------------------- | |
| # norm setting | |
| deepnorm = False | |