{ "version": "1.0", "truncation": { "direction": "Right", "max_length": 256, "strategy": "LongestFirst", "stride": 0 }, "padding": { "strategy": { "Fixed": 256 }, "direction": "Right", "pad_to_multiple_of": null, "pad_id": 1, "pad_type_id": 0, "pad_token": "" }, "added_tokens": [ { "id": 0, "content": "[CLS]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 1, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 2, "content": "[SEP]", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 3, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 5, "content": "ar_AR", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 6, "content": "cs_CZ", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 7, "content": "de_DE", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 8, "content": "en_XX", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 9, "content": "es_XX", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 10, "content": "et_EE", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 11, "content": "fi_FI", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 12, "content": "fr_XX", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 13, "content": "gu_IN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 14, "content": "hi_IN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 15, "content": "it_IT", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 16, "content": "ja_XX", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 17, "content": "kk_KZ", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 18, "content": "ko_KR", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 19, "content": "lt_LT", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 20, "content": "lv_LV", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 21, "content": "my_MM", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 22, "content": "ne_NP", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 23, "content": "nl_XX", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 24, "content": "ro_RO", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 25, "content": "ru_RU", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 26, "content": "si_LK", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 27, "content": "tr_TR", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 28, "content": "vi_VN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 29, "content": "zh_CN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 30, "content": "af_ZA", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 31, "content": "az_AZ", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 32, "content": "bn_IN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 33, "content": "fa_IR", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 34, "content": "he_IL", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 35, "content": "hr_HR", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 36, "content": "id_ID", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 37, "content": "ka_GE", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 38, "content": "km_KH", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 39, "content": "mk_MK", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 40, "content": "ml_IN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 41, "content": "mn_MN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 42, "content": "mr_IN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 43, "content": "pl_PL", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 44, "content": "ps_AF", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 45, "content": "pt_XX", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 46, "content": "sv_SE", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 47, "content": "sw_KE", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 48, "content": "ta_IN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 49, "content": "te_IN", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 50, "content": "th_TH", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 51, "content": "tl_XX", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 52, "content": "uk_UA", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 53, "content": "ur_PK", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 54, "content": "xh_ZA", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 55, "content": "gl_ES", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 56, "content": "sl_SI", "single_word": false, "lstrip": false, "rstrip": false, "normalized": false, "special": true }, { "id": 57, "content": "[MASK]", "single_word": false, "lstrip": true, "rstrip": false, "normalized": true, "special": true }, { "id": 58, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 59, "content": "", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 60, "content": "<2as>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 61, "content": "<2bn>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 62, "content": "<2en>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 63, "content": "<2gu>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 64, "content": "<2hi>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 65, "content": "<2kn>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 66, "content": "<2ml>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 67, "content": "<2mr>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 68, "content": "<2or>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 69, "content": "<2pa>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 70, "content": "<2ta>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false }, { "id": 71, "content": "<2te>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "special": false } ], "normalizer": { "type": "Sequence", "normalizers": [ { "type": "Replace", "pattern": { "Regex": "[\\n\\r\\t]" }, "content": " " }, { "type": "NFKC" }, { "type": "Strip", "strip_left": false, "strip_right": true }, { "type": "Replace", "pattern": { "Regex": " {2,}" }, "content": "▁" } ] }, "pre_tokenizer": { "type": "Metaspace", "replacement": "▁", "prepend_scheme": "always", "split": true }, "post_processor": { "type": "TemplateProcessing", "single": [ { "SpecialToken": { "id": "gu_IN", "type_id": 0 } }, { "Sequence": { "id": "A", "type_id": 0 } }, { "SpecialToken": { "id": "[SEP]", "type_id": 0 } } ], "pair": [ { "SpecialToken": { "id": "gu_IN", "type_id": 0 } }, { "Sequence": { "id": "A", "type_id": 0 } }, { "Sequence": { "id": "B", "type_id": 0 } }, { "SpecialToken": { "id": "[SEP]", "type_id": 0 } } ], "special_tokens": { "[SEP]": { "id": "[SEP]", "ids": [ 2 ], "tokens": [ "[SEP]" ] }, "gu_IN": { "id": "gu_IN", "ids": [ 13 ], "tokens": [ "gu_IN" ] } } }, "decoder": { "type": "Metaspace", "replacement": "▁", "prepend_scheme": "always", "split": true }, "model": { "type": "Unigram", "unk_id": 3, "vocab": [ [ "[CLS]", 0.0 ], [ "", 0.0 ], [ "[SEP]", 0.0 ], [ "", 0.0 ], [ "▁", -2.0 ], [ "ar_AR", 0.0 ], [ "cs_CZ", 0.0 ], [ "de_DE", 0.0 ], [ "en_XX", 0.0 ], [ "es_XX", 0.0 ], [ "et_EE", 0.0 ], [ "fi_FI", 0.0 ], [ "fr_XX", 0.0 ], [ "gu_IN", 0.0 ], [ "hi_IN", 0.0 ], [ "it_IT", 0.0 ], [ "ja_XX", 0.0 ], [ "kk_KZ", 0.0 ], [ "ko_KR", 0.0 ], [ "lt_LT", 0.0 ], [ "lv_LV", 0.0 ], [ "my_MM", 0.0 ], [ "ne_NP", 0.0 ], [ "nl_XX", 0.0 ], [ "ro_RO", 0.0 ], [ "ru_RU", 0.0 ], [ "si_LK", 0.0 ], [ "tr_TR", 0.0 ], [ "vi_VN", 0.0 ], [ "zh_CN", 0.0 ], [ "af_ZA", 0.0 ], [ "az_AZ", 0.0 ], [ "bn_IN", 0.0 ], [ "fa_IR", 0.0 ], [ "he_IL", 0.0 ], [ "hr_HR", 0.0 ], [ "id_ID", 0.0 ], [ "ka_GE", 0.0 ], [ "km_KH", 0.0 ], [ "mk_MK", 0.0 ], [ "ml_IN", 0.0 ], [ "mn_MN", 0.0 ], [ "mr_IN", 0.0 ], [ "pl_PL", 0.0 ], [ "ps_AF", 0.0 ], [ "pt_XX", 0.0 ], [ "sv_SE", 0.0 ], [ "sw_KE", 0.0 ], [ "ta_IN", 0.0 ], [ "te_IN", 0.0 ], [ "th_TH", 0.0 ], [ "tl_XX", 0.0 ], [ "uk_UA", 0.0 ], [ "ur_PK", 0.0 ], [ "xh_ZA", 0.0 ], [ "gl_ES", 0.0 ], [ "sl_SI", 0.0 ], [ "[MASK]", 0.0 ] ], "byte_fallback": false } }