# Mapping the BCP-47 codes used in the MADLAD-400 models to the language names # [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/pdf/2309.04662) langid_to_language = { # Page 16 "<2en>": "English", "<2ru>": "Russian", "<2es>": "Spanish", "<2fr>": "French", "<2de>": "German", "<2it>": "Italian", "<2pt>": "Portuguese", "<2pl>": "Polish", "<2nl>": "Dutch", "<2vi>": "Vietnamese", "<2tr>": "Turkish", "<2sv>": "Swedish", "<2id>": "Indonesian", "<2ro>": "Romanian", "<2cs>": "Czech", "<2zh>": "Mandarin Chinese", "<2hu>": "Hungarian", "<2ja>": "Japanese", "<2th>": "Thai", "<2fi>": "Finnish", "<2fa>": "Persian", "<2uk>": "Ukrainian", "<2da>": "Danish", "<2el>": "Greek", "<2no>": "Norwegian", "<2bg>": "Bulgarian", "<2sk>": "Slovak", "<2ko>": "Korean", "<2ar>": "Arabic", "<2lt>": "Lithuanian", "<2ca>": "Catalan", "<2sl>": "Slovenian", "<2he>": "Hebrew", "<2et>": "Estonian", "<2lv>": "Latvian", "<2hi>": "Hindi", "<2sq>": "Albanian", "<2ms>": "Malay", "<2az>": "Azerbaijani", "<2sr>": "Serbian", "<2ta>": "Tamil", "<2hr>": "Croatian", "<2kk>": "Kazakh", "<2is>": "Icelandic", "<2ml>": "Malayalam", "<2mr>": "Marathi", "<2te>": "Telugu", "<2af>": "Afrikaans", "<2gl>": "Galician", "<2fil>": "Filipino", "<2be>": "Belarusian", # Page 17 "<2mk>": "Macedonian", "<2eu>": "Basque", "<2bn>": "Bengali", "<2ka>": "Georgian", "<2mn>": "Mongolian", "<2bs>": "Bosnian", "<2uz>": "Uzbek", "<2ur>": "Urdu", "<2sw>": "Swahili", "<2yue>": "Cantonese", "<2ne>": "Nepali", "<2kn>": "Kannada", "<2kaa>": "Kara-Kalpak", "<2gu>": "Gujarati", "<2si>": "Sinhala", "<2cy>": "Welsh", "<2eo>": "Esperanto", "<2la>": "Latin", "<2hy>": "Armenian", "<2ky>": "Kyrghyz", "<2tg>": "Tajik", "<2ga>": "Irish", "<2mt>": "Maltese", "<2my>": "Myanmar (Burmese)", "<2km>": "Khmer", "<2tt>": "Tatar", "<2so>": "Somali", "<2ku>": "Kurdish (Kurmanji)", "<2ps>": "Pashto", "<2pa>": "Punjabi", "<2rw>": "Kinyarwanda", "<2lo>": "Lao", "<2ha>": "Hausa", "<2dv>": "Dhivehi", "<2fy>": "W. Frisian", "<2lb>": "Luxembourgish", "<2ckb>": "Kurdish (Sorani)", "<2mg>": "Malagasy", "<2gd>": "Scottish Gaelic", "<2am>": "Amharic", "<2ug>": "Uyghur", "<2ht>": "Haitian Creole", "<2grc>": "Ancient Greek", "<2hmn>": "Hmong", "<2sd>": "Sindhi", "<2jv>": "Javanese", "<2mi>": "Maori", "<2tk>": "Turkmen", "<2ceb>": "Cebuano", "<2yi>": "Yiddish", "<2ba>": "Bashkir", "<2fo>": "Faroese", "<2or>": "Odia (Oriya)", "<2xh>": "Xhosa", "<2su>": "Sundanese", "<2kl>": "Kalaallisut", "<2ny>": "Chichewa", "<2sm>": "Samoan", "<2sn>": "Shona", "<2co>": "Corsican", "<2zu>": "Zulu", "<2ig>": "Igbo", "<2yo>": "Yoruba", "<2pap>": "Papiamento", "<2st>": "Sesotho", "<2haw>": "Hawaiian", "<2as>": "Assamese", "<2oc>": "Occitan", "<2cv>": "Chuvash", "<2lus>": "Mizo", "<2tet>": "Tetum", "<2gsw>": "Swiss German", "<2sah>": "Yakut", "<2br>": "Breton", "<2rm>": "Romansh", "<2sa>": "Sanskrit", "<2bo>": "Tibetan", "<2om>": "Oromo", "<2se>": "N. Sami", "<2ce>": "Chechen", "<2cnh>": "Hakha Chin", # Page 18 "<2ilo>": "Ilocano", "<2hil>": "Hiligaynon", "<2udm>": "Udmurt", "<2os>": "Ossetian", "<2lg>": "Luganda", "<2ti>": "Tigrinya", "<2vec>": "Venetian", "<2ts>": "Tsonga", "<2tyv>": "Tuvinian", "<2kbd>": "Kabardian", "<2ee>": "Ewe", "<2iba>": "Iban", "<2av>": "Avar", "<2kha>": "Khasi", "<2to>": "Tonga (Tonga Islands)", "<2tn>": "Tswana", "<2nso>": "Sepedi", "<2fj>": "Fijian", "<2zza>": "Zaza", "<2ak>": "Twi", "<2ada>": "Adangme", "<2otq>": "Querétaro Otomi", "<2dz>": "Dzongkha", "<2bua>": "Buryat", "<2cfm>": "Falam Chin", "<2ln>": "Lingala", "<2chm>": "Meadow Mari", "<2gn>": "Guarani", "<2krc>": "Karachay-Balkar", "<2wa>": "Walloon", "<2hif>": "Fiji Hindi", "<2yua>": "Yucateco", "<2srn>": "Sranan Tongo", "<2war>": "Waray (Philippines)", "<2rom>": "Romani", "<2bik>": "Central Bikol", "<2pam>": "Pampanga", "<2sg>": "Sango", "<2lu>": "Luba-Katanga", "<2ady>": "Adyghe", "<2kbp>": "Kabiyè", "<2syr>": "Syriac", "<2ltg>": "Latgalian", "<2myv>": "Erzya", "<2iso>": "Isoko", "<2kac>": "Kachin", "<2bho>": "Bhojpuri", "<2ay>": "Aymara", "<2kum>": "Kumyk", "<2qu>": "Quechua", "<2za>": "Zhuang", "<2pag>": "Pangasinan", "<2ngu>": "Guerrero Nahuatl", "<2ve>": "Venda", "<2pck>": "Paite Chin", "<2zap>": "Zapotec", "<2tyz>": "Tày", "<2hui>": "Huli", "<2bbc>": "Batak Toba", "<2tzo>": "Tzotzil", "<2tiv>": "Tiv", "<2ksd>": "Kuanua", "<2gom>": "Goan Konkani", "<2min>": "Minangkabau", "<2ang>": "Old English", "<2nhe>": "E. Huasteca Nahuatl", "<2bgp>": "E. Baluchi", "<2nzi>": "Nzima", "<2nnb>": "Nande", "<2nv>": "Navajo", # '<2zxx>': 'Noise', "<2bci>": "Baoulé", "<2kv>": "Komi", "<2new>": "Newari", "<2mps>": "Dadibi", "<2alt>": "S. Altai", "<2meu>": "Motu", "<2bew>": "Betawi", "<2fon>": "Fon", "<2iu>": "Inuktitut", "<2abt>": "Ambulas", # Page 19 "<2mgh>": "Makhuwa-Meetto", "<2mnw>": "Mon", "<2tvl>": "Tuvalu", "<2dov>": "Dombe", "<2tlh>": "Klingon", "<2ho>": "Hiri Motu", "<2kw>": "Cornish", "<2mrj>": "Hill Mari", "<2meo>": "Kedah Malay", "<2crh>": "Crimean Tatar", "<2mbt>": "Matigsalug Manobo", "<2emp>": "N. Emberá", "<2ace>": "Achinese", "<2ium>": "Iu Mien", "<2mam>": "Mam", "<2gym>": "Ngäbere", "<2mai>": "Maithili", "<2crs>": "Seselwa Creole French", "<2pon>": "Pohnpeian", "<2ubu>": "Umbu-Ungu", "<2fip>": "Fipa", "<2quc>": "K’iche’", "<2gv>": "Manx", "<2kj>": "Kuanyama", "<2btx>": "Batak Karo", "<2ape>": "Bukiyip", "<2chk>": "Chuukese", "<2rcf>": "Réunion Creole French", "<2shn>": "Shan", "<2tzh>": "Tzeltal", "<2mdf>": "Moksha", "<2ppk>": "Uma", "<2ss>": "Swati", "<2gag>": "Gagauz", "<2cab>": "Garifuna", "<2kri>": "Krio", "<2seh>": "Sena", "<2ibb>": "Ibibio", "<2tbz>": "Ditammari", "<2bru>": "E. Bru", "<2enq>": "Enga", "<2ach>": "Acoli", "<2cuk>": "San Blas Kuna", "<2kmb>": "Kimbundu", "<2wo>": "Wolof", "<2kek>": "Kekchí", "<2qub>": "Huallaga Huánuco Quechua", "<2tab>": "Tabassaran", "<2bts>": "Batak Simalungun", "<2kos>": "Kosraean", "<2rwo>": "Rawa", "<2cak>": "Kaqchikel", "<2tuc>": "Mutu", "<2bum>": "Bulu", "<2cjk>": "Chokwe", "<2gil>": "Gilbertese", "<2stq>": "Saterfriesisch", "<2tsg>": "Tausug", "<2quh>": "S. Bolivian Quechua", "<2mak>": "Makasar", "<2arn>": "Mapudungun", "<2ban>": "Balinese", "<2jiv>": "Shuar", "<2sja>": "Epena", "<2yap>": "Yapese", "<2tcy>": "Tulu", "<2toj>": "Tojolabal", "<2twu>": "Termanu", "<2xal>": "Kalmyk", "<2amu>": "Guerrero Amuzgo", "<2rmc>": "Carpathian Romani", "<2hus>": "Huastec", "<2nia>": "Nias", "<2kjh>": "Khakas", "<2bm>": "Bambara", "<2guh>": "Guahibo", "<2mas>": "Masai", "<2acf>": "St Lucian Creole French", "<2dtp>": "Kadazan Dusun", "<2ksw>": "S’gaw Karen", "<2bzj>": "Belize Kriol English", # Page 20 "<2din>": "Dinka", "<2zne>": "Zande", "<2mad>": "Madurese", "<2msi>": "Sabah Malay", "<2mag>": "Magahi", "<2mkn>": "Kupang Malay", "<2kg>": "Kongo", "<2lhu>": "Lahu", "<2ch>": "Chamorro", "<2qvi>": "Imbabura H. Quichua", "<2mh>": "Marshallese", "<2djk>": "E. Maroon Creole", "<2sus>": "Susu", "<2mfe>": "Morisien", "<2srm>": "Saramaccan", "<2dyu>": "Dyula", "<2ctu>": "Chol", "<2gui>": "E. Bolivian Guaraní", "<2pau>": "Palauan", "<2inb>": "Inga", "<2bi>": "Bislama", "<2mni>": "Meiteilon (Manipuri)", "<2guc>": "Wayuu", "<2jam>": "Jamaican Creole English", "<2wal>": "Wolaytta", "<2jac>": "Popti’", "<2bas>": "Basa (Cameroon)", "<2gor>": "Gorontalo", "<2skr>": "Saraiki", "<2nyu>": "Nyungwe", "<2noa>": "Woun Meu", "<2sda>": "Toraja-Sa’dan", "<2gub>": "Guajajára", "<2nog>": "Nogai", "<2cni>": "Asháninka", "<2teo>": "Teso", "<2tdx>": "Tandroy-Mahafaly Malagasy", "<2sxn>": "Sangir", "<2rki>": "Rakhine", "<2nr>": "South Ndebele", "<2frp>": "Arpitan", "<2alz>": "Alur", "<2taj>": "E. Tamang", "<2lrc>": "N. Luri", "<2cce>": "Chopi", "<2rn>": "Rundi", "<2jvn>": "Caribbean Javanese", "<2hvn>": "Sabu", "<2nij>": "Ngaju", "<2dwr>": "Dawro", "<2izz>": "Izii", "<2msm>": "Agusan Manobo", "<2bus>": "Bokobaru", "<2ktu>": "Kituba (DRC)", "<2chr>": "Cherokee", "<2maz>": "Central Mazahua", "<2tzj>": "Tz’utujil", "<2suz>": "Sunwar", "<2knj>": "W. Kanjobal", "<2bim>": "Bimoba", "<2gvl>": "Gulay", "<2bqc>": "Boko (Benin)", "<2tca>": "Ticuna", "<2pis>": "Pijin", "<2prk>": "Parauk", "<2laj>": "Lango (Uganda)", "<2mel>": "Central Melanau", "<2qxr>": "Cañar H. Quichua", "<2niq>": "Nandi", "<2ahk>": "Akha", "<2shp>": "Shipibo-Conibo", "<2hne>": "Chhattisgarhi", "<2spp>": "Supyyire Senoufo", "<2koi>": "Komi-Permyak", "<2krj>": "Kinaray-A", "<2quf>": "Lambayeque Quechua", "<2luz>": "S. Luri", "<2agr>": "Aguaruna", "<2tsc>": "Tswa", "<2mqy>": "Manggarai", "<2gof>": "Gofa", # Page 21 "<2gbm>": "Garhwali", "<2miq>": "Mískito", "<2dje>": "Zarma", "<2awa>": "Awadhi", "<2bjj>": "Kanauji", "<2qvz>": "N. Pastaza Quichua", "<2sjp>": "Surjapuri", "<2tll>": "Tetela", "<2raj>": "Rajasthani", "<2kjg>": "Khmu", "<2bgz>": "Banggai", "<2quy>": "Ayacucho Quechua", "<2cbk>": "Chavacano", "<2akb>": "Batak Angkola", "<2oj>": "Ojibwa", "<2ify>": "Keley-I Kallahan", "<2mey>": "Hassaniyya", "<2ks>": "Kashimiri", "<2cac>": "Chuj", "<2brx>": "Bodo (India)", "<2qup>": "S. Pastaza Quechua", "<2syl>": "Sylheti", "<2jax>": "Jambi Malay", "<2ff>": "Fulfulde", "<2ber>": "Tamazight (Tfng)", "<2tks>": "Takestani", "<2trp>": "Kok Borok", "<2mrw>": "Maranao", "<2adh>": "Adhola", "<2smt>": "Simte", "<2srr>": "Serer", "<2ffm>": "Maasina Fulfulde", "<2qvc>": "Cajamarca Quechua", "<2mtr>": "Mewari", "<2ann>": "Obolo", "<2kaa-Latn>": "Kara-Kalpak (Latn)", "<2aa>": "Afar", "<2noe>": "Nimadi", "<2nut>": "Nung (Viet Nam)", "<2gyn>": "Guyanese Creole English", "<2kwi>": "Awa-Cuaiquer", "<2xmm>": "Manado Malay", "<2msb>": "Masbatenyo", }