Spaces:
Running on Zero
Running on Zero
| # Mapping the BCP-47 codes used in the MADLAD-400 models to the language names | |
| # [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/pdf/2309.04662) | |
| langid_to_language = { | |
| # Page 16 | |
| "<2en>": "English", | |
| "<2ru>": "Russian", | |
| "<2es>": "Spanish", | |
| "<2fr>": "French", | |
| "<2de>": "German", | |
| "<2it>": "Italian", | |
| "<2pt>": "Portuguese", | |
| "<2pl>": "Polish", | |
| "<2nl>": "Dutch", | |
| "<2vi>": "Vietnamese", | |
| "<2tr>": "Turkish", | |
| "<2sv>": "Swedish", | |
| "<2id>": "Indonesian", | |
| "<2ro>": "Romanian", | |
| "<2cs>": "Czech", | |
| "<2zh>": "Mandarin Chinese", | |
| "<2hu>": "Hungarian", | |
| "<2ja>": "Japanese", | |
| "<2th>": "Thai", | |
| "<2fi>": "Finnish", | |
| "<2fa>": "Persian", | |
| "<2uk>": "Ukrainian", | |
| "<2da>": "Danish", | |
| "<2el>": "Greek", | |
| "<2no>": "Norwegian", | |
| "<2bg>": "Bulgarian", | |
| "<2sk>": "Slovak", | |
| "<2ko>": "Korean", | |
| "<2ar>": "Arabic", | |
| "<2lt>": "Lithuanian", | |
| "<2ca>": "Catalan", | |
| "<2sl>": "Slovenian", | |
| "<2he>": "Hebrew", | |
| "<2et>": "Estonian", | |
| "<2lv>": "Latvian", | |
| "<2hi>": "Hindi", | |
| "<2sq>": "Albanian", | |
| "<2ms>": "Malay", | |
| "<2az>": "Azerbaijani", | |
| "<2sr>": "Serbian", | |
| "<2ta>": "Tamil", | |
| "<2hr>": "Croatian", | |
| "<2kk>": "Kazakh", | |
| "<2is>": "Icelandic", | |
| "<2ml>": "Malayalam", | |
| "<2mr>": "Marathi", | |
| "<2te>": "Telugu", | |
| "<2af>": "Afrikaans", | |
| "<2gl>": "Galician", | |
| "<2fil>": "Filipino", | |
| "<2be>": "Belarusian", | |
| # Page 17 | |
| "<2mk>": "Macedonian", | |
| "<2eu>": "Basque", | |
| "<2bn>": "Bengali", | |
| "<2ka>": "Georgian", | |
| "<2mn>": "Mongolian", | |
| "<2bs>": "Bosnian", | |
| "<2uz>": "Uzbek", | |
| "<2ur>": "Urdu", | |
| "<2sw>": "Swahili", | |
| "<2yue>": "Cantonese", | |
| "<2ne>": "Nepali", | |
| "<2kn>": "Kannada", | |
| "<2kaa>": "Kara-Kalpak", | |
| "<2gu>": "Gujarati", | |
| "<2si>": "Sinhala", | |
| "<2cy>": "Welsh", | |
| "<2eo>": "Esperanto", | |
| "<2la>": "Latin", | |
| "<2hy>": "Armenian", | |
| "<2ky>": "Kyrghyz", | |
| "<2tg>": "Tajik", | |
| "<2ga>": "Irish", | |
| "<2mt>": "Maltese", | |
| "<2my>": "Myanmar (Burmese)", | |
| "<2km>": "Khmer", | |
| "<2tt>": "Tatar", | |
| "<2so>": "Somali", | |
| "<2ku>": "Kurdish (Kurmanji)", | |
| "<2ps>": "Pashto", | |
| "<2pa>": "Punjabi", | |
| "<2rw>": "Kinyarwanda", | |
| "<2lo>": "Lao", | |
| "<2ha>": "Hausa", | |
| "<2dv>": "Dhivehi", | |
| "<2fy>": "W. Frisian", | |
| "<2lb>": "Luxembourgish", | |
| "<2ckb>": "Kurdish (Sorani)", | |
| "<2mg>": "Malagasy", | |
| "<2gd>": "Scottish Gaelic", | |
| "<2am>": "Amharic", | |
| "<2ug>": "Uyghur", | |
| "<2ht>": "Haitian Creole", | |
| "<2grc>": "Ancient Greek", | |
| "<2hmn>": "Hmong", | |
| "<2sd>": "Sindhi", | |
| "<2jv>": "Javanese", | |
| "<2mi>": "Maori", | |
| "<2tk>": "Turkmen", | |
| "<2ceb>": "Cebuano", | |
| "<2yi>": "Yiddish", | |
| "<2ba>": "Bashkir", | |
| "<2fo>": "Faroese", | |
| "<2or>": "Odia (Oriya)", | |
| "<2xh>": "Xhosa", | |
| "<2su>": "Sundanese", | |
| "<2kl>": "Kalaallisut", | |
| "<2ny>": "Chichewa", | |
| "<2sm>": "Samoan", | |
| "<2sn>": "Shona", | |
| "<2co>": "Corsican", | |
| "<2zu>": "Zulu", | |
| "<2ig>": "Igbo", | |
| "<2yo>": "Yoruba", | |
| "<2pap>": "Papiamento", | |
| "<2st>": "Sesotho", | |
| "<2haw>": "Hawaiian", | |
| "<2as>": "Assamese", | |
| "<2oc>": "Occitan", | |
| "<2cv>": "Chuvash", | |
| "<2lus>": "Mizo", | |
| "<2tet>": "Tetum", | |
| "<2gsw>": "Swiss German", | |
| "<2sah>": "Yakut", | |
| "<2br>": "Breton", | |
| "<2rm>": "Romansh", | |
| "<2sa>": "Sanskrit", | |
| "<2bo>": "Tibetan", | |
| "<2om>": "Oromo", | |
| "<2se>": "N. Sami", | |
| "<2ce>": "Chechen", | |
| "<2cnh>": "Hakha Chin", | |
| # Page 18 | |
| "<2ilo>": "Ilocano", | |
| "<2hil>": "Hiligaynon", | |
| "<2udm>": "Udmurt", | |
| "<2os>": "Ossetian", | |
| "<2lg>": "Luganda", | |
| "<2ti>": "Tigrinya", | |
| "<2vec>": "Venetian", | |
| "<2ts>": "Tsonga", | |
| "<2tyv>": "Tuvinian", | |
| "<2kbd>": "Kabardian", | |
| "<2ee>": "Ewe", | |
| "<2iba>": "Iban", | |
| "<2av>": "Avar", | |
| "<2kha>": "Khasi", | |
| "<2to>": "Tonga (Tonga Islands)", | |
| "<2tn>": "Tswana", | |
| "<2nso>": "Sepedi", | |
| "<2fj>": "Fijian", | |
| "<2zza>": "Zaza", | |
| "<2ak>": "Twi", | |
| "<2ada>": "Adangme", | |
| "<2otq>": "Querétaro Otomi", | |
| "<2dz>": "Dzongkha", | |
| "<2bua>": "Buryat", | |
| "<2cfm>": "Falam Chin", | |
| "<2ln>": "Lingala", | |
| "<2chm>": "Meadow Mari", | |
| "<2gn>": "Guarani", | |
| "<2krc>": "Karachay-Balkar", | |
| "<2wa>": "Walloon", | |
| "<2hif>": "Fiji Hindi", | |
| "<2yua>": "Yucateco", | |
| "<2srn>": "Sranan Tongo", | |
| "<2war>": "Waray (Philippines)", | |
| "<2rom>": "Romani", | |
| "<2bik>": "Central Bikol", | |
| "<2pam>": "Pampanga", | |
| "<2sg>": "Sango", | |
| "<2lu>": "Luba-Katanga", | |
| "<2ady>": "Adyghe", | |
| "<2kbp>": "Kabiyè", | |
| "<2syr>": "Syriac", | |
| "<2ltg>": "Latgalian", | |
| "<2myv>": "Erzya", | |
| "<2iso>": "Isoko", | |
| "<2kac>": "Kachin", | |
| "<2bho>": "Bhojpuri", | |
| "<2ay>": "Aymara", | |
| "<2kum>": "Kumyk", | |
| "<2qu>": "Quechua", | |
| "<2za>": "Zhuang", | |
| "<2pag>": "Pangasinan", | |
| "<2ngu>": "Guerrero Nahuatl", | |
| "<2ve>": "Venda", | |
| "<2pck>": "Paite Chin", | |
| "<2zap>": "Zapotec", | |
| "<2tyz>": "Tày", | |
| "<2hui>": "Huli", | |
| "<2bbc>": "Batak Toba", | |
| "<2tzo>": "Tzotzil", | |
| "<2tiv>": "Tiv", | |
| "<2ksd>": "Kuanua", | |
| "<2gom>": "Goan Konkani", | |
| "<2min>": "Minangkabau", | |
| "<2ang>": "Old English", | |
| "<2nhe>": "E. Huasteca Nahuatl", | |
| "<2bgp>": "E. Baluchi", | |
| "<2nzi>": "Nzima", | |
| "<2nnb>": "Nande", | |
| "<2nv>": "Navajo", | |
| # '<2zxx>': 'Noise', | |
| "<2bci>": "Baoulé", | |
| "<2kv>": "Komi", | |
| "<2new>": "Newari", | |
| "<2mps>": "Dadibi", | |
| "<2alt>": "S. Altai", | |
| "<2meu>": "Motu", | |
| "<2bew>": "Betawi", | |
| "<2fon>": "Fon", | |
| "<2iu>": "Inuktitut", | |
| "<2abt>": "Ambulas", | |
| # Page 19 | |
| "<2mgh>": "Makhuwa-Meetto", | |
| "<2mnw>": "Mon", | |
| "<2tvl>": "Tuvalu", | |
| "<2dov>": "Dombe", | |
| "<2tlh>": "Klingon", | |
| "<2ho>": "Hiri Motu", | |
| "<2kw>": "Cornish", | |
| "<2mrj>": "Hill Mari", | |
| "<2meo>": "Kedah Malay", | |
| "<2crh>": "Crimean Tatar", | |
| "<2mbt>": "Matigsalug Manobo", | |
| "<2emp>": "N. Emberá", | |
| "<2ace>": "Achinese", | |
| "<2ium>": "Iu Mien", | |
| "<2mam>": "Mam", | |
| "<2gym>": "Ngäbere", | |
| "<2mai>": "Maithili", | |
| "<2crs>": "Seselwa Creole French", | |
| "<2pon>": "Pohnpeian", | |
| "<2ubu>": "Umbu-Ungu", | |
| "<2fip>": "Fipa", | |
| "<2quc>": "K’iche’", | |
| "<2gv>": "Manx", | |
| "<2kj>": "Kuanyama", | |
| "<2btx>": "Batak Karo", | |
| "<2ape>": "Bukiyip", | |
| "<2chk>": "Chuukese", | |
| "<2rcf>": "Réunion Creole French", | |
| "<2shn>": "Shan", | |
| "<2tzh>": "Tzeltal", | |
| "<2mdf>": "Moksha", | |
| "<2ppk>": "Uma", | |
| "<2ss>": "Swati", | |
| "<2gag>": "Gagauz", | |
| "<2cab>": "Garifuna", | |
| "<2kri>": "Krio", | |
| "<2seh>": "Sena", | |
| "<2ibb>": "Ibibio", | |
| "<2tbz>": "Ditammari", | |
| "<2bru>": "E. Bru", | |
| "<2enq>": "Enga", | |
| "<2ach>": "Acoli", | |
| "<2cuk>": "San Blas Kuna", | |
| "<2kmb>": "Kimbundu", | |
| "<2wo>": "Wolof", | |
| "<2kek>": "Kekchí", | |
| "<2qub>": "Huallaga Huánuco Quechua", | |
| "<2tab>": "Tabassaran", | |
| "<2bts>": "Batak Simalungun", | |
| "<2kos>": "Kosraean", | |
| "<2rwo>": "Rawa", | |
| "<2cak>": "Kaqchikel", | |
| "<2tuc>": "Mutu", | |
| "<2bum>": "Bulu", | |
| "<2cjk>": "Chokwe", | |
| "<2gil>": "Gilbertese", | |
| "<2stq>": "Saterfriesisch", | |
| "<2tsg>": "Tausug", | |
| "<2quh>": "S. Bolivian Quechua", | |
| "<2mak>": "Makasar", | |
| "<2arn>": "Mapudungun", | |
| "<2ban>": "Balinese", | |
| "<2jiv>": "Shuar", | |
| "<2sja>": "Epena", | |
| "<2yap>": "Yapese", | |
| "<2tcy>": "Tulu", | |
| "<2toj>": "Tojolabal", | |
| "<2twu>": "Termanu", | |
| "<2xal>": "Kalmyk", | |
| "<2amu>": "Guerrero Amuzgo", | |
| "<2rmc>": "Carpathian Romani", | |
| "<2hus>": "Huastec", | |
| "<2nia>": "Nias", | |
| "<2kjh>": "Khakas", | |
| "<2bm>": "Bambara", | |
| "<2guh>": "Guahibo", | |
| "<2mas>": "Masai", | |
| "<2acf>": "St Lucian Creole French", | |
| "<2dtp>": "Kadazan Dusun", | |
| "<2ksw>": "S’gaw Karen", | |
| "<2bzj>": "Belize Kriol English", | |
| # Page 20 | |
| "<2din>": "Dinka", | |
| "<2zne>": "Zande", | |
| "<2mad>": "Madurese", | |
| "<2msi>": "Sabah Malay", | |
| "<2mag>": "Magahi", | |
| "<2mkn>": "Kupang Malay", | |
| "<2kg>": "Kongo", | |
| "<2lhu>": "Lahu", | |
| "<2ch>": "Chamorro", | |
| "<2qvi>": "Imbabura H. Quichua", | |
| "<2mh>": "Marshallese", | |
| "<2djk>": "E. Maroon Creole", | |
| "<2sus>": "Susu", | |
| "<2mfe>": "Morisien", | |
| "<2srm>": "Saramaccan", | |
| "<2dyu>": "Dyula", | |
| "<2ctu>": "Chol", | |
| "<2gui>": "E. Bolivian Guaraní", | |
| "<2pau>": "Palauan", | |
| "<2inb>": "Inga", | |
| "<2bi>": "Bislama", | |
| "<2mni>": "Meiteilon (Manipuri)", | |
| "<2guc>": "Wayuu", | |
| "<2jam>": "Jamaican Creole English", | |
| "<2wal>": "Wolaytta", | |
| "<2jac>": "Popti’", | |
| "<2bas>": "Basa (Cameroon)", | |
| "<2gor>": "Gorontalo", | |
| "<2skr>": "Saraiki", | |
| "<2nyu>": "Nyungwe", | |
| "<2noa>": "Woun Meu", | |
| "<2sda>": "Toraja-Sa’dan", | |
| "<2gub>": "Guajajára", | |
| "<2nog>": "Nogai", | |
| "<2cni>": "Asháninka", | |
| "<2teo>": "Teso", | |
| "<2tdx>": "Tandroy-Mahafaly Malagasy", | |
| "<2sxn>": "Sangir", | |
| "<2rki>": "Rakhine", | |
| "<2nr>": "South Ndebele", | |
| "<2frp>": "Arpitan", | |
| "<2alz>": "Alur", | |
| "<2taj>": "E. Tamang", | |
| "<2lrc>": "N. Luri", | |
| "<2cce>": "Chopi", | |
| "<2rn>": "Rundi", | |
| "<2jvn>": "Caribbean Javanese", | |
| "<2hvn>": "Sabu", | |
| "<2nij>": "Ngaju", | |
| "<2dwr>": "Dawro", | |
| "<2izz>": "Izii", | |
| "<2msm>": "Agusan Manobo", | |
| "<2bus>": "Bokobaru", | |
| "<2ktu>": "Kituba (DRC)", | |
| "<2chr>": "Cherokee", | |
| "<2maz>": "Central Mazahua", | |
| "<2tzj>": "Tz’utujil", | |
| "<2suz>": "Sunwar", | |
| "<2knj>": "W. Kanjobal", | |
| "<2bim>": "Bimoba", | |
| "<2gvl>": "Gulay", | |
| "<2bqc>": "Boko (Benin)", | |
| "<2tca>": "Ticuna", | |
| "<2pis>": "Pijin", | |
| "<2prk>": "Parauk", | |
| "<2laj>": "Lango (Uganda)", | |
| "<2mel>": "Central Melanau", | |
| "<2qxr>": "Cañar H. Quichua", | |
| "<2niq>": "Nandi", | |
| "<2ahk>": "Akha", | |
| "<2shp>": "Shipibo-Conibo", | |
| "<2hne>": "Chhattisgarhi", | |
| "<2spp>": "Supyyire Senoufo", | |
| "<2koi>": "Komi-Permyak", | |
| "<2krj>": "Kinaray-A", | |
| "<2quf>": "Lambayeque Quechua", | |
| "<2luz>": "S. Luri", | |
| "<2agr>": "Aguaruna", | |
| "<2tsc>": "Tswa", | |
| "<2mqy>": "Manggarai", | |
| "<2gof>": "Gofa", | |
| # Page 21 | |
| "<2gbm>": "Garhwali", | |
| "<2miq>": "Mískito", | |
| "<2dje>": "Zarma", | |
| "<2awa>": "Awadhi", | |
| "<2bjj>": "Kanauji", | |
| "<2qvz>": "N. Pastaza Quichua", | |
| "<2sjp>": "Surjapuri", | |
| "<2tll>": "Tetela", | |
| "<2raj>": "Rajasthani", | |
| "<2kjg>": "Khmu", | |
| "<2bgz>": "Banggai", | |
| "<2quy>": "Ayacucho Quechua", | |
| "<2cbk>": "Chavacano", | |
| "<2akb>": "Batak Angkola", | |
| "<2oj>": "Ojibwa", | |
| "<2ify>": "Keley-I Kallahan", | |
| "<2mey>": "Hassaniyya", | |
| "<2ks>": "Kashimiri", | |
| "<2cac>": "Chuj", | |
| "<2brx>": "Bodo (India)", | |
| "<2qup>": "S. Pastaza Quechua", | |
| "<2syl>": "Sylheti", | |
| "<2jax>": "Jambi Malay", | |
| "<2ff>": "Fulfulde", | |
| "<2ber>": "Tamazight (Tfng)", | |
| "<2tks>": "Takestani", | |
| "<2trp>": "Kok Borok", | |
| "<2mrw>": "Maranao", | |
| "<2adh>": "Adhola", | |
| "<2smt>": "Simte", | |
| "<2srr>": "Serer", | |
| "<2ffm>": "Maasina Fulfulde", | |
| "<2qvc>": "Cajamarca Quechua", | |
| "<2mtr>": "Mewari", | |
| "<2ann>": "Obolo", | |
| "<2kaa-Latn>": "Kara-Kalpak (Latn)", | |
| "<2aa>": "Afar", | |
| "<2noe>": "Nimadi", | |
| "<2nut>": "Nung (Viet Nam)", | |
| "<2gyn>": "Guyanese Creole English", | |
| "<2kwi>": "Awa-Cuaiquer", | |
| "<2xmm>": "Manado Malay", | |
| "<2msb>": "Masbatenyo", | |
| } | |