madlad-400-translate / langmap /langid_mapping.py
Daryl Lim
Rename LangMap/ to langmap/ and fix package issues
a760c43
Raw
History Blame
11.2 kB
# Mapping the BCP-47 codes used in the MADLAD-400 models to the language names
# [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/pdf/2309.04662)
langid_to_language = {
# Page 16
"<2en>": "English",
"<2ru>": "Russian",
"<2es>": "Spanish",
"<2fr>": "French",
"<2de>": "German",
"<2it>": "Italian",
"<2pt>": "Portuguese",
"<2pl>": "Polish",
"<2nl>": "Dutch",
"<2vi>": "Vietnamese",
"<2tr>": "Turkish",
"<2sv>": "Swedish",
"<2id>": "Indonesian",
"<2ro>": "Romanian",
"<2cs>": "Czech",
"<2zh>": "Mandarin Chinese",
"<2hu>": "Hungarian",
"<2ja>": "Japanese",
"<2th>": "Thai",
"<2fi>": "Finnish",
"<2fa>": "Persian",
"<2uk>": "Ukrainian",
"<2da>": "Danish",
"<2el>": "Greek",
"<2no>": "Norwegian",
"<2bg>": "Bulgarian",
"<2sk>": "Slovak",
"<2ko>": "Korean",
"<2ar>": "Arabic",
"<2lt>": "Lithuanian",
"<2ca>": "Catalan",
"<2sl>": "Slovenian",
"<2he>": "Hebrew",
"<2et>": "Estonian",
"<2lv>": "Latvian",
"<2hi>": "Hindi",
"<2sq>": "Albanian",
"<2ms>": "Malay",
"<2az>": "Azerbaijani",
"<2sr>": "Serbian",
"<2ta>": "Tamil",
"<2hr>": "Croatian",
"<2kk>": "Kazakh",
"<2is>": "Icelandic",
"<2ml>": "Malayalam",
"<2mr>": "Marathi",
"<2te>": "Telugu",
"<2af>": "Afrikaans",
"<2gl>": "Galician",
"<2fil>": "Filipino",
"<2be>": "Belarusian",
# Page 17
"<2mk>": "Macedonian",
"<2eu>": "Basque",
"<2bn>": "Bengali",
"<2ka>": "Georgian",
"<2mn>": "Mongolian",
"<2bs>": "Bosnian",
"<2uz>": "Uzbek",
"<2ur>": "Urdu",
"<2sw>": "Swahili",
"<2yue>": "Cantonese",
"<2ne>": "Nepali",
"<2kn>": "Kannada",
"<2kaa>": "Kara-Kalpak",
"<2gu>": "Gujarati",
"<2si>": "Sinhala",
"<2cy>": "Welsh",
"<2eo>": "Esperanto",
"<2la>": "Latin",
"<2hy>": "Armenian",
"<2ky>": "Kyrghyz",
"<2tg>": "Tajik",
"<2ga>": "Irish",
"<2mt>": "Maltese",
"<2my>": "Myanmar (Burmese)",
"<2km>": "Khmer",
"<2tt>": "Tatar",
"<2so>": "Somali",
"<2ku>": "Kurdish (Kurmanji)",
"<2ps>": "Pashto",
"<2pa>": "Punjabi",
"<2rw>": "Kinyarwanda",
"<2lo>": "Lao",
"<2ha>": "Hausa",
"<2dv>": "Dhivehi",
"<2fy>": "W. Frisian",
"<2lb>": "Luxembourgish",
"<2ckb>": "Kurdish (Sorani)",
"<2mg>": "Malagasy",
"<2gd>": "Scottish Gaelic",
"<2am>": "Amharic",
"<2ug>": "Uyghur",
"<2ht>": "Haitian Creole",
"<2grc>": "Ancient Greek",
"<2hmn>": "Hmong",
"<2sd>": "Sindhi",
"<2jv>": "Javanese",
"<2mi>": "Maori",
"<2tk>": "Turkmen",
"<2ceb>": "Cebuano",
"<2yi>": "Yiddish",
"<2ba>": "Bashkir",
"<2fo>": "Faroese",
"<2or>": "Odia (Oriya)",
"<2xh>": "Xhosa",
"<2su>": "Sundanese",
"<2kl>": "Kalaallisut",
"<2ny>": "Chichewa",
"<2sm>": "Samoan",
"<2sn>": "Shona",
"<2co>": "Corsican",
"<2zu>": "Zulu",
"<2ig>": "Igbo",
"<2yo>": "Yoruba",
"<2pap>": "Papiamento",
"<2st>": "Sesotho",
"<2haw>": "Hawaiian",
"<2as>": "Assamese",
"<2oc>": "Occitan",
"<2cv>": "Chuvash",
"<2lus>": "Mizo",
"<2tet>": "Tetum",
"<2gsw>": "Swiss German",
"<2sah>": "Yakut",
"<2br>": "Breton",
"<2rm>": "Romansh",
"<2sa>": "Sanskrit",
"<2bo>": "Tibetan",
"<2om>": "Oromo",
"<2se>": "N. Sami",
"<2ce>": "Chechen",
"<2cnh>": "Hakha Chin",
# Page 18
"<2ilo>": "Ilocano",
"<2hil>": "Hiligaynon",
"<2udm>": "Udmurt",
"<2os>": "Ossetian",
"<2lg>": "Luganda",
"<2ti>": "Tigrinya",
"<2vec>": "Venetian",
"<2ts>": "Tsonga",
"<2tyv>": "Tuvinian",
"<2kbd>": "Kabardian",
"<2ee>": "Ewe",
"<2iba>": "Iban",
"<2av>": "Avar",
"<2kha>": "Khasi",
"<2to>": "Tonga (Tonga Islands)",
"<2tn>": "Tswana",
"<2nso>": "Sepedi",
"<2fj>": "Fijian",
"<2zza>": "Zaza",
"<2ak>": "Twi",
"<2ada>": "Adangme",
"<2otq>": "Querétaro Otomi",
"<2dz>": "Dzongkha",
"<2bua>": "Buryat",
"<2cfm>": "Falam Chin",
"<2ln>": "Lingala",
"<2chm>": "Meadow Mari",
"<2gn>": "Guarani",
"<2krc>": "Karachay-Balkar",
"<2wa>": "Walloon",
"<2hif>": "Fiji Hindi",
"<2yua>": "Yucateco",
"<2srn>": "Sranan Tongo",
"<2war>": "Waray (Philippines)",
"<2rom>": "Romani",
"<2bik>": "Central Bikol",
"<2pam>": "Pampanga",
"<2sg>": "Sango",
"<2lu>": "Luba-Katanga",
"<2ady>": "Adyghe",
"<2kbp>": "Kabiyè",
"<2syr>": "Syriac",
"<2ltg>": "Latgalian",
"<2myv>": "Erzya",
"<2iso>": "Isoko",
"<2kac>": "Kachin",
"<2bho>": "Bhojpuri",
"<2ay>": "Aymara",
"<2kum>": "Kumyk",
"<2qu>": "Quechua",
"<2za>": "Zhuang",
"<2pag>": "Pangasinan",
"<2ngu>": "Guerrero Nahuatl",
"<2ve>": "Venda",
"<2pck>": "Paite Chin",
"<2zap>": "Zapotec",
"<2tyz>": "Tày",
"<2hui>": "Huli",
"<2bbc>": "Batak Toba",
"<2tzo>": "Tzotzil",
"<2tiv>": "Tiv",
"<2ksd>": "Kuanua",
"<2gom>": "Goan Konkani",
"<2min>": "Minangkabau",
"<2ang>": "Old English",
"<2nhe>": "E. Huasteca Nahuatl",
"<2bgp>": "E. Baluchi",
"<2nzi>": "Nzima",
"<2nnb>": "Nande",
"<2nv>": "Navajo",
# '<2zxx>': 'Noise',
"<2bci>": "Baoulé",
"<2kv>": "Komi",
"<2new>": "Newari",
"<2mps>": "Dadibi",
"<2alt>": "S. Altai",
"<2meu>": "Motu",
"<2bew>": "Betawi",
"<2fon>": "Fon",
"<2iu>": "Inuktitut",
"<2abt>": "Ambulas",
# Page 19
"<2mgh>": "Makhuwa-Meetto",
"<2mnw>": "Mon",
"<2tvl>": "Tuvalu",
"<2dov>": "Dombe",
"<2tlh>": "Klingon",
"<2ho>": "Hiri Motu",
"<2kw>": "Cornish",
"<2mrj>": "Hill Mari",
"<2meo>": "Kedah Malay",
"<2crh>": "Crimean Tatar",
"<2mbt>": "Matigsalug Manobo",
"<2emp>": "N. Emberá",
"<2ace>": "Achinese",
"<2ium>": "Iu Mien",
"<2mam>": "Mam",
"<2gym>": "Ngäbere",
"<2mai>": "Maithili",
"<2crs>": "Seselwa Creole French",
"<2pon>": "Pohnpeian",
"<2ubu>": "Umbu-Ungu",
"<2fip>": "Fipa",
"<2quc>": "K’iche’",
"<2gv>": "Manx",
"<2kj>": "Kuanyama",
"<2btx>": "Batak Karo",
"<2ape>": "Bukiyip",
"<2chk>": "Chuukese",
"<2rcf>": "Réunion Creole French",
"<2shn>": "Shan",
"<2tzh>": "Tzeltal",
"<2mdf>": "Moksha",
"<2ppk>": "Uma",
"<2ss>": "Swati",
"<2gag>": "Gagauz",
"<2cab>": "Garifuna",
"<2kri>": "Krio",
"<2seh>": "Sena",
"<2ibb>": "Ibibio",
"<2tbz>": "Ditammari",
"<2bru>": "E. Bru",
"<2enq>": "Enga",
"<2ach>": "Acoli",
"<2cuk>": "San Blas Kuna",
"<2kmb>": "Kimbundu",
"<2wo>": "Wolof",
"<2kek>": "Kekchí",
"<2qub>": "Huallaga Huánuco Quechua",
"<2tab>": "Tabassaran",
"<2bts>": "Batak Simalungun",
"<2kos>": "Kosraean",
"<2rwo>": "Rawa",
"<2cak>": "Kaqchikel",
"<2tuc>": "Mutu",
"<2bum>": "Bulu",
"<2cjk>": "Chokwe",
"<2gil>": "Gilbertese",
"<2stq>": "Saterfriesisch",
"<2tsg>": "Tausug",
"<2quh>": "S. Bolivian Quechua",
"<2mak>": "Makasar",
"<2arn>": "Mapudungun",
"<2ban>": "Balinese",
"<2jiv>": "Shuar",
"<2sja>": "Epena",
"<2yap>": "Yapese",
"<2tcy>": "Tulu",
"<2toj>": "Tojolabal",
"<2twu>": "Termanu",
"<2xal>": "Kalmyk",
"<2amu>": "Guerrero Amuzgo",
"<2rmc>": "Carpathian Romani",
"<2hus>": "Huastec",
"<2nia>": "Nias",
"<2kjh>": "Khakas",
"<2bm>": "Bambara",
"<2guh>": "Guahibo",
"<2mas>": "Masai",
"<2acf>": "St Lucian Creole French",
"<2dtp>": "Kadazan Dusun",
"<2ksw>": "S’gaw Karen",
"<2bzj>": "Belize Kriol English",
# Page 20
"<2din>": "Dinka",
"<2zne>": "Zande",
"<2mad>": "Madurese",
"<2msi>": "Sabah Malay",
"<2mag>": "Magahi",
"<2mkn>": "Kupang Malay",
"<2kg>": "Kongo",
"<2lhu>": "Lahu",
"<2ch>": "Chamorro",
"<2qvi>": "Imbabura H. Quichua",
"<2mh>": "Marshallese",
"<2djk>": "E. Maroon Creole",
"<2sus>": "Susu",
"<2mfe>": "Morisien",
"<2srm>": "Saramaccan",
"<2dyu>": "Dyula",
"<2ctu>": "Chol",
"<2gui>": "E. Bolivian Guaraní",
"<2pau>": "Palauan",
"<2inb>": "Inga",
"<2bi>": "Bislama",
"<2mni>": "Meiteilon (Manipuri)",
"<2guc>": "Wayuu",
"<2jam>": "Jamaican Creole English",
"<2wal>": "Wolaytta",
"<2jac>": "Popti’",
"<2bas>": "Basa (Cameroon)",
"<2gor>": "Gorontalo",
"<2skr>": "Saraiki",
"<2nyu>": "Nyungwe",
"<2noa>": "Woun Meu",
"<2sda>": "Toraja-Sa’dan",
"<2gub>": "Guajajára",
"<2nog>": "Nogai",
"<2cni>": "Asháninka",
"<2teo>": "Teso",
"<2tdx>": "Tandroy-Mahafaly Malagasy",
"<2sxn>": "Sangir",
"<2rki>": "Rakhine",
"<2nr>": "South Ndebele",
"<2frp>": "Arpitan",
"<2alz>": "Alur",
"<2taj>": "E. Tamang",
"<2lrc>": "N. Luri",
"<2cce>": "Chopi",
"<2rn>": "Rundi",
"<2jvn>": "Caribbean Javanese",
"<2hvn>": "Sabu",
"<2nij>": "Ngaju",
"<2dwr>": "Dawro",
"<2izz>": "Izii",
"<2msm>": "Agusan Manobo",
"<2bus>": "Bokobaru",
"<2ktu>": "Kituba (DRC)",
"<2chr>": "Cherokee",
"<2maz>": "Central Mazahua",
"<2tzj>": "Tz’utujil",
"<2suz>": "Sunwar",
"<2knj>": "W. Kanjobal",
"<2bim>": "Bimoba",
"<2gvl>": "Gulay",
"<2bqc>": "Boko (Benin)",
"<2tca>": "Ticuna",
"<2pis>": "Pijin",
"<2prk>": "Parauk",
"<2laj>": "Lango (Uganda)",
"<2mel>": "Central Melanau",
"<2qxr>": "Cañar H. Quichua",
"<2niq>": "Nandi",
"<2ahk>": "Akha",
"<2shp>": "Shipibo-Conibo",
"<2hne>": "Chhattisgarhi",
"<2spp>": "Supyyire Senoufo",
"<2koi>": "Komi-Permyak",
"<2krj>": "Kinaray-A",
"<2quf>": "Lambayeque Quechua",
"<2luz>": "S. Luri",
"<2agr>": "Aguaruna",
"<2tsc>": "Tswa",
"<2mqy>": "Manggarai",
"<2gof>": "Gofa",
# Page 21
"<2gbm>": "Garhwali",
"<2miq>": "Mískito",
"<2dje>": "Zarma",
"<2awa>": "Awadhi",
"<2bjj>": "Kanauji",
"<2qvz>": "N. Pastaza Quichua",
"<2sjp>": "Surjapuri",
"<2tll>": "Tetela",
"<2raj>": "Rajasthani",
"<2kjg>": "Khmu",
"<2bgz>": "Banggai",
"<2quy>": "Ayacucho Quechua",
"<2cbk>": "Chavacano",
"<2akb>": "Batak Angkola",
"<2oj>": "Ojibwa",
"<2ify>": "Keley-I Kallahan",
"<2mey>": "Hassaniyya",
"<2ks>": "Kashimiri",
"<2cac>": "Chuj",
"<2brx>": "Bodo (India)",
"<2qup>": "S. Pastaza Quechua",
"<2syl>": "Sylheti",
"<2jax>": "Jambi Malay",
"<2ff>": "Fulfulde",
"<2ber>": "Tamazight (Tfng)",
"<2tks>": "Takestani",
"<2trp>": "Kok Borok",
"<2mrw>": "Maranao",
"<2adh>": "Adhola",
"<2smt>": "Simte",
"<2srr>": "Serer",
"<2ffm>": "Maasina Fulfulde",
"<2qvc>": "Cajamarca Quechua",
"<2mtr>": "Mewari",
"<2ann>": "Obolo",
"<2kaa-Latn>": "Kara-Kalpak (Latn)",
"<2aa>": "Afar",
"<2noe>": "Nimadi",
"<2nut>": "Nung (Viet Nam)",
"<2gyn>": "Guyanese Creole English",
"<2kwi>": "Awa-Cuaiquer",
"<2xmm>": "Manado Malay",
"<2msb>": "Masbatenyo",
}