# Mapping the BCP-47 codes used in the MADLAD-400 models to the language names. # Restricted to languages evaluated in Table 14, Section A.9 of: # [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/pdf/2309.04662) # Evaluation sets: WMT, Flores-200, NTREX, Gatones langid_to_language = { # Page 16 "<2en>": "English", "<2ru>": "Russian", "<2es>": "Spanish", "<2fr>": "French", "<2de>": "German", "<2it>": "Italian", "<2pt>": "Portuguese", "<2pl>": "Polish", "<2nl>": "Dutch", "<2vi>": "Vietnamese", "<2tr>": "Turkish", "<2sv>": "Swedish", "<2id>": "Indonesian", "<2ro>": "Romanian", "<2cs>": "Czech", "<2zh>": "Mandarin Chinese", "<2hu>": "Hungarian", "<2ja>": "Japanese", "<2th>": "Thai", "<2fi>": "Finnish", "<2fa>": "Persian", "<2uk>": "Ukrainian", "<2da>": "Danish", "<2el>": "Greek", "<2no>": "Norwegian", "<2bg>": "Bulgarian", "<2sk>": "Slovak", "<2ko>": "Korean", "<2ar>": "Arabic", "<2lt>": "Lithuanian", "<2ca>": "Catalan", "<2sl>": "Slovenian", "<2he>": "Hebrew", "<2et>": "Estonian", "<2lv>": "Latvian", "<2hi>": "Hindi", "<2sq>": "Albanian", "<2ms>": "Malay", "<2az>": "Azerbaijani", "<2sr>": "Serbian", "<2ta>": "Tamil", "<2hr>": "Croatian", "<2kk>": "Kazakh", "<2is>": "Icelandic", "<2ml>": "Malayalam", "<2mr>": "Marathi", "<2te>": "Telugu", "<2af>": "Afrikaans", "<2gl>": "Galician", "<2fil>": "Filipino", "<2be>": "Belarusian", # Page 17 "<2mk>": "Macedonian", "<2eu>": "Basque", "<2bn>": "Bengali", "<2ka>": "Georgian", "<2mn>": "Mongolian", "<2bs>": "Bosnian", "<2uz>": "Uzbek", "<2ur>": "Urdu", "<2sw>": "Swahili", "<2yue>": "Cantonese", "<2ne>": "Nepali", "<2kn>": "Kannada", "<2gu>": "Gujarati", "<2si>": "Sinhala", "<2cy>": "Welsh", "<2eo>": "Esperanto", "<2hy>": "Armenian", "<2ky>": "Kyrghyz", "<2tg>": "Tajik", "<2ga>": "Irish", "<2mt>": "Maltese", "<2my>": "Myanmar (Burmese)", "<2km>": "Khmer", "<2tt>": "Tatar", "<2so>": "Somali", "<2ku>": "Kurdish (Kurmanji)", "<2ps>": "Pashto", "<2pa>": "Punjabi", "<2rw>": "Kinyarwanda", "<2lo>": "Lao", "<2ha>": "Hausa", "<2dv>": "Dhivehi", "<2lb>": "Luxembourgish", "<2ckb>": "Kurdish (Sorani)", "<2mg>": "Malagasy", "<2gd>": "Scottish Gaelic", "<2am>": "Amharic", "<2ug>": "Uyghur", "<2ht>": "Haitian Creole", "<2hmn>": "Hmong", "<2sd>": "Sindhi", "<2jv>": "Javanese", "<2mi>": "Maori", "<2tk>": "Turkmen", "<2ceb>": "Cebuano", "<2yi>": "Yiddish", "<2ba>": "Bashkir", "<2fo>": "Faroese", "<2or>": "Odia (Oriya)", "<2xh>": "Xhosa", "<2su>": "Sundanese", "<2kl>": "Kalaallisut", "<2ny>": "Chichewa", "<2sm>": "Samoan", "<2sn>": "Shona", "<2zu>": "Zulu", "<2ig>": "Igbo", "<2yo>": "Yoruba", "<2pap>": "Papiamento", "<2st>": "Sesotho", "<2as>": "Assamese", "<2oc>": "Occitan", "<2cv>": "Chuvash", "<2lus>": "Mizo", "<2sa>": "Sanskrit", "<2bo>": "Tibetan", "<2om>": "Oromo", "<2ce>": "Chechen", # Page 18 "<2ilo>": "Ilocano", "<2lg>": "Luganda", "<2ti>": "Tigrinya", "<2vec>": "Venetian", "<2ts>": "Tsonga", "<2ee>": "Ewe", "<2av>": "Avar", "<2to>": "Tonga (Tonga Islands)", "<2tn>": "Tswana", "<2nso>": "Sepedi", "<2fj>": "Fijian", "<2zza>": "Zaza", "<2ak>": "Twi", "<2dz>": "Dzongkha", "<2ln>": "Lingala", "<2gn>": "Guarani", "<2yua>": "Yucateco", "<2war>": "Waray (Philippines)", "<2sg>": "Sango", "<2ady>": "Adyghe", "<2kbp>": "Kabiyè", "<2ltg>": "Latgalian", "<2iso>": "Isoko", "<2kac>": "Kachin", "<2bho>": "Bhojpuri", "<2ay>": "Aymara", "<2qu>": "Quechua", "<2pag>": "Pangasinan", "<2ve>": "Venda", "<2bbc>": "Batak Toba", "<2tiv>": "Tiv", "<2gom>": "Goan Konkani", "<2min>": "Minangkabau", "<2bci>": "Baoulé", "<2bew>": "Betawi", "<2fon>": "Fon", # Page 19 "<2meo>": "Kedah Malay", "<2crh>": "Crimean Tatar", "<2ace>": "Achinese", "<2mai>": "Maithili", "<2quc>": "K\u2019iche\u2019", "<2shn>": "Shan", "<2ss>": "Swati", "<2kri>": "Krio", "<2kmb>": "Kimbundu", "<2wo>": "Wolof", "<2ban>": "Balinese", "<2bm>": "Bambara", # Page 20 "<2din>": "Dinka", "<2mad>": "Madurese", "<2mag>": "Magahi", "<2kg>": "Kongo", "<2dyu>": "Dyula", "<2mni>": "Meiteilon (Manipuri)", "<2skr>": "Saraiki", "<2rn>": "Rundi", "<2chr>": "Cherokee", "<2hne>": "Chhattisgarhi", # Page 21 "<2awa>": "Awadhi", "<2quy>": "Ayacucho Quechua", "<2mey>": "Hassaniyya", "<2ks>": "Kashimiri", "<2ff>": "Fulfulde", "<2ber>": "Tamazight (Tfng)", }