from langmap.langid_mapping import langid_to_language def test_langid_mapping_is_nonempty(): assert len(langid_to_language) > 0 def test_langid_mapping_count(): assert len(langid_to_language) == 183, f"Expected 183 evaluated languages, got {len(langid_to_language)}" def test_keys_are_bcp47_tokens(): for key in langid_to_language: assert key.startswith("<2"), f"Key {key} does not start with '<2'" assert key.endswith(">"), f"Key {key} does not end with '>'" def test_values_are_nonempty_strings(): for key, value in langid_to_language.items(): assert isinstance(value, str) and value.strip(), f"Empty language name for {key}" def test_no_leading_or_trailing_whitespace(): for key, value in langid_to_language.items(): assert value == value.strip(), f"Language name for {key} has leading/trailing whitespace: {value!r}" def test_no_duplicate_language_names(): names = list(langid_to_language.values()) assert len(names) == len(set(names)), "Duplicate language names found" def test_known_evaluated_languages_present(): """Verify representative languages from each evaluation set in Table 14.""" values = set(langid_to_language.values()) # WMT for lang in ["Czech", "German", "Finnish", "French", "Kazakh"]: assert lang in values, f"{lang} (WMT) missing from mapping" # Flores-200 for lang in ["Achinese", "Balinese", "Fon", "Swati", "Venetian"]: assert lang in values, f"{lang} (Flores-200) missing from mapping" # NTREX for lang in ["Dhivehi", "Hmong", "Hassaniyya", "Cantonese", "Venda"]: assert lang in values, f"{lang} (NTREX) missing from mapping" # Gatones for lang in ["Adyghe", "Cherokee", "Goan Konkani", "Saraiki", "Zaza"]: assert lang in values, f"{lang} (Gatones) missing from mapping" def test_removed_languages_absent(): """Languages not in Table 14 evaluation sets must not be in the mapping.""" values = set(langid_to_language.values()) for lang in ["Hawaiian", "Latin", "Corsican", "Kara-Kalpak", "Ancient Greek", "W. Frisian", "Breton"]: assert lang not in values, f"{lang} should have been removed (not in Table 14)"