#!/usr/bin/env python3 """ Generate langid_mapping.py from MADLAD-400 paper Table 9. Parses Table 9 (Section A.1, pages 16-22) from the MADLAD-400 paper PDF, assigns regions using script-based defaults and language-specific overrides, and writes the result to langmap/langid_mapping.py. Usage: python scripts/generate_langmap.py Dev dependencies (requirements-dev.txt): pdfplumber """ import re import sys from pathlib import Path import pdfplumber # Pages 16-22 of the paper contain Table 9 (0-indexed: 15-21) TABLE_PAGES = range(15, 22) # On the last page of the table (page 22, 0-indexed 21), the table ends around # y=430 and is followed by Section A.2. Stop processing words below this threshold. PAGE_22_MAX_TOP = 440 # Column x-coordinate boundaries for Table 9 COL_BCP47_MIN = 100 COL_BCP47_MAX = 140 COL_NAME_MIN = 140 COL_NAME_MAX = 220 COL_SCRIPT_MIN = 220 COL_SCRIPT_MAX = 260 COL_DATA_MIN = 260 # Tolerance in pixels for matching name/script cells to BCP-47 cells in the same row. # Some multi-line name cells have a slightly different y-position than the BCP-47 cell. ROW_TOLERANCE = 5 # BCP-47 code pattern: lowercase base subtag, optional uppercase subtags (e.g. tly-IR, nan-Latn-TW) BCP47_RE = re.compile(r"^[a-z]{2,}(-[A-Za-z0-9]+)*$") # Default region based on writing script SCRIPT_TO_REGION: dict[str, str] = { "Arab": "Middle East & North Africa", "Armn": "Europe", "Beng": "South Asia", "Cans": "Americas", "Cher": "Americas", "Cyrl": "Europe", "Deva": "South Asia", "Ethi": "Africa", "Geor": "Europe", "Grek": "Europe", "Gujr": "South Asia", "Guru": "South Asia", "Hang": "East Asia", "Hans": "East Asia", "Hant": "East Asia", "Hebr": "Middle East & North Africa", "Jpan": "East Asia", "Khmr": "Southeast Asia", "Knda": "South Asia", "Kore": "East Asia", "Laoo": "Southeast Asia", "Mlym": "South Asia", "Mong": "East Asia", "Mymr": "Southeast Asia", "Olck": "South Asia", "Orya": "South Asia", "Sinh": "South Asia", "Syrc": "Middle East & North Africa", "Taml": "South Asia", "Telu": "South Asia", "Tfng": "Africa", "Thaa": "South Asia", "Thai": "Southeast Asia", "Tibt": "East Asia", } # Language-specific overrides (primarily for Latin-script languages and corrections) LANGUAGE_OVERRIDES: dict[str, str] = { # Europe (Latin-script) "ang": "Europe", "br": "Europe", "bs": "Europe", "ca": "Europe", "co": "Europe", "cs": "Europe", "cy": "Europe", "da": "Europe", "de": "Europe", "en": "Europe", "eo": "Europe", "es": "Europe", "et": "Europe", "eu": "Europe", "fi": "Europe", "fo": "Europe", "fr": "Europe", "frp": "Europe", "fy": "Europe", "ga": "Europe", "gag": "Europe", "gd": "Europe", "gl": "Europe", "gsw": "Europe", "gv": "Europe", "hr": "Europe", "hu": "Europe", "is": "Europe", "it": "Europe", "kw": "Europe", "la": "Europe", "lb": "Europe", "lt": "Europe", "ltg": "Europe", "lv": "Europe", "mk": "Europe", "mt": "Europe", "nl": "Europe", "nn": "Europe", "no": "Europe", "oc": "Europe", "pl": "Europe", "pt": "Europe", "rm": "Europe", "rmc": "Europe", "ro": "Europe", "rom": "Europe", "se": "Europe", "sk": "Europe", "sl": "Europe", "sq": "Europe", "sr": "Europe", "stq": "Europe", "sv": "Europe", "vec": "Europe", "wa": "Europe", # Africa (Latin-script) "aa": "Africa", "ach": "Africa", "ada": "Africa", "adh": "Africa", "af": "Africa", "ak": "Africa", "alz": "Africa", "ann": "Africa", "bas": "Africa", "bci": "Africa", "bim": "Africa", "bm": "Africa", "bqc": "Africa", "bum": "Africa", "bus": "Africa", "cce": "Africa", "cjk": "Africa", "din": "Africa", "dje": "Africa", "dov": "Africa", "dwr": "Africa", "dyu": "Africa", "ee": "Africa", "ff": "Africa", "ffm": "Africa", "fip": "Africa", "fon": "Africa", "gub": "Americas", "gvl": "Africa", "ha": "Africa", "ig": "Africa", "ibb": "Africa", "iso": "Africa", "izz": "Africa", "kbp": "Africa", "kg": "Africa", "kj": "Africa", "kmb": "Africa", "kri": "Africa", "ktu": "Africa", "laj": "Africa", "lg": "Africa", "ln": "Africa", "lu": "Africa", "mas": "Africa", "mfe": "Africa", "mg": "Africa", "mgh": "Africa", "niq": "Africa", "nnb": "Africa", "nso": "Africa", "nr": "Africa", "ny": "Africa", "nyu": "Africa", "nzi": "Africa", "om": "Africa", "rn": "Africa", "rw": "Africa", "seh": "Africa", "sg": "Africa", "sn": "Africa", "so": "Africa", "spp": "Africa", "srr": "Africa", "ss": "Africa", "st": "Africa", "sus": "Africa", "sw": "Africa", "tbz": "Africa", "tdx": "Africa", "teo": "Africa", "tiv": "Africa", "tll": "Africa", "tn": "Africa", "ts": "Africa", "tsc": "Africa", "ve": "Africa", "wal": "Africa", "wo": "Africa", "xh": "Africa", "yo": "Africa", "zne": "Africa", "zu": "Africa", # Americas (Latin-script) "acf": "Americas", "agr": "Americas", "cab": "Americas", "amu": "Americas", "arn": "Americas", "ay": "Americas", "bzj": "Americas", "cac": "Americas", "cak": "Americas", "cni": "Americas", "ctu": "Americas", "cuk": "Americas", "djk": "Americas", "emp": "Americas", "gn": "Americas", "guc": "Americas", "guh": "Americas", "gui": "Americas", "gym": "Americas", "gyn": "Americas", "haw": "Americas", "ht": "Americas", "hus": "Americas", "inb": "Americas", "jac": "Americas", "jam": "Americas", "jiv": "Americas", "kek": "Americas", "kl": "Americas", "knj": "Americas", "kwi": "Americas", "mam": "Americas", "maz": "Americas", "miq": "Americas", "ngu": "Americas", "nhe": "Americas", "noa": "Americas", "nv": "Americas", "otq": "Americas", "pap": "Americas", "qu": "Americas", "qub": "Americas", "quc": "Americas", "quf": "Americas", "quh": "Americas", "qup": "Americas", "quy": "Americas", "qvc": "Americas", "qvi": "Americas", "qvz": "Americas", "qxr": "Americas", "rcf": "Americas", "shp": "Americas", "sja": "Americas", "srm": "Americas", "srn": "Americas", "tca": "Americas", "toj": "Americas", "tzh": "Americas", "tzj": "Americas", "tzo": "Americas", "yua": "Americas", "zap": "Americas", # Southeast Asia (Latin-script) "ace": "Southeast Asia", "ahk": "Southeast Asia", "akb": "Southeast Asia", "ban": "Southeast Asia", "bbc": "Southeast Asia", "bew": "Southeast Asia", "bgz": "Southeast Asia", "bik": "Southeast Asia", "bru": "Southeast Asia", "btx": "Southeast Asia", "bts": "Southeast Asia", "cbk": "Southeast Asia", "ceb": "Southeast Asia", "cfm": "Southeast Asia", "cjm": "Southeast Asia", "cnh": "Southeast Asia", "dtp": "Southeast Asia", "fil": "Southeast Asia", "gor": "Southeast Asia", "hil": "Southeast Asia", "hmn": "Southeast Asia", "iba": "Southeast Asia", "id": "Southeast Asia", "ify": "Southeast Asia", "ilo": "Southeast Asia", "ium": "Southeast Asia", "jax": "Southeast Asia", "jv": "Southeast Asia", "jvn": "Southeast Asia", "kac": "Southeast Asia", "krj": "Southeast Asia", "lhu": "Southeast Asia", "mad": "Southeast Asia", "mak": "Southeast Asia", "mbt": "Southeast Asia", "mel": "Southeast Asia", "meo": "Southeast Asia", "min": "Southeast Asia", "mkn": "Southeast Asia", "mqy": "Southeast Asia", "mrw": "Southeast Asia", "ms": "Southeast Asia", "msi": "Southeast Asia", "msb": "Southeast Asia", "msm": "Southeast Asia", "nia": "Southeast Asia", "nij": "Southeast Asia", "nut": "Southeast Asia", "pag": "Southeast Asia", "pam": "Southeast Asia", "pck": "Southeast Asia", "ppk": "Southeast Asia", "prk": "Southeast Asia", "sda": "Southeast Asia", "su": "Southeast Asia", "sxn": "Southeast Asia", "tet": "Southeast Asia", "tl": "Southeast Asia", "tsg": "Southeast Asia", "tyz": "Southeast Asia", "vi": "Southeast Asia", "war": "Southeast Asia", "xmm": "Southeast Asia", # Oceania (Latin-script) "abt": "Oceania", "ape": "Oceania", "bi": "Oceania", "ch": "Oceania", "chk": "Oceania", "enq": "Oceania", "fj": "Oceania", "gil": "Oceania", "hif": "Oceania", "ho": "Oceania", "hui": "Oceania", "hvn": "Oceania", "kos": "Oceania", "ksd": "Oceania", "meu": "Oceania", "mh": "Oceania", "mi": "Oceania", "mps": "Oceania", "pau": "Oceania", "pis": "Oceania", "pon": "Oceania", "rwo": "Oceania", "sm": "Oceania", "to": "Oceania", "tuc": "Oceania", "tvl": "Oceania", "twu": "Oceania", "ubu": "Oceania", "yap": "Oceania", # South Asia (Latin-script) "kha": "South Asia", "lus": "South Asia", "smt": "South Asia", "trp": "South Asia", # East Asia (Latin-script) "mn": "East Asia", "za": "East Asia", # Middle East & North Africa (Latin-script) "bgp": "Middle East & North Africa", "ku": "Middle East & North Africa", "tr": "Middle East & North Africa", "zza": "Middle East & North Africa", # Central Asia (Latin-script) "az": "Central Asia", "kaa-Latn": "Central Asia", "kk": "Central Asia", "ky": "Central Asia", "tg": "Central Asia", "tk": "Central Asia", "uz": "Central Asia", # Corrections for Cyrillic languages not in Europe "ba": "Central Asia", "ce": "Europe", "cv": "Europe", "kv": "Europe", "os": "Europe", "sah": "East Asia", "tt": "Europe", # Non-geographic / constructed languages: assign to Americas (creator's continent) "crs": "Americas", # Seselwa Creole French (Indian Ocean) — grouped with Atlantic/Indian Ocean creoles "tlh": "Americas", # Klingon — constructed language "zxx": "Americas", # Noise/non-linguistic content — placeholder region } def fix_name_spacing(name: str) -> str: """Insert spaces lost during PDF word extraction. PDF word extraction sometimes concatenates adjacent words when they appear close together on the page. This function restores spaces before uppercase letters that follow lowercase letters, before opening parentheses, and after periods followed by uppercase letters. """ # Insert space before uppercase letter following a lowercase/accented lowercase letter name = re.sub(r"([a-zà-öø-ÿ])([A-Z])", r"\1 \2", name) # Insert space before opening parenthesis following a word character name = re.sub(r"(\w)\(", r"\1 (", name) # Insert space after period followed by uppercase letter name = re.sub(r"\.([A-Z])", r". \1", name) return name def parse_table9(pdf_path: str) -> list[dict[str, str]]: """Parse Table 9 from the MADLAD-400 paper using word-based extraction. pdfplumber's extract_table() does not work on this PDF because the table lines are not detected as table borders. Instead we extract words and assign them to columns by x-coordinate, then group by row using a y-position tolerance to handle multi-line name cells. """ entries: list[dict[str, str]] = [] with pdfplumber.open(pdf_path) as pdf: for page_num in TABLE_PAGES: page = pdf.pages[page_num] words = page.extract_words() # On the last table page, cut off below the table (avoids Section A.2 code block) max_top = PAGE_22_MAX_TOP if page_num == 21 else float("inf") # Partition words by column bcp47_col = [w for w in words if COL_BCP47_MIN <= w["x0"] < COL_BCP47_MAX and w["top"] <= max_top] name_col = [w for w in words if COL_NAME_MIN <= w["x0"] < COL_NAME_MAX and w["top"] <= max_top] script_col = [w for w in words if COL_SCRIPT_MIN <= w["x0"] < COL_SCRIPT_MAX and w["top"] <= max_top] data_col = [w for w in words if w["x0"] >= COL_DATA_MIN and w["top"] <= max_top] for bcp_w in bcp47_col: bcp47 = bcp_w["text"] # Skip non-BCP-47 text (headers, prose, code) if not BCP47_RE.match(bcp47) or bcp47 in ("total", "median"): continue bcp_top = bcp_w["top"] # Find cells in the same row (within ROW_TOLERANCE pixels) name_ws = [w for w in name_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE] script_ws = [w for w in script_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE] data_ws = [w for w in data_col if abs(w["top"] - bcp_top) <= ROW_TOLERANCE] name = " ".join(w["text"] for w in sorted(name_ws, key=lambda w: w["x0"])) name = fix_name_spacing(name) script = " ".join(w["text"] for w in sorted(script_ws, key=lambda w: w["x0"])) data_texts = [w["text"] for w in sorted(data_ws, key=lambda w: w["x0"])] # Require both name and script to be present if not name or not script: continue # Skip rows where all data columns are "-" (self-audit omissions) if data_texts and all(d.strip() == "-" for d in data_texts): continue entries.append({"bcp47": bcp47, "name": name, "script": script}) return entries def assign_region(bcp47: str, script: str) -> str: """Assign a geographic region based on language code and script.""" if bcp47 in LANGUAGE_OVERRIDES: return LANGUAGE_OVERRIDES[bcp47] if script in SCRIPT_TO_REGION: return SCRIPT_TO_REGION[script] return "Other" def write_mapping(entries: list[dict[str, str]], output_path: Path) -> None: """Write the language mapping to a Python file.""" mapping: list[tuple[str, str, str]] = [] for entry in entries: token = f"<2{entry['bcp47']}>" region = assign_region(entry["bcp47"], entry["script"]) mapping.append((token, entry["name"], region)) # Sort by region, then by name within each region mapping.sort(key=lambda x: (x[2], x[1])) lines = [ "# Auto-generated by scripts/generate_langmap.py from MADLAD-400 paper Table 9 (Section A.1)", "# Source: https://arxiv.org/pdf/2309.04662", f"# {len(mapping)} languages with training data (excludes 79 self-audit omissions)", "#", "# To regenerate:", "# python scripts/generate_langmap.py ", "langid_to_language = {", ] for token, name, region in mapping: # Escape any quotes in names escaped_name = name.replace('"', '\\"') lines.append(f' "{token}": {{"name": "{escaped_name}", "region": "{region}"}},') lines.append("}") lines.append("") output_path.write_text("\n".join(lines)) def main() -> None: if len(sys.argv) != 2: print("Usage: python scripts/generate_langmap.py ") sys.exit(1) pdf_path = sys.argv[1] entries = parse_table9(pdf_path) print(f"Parsed {len(entries)} languages from Table 9") # Report region assignments other: list[tuple[str, str]] = [] for entry in entries: region = assign_region(entry["bcp47"], entry["script"]) if region == "Other": other.append((entry["bcp47"], entry["name"])) if other: print(f"\n{len(other)} languages assigned to 'Other' region (need manual override):") for bcp47, name in other: print(f" {bcp47}: {name}") print("\nAdd overrides to LANGUAGE_OVERRIDES in the script and re-run.") output_path = Path(__file__).parent.parent / "langmap" / "langid_mapping.py" write_mapping(entries, output_path) print(f"\nWrote {len(entries)} entries to {output_path}") if other: sys.exit(1) if __name__ == "__main__": main()